From b98cd83253d24ad3aa1d02422a129f4a106468fa Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Fri, 27 Aug 2021 11:26:29 +0300 Subject: [PATCH 01/31] introduce envoy config with gpus added a message to director.proto with gpu info --- .../PyTorch_Kvasir_UNet/envoy/start_envoy.sh | 2 +- .../{shard_config.yaml => envoy_config.yaml} | 20 ++++++++++++++----- openfl-workspace/default/shard_descriptor.py | 3 ++- openfl/interface/envoy.py | 15 +++++++------- openfl/protocols/director.proto | 9 +++++++-- .../pytorch_kvasir_unet/envoy/start_envoy.sh | 2 +- 6 files changed, 34 insertions(+), 17 deletions(-) rename openfl-workspace/default/{shard_config.yaml => envoy_config.yaml} (66%) diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh index 222d3988e0..3f06f2113b 100755 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls --shard-config-path shard_config.yaml -dh localhost -dp 50051 \ No newline at end of file +fx envoy start -n env_one --disable-tls --shard-config-path envoy_config.yaml -dh localhost -dp 50051 \ No newline at end of file diff --git a/openfl-workspace/default/shard_config.yaml b/openfl-workspace/default/envoy_config.yaml similarity index 66% rename from openfl-workspace/default/shard_config.yaml rename to openfl-workspace/default/envoy_config.yaml index cb08ac22d2..c8b2560fb8 100644 --- a/openfl-workspace/default/shard_config.yaml +++ b/openfl-workspace/default/envoy_config.yaml @@ -1,6 +1,12 @@ # Copyright (C) 2020-2021 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +# SETUP ENVOY PARAMETERS + +# cuda_devices - field allows you to put indeces of cuda devices you want to allow using +# in Federated experiments + +# SETUP SHARD DESCRIPTOR # To start envoy implement LocalShardDescriptor class in shard_descriptor module. # Alternatively, point to an implemented Shard Descriptor in 'template' field. @@ -10,8 +16,12 @@ # 2. sample_shape - shape of sample's numpy representaion that will by return from __getitem__ # 3. target_shape - shape of target's numpy representaion that will by return from __getitem__ -template: shard_descriptor.LocalShardDescriptor -params: - data_path: data - sample_shape: [] - target_shape: [] \ No newline at end of file +envoy: + cuda_devices: [] + +shard_descriptor: + template: shard_descriptor.LocalShardDescriptor + params: + data_path: data + sample_shape: [] + target_shape: [] \ No newline at end of file diff --git a/openfl-workspace/default/shard_descriptor.py b/openfl-workspace/default/shard_descriptor.py index 2f41db0f40..08db460737 100644 --- a/openfl-workspace/default/shard_descriptor.py +++ b/openfl-workspace/default/shard_descriptor.py @@ -16,6 +16,7 @@ def __init__(self, data_path: str, sample_shape: tuple, target_shape: tuple) -> """ Initialize local Shard Descriptor. - Parameters are arbitrary, set up a shard_config.yaml as you need. + Parameters are arbitrary, set up the ShardDescriptor-related part + of the envoy_config.yaml as you need. """ super().__init__() diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index 2f95766d0f..1d85931cef 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -39,8 +39,8 @@ def envoy(context): help='The federation director port', type=click.IntRange(1, 65535)) @option('--tls/--disable-tls', default=True, is_flag=True, help='Use TLS or not (By default TLS is enabled)') -@option('-sc', '--shard-config-path', default='shard_config.yaml', - help='The shard config path', type=ClickPath(exists=True)) +@option('-sc', '--envoy-config-path', default='envoy_config.yaml', + help='The envoy config path', type=ClickPath(exists=True)) @option('-rc', '--root-cert-path', 'root_certificate', default=None, help='Path to a root CA cert', type=ClickPath(exists=True)) @option('-pk', '--private-key-path', 'private_key', default=None, @@ -54,7 +54,10 @@ def start_(shard_name, director_host, director_port, tls, shard_config_path, if is_directory_traversal(shard_config_path): click.echo('The shard config path is out of the openfl workspace scope.') sys.exit(1) + with open(shard_config_path) as stream: + envoy_config = safe_load(stream) + # pass envoy parameters shard_config_path = Path(shard_config_path).absolute() if root_certificate: root_certificate = Path(root_certificate).absolute() @@ -63,7 +66,7 @@ def start_(shard_name, director_host, director_port, tls, shard_config_path, if certificate: certificate = Path(certificate).absolute() - shard_descriptor = shard_descriptor_from_config(shard_config_path) + shard_descriptor = shard_descriptor_from_config(envoy_config.get('shard_descriptor', {})) envoy = Envoy( shard_name=shard_name, director_host=director_host, @@ -95,8 +98,8 @@ def create(envoy_path): (envoy_path / 'cert').mkdir(parents=True, exist_ok=True) (envoy_path / 'logs').mkdir(parents=True, exist_ok=True) (envoy_path / 'data').mkdir(parents=True, exist_ok=True) - shutil.copyfile(WORKSPACE / 'default/shard_config.yaml', - envoy_path / 'shard_config.yaml') + shutil.copyfile(WORKSPACE / 'default/envoy_config.yaml', + envoy_path / 'envoy_config.yaml') shutil.copyfile(WORKSPACE / 'default/shard_descriptor.py', envoy_path / 'shard_descriptor.py') shutil.copyfile(WORKSPACE / 'default/requirements.txt', @@ -105,8 +108,6 @@ def create(envoy_path): def shard_descriptor_from_config(shard_config_path: str): """Build a shard descriptor from config.""" - with open(shard_config_path) as stream: - shard_config = safe_load(stream) template = shard_config.get('template') if not template: raise Exception(f'You should define a shard ' diff --git a/openfl/protocols/director.proto b/openfl/protocols/director.proto index 1480b305b3..c1ca8fa620 100644 --- a/openfl/protocols/director.proto +++ b/openfl/protocols/director.proto @@ -7,13 +7,18 @@ import "google/protobuf/duration.proto"; import "federation.proto"; +message CudaDeviceInfo { + uint64 index = 1; + uint64 memory_total = 2; + uint64 memory_utilized = 3; +} + // Envoy Messages message NodeInfo { string name = 1; string adress = 2; - bool cuda_available = 3; - uint32 memory_size = 4; + repeated CudaDeviceInfo cuda_devices = 3; } message ShardInfo { diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/start_envoy.sh b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/start_envoy.sh index 222d3988e0..1dd6591439 100755 --- a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/start_envoy.sh +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls --shard-config-path shard_config.yaml -dh localhost -dp 50051 \ No newline at end of file +fx envoy start -n env_one --disable-tls --envoy-config-path envoy_config.yaml -dh localhost -dp 50051 \ No newline at end of file From f64fc3911cef07ccb88e1b22f12635c1bb1d2f2f Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Tue, 31 Aug 2021 12:17:12 +0300 Subject: [PATCH 02/31] added cuda version to message --- openfl/protocols/director.proto | 1 + 1 file changed, 1 insertion(+) diff --git a/openfl/protocols/director.proto b/openfl/protocols/director.proto index c1ca8fa620..0e81f8b4a2 100644 --- a/openfl/protocols/director.proto +++ b/openfl/protocols/director.proto @@ -19,6 +19,7 @@ message NodeInfo { string name = 1; string adress = 2; repeated CudaDeviceInfo cuda_devices = 3; + string cuda_version = 4; } message ShardInfo { From d3fe0557cd2d786d9d133197564cd1e36b641370 Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Wed, 1 Sep 2021 18:22:29 +0300 Subject: [PATCH 03/31] introduced device_monitor plugin --- openfl-workspace/default/envoy_config.yaml | 7 +++- openfl/component/envoy/envoy.py | 10 ++++- openfl/interface/envoy.py | 18 +++++++- .../cuda_device_monitor.py | 29 +++++++++++++ .../device_monitor.py | 19 +++++++++ .../pynvml_monitor.py | 42 +++++++++++++++++++ openfl/protocols/director.proto | 20 ++++----- openfl/transport/grpc/director_client.py | 8 ++-- 8 files changed, 135 insertions(+), 18 deletions(-) create mode 100644 openfl/plugins/processing_units_monitor/cuda_device_monitor.py create mode 100644 openfl/plugins/processing_units_monitor/device_monitor.py create mode 100644 openfl/plugins/processing_units_monitor/pynvml_monitor.py diff --git a/openfl-workspace/default/envoy_config.yaml b/openfl-workspace/default/envoy_config.yaml index c8b2560fb8..d468d1d5f9 100644 --- a/openfl-workspace/default/envoy_config.yaml +++ b/openfl-workspace/default/envoy_config.yaml @@ -16,8 +16,13 @@ # 2. sample_shape - shape of sample's numpy representaion that will by return from __getitem__ # 3. target_shape - shape of target's numpy representaion that will by return from __getitem__ -envoy: + +params: cuda_devices: [] + optional_plugin_components: + cuda_device_monitor: + template : openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings : [] shard_descriptor: template: shard_descriptor.LocalShardDescriptor diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index 26a4332478..a337b61baa 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -25,7 +25,7 @@ class Envoy: def __init__(self, *, shard_name, director_host, director_port, shard_descriptor, root_certificate: str = None, private_key: str = None, certificate: str = None, - tls: bool = True) -> None: + tls: bool = True, **envoy_params) -> None: """Initialize a envoy object.""" self.name = shard_name self.root_certificate = Path( @@ -41,7 +41,13 @@ def __init__(self, *, shard_name, director_host, director_port, shard_descriptor private_key=private_key, certificate=certificate ) + self.shard_descriptor = shard_descriptor + self.cuda_devices = envoy_params.get('cuda_devices', []) + + # Optional plugins + self.cuda_device_monitor = envoy_params.get('cuda_device_monitor', None) + self.executor = ThreadPoolExecutor() self.running_experiments = {} self.is_experiment_running = False @@ -87,7 +93,7 @@ def send_health_check(self): logger.info('The health check sender is started.') while True: timeout = self.director_client.send_health_check( - collaborator_name=self.name, + envoy_name=self.name, is_experiment_running=self.is_experiment_running ) time.sleep(timeout) diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index 1d85931cef..2435a69a19 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -54,6 +54,7 @@ def start_(shard_name, director_host, director_port, tls, shard_config_path, if is_directory_traversal(shard_config_path): click.echo('The shard config path is out of the openfl workspace scope.') sys.exit(1) + # Reed the Envoy config with open(shard_config_path) as stream: envoy_config = safe_load(stream) @@ -66,6 +67,20 @@ def start_(shard_name, director_host, director_port, tls, shard_config_path, if certificate: certificate = Path(certificate).absolute() + envoy_params = envoy_config.get('params', {}) + for plugin_name, plugin_settings in envoy_config.get('optional_plugin_components', {}).items(): + template = plugin_settings.get('template') + if not template: + raise Exception('You should put a template' + f'for plugin {plugin_name}') + class_name = template.split('.')[-1] + module_path = '.'.join(template.split('.')[:-1]) + plugin_params = plugin_settings.get('params', {}) + + module = import_module(module_path) + instance = getattr(module, class_name)(**plugin_params) + envoy_params[plugin_name] = instance + shard_descriptor = shard_descriptor_from_config(envoy_config.get('shard_descriptor', {})) envoy = Envoy( shard_name=shard_name, @@ -75,7 +90,8 @@ def start_(shard_name, director_host, director_port, tls, shard_config_path, tls=tls, root_certificate=root_certificate, private_key=private_key, - certificate=certificate + certificate=certificate, + **envoy_params ) envoy.start() diff --git a/openfl/plugins/processing_units_monitor/cuda_device_monitor.py b/openfl/plugins/processing_units_monitor/cuda_device_monitor.py new file mode 100644 index 0000000000..22195f3152 --- /dev/null +++ b/openfl/plugins/processing_units_monitor/cuda_device_monitor.py @@ -0,0 +1,29 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""CUDA Device monitor plugin module.""" + +from .device_monitor import DeviceMonitor + + +class CUDADeviceMonitor(DeviceMonitor): + """CUDA Device monitor plugin.""" + + def get_driver_version(self) -> str: + """Get CUDA driver version.""" + raise NotImplementedError + + def get_device_memory_total(self, index: int) -> int: + """Get total memory available on the device.""" + raise NotImplementedError + + def get_device_memory_utilized(self, index: int) -> int: + """Get utilized memory on the device.""" + raise NotImplementedError + + def get_device_utilization(self, index: int) -> str: + """ + Get device utilization method. + + It is just a general method that returns a string that may be shown to the frontend user. + """ + raise NotImplementedError diff --git a/openfl/plugins/processing_units_monitor/device_monitor.py b/openfl/plugins/processing_units_monitor/device_monitor.py new file mode 100644 index 0000000000..de1d8f6dd8 --- /dev/null +++ b/openfl/plugins/processing_units_monitor/device_monitor.py @@ -0,0 +1,19 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""Device monitor plugin module.""" + + +class DeviceMonitor: + """Device monitor plugin interface.""" + + def get_driver_version(self) -> str: + """Get device's driver version.""" + raise NotImplementedError + + def get_device_utilization(self, index: int) -> str: + """ + Get device utilization method. + + It is just a general method that returns a string that may be shown to the frontend user. + """ + raise NotImplementedError diff --git a/openfl/plugins/processing_units_monitor/pynvml_monitor.py b/openfl/plugins/processing_units_monitor/pynvml_monitor.py new file mode 100644 index 0000000000..7ed79fb87d --- /dev/null +++ b/openfl/plugins/processing_units_monitor/pynvml_monitor.py @@ -0,0 +1,42 @@ +# Copyright (C) 2020-2021 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +"""pynvml CUDA Device monitor plugin module.""" + +import pynvml + +from .cuda_device_monitor import CUDADeviceMonitor + + +class PynvmlCUDADeviceMonitor(CUDADeviceMonitor): + """CUDA Device monitor plugin using pynvml lib.""" + # required package: nvidia-ml-py3 + + def __init__(self) -> None: + super().__init__() + pynvml.nvmlInit() + + def get_driver_version(self) -> str: + """Get CUDA driver version.""" + return pynvml.nvmlSystemGetDriverVersion().decode("utf-8") + + def get_device_memory_total(self, index: int) -> int: + """Get total memory available on the device.""" + handle = pynvml.nvmlDeviceGetHandleByIndex(index) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + return info.total + + def get_device_memory_utilized(self, index: int) -> int: + """Get utilized memory on the device.""" + handle = pynvml.nvmlDeviceGetHandleByIndex(index) + info = pynvml.nvmlDeviceGetMemoryInfo(handle) + return info.used + + def get_device_utilization(self, index: int) -> str: + """ + Get device utilization method. + + It is just a general method that returns a string that may be shown to the frontend user. + """ + handle = pynvml.nvmlDeviceGetHandleByIndex(index) + info_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) + return str(info_utilization.gpu) + '%' diff --git a/openfl/protocols/director.proto b/openfl/protocols/director.proto index 0e81f8b4a2..b71ef96c9a 100644 --- a/openfl/protocols/director.proto +++ b/openfl/protocols/director.proto @@ -53,6 +53,15 @@ message ExperimentData { bytes npbytes = 2; //actual data } +message EnvoyStatus { + string name = 1; + bool is_experiment_running = 2; +} + +message EnvoyHealthCheckResponse { + google.protobuf.Duration health_check_period = 1; +} + // API Messages message ExperimentInfo { @@ -102,15 +111,6 @@ message RemoveExperimentResponse { bool acknowledgement = 1; } -message CollaboratorStatus { - string name = 1; - bool is_experiment_running = 2; -} - -message CollaboratorHealthCheckResponse { - google.protobuf.Duration health_check_period = 1; -} - message EnvoyInfo { ShardInfo shard_info = 1; bool is_online = 2; @@ -138,6 +138,6 @@ service FederationDirector { rpc GetTrainedModel (GetTrainedModelRequest) returns (TrainedModelResponse) {} rpc StreamMetrics (StreamMetricsRequest) returns (stream StreamMetricsResponse) {} rpc RemoveExperimentData (RemoveExperimentRequest) returns (RemoveExperimentResponse) {} - rpc CollaboratorHealthCheck (CollaboratorStatus) returns (CollaboratorHealthCheckResponse) {} + rpc EnvoyHealthCheck (EnvoyStatus) returns (EnvoyHealthCheckResponse) {} rpc GetEnvoys (GetEnvoysRequest) returns (GetEnvoysResponse) {} } diff --git a/openfl/transport/grpc/director_client.py b/openfl/transport/grpc/director_client.py index 260e4c9153..5d28b125f3 100644 --- a/openfl/transport/grpc/director_client.py +++ b/openfl/transport/grpc/director_client.py @@ -98,15 +98,15 @@ def _get_node_info(self): """Generate a node info message.""" return director_pb2.NodeInfo(name=self.shard_name) - def send_health_check(self, *, collaborator_name: str, is_experiment_running: bool) -> int: + def send_health_check(self, *, envoy_name: str, is_experiment_running: bool) -> int: """Send envoy health check.""" - status = director_pb2.CollaboratorStatus( - name=collaborator_name, + status = director_pb2.EnvoyStatus( + name=envoy_name, is_experiment_running=is_experiment_running, ) logger.debug(f'Sending health check status: {status}') - response = self.stub.CollaboratorHealthCheck(status) + response = self.stub.EnvoyHealthCheck(status) health_check_period = response.health_check_period.seconds return health_check_period From cc128e359f16b5e9e77938b6ab6939506c36755e Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Mon, 6 Sep 2021 17:19:59 +0300 Subject: [PATCH 04/31] cuda devices included in shard_info --- openfl/component/envoy/envoy.py | 5 +- openfl/protocols/director.proto | 5 +- openfl/protocols/director_pb2.py | 1810 ++++++++++++---------- openfl/protocols/director_pb2_grpc.py | 26 +- openfl/transport/grpc/director_client.py | 9 +- 5 files changed, 1030 insertions(+), 825 deletions(-) diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index a337b61baa..fe3b900542 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -74,7 +74,6 @@ def run(self): except Exception as exc: logger.exception(f'Collaborator failed with error: {exc}:') finally: - # Workspace cleaning should not be done by gRPC client! self.is_experiment_running = False @staticmethod @@ -113,7 +112,9 @@ def _run_collaborator(self, plan='plan/plan.yaml'): def start(self): """Start the envoy.""" try: - is_accepted = self.director_client.report_shard_info(self.shard_descriptor) + is_accepted = self.director_client.report_shard_info( + shard_descriptor=self.shard_descriptor, + cuda_devices=self.cuda_devices) except Exception as exc: logger.exception(f'Failed to report shard info: {exc}') else: diff --git a/openfl/protocols/director.proto b/openfl/protocols/director.proto index b71ef96c9a..4130590ea3 100644 --- a/openfl/protocols/director.proto +++ b/openfl/protocols/director.proto @@ -17,9 +17,7 @@ message CudaDeviceInfo { message NodeInfo { string name = 1; - string adress = 2; - repeated CudaDeviceInfo cuda_devices = 3; - string cuda_version = 4; + repeated uint64 cuda_devices = 2; } message ShardInfo { @@ -73,7 +71,6 @@ message ExperimentInfo { message SetNewExperimentResponse{ bool accepted = 1; - string tensorboard_address = 2; } message GetTrainedModelRequest { diff --git a/openfl/protocols/director_pb2.py b/openfl/protocols/director_pb2.py index 08abef2443..5617543436 100644 --- a/openfl/protocols/director_pb2.py +++ b/openfl/protocols/director_pb2.py @@ -16,858 +16,955 @@ import openfl.protocols.federation_pb2 as federation__pb2 DESCRIPTOR = _descriptor.FileDescriptor( - name='director.proto', - package='', - syntax='proto3', - serialized_options=None, - create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"U\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0e\n\x06\x61\x64ress\x18\x02 \x01(\t\x12\x16\n\x0e\x63uda_available\x18\x03 \x01(\x08\x12\x13\n\x0bmemory_size\x18\x04 \x01(\r\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\"\x86\x01\n\x0e\x45xperimentInfo\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\"I\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\x12\x1b\n\x13tensorboard_address\x18\x02 \x01(\t\"\x95\x01\n\x16GetTrainedModelRequest\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"\x17\n\x15GetDatasetInfoRequest\"/\n\x14StreamMetricsRequest\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"2\n\x17RemoveExperimentRequest\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"A\n\x12\x43ollaboratorStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\"Y\n\x1f\x43ollaboratorHealthCheckResponse\x12\x36\n\x13health_check_period\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xb9\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12R\n\x17\x43ollaboratorHealthCheck\x12\x13.CollaboratorStatus\x1a .CollaboratorHealthCheckResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' - , - dependencies=[google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR, - google_dot_protobuf_dot_duration__pb2.DESCRIPTOR, federation__pb2.DESCRIPTOR, ]) + name='director.proto', + package='', + syntax='proto3', + serialized_options=None, + create_key=_descriptor._internal_create_key, + serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"\x1f\n\rRequestHeader\x12\x0e\n\x06sender\x18\x01 \x01(\t\"N\n\x0e\x43udaDeviceInfo\x12\r\n\x05index\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_total\x18\x02 \x01(\x04\x12\x17\n\x0fmemory_utilized\x18\x03 \x01(\x04\".\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x14\n\x0c\x63uda_devices\x18\x02 \x03(\x04\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\":\n\x0b\x45nvoyStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\"R\n\x18\x45nvoyHealthCheckResponse\x12\x36\n\x13health_check_period\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\"\xa6\x01\n\x0e\x45xperimentInfo\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\",\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"\xb5\x01\n\x16GetTrainedModelRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"7\n\x15GetDatasetInfoRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\"O\n\x14StreamMetricsRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"R\n\x17RemoveExperimentRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xa4\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12=\n\x10\x45nvoyHealthCheck\x12\x0c.EnvoyStatus\x1a\x19.EnvoyHealthCheckResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' + , + dependencies=[google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR,google_dot_protobuf_dot_duration__pb2.DESCRIPTOR,federation__pb2.DESCRIPTOR,]) + + _GETTRAINEDMODELREQUEST_MODELTYPE = _descriptor.EnumDescriptor( - name='ModelType', - full_name='GetTrainedModelRequest.ModelType', - filename=None, - file=DESCRIPTOR, - create_key=_descriptor._internal_create_key, - values=[ - _descriptor.EnumValueDescriptor( - name='BEST_MODEL', index=0, number=0, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - _descriptor.EnumValueDescriptor( - name='LAST_MODEL', index=1, number=1, - serialized_options=None, - type=None, - create_key=_descriptor._internal_create_key), - ], - containing_type=None, - serialized_options=None, - serialized_start=915, - serialized_end=958, + name='ModelType', + full_name='GetTrainedModelRequest.ModelType', + filename=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + values=[ + _descriptor.EnumValueDescriptor( + name='BEST_MODEL', index=0, number=0, + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), + _descriptor.EnumValueDescriptor( + name='LAST_MODEL', index=1, number=1, + serialized_options=None, + type=None, + create_key=_descriptor._internal_create_key), + ], + containing_type=None, + serialized_options=None, + serialized_start=1168, + serialized_end=1211, ) _sym_db.RegisterEnumDescriptor(_GETTRAINEDMODELREQUEST_MODELTYPE) + +_REQUESTHEADER = _descriptor.Descriptor( + name='RequestHeader', + full_name='RequestHeader', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='sender', full_name='RequestHeader.sender', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=101, + serialized_end=132, +) + + +_CUDADEVICEINFO = _descriptor.Descriptor( + name='CudaDeviceInfo', + full_name='CudaDeviceInfo', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='index', full_name='CudaDeviceInfo.index', index=0, + number=1, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='memory_total', full_name='CudaDeviceInfo.memory_total', index=1, + number=2, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='memory_utilized', full_name='CudaDeviceInfo.memory_utilized', index=2, + number=3, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=134, + serialized_end=212, +) + + _NODEINFO = _descriptor.Descriptor( - name='NodeInfo', - full_name='NodeInfo', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='NodeInfo.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='adress', full_name='NodeInfo.adress', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='cuda_available', full_name='NodeInfo.cuda_available', index=2, - number=3, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='memory_size', full_name='NodeInfo.memory_size', index=3, - number=4, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=101, - serialized_end=186, + name='NodeInfo', + full_name='NodeInfo', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='NodeInfo.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='cuda_devices', full_name='NodeInfo.cuda_devices', index=1, + number=2, type=4, cpp_type=4, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=214, + serialized_end=260, ) _SHARDINFO = _descriptor.Descriptor( - name='ShardInfo', - full_name='ShardInfo', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='node_info', full_name='ShardInfo.node_info', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='shard_description', full_name='ShardInfo.shard_description', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='n_samples', full_name='ShardInfo.n_samples', index=2, - number=3, type=4, cpp_type=4, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='sample_shape', full_name='ShardInfo.sample_shape', index=3, - number=4, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='target_shape', full_name='ShardInfo.target_shape', index=4, - number=5, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=189, - serialized_end=320, + name='ShardInfo', + full_name='ShardInfo', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='node_info', full_name='ShardInfo.node_info', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='shard_description', full_name='ShardInfo.shard_description', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='n_samples', full_name='ShardInfo.n_samples', index=2, + number=3, type=4, cpp_type=4, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='sample_shape', full_name='ShardInfo.sample_shape', index=3, + number=4, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='target_shape', full_name='ShardInfo.target_shape', index=4, + number=5, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=263, + serialized_end=394, ) _SHARDACKNOWLEDGEMENT = _descriptor.Descriptor( - name='ShardAcknowledgement', - full_name='ShardAcknowledgement', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='accepted', full_name='ShardAcknowledgement.accepted', index=0, - number=1, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=322, - serialized_end=362, + name='ShardAcknowledgement', + full_name='ShardAcknowledgement', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='accepted', full_name='ShardAcknowledgement.accepted', index=0, + number=1, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=396, + serialized_end=436, ) _WAITEXPERIMENTREQUEST = _descriptor.Descriptor( - name='WaitExperimentRequest', - full_name='WaitExperimentRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='collaborator_name', full_name='WaitExperimentRequest.collaborator_name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=364, - serialized_end=414, + name='WaitExperimentRequest', + full_name='WaitExperimentRequest', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='collaborator_name', full_name='WaitExperimentRequest.collaborator_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=438, + serialized_end=488, ) _WAITEXPERIMENTRESPONSE = _descriptor.Descriptor( - name='WaitExperimentResponse', - full_name='WaitExperimentResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='experiment_name', full_name='WaitExperimentResponse.experiment_name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=416, - serialized_end=465, + name='WaitExperimentResponse', + full_name='WaitExperimentResponse', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='experiment_name', full_name='WaitExperimentResponse.experiment_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=490, + serialized_end=539, ) _GETEXPERIMENTDATAREQUEST = _descriptor.Descriptor( - name='GetExperimentDataRequest', - full_name='GetExperimentDataRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='experiment_name', full_name='GetExperimentDataRequest.experiment_name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='collaborator_name', full_name='GetExperimentDataRequest.collaborator_name', - index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=467, - serialized_end=545, + name='GetExperimentDataRequest', + full_name='GetExperimentDataRequest', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='experiment_name', full_name='GetExperimentDataRequest.experiment_name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='collaborator_name', full_name='GetExperimentDataRequest.collaborator_name', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=541, + serialized_end=619, ) _EXPERIMENTDATA = _descriptor.Descriptor( - name='ExperimentData', - full_name='ExperimentData', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='size', full_name='ExperimentData.size', index=0, - number=1, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='npbytes', full_name='ExperimentData.npbytes', index=1, - number=2, type=12, cpp_type=9, label=1, - has_default_value=False, default_value=b"", - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=547, - serialized_end=594, + name='ExperimentData', + full_name='ExperimentData', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='size', full_name='ExperimentData.size', index=0, + number=1, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='npbytes', full_name='ExperimentData.npbytes', index=1, + number=2, type=12, cpp_type=9, label=1, + has_default_value=False, default_value=b"", + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=621, + serialized_end=668, +) + + +_ENVOYSTATUS = _descriptor.Descriptor( + name='EnvoyStatus', + full_name='EnvoyStatus', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='name', full_name='EnvoyStatus.name', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='is_experiment_running', full_name='EnvoyStatus.is_experiment_running', index=1, + number=2, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=670, + serialized_end=728, +) + + +_ENVOYHEALTHCHECKRESPONSE = _descriptor.Descriptor( + name='EnvoyHealthCheckResponse', + full_name='EnvoyHealthCheckResponse', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='health_check_period', full_name='EnvoyHealthCheckResponse.health_check_period', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=730, + serialized_end=812, ) _EXPERIMENTINFO = _descriptor.Descriptor( - name='ExperimentInfo', - full_name='ExperimentInfo', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='ExperimentInfo.name', index=0, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='collaborator_names', full_name='ExperimentInfo.collaborator_names', index=1, - number=3, type=9, cpp_type=9, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='experiment_data', full_name='ExperimentInfo.experiment_data', index=2, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='model_proto', full_name='ExperimentInfo.model_proto', index=3, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=597, - serialized_end=731, + name='ExperimentInfo', + full_name='ExperimentInfo', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='header', full_name='ExperimentInfo.header', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='name', full_name='ExperimentInfo.name', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='collaborator_names', full_name='ExperimentInfo.collaborator_names', index=2, + number=3, type=9, cpp_type=9, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='experiment_data', full_name='ExperimentInfo.experiment_data', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='model_proto', full_name='ExperimentInfo.model_proto', index=4, + number=5, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=815, + serialized_end=981, ) _SETNEWEXPERIMENTRESPONSE = _descriptor.Descriptor( - name='SetNewExperimentResponse', - full_name='SetNewExperimentResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='accepted', full_name='SetNewExperimentResponse.accepted', index=0, - number=1, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='tensorboard_address', full_name='SetNewExperimentResponse.tensorboard_address', - index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=733, - serialized_end=806, + name='SetNewExperimentResponse', + full_name='SetNewExperimentResponse', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='accepted', full_name='SetNewExperimentResponse.accepted', index=0, + number=1, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=983, + serialized_end=1027, ) _GETTRAINEDMODELREQUEST = _descriptor.Descriptor( - name='GetTrainedModelRequest', - full_name='GetTrainedModelRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='experiment_name', full_name='GetTrainedModelRequest.experiment_name', index=0, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='model_type', full_name='GetTrainedModelRequest.model_type', index=1, - number=3, type=14, cpp_type=8, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - _GETTRAINEDMODELREQUEST_MODELTYPE, - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=809, - serialized_end=958, + name='GetTrainedModelRequest', + full_name='GetTrainedModelRequest', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='header', full_name='GetTrainedModelRequest.header', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='experiment_name', full_name='GetTrainedModelRequest.experiment_name', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='model_type', full_name='GetTrainedModelRequest.model_type', index=2, + number=3, type=14, cpp_type=8, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + _GETTRAINEDMODELREQUEST_MODELTYPE, + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1030, + serialized_end=1211, ) _TRAINEDMODELRESPONSE = _descriptor.Descriptor( - name='TrainedModelResponse', - full_name='TrainedModelResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='model_proto', full_name='TrainedModelResponse.model_proto', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=960, - serialized_end=1016, + name='TrainedModelResponse', + full_name='TrainedModelResponse', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='model_proto', full_name='TrainedModelResponse.model_proto', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1213, + serialized_end=1269, ) _GETDATASETINFOREQUEST = _descriptor.Descriptor( - name='GetDatasetInfoRequest', - full_name='GetDatasetInfoRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1018, - serialized_end=1041, + name='GetDatasetInfoRequest', + full_name='GetDatasetInfoRequest', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='header', full_name='GetDatasetInfoRequest.header', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1271, + serialized_end=1326, ) _STREAMMETRICSREQUEST = _descriptor.Descriptor( - name='StreamMetricsRequest', - full_name='StreamMetricsRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='experiment_name', full_name='StreamMetricsRequest.experiment_name', index=0, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1043, - serialized_end=1090, + name='StreamMetricsRequest', + full_name='StreamMetricsRequest', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='header', full_name='StreamMetricsRequest.header', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='experiment_name', full_name='StreamMetricsRequest.experiment_name', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1328, + serialized_end=1407, ) _STREAMMETRICSRESPONSE = _descriptor.Descriptor( - name='StreamMetricsResponse', - full_name='StreamMetricsResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='metric_origin', full_name='StreamMetricsResponse.metric_origin', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='task_name', full_name='StreamMetricsResponse.task_name', index=1, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='metric_name', full_name='StreamMetricsResponse.metric_name', index=2, - number=3, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='metric_value', full_name='StreamMetricsResponse.metric_value', index=3, - number=4, type=2, cpp_type=6, label=1, - has_default_value=False, default_value=float(0), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='round', full_name='StreamMetricsResponse.round', index=4, - number=5, type=13, cpp_type=3, label=1, - has_default_value=False, default_value=0, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1092, - serialized_end=1215, + name='StreamMetricsResponse', + full_name='StreamMetricsResponse', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='metric_origin', full_name='StreamMetricsResponse.metric_origin', index=0, + number=1, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='task_name', full_name='StreamMetricsResponse.task_name', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metric_name', full_name='StreamMetricsResponse.metric_name', index=2, + number=3, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='metric_value', full_name='StreamMetricsResponse.metric_value', index=3, + number=4, type=2, cpp_type=6, label=1, + has_default_value=False, default_value=float(0), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='round', full_name='StreamMetricsResponse.round', index=4, + number=5, type=13, cpp_type=3, label=1, + has_default_value=False, default_value=0, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1409, + serialized_end=1532, ) _REMOVEEXPERIMENTREQUEST = _descriptor.Descriptor( - name='RemoveExperimentRequest', - full_name='RemoveExperimentRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='experiment_name', full_name='RemoveExperimentRequest.experiment_name', index=0, - number=2, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1217, - serialized_end=1267, + name='RemoveExperimentRequest', + full_name='RemoveExperimentRequest', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='header', full_name='RemoveExperimentRequest.header', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='experiment_name', full_name='RemoveExperimentRequest.experiment_name', index=1, + number=2, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1534, + serialized_end=1616, ) _REMOVEEXPERIMENTRESPONSE = _descriptor.Descriptor( - name='RemoveExperimentResponse', - full_name='RemoveExperimentResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='acknowledgement', full_name='RemoveExperimentResponse.acknowledgement', index=0, - number=1, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1269, - serialized_end=1320, -) - -_COLLABORATORSTATUS = _descriptor.Descriptor( - name='CollaboratorStatus', - full_name='CollaboratorStatus', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='name', full_name='CollaboratorStatus.name', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='is_experiment_running', full_name='CollaboratorStatus.is_experiment_running', - index=1, - number=2, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1322, - serialized_end=1387, -) - -_COLLABORATORHEALTHCHECKRESPONSE = _descriptor.Descriptor( - name='CollaboratorHealthCheckResponse', - full_name='CollaboratorHealthCheckResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='health_check_period', - full_name='CollaboratorHealthCheckResponse.health_check_period', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1389, - serialized_end=1478, + name='RemoveExperimentResponse', + full_name='RemoveExperimentResponse', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='acknowledgement', full_name='RemoveExperimentResponse.acknowledgement', index=0, + number=1, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1618, + serialized_end=1669, ) _ENVOYINFO = _descriptor.Descriptor( - name='EnvoyInfo', - full_name='EnvoyInfo', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='shard_info', full_name='EnvoyInfo.shard_info', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='is_online', full_name='EnvoyInfo.is_online', index=1, - number=2, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='is_experiment_running', full_name='EnvoyInfo.is_experiment_running', index=2, - number=3, type=8, cpp_type=7, label=1, - has_default_value=False, default_value=False, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='last_updated', full_name='EnvoyInfo.last_updated', index=3, - number=4, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='valid_duration', full_name='EnvoyInfo.valid_duration', index=4, - number=5, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1481, - serialized_end=1675, + name='EnvoyInfo', + full_name='EnvoyInfo', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='shard_info', full_name='EnvoyInfo.shard_info', index=0, + number=1, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='is_online', full_name='EnvoyInfo.is_online', index=1, + number=2, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='is_experiment_running', full_name='EnvoyInfo.is_experiment_running', index=2, + number=3, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='last_updated', full_name='EnvoyInfo.last_updated', index=3, + number=4, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='valid_duration', full_name='EnvoyInfo.valid_duration', index=4, + number=5, type=11, cpp_type=10, label=1, + has_default_value=False, default_value=None, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1672, + serialized_end=1866, ) _GETENVOYSREQUEST = _descriptor.Descriptor( - name='GetEnvoysRequest', - full_name='GetEnvoysRequest', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1677, - serialized_end=1695, + name='GetEnvoysRequest', + full_name='GetEnvoysRequest', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1868, + serialized_end=1886, ) _GETENVOYSRESPONSE = _descriptor.Descriptor( - name='GetEnvoysResponse', - full_name='GetEnvoysResponse', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='envoy_infos', full_name='GetEnvoysResponse.envoy_infos', index=0, - number=1, type=11, cpp_type=10, label=3, - has_default_value=False, default_value=[], - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=1697, - serialized_end=1749, + name='GetEnvoysResponse', + full_name='GetEnvoysResponse', + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name='envoy_infos', full_name='GetEnvoysResponse.envoy_infos', index=0, + number=1, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + ], + extensions=[ + ], + nested_types=[], + enum_types=[ + ], + serialized_options=None, + is_extendable=False, + syntax='proto3', + extension_ranges=[], + oneofs=[ + ], + serialized_start=1888, + serialized_end=1940, ) _SHARDINFO.fields_by_name['node_info'].message_type = _NODEINFO +_ENVOYHEALTHCHECKRESPONSE.fields_by_name['health_check_period'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION +_EXPERIMENTINFO.fields_by_name['header'].message_type = _REQUESTHEADER _EXPERIMENTINFO.fields_by_name['experiment_data'].message_type = _EXPERIMENTDATA _EXPERIMENTINFO.fields_by_name['model_proto'].message_type = federation__pb2._MODELPROTO _GETTRAINEDMODELREQUEST.fields_by_name['model_type'].enum_type = _GETTRAINEDMODELREQUEST_MODELTYPE _GETTRAINEDMODELREQUEST_MODELTYPE.containing_type = _GETTRAINEDMODELREQUEST _TRAINEDMODELRESPONSE.fields_by_name['model_proto'].message_type = federation__pb2._MODELPROTO -_COLLABORATORHEALTHCHECKRESPONSE.fields_by_name[ - 'health_check_period'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION +_GETDATASETINFOREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER +_STREAMMETRICSREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER +_REMOVEEXPERIMENTREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER _ENVOYINFO.fields_by_name['shard_info'].message_type = _SHARDINFO _ENVOYINFO.fields_by_name[ 'last_updated'].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP _ENVOYINFO.fields_by_name[ 'valid_duration'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION _GETENVOYSRESPONSE.fields_by_name['envoy_infos'].message_type = _ENVOYINFO +DESCRIPTOR.message_types_by_name['RequestHeader'] = _REQUESTHEADER +DESCRIPTOR.message_types_by_name['CudaDeviceInfo'] = _CUDADEVICEINFO DESCRIPTOR.message_types_by_name['NodeInfo'] = _NODEINFO DESCRIPTOR.message_types_by_name['ShardInfo'] = _SHARDINFO DESCRIPTOR.message_types_by_name['ShardAcknowledgement'] = _SHARDACKNOWLEDGEMENT @@ -875,6 +972,8 @@ DESCRIPTOR.message_types_by_name['WaitExperimentResponse'] = _WAITEXPERIMENTRESPONSE DESCRIPTOR.message_types_by_name['GetExperimentDataRequest'] = _GETEXPERIMENTDATAREQUEST DESCRIPTOR.message_types_by_name['ExperimentData'] = _EXPERIMENTDATA +DESCRIPTOR.message_types_by_name['EnvoyStatus'] = _ENVOYSTATUS +DESCRIPTOR.message_types_by_name['EnvoyHealthCheckResponse'] = _ENVOYHEALTHCHECKRESPONSE DESCRIPTOR.message_types_by_name['ExperimentInfo'] = _EXPERIMENTINFO DESCRIPTOR.message_types_by_name['SetNewExperimentResponse'] = _SETNEWEXPERIMENTRESPONSE DESCRIPTOR.message_types_by_name['GetTrainedModelRequest'] = _GETTRAINEDMODELREQUEST @@ -884,14 +983,25 @@ DESCRIPTOR.message_types_by_name['StreamMetricsResponse'] = _STREAMMETRICSRESPONSE DESCRIPTOR.message_types_by_name['RemoveExperimentRequest'] = _REMOVEEXPERIMENTREQUEST DESCRIPTOR.message_types_by_name['RemoveExperimentResponse'] = _REMOVEEXPERIMENTRESPONSE -DESCRIPTOR.message_types_by_name['CollaboratorStatus'] = _COLLABORATORSTATUS -DESCRIPTOR.message_types_by_name[ - 'CollaboratorHealthCheckResponse'] = _COLLABORATORHEALTHCHECKRESPONSE DESCRIPTOR.message_types_by_name['EnvoyInfo'] = _ENVOYINFO DESCRIPTOR.message_types_by_name['GetEnvoysRequest'] = _GETENVOYSREQUEST DESCRIPTOR.message_types_by_name['GetEnvoysResponse'] = _GETENVOYSRESPONSE _sym_db.RegisterFileDescriptor(DESCRIPTOR) +RequestHeader = _reflection.GeneratedProtocolMessageType('RequestHeader', (_message.Message,), { + 'DESCRIPTOR' : _REQUESTHEADER, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:RequestHeader) + }) +_sym_db.RegisterMessage(RequestHeader) + +CudaDeviceInfo = _reflection.GeneratedProtocolMessageType('CudaDeviceInfo', (_message.Message,), { + 'DESCRIPTOR' : _CUDADEVICEINFO, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:CudaDeviceInfo) + }) +_sym_db.RegisterMessage(CudaDeviceInfo) + NodeInfo = _reflection.GeneratedProtocolMessageType('NodeInfo', (_message.Message,), { 'DESCRIPTOR': _NODEINFO, '__module__': 'director_pb2' @@ -945,6 +1055,20 @@ }) _sym_db.RegisterMessage(ExperimentData) +EnvoyStatus = _reflection.GeneratedProtocolMessageType('EnvoyStatus', (_message.Message,), { + 'DESCRIPTOR' : _ENVOYSTATUS, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:EnvoyStatus) + }) +_sym_db.RegisterMessage(EnvoyStatus) + +EnvoyHealthCheckResponse = _reflection.GeneratedProtocolMessageType('EnvoyHealthCheckResponse', (_message.Message,), { + 'DESCRIPTOR' : _ENVOYHEALTHCHECKRESPONSE, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:EnvoyHealthCheckResponse) + }) +_sym_db.RegisterMessage(EnvoyHealthCheckResponse) + ExperimentInfo = _reflection.GeneratedProtocolMessageType('ExperimentInfo', (_message.Message,), { 'DESCRIPTOR': _EXPERIMENTINFO, '__module__': 'director_pb2' @@ -1016,22 +1140,6 @@ }) _sym_db.RegisterMessage(RemoveExperimentResponse) -CollaboratorStatus = _reflection.GeneratedProtocolMessageType('CollaboratorStatus', - (_message.Message,), { - 'DESCRIPTOR': _COLLABORATORSTATUS, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:CollaboratorStatus) - }) -_sym_db.RegisterMessage(CollaboratorStatus) - -CollaboratorHealthCheckResponse = _reflection.GeneratedProtocolMessageType( - 'CollaboratorHealthCheckResponse', (_message.Message,), { - 'DESCRIPTOR': _COLLABORATORHEALTHCHECKRESPONSE, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:CollaboratorHealthCheckResponse) - }) -_sym_db.RegisterMessage(CollaboratorHealthCheckResponse) - EnvoyInfo = _reflection.GeneratedProtocolMessageType('EnvoyInfo', (_message.Message,), { 'DESCRIPTOR': _ENVOYINFO, '__module__': 'director_pb2' @@ -1056,10 +1164,112 @@ _sym_db.RegisterMessage(GetEnvoysResponse) _FEDERATIONDIRECTOR = _descriptor.ServiceDescriptor( - name='FederationDirector', - full_name='FederationDirector', - file=DESCRIPTOR, + name='FederationDirector', + full_name='FederationDirector', + file=DESCRIPTOR, + index=0, + serialized_options=None, + create_key=_descriptor._internal_create_key, + serialized_start=1943, + serialized_end=2619, + methods=[ + _descriptor.MethodDescriptor( + name='AcknowledgeShard', + full_name='FederationDirector.AcknowledgeShard', index=0, + containing_service=None, + input_type=_SHARDINFO, + output_type=_SHARDACKNOWLEDGEMENT, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='WaitExperiment', + full_name='FederationDirector.WaitExperiment', + index=1, + containing_service=None, + input_type=_WAITEXPERIMENTREQUEST, + output_type=_WAITEXPERIMENTRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='GetExperimentData', + full_name='FederationDirector.GetExperimentData', + index=2, + containing_service=None, + input_type=_GETEXPERIMENTDATAREQUEST, + output_type=_EXPERIMENTDATA, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='SetNewExperiment', + full_name='FederationDirector.SetNewExperiment', + index=3, + containing_service=None, + input_type=_EXPERIMENTINFO, + output_type=_SETNEWEXPERIMENTRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='GetDatasetInfo', + full_name='FederationDirector.GetDatasetInfo', + index=4, + containing_service=None, + input_type=_GETDATASETINFOREQUEST, + output_type=_SHARDINFO, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='GetTrainedModel', + full_name='FederationDirector.GetTrainedModel', + index=5, + containing_service=None, + input_type=_GETTRAINEDMODELREQUEST, + output_type=_TRAINEDMODELRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='StreamMetrics', + full_name='FederationDirector.StreamMetrics', + index=6, + containing_service=None, + input_type=_STREAMMETRICSREQUEST, + output_type=_STREAMMETRICSRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='RemoveExperimentData', + full_name='FederationDirector.RemoveExperimentData', + index=7, + containing_service=None, + input_type=_REMOVEEXPERIMENTREQUEST, + output_type=_REMOVEEXPERIMENTRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='EnvoyHealthCheck', + full_name='FederationDirector.EnvoyHealthCheck', + index=8, + containing_service=None, + input_type=_ENVOYSTATUS, + output_type=_ENVOYHEALTHCHECKRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name='GetEnvoys', + full_name='FederationDirector.GetEnvoys', + index=9, + containing_service=None, + input_type=_GETENVOYSREQUEST, + output_type=_GETENVOYSRESPONSE, serialized_options=None, create_key=_descriptor._internal_create_key, serialized_start=1752, diff --git a/openfl/protocols/director_pb2_grpc.py b/openfl/protocols/director_pb2_grpc.py index 18064a3c81..dfd673aec3 100644 --- a/openfl/protocols/director_pb2_grpc.py +++ b/openfl/protocols/director_pb2_grpc.py @@ -54,10 +54,10 @@ def __init__(self, channel): request_serializer=director__pb2.RemoveExperimentRequest.SerializeToString, response_deserializer=director__pb2.RemoveExperimentResponse.FromString, ) - self.CollaboratorHealthCheck = channel.unary_unary( - '/FederationDirector/CollaboratorHealthCheck', - request_serializer=director__pb2.CollaboratorStatus.SerializeToString, - response_deserializer=director__pb2.CollaboratorHealthCheckResponse.FromString, + self.EnvoyHealthCheck = channel.unary_unary( + '/FederationDirector/EnvoyHealthCheck', + request_serializer=director__pb2.EnvoyStatus.SerializeToString, + response_deserializer=director__pb2.EnvoyHealthCheckResponse.FromString, ) self.GetEnvoys = channel.unary_unary( '/FederationDirector/GetEnvoys', @@ -120,7 +120,7 @@ def RemoveExperimentData(self, request, context): context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') - def CollaboratorHealthCheck(self, request, context): + def EnvoyHealthCheck(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) context.set_details('Method not implemented!') @@ -175,10 +175,10 @@ def add_FederationDirectorServicer_to_server(servicer, server): request_deserializer=director__pb2.RemoveExperimentRequest.FromString, response_serializer=director__pb2.RemoveExperimentResponse.SerializeToString, ), - 'CollaboratorHealthCheck': grpc.unary_unary_rpc_method_handler( - servicer.CollaboratorHealthCheck, - request_deserializer=director__pb2.CollaboratorStatus.FromString, - response_serializer=director__pb2.CollaboratorHealthCheckResponse.SerializeToString, + 'EnvoyHealthCheck': grpc.unary_unary_rpc_method_handler( + servicer.EnvoyHealthCheck, + request_deserializer=director__pb2.EnvoyStatus.FromString, + response_serializer=director__pb2.EnvoyHealthCheckResponse.SerializeToString, ), 'GetEnvoys': grpc.unary_unary_rpc_method_handler( servicer.GetEnvoys, @@ -332,7 +332,7 @@ def RemoveExperimentData(request, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def CollaboratorHealthCheck(request, + def EnvoyHealthCheck(request, target, options=(), channel_credentials=None, @@ -342,9 +342,9 @@ def CollaboratorHealthCheck(request, wait_for_ready=None, timeout=None, metadata=None): - return grpc.experimental.unary_unary(request, target, '/FederationDirector/CollaboratorHealthCheck', - director__pb2.CollaboratorStatus.SerializeToString, - director__pb2.CollaboratorHealthCheckResponse.FromString, + return grpc.experimental.unary_unary(request, target, '/FederationDirector/EnvoyHealthCheck', + director__pb2.EnvoyStatus.SerializeToString, + director__pb2.EnvoyHealthCheckResponse.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/openfl/transport/grpc/director_client.py b/openfl/transport/grpc/director_client.py index 5d28b125f3..f5f01e40e2 100644 --- a/openfl/transport/grpc/director_client.py +++ b/openfl/transport/grpc/director_client.py @@ -51,7 +51,7 @@ def __init__(self, *, director_host, director_port, shard_name, tls=True, channel = grpc.secure_channel(director_addr, credentials, options=options) self.stub = director_pb2_grpc.FederationDirectorStub(channel) - def report_shard_info(self, shard_descriptor) -> bool: + def report_shard_info(self, shard_descriptor, cuda_devices) -> bool: """Report shard info to the director.""" logger.info('Send report AcknowledgeShard') # True considered as successful registration @@ -62,7 +62,8 @@ def report_shard_info(self, shard_descriptor) -> bool: target_shape=shard_descriptor.target_shape ) - shard_info.node_info.CopyFrom(self._get_node_info()) + shard_info.node_info.name = self.shard_name + shard_info.node_info.cuda_devices = cuda_devices acknowledgement = self.stub.AcknowledgeShard(shard_info) return acknowledgement.accepted @@ -94,10 +95,6 @@ def _get_experiment_data(self): """Generate the experiment data request.""" yield director_pb2.WaitExperimentRequest(collaborator_name=self.shard_name) - def _get_node_info(self): - """Generate a node info message.""" - return director_pb2.NodeInfo(name=self.shard_name) - def send_health_check(self, *, envoy_name: str, is_experiment_running: bool) -> int: """Send envoy health check.""" status = director_pb2.EnvoyStatus( From a04cb1a10b1cf5d8fbdbc404f9e4938384e8fe40 Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Mon, 6 Sep 2021 18:26:32 +0300 Subject: [PATCH 05/31] experiment update --- .../PyTorch_Kvasir_UNet/envoy/envoy_config.yaml | 14 ++++++++++++++ .../PyTorch_Kvasir_UNet/envoy/shard_config.yaml | 2 +- .../PyTorch_Kvasir_UNet/envoy/start_envoy.sh | 2 +- openfl-workspace/default/envoy_config.yaml | 4 ++-- openfl/interface/envoy.py | 13 +++++++------ 5 files changed, 25 insertions(+), 10 deletions(-) create mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml new file mode 100644 index 0000000000..ee4c3fda35 --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml @@ -0,0 +1,14 @@ +params: + cuda_devices: [] + optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: kvasir_shard_descriptor.KvasirShardDescriptor + params: + data_folder: kvasir_data + rank_worldsize: 1,90 + enforce_image_hw: '300,400' + \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml index aba8be3627..62cf42a45c 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml @@ -1,5 +1,5 @@ template: kvasir_shard_descriptor.KvasirShardDescriptor params: data_folder: kvasir_data - rank_worldsize: 1,90 + rank_worldsize: 1,10 enforce_image_hw: '300,400' \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh index 3f06f2113b..1dfda52241 100755 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls --shard-config-path envoy_config.yaml -dh localhost -dp 50051 \ No newline at end of file +fx envoy start -n env_one --disable-tls --envoy-config-path envoy_config.yaml -dh localhost -dp 50051 diff --git a/openfl-workspace/default/envoy_config.yaml b/openfl-workspace/default/envoy_config.yaml index d468d1d5f9..46d9844662 100644 --- a/openfl-workspace/default/envoy_config.yaml +++ b/openfl-workspace/default/envoy_config.yaml @@ -21,8 +21,8 @@ params: cuda_devices: [] optional_plugin_components: cuda_device_monitor: - template : openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings : [] + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] shard_descriptor: template: shard_descriptor.LocalShardDescriptor diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index 2435a69a19..f36fb3441f 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -39,7 +39,7 @@ def envoy(context): help='The federation director port', type=click.IntRange(1, 65535)) @option('--tls/--disable-tls', default=True, is_flag=True, help='Use TLS or not (By default TLS is enabled)') -@option('-sc', '--envoy-config-path', default='envoy_config.yaml', +@option('-ec', '--envoy-config-path', default='envoy_config.yaml', help='The envoy config path', type=ClickPath(exists=True)) @option('-rc', '--root-cert-path', 'root_certificate', default=None, help='Path to a root CA cert', type=ClickPath(exists=True)) @@ -47,7 +47,7 @@ def envoy(context): help='Path to a private key', type=ClickPath(exists=True)) @option('-oc', '--public-cert-path', 'certificate', default=None, help='Path to a signed certificate', type=ClickPath(exists=True)) -def start_(shard_name, director_host, director_port, tls, shard_config_path, +def start_(shard_name, director_host, director_port, tls, envoy_config_path, root_certificate, private_key, certificate): """Start the Envoy.""" logger.info('🧿 Starting the Envoy.') @@ -55,7 +55,7 @@ def start_(shard_name, director_host, director_port, tls, shard_config_path, click.echo('The shard config path is out of the openfl workspace scope.') sys.exit(1) # Reed the Envoy config - with open(shard_config_path) as stream: + with open(envoy_config_path) as stream: envoy_config = safe_load(stream) # pass envoy parameters @@ -122,12 +122,13 @@ def create(envoy_path): envoy_path / 'requirements.txt') -def shard_descriptor_from_config(shard_config_path: str): +def shard_descriptor_from_config(shard_config: dict): """Build a shard descriptor from config.""" + print(shard_config) template = shard_config.get('template') if not template: - raise Exception(f'You should define a shard ' - f'descriptor template in {shard_config_path}') + raise Exception('You should define a shard ' + 'descriptor template in the envoy config') class_name = template.split('.')[-1] module_path = '.'.join(template.split('.')[:-1]) params = shard_config.get('params', {}) From 33a7d5871da4acfee172ad1897461b39566815fa Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Tue, 7 Sep 2021 10:02:26 +0300 Subject: [PATCH 06/31] fix repeated field assignmet --- openfl/transport/grpc/director_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/transport/grpc/director_client.py b/openfl/transport/grpc/director_client.py index f5f01e40e2..0a8307a4b2 100644 --- a/openfl/transport/grpc/director_client.py +++ b/openfl/transport/grpc/director_client.py @@ -63,7 +63,7 @@ def report_shard_info(self, shard_descriptor, cuda_devices) -> bool: ) shard_info.node_info.name = self.shard_name - shard_info.node_info.cuda_devices = cuda_devices + shard_info.node_info.cuda_devices[:] = cuda_devices acknowledgement = self.stub.AcknowledgeShard(shard_info) return acknowledgement.accepted From 5a7078f6916bd5b5da771ac7cc2fc11f1f98a8b1 Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Thu, 9 Sep 2021 10:35:00 +0300 Subject: [PATCH 07/31] cuda status updates --- .../envoy/envoy_config.yaml | 2 +- openfl/component/director/director.py | 13 +- openfl/component/envoy/envoy.py | 21 ++- openfl/protocols/director.proto | 7 +- openfl/protocols/director_pb2.py | 153 ++++++++++-------- openfl/protocols/director_pb2_grpc.py | 66 ++++---- openfl/transport/grpc/director_client.py | 23 ++- openfl/transport/grpc/director_server.py | 11 +- 8 files changed, 183 insertions(+), 113 deletions(-) diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml index ee4c3fda35..35d1abb9c8 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml @@ -1,5 +1,5 @@ params: - cuda_devices: [] + cuda_devices: [1,2,5] optional_plugin_components: cuda_device_monitor: template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor diff --git a/openfl/component/director/director.py b/openfl/component/director/director.py index 81b5231376..36377668c4 100644 --- a/openfl/component/director/director.py +++ b/openfl/component/director/director.py @@ -167,13 +167,14 @@ def remove_experiment_data(self, experiment_name: str, caller: str): and caller in self.experiments_registry[experiment_name].users): self.experiments_registry.remove(experiment_name) - def collaborator_health_check( - self, *, collaborator_name: str, is_experiment_running: bool + def envoy_health_check( + self, *, envoy_name: str, is_experiment_running: bool, + cuda_devices_status: list = None, ) -> int: """Accept health check from envoy.""" - shard_info = self._shard_registry.get(collaborator_name) + shard_info = self._shard_registry.get(envoy_name) if not shard_info: - raise Exception(f'Unknown shard {collaborator_name}') + raise Exception(f'Unknown shard {envoy_name}') hc_period = self.settings.get('envoy_health_check_period', ENVOY_HEALTH_CHECK_PERIOD) shard_info['is_online']: True @@ -181,6 +182,10 @@ def collaborator_health_check( shard_info['valid_duration'] = 2 * hc_period shard_info['last_updated'] = time.time() + if cuda_devices_status is not None: + for i in range(len(cuda_devices_status)): + shard_info['shard_info'].node_info.cuda_devices[i] = cuda_devices_status[i] + return hc_period def get_envoys(self) -> list: diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index fe3b900542..d8741d46eb 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -91,9 +91,28 @@ def send_health_check(self): """Send health check to the director.""" logger.info('The health check sender is started.') while True: + # Need a separate method 'Get self state' or smth + devices_status_kwargs = {} + + if self.cuda_device_monitor is not None: + cuda_devices_info = {} + for device_id in self.cuda_devices: + cuda_devices_info[str(device_id)] = { + 'memory_total': + self.cuda_device_monitor.get_device_memory_total(device_id), + 'memory_used': + self.cuda_device_monitor.get_device_memory_utilized(device_id), + 'device_utilization': + self.cuda_device_monitor.get_device_utilization(device_id)} + + devices_status_kwargs['cuda_driver_version'] = \ + self.cuda_device_monitor.get_driver_version() + devices_status_kwargs['cuda_devices_info'] = cuda_devices_info + timeout = self.director_client.send_health_check( envoy_name=self.name, - is_experiment_running=self.is_experiment_running + is_experiment_running=self.is_experiment_running, + **devices_status_kwargs, ) time.sleep(timeout) diff --git a/openfl/protocols/director.proto b/openfl/protocols/director.proto index 4130590ea3..0111e9009f 100644 --- a/openfl/protocols/director.proto +++ b/openfl/protocols/director.proto @@ -11,13 +11,15 @@ message CudaDeviceInfo { uint64 index = 1; uint64 memory_total = 2; uint64 memory_utilized = 3; + string device_utilization = 4; + string cuda_driver_version = 5; } // Envoy Messages message NodeInfo { string name = 1; - repeated uint64 cuda_devices = 2; + repeated CudaDeviceInfo cuda_devices = 2; } message ShardInfo { @@ -54,6 +56,7 @@ message ExperimentData { message EnvoyStatus { string name = 1; bool is_experiment_running = 2; + repeated CudaDeviceInfo cuda_devices = 3; } message EnvoyHealthCheckResponse { @@ -128,6 +131,7 @@ service FederationDirector { // Shard owner could also provide some public data for tests rpc WaitExperiment (stream WaitExperimentRequest) returns (stream WaitExperimentResponse) {} rpc GetExperimentData (GetExperimentDataRequest) returns (stream ExperimentData) {} + rpc EnvoyHealthCheck (EnvoyStatus) returns (EnvoyHealthCheckResponse) {} // API RPCs rpc SetNewExperiment (stream ExperimentInfo) returns (SetNewExperimentResponse) {} @@ -135,6 +139,5 @@ service FederationDirector { rpc GetTrainedModel (GetTrainedModelRequest) returns (TrainedModelResponse) {} rpc StreamMetrics (StreamMetricsRequest) returns (stream StreamMetricsResponse) {} rpc RemoveExperimentData (RemoveExperimentRequest) returns (RemoveExperimentResponse) {} - rpc EnvoyHealthCheck (EnvoyStatus) returns (EnvoyHealthCheckResponse) {} rpc GetEnvoys (GetEnvoysRequest) returns (GetEnvoysResponse) {} } diff --git a/openfl/protocols/director_pb2.py b/openfl/protocols/director_pb2.py index 5617543436..9d6732eebd 100644 --- a/openfl/protocols/director_pb2.py +++ b/openfl/protocols/director_pb2.py @@ -21,7 +21,7 @@ syntax='proto3', serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"\x1f\n\rRequestHeader\x12\x0e\n\x06sender\x18\x01 \x01(\t\"N\n\x0e\x43udaDeviceInfo\x12\r\n\x05index\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_total\x18\x02 \x01(\x04\x12\x17\n\x0fmemory_utilized\x18\x03 \x01(\x04\".\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x14\n\x0c\x63uda_devices\x18\x02 \x03(\x04\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\":\n\x0b\x45nvoyStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\"R\n\x18\x45nvoyHealthCheckResponse\x12\x36\n\x13health_check_period\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\"\xa6\x01\n\x0e\x45xperimentInfo\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\",\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"\xb5\x01\n\x16GetTrainedModelRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"7\n\x15GetDatasetInfoRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\"O\n\x14StreamMetricsRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"R\n\x17RemoveExperimentRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xa4\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12=\n\x10\x45nvoyHealthCheck\x12\x0c.EnvoyStatus\x1a\x19.EnvoyHealthCheckResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' + serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"\x1f\n\rRequestHeader\x12\x0e\n\x06sender\x18\x01 \x01(\t\"\x87\x01\n\x0e\x43udaDeviceInfo\x12\r\n\x05index\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_total\x18\x02 \x01(\x04\x12\x17\n\x0fmemory_utilized\x18\x03 \x01(\x04\x12\x1a\n\x12\x64\x65vice_utilization\x18\x04 \x01(\t\x12\x1b\n\x13\x63uda_driver_version\x18\x05 \x01(\t\"?\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12%\n\x0c\x63uda_devices\x18\x02 \x03(\x0b\x32\x0f.CudaDeviceInfo\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\"a\n\x0b\x45nvoyStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\x12%\n\x0c\x63uda_devices\x18\x03 \x03(\x0b\x32\x0f.CudaDeviceInfo\"R\n\x18\x45nvoyHealthCheckResponse\x12\x36\n\x13health_check_period\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\"\xa6\x01\n\x0e\x45xperimentInfo\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\",\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"\xb5\x01\n\x16GetTrainedModelRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"7\n\x15GetDatasetInfoRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\"O\n\x14StreamMetricsRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"R\n\x17RemoveExperimentRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xa4\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12=\n\x10\x45nvoyHealthCheck\x12\x0c.EnvoyStatus\x1a\x19.EnvoyHealthCheckResponse\"\x00\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' , dependencies=[google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR,google_dot_protobuf_dot_duration__pb2.DESCRIPTOR,federation__pb2.DESCRIPTOR,]) @@ -47,8 +47,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=1168, - serialized_end=1211, + serialized_start=1282, + serialized_end=1325, ) _sym_db.RegisterEnumDescriptor(_GETTRAINEDMODELREQUEST_MODELTYPE) @@ -114,6 +114,20 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='device_utilization', full_name='CudaDeviceInfo.device_utilization', index=3, + number=4, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='cuda_driver_version', full_name='CudaDeviceInfo.cuda_driver_version', index=4, + number=5, type=9, cpp_type=9, label=1, + has_default_value=False, default_value=b"".decode('utf-8'), + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -126,8 +140,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=134, - serialized_end=212, + serialized_start=135, + serialized_end=270, ) @@ -148,7 +162,7 @@ serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( name='cuda_devices', full_name='NodeInfo.cuda_devices', index=1, - number=2, type=4, cpp_type=4, label=3, + number=2, type=11, cpp_type=10, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, @@ -165,8 +179,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=214, - serialized_end=260, + serialized_start=272, + serialized_end=335, ) _SHARDINFO = _descriptor.Descriptor( @@ -224,8 +238,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=263, - serialized_end=394, + serialized_start=338, + serialized_end=469, ) _SHARDACKNOWLEDGEMENT = _descriptor.Descriptor( @@ -255,8 +269,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=396, - serialized_end=436, + serialized_start=471, + serialized_end=511, ) _WAITEXPERIMENTREQUEST = _descriptor.Descriptor( @@ -286,8 +300,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=438, - serialized_end=488, + serialized_start=513, + serialized_end=563, ) _WAITEXPERIMENTRESPONSE = _descriptor.Descriptor( @@ -317,8 +331,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=490, - serialized_end=539, + serialized_start=565, + serialized_end=614, ) _GETEXPERIMENTDATAREQUEST = _descriptor.Descriptor( @@ -355,8 +369,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=541, - serialized_end=619, + serialized_start=616, + serialized_end=694, ) _EXPERIMENTDATA = _descriptor.Descriptor( @@ -393,8 +407,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=621, - serialized_end=668, + serialized_start=696, + serialized_end=743, ) @@ -420,6 +434,13 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='cuda_devices', full_name='EnvoyStatus.cuda_devices', index=2, + number=3, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -432,8 +453,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=670, - serialized_end=728, + serialized_start=745, + serialized_end=842, ) @@ -464,8 +485,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=730, - serialized_end=812, + serialized_start=844, + serialized_end=926, ) _EXPERIMENTINFO = _descriptor.Descriptor( @@ -523,8 +544,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=815, - serialized_end=981, + serialized_start=929, + serialized_end=1095, ) _SETNEWEXPERIMENTRESPONSE = _descriptor.Descriptor( @@ -554,8 +575,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=983, - serialized_end=1027, + serialized_start=1097, + serialized_end=1141, ) _GETTRAINEDMODELREQUEST = _descriptor.Descriptor( @@ -600,8 +621,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1030, - serialized_end=1211, + serialized_start=1144, + serialized_end=1325, ) _TRAINEDMODELRESPONSE = _descriptor.Descriptor( @@ -631,8 +652,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1213, - serialized_end=1269, + serialized_start=1327, + serialized_end=1383, ) _GETDATASETINFOREQUEST = _descriptor.Descriptor( @@ -662,8 +683,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1271, - serialized_end=1326, + serialized_start=1385, + serialized_end=1440, ) _STREAMMETRICSREQUEST = _descriptor.Descriptor( @@ -700,8 +721,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1328, - serialized_end=1407, + serialized_start=1442, + serialized_end=1521, ) _STREAMMETRICSRESPONSE = _descriptor.Descriptor( @@ -759,8 +780,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1409, - serialized_end=1532, + serialized_start=1523, + serialized_end=1646, ) _REMOVEEXPERIMENTREQUEST = _descriptor.Descriptor( @@ -797,8 +818,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1534, - serialized_end=1616, + serialized_start=1648, + serialized_end=1730, ) _REMOVEEXPERIMENTRESPONSE = _descriptor.Descriptor( @@ -828,8 +849,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1618, - serialized_end=1669, + serialized_start=1732, + serialized_end=1783, ) _ENVOYINFO = _descriptor.Descriptor( @@ -887,8 +908,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1672, - serialized_end=1866, + serialized_start=1786, + serialized_end=1980, ) _GETENVOYSREQUEST = _descriptor.Descriptor( @@ -911,8 +932,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1868, - serialized_end=1886, + serialized_start=1982, + serialized_end=2000, ) _GETENVOYSRESPONSE = _descriptor.Descriptor( @@ -942,11 +963,13 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1888, - serialized_end=1940, + serialized_start=2002, + serialized_end=2054, ) +_NODEINFO.fields_by_name['cuda_devices'].message_type = _CUDADEVICEINFO _SHARDINFO.fields_by_name['node_info'].message_type = _NODEINFO +_ENVOYSTATUS.fields_by_name['cuda_devices'].message_type = _CUDADEVICEINFO _ENVOYHEALTHCHECKRESPONSE.fields_by_name['health_check_period'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION _EXPERIMENTINFO.fields_by_name['header'].message_type = _REQUESTHEADER _EXPERIMENTINFO.fields_by_name['experiment_data'].message_type = _EXPERIMENTDATA @@ -1170,8 +1193,8 @@ index=0, serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_start=1943, - serialized_end=2619, + serialized_start=2057, + serialized_end=2733, methods=[ _descriptor.MethodDescriptor( name='AcknowledgeShard', @@ -1203,10 +1226,20 @@ serialized_options=None, create_key=_descriptor._internal_create_key, ), + _descriptor.MethodDescriptor( + name='EnvoyHealthCheck', + full_name='FederationDirector.EnvoyHealthCheck', + index=3, + containing_service=None, + input_type=_ENVOYSTATUS, + output_type=_ENVOYHEALTHCHECKRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), _descriptor.MethodDescriptor( name='SetNewExperiment', full_name='FederationDirector.SetNewExperiment', - index=3, + index=4, containing_service=None, input_type=_EXPERIMENTINFO, output_type=_SETNEWEXPERIMENTRESPONSE, @@ -1216,7 +1249,7 @@ _descriptor.MethodDescriptor( name='GetDatasetInfo', full_name='FederationDirector.GetDatasetInfo', - index=4, + index=5, containing_service=None, input_type=_GETDATASETINFOREQUEST, output_type=_SHARDINFO, @@ -1226,7 +1259,7 @@ _descriptor.MethodDescriptor( name='GetTrainedModel', full_name='FederationDirector.GetTrainedModel', - index=5, + index=6, containing_service=None, input_type=_GETTRAINEDMODELREQUEST, output_type=_TRAINEDMODELRESPONSE, @@ -1236,7 +1269,7 @@ _descriptor.MethodDescriptor( name='StreamMetrics', full_name='FederationDirector.StreamMetrics', - index=6, + index=7, containing_service=None, input_type=_STREAMMETRICSREQUEST, output_type=_STREAMMETRICSRESPONSE, @@ -1246,23 +1279,13 @@ _descriptor.MethodDescriptor( name='RemoveExperimentData', full_name='FederationDirector.RemoveExperimentData', - index=7, + index=8, containing_service=None, input_type=_REMOVEEXPERIMENTREQUEST, output_type=_REMOVEEXPERIMENTRESPONSE, serialized_options=None, create_key=_descriptor._internal_create_key, ), - _descriptor.MethodDescriptor( - name='EnvoyHealthCheck', - full_name='FederationDirector.EnvoyHealthCheck', - index=8, - containing_service=None, - input_type=_ENVOYSTATUS, - output_type=_ENVOYHEALTHCHECKRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), _descriptor.MethodDescriptor( name='GetEnvoys', full_name='FederationDirector.GetEnvoys', diff --git a/openfl/protocols/director_pb2_grpc.py b/openfl/protocols/director_pb2_grpc.py index dfd673aec3..1f64aa4808 100644 --- a/openfl/protocols/director_pb2_grpc.py +++ b/openfl/protocols/director_pb2_grpc.py @@ -29,6 +29,11 @@ def __init__(self, channel): request_serializer=director__pb2.GetExperimentDataRequest.SerializeToString, response_deserializer=director__pb2.ExperimentData.FromString, ) + self.EnvoyHealthCheck = channel.unary_unary( + '/FederationDirector/EnvoyHealthCheck', + request_serializer=director__pb2.EnvoyStatus.SerializeToString, + response_deserializer=director__pb2.EnvoyHealthCheckResponse.FromString, + ) self.SetNewExperiment = channel.stream_unary( '/FederationDirector/SetNewExperiment', request_serializer=director__pb2.ExperimentInfo.SerializeToString, @@ -54,11 +59,6 @@ def __init__(self, channel): request_serializer=director__pb2.RemoveExperimentRequest.SerializeToString, response_deserializer=director__pb2.RemoveExperimentResponse.FromString, ) - self.EnvoyHealthCheck = channel.unary_unary( - '/FederationDirector/EnvoyHealthCheck', - request_serializer=director__pb2.EnvoyStatus.SerializeToString, - response_deserializer=director__pb2.EnvoyHealthCheckResponse.FromString, - ) self.GetEnvoys = channel.unary_unary( '/FederationDirector/GetEnvoys', request_serializer=director__pb2.GetEnvoysRequest.SerializeToString, @@ -89,6 +89,12 @@ def GetExperimentData(self, request, context): context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') + def EnvoyHealthCheck(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') + def SetNewExperiment(self, request_iterator, context): """API RPCs """ @@ -120,12 +126,6 @@ def RemoveExperimentData(self, request, context): context.set_details('Method not implemented!') raise NotImplementedError('Method not implemented!') - def EnvoyHealthCheck(self, request, context): - """Missing associated documentation comment in .proto file.""" - context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') - def GetEnvoys(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) @@ -150,6 +150,11 @@ def add_FederationDirectorServicer_to_server(servicer, server): request_deserializer=director__pb2.GetExperimentDataRequest.FromString, response_serializer=director__pb2.ExperimentData.SerializeToString, ), + 'EnvoyHealthCheck': grpc.unary_unary_rpc_method_handler( + servicer.EnvoyHealthCheck, + request_deserializer=director__pb2.EnvoyStatus.FromString, + response_serializer=director__pb2.EnvoyHealthCheckResponse.SerializeToString, + ), 'SetNewExperiment': grpc.stream_unary_rpc_method_handler( servicer.SetNewExperiment, request_deserializer=director__pb2.ExperimentInfo.FromString, @@ -175,11 +180,6 @@ def add_FederationDirectorServicer_to_server(servicer, server): request_deserializer=director__pb2.RemoveExperimentRequest.FromString, response_serializer=director__pb2.RemoveExperimentResponse.SerializeToString, ), - 'EnvoyHealthCheck': grpc.unary_unary_rpc_method_handler( - servicer.EnvoyHealthCheck, - request_deserializer=director__pb2.EnvoyStatus.FromString, - response_serializer=director__pb2.EnvoyHealthCheckResponse.SerializeToString, - ), 'GetEnvoys': grpc.unary_unary_rpc_method_handler( servicer.GetEnvoys, request_deserializer=director__pb2.GetEnvoysRequest.FromString, @@ -246,6 +246,23 @@ def GetExperimentData(request, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + @staticmethod + def EnvoyHealthCheck(request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/FederationDirector/EnvoyHealthCheck', + director__pb2.EnvoyStatus.SerializeToString, + director__pb2.EnvoyHealthCheckResponse.FromString, + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + @staticmethod def SetNewExperiment(request_iterator, target, @@ -331,23 +348,6 @@ def RemoveExperimentData(request, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) - @staticmethod - def EnvoyHealthCheck(request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/FederationDirector/EnvoyHealthCheck', - director__pb2.EnvoyStatus.SerializeToString, - director__pb2.EnvoyHealthCheckResponse.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) - @staticmethod def GetEnvoys(request, target, diff --git a/openfl/transport/grpc/director_client.py b/openfl/transport/grpc/director_client.py index 0a8307a4b2..3410d9e6b1 100644 --- a/openfl/transport/grpc/director_client.py +++ b/openfl/transport/grpc/director_client.py @@ -63,7 +63,10 @@ def report_shard_info(self, shard_descriptor, cuda_devices) -> bool: ) shard_info.node_info.name = self.shard_name - shard_info.node_info.cuda_devices[:] = cuda_devices + shard_info.node_info.cuda_devices.extend( + director_pb2.CudaDeviceInfo(index=cuda_device) + for cuda_device in cuda_devices + ) acknowledgement = self.stub.AcknowledgeShard(shard_info) return acknowledgement.accepted @@ -95,12 +98,28 @@ def _get_experiment_data(self): """Generate the experiment data request.""" yield director_pb2.WaitExperimentRequest(collaborator_name=self.shard_name) - def send_health_check(self, *, envoy_name: str, is_experiment_running: bool) -> int: + def send_health_check(self, *, envoy_name: str, is_experiment_running: bool, + cuda_devices_info: dict = None, + cuda_driver_version: str = None) -> int: """Send envoy health check.""" status = director_pb2.EnvoyStatus( name=envoy_name, is_experiment_running=is_experiment_running, ) + + cuda_messages = [] + if cuda_devices_info is not None: + cuda_messages = [director_pb2.CudaDeviceInfo( + index=device_index, + memory_total=description_dict['memory_total'], + memory_utilized=description_dict['memory_used'], + device_utilization=description_dict['device_utilization'], + cuda_driver_version=cuda_driver_version + ) for device_index, description_dict in cuda_devices_info.items() + ] + + status.cuda_devices.extend(cuda_messages) + logger.debug(f'Sending health check status: {status}') response = self.stub.EnvoyHealthCheck(status) diff --git a/openfl/transport/grpc/director_server.py b/openfl/transport/grpc/director_server.py index ea42516551..ccb4dc3f78 100644 --- a/openfl/transport/grpc/director_server.py +++ b/openfl/transport/grpc/director_server.py @@ -226,14 +226,15 @@ async def RemoveExperimentData(self, request, context): # NOQA:N802 response.acknowledgement = True return response - async def CollaboratorHealthCheck(self, request, context): # NOQA:N802 + async def EnvoyHealthCheck(self, request, context): # NOQA:N802 """Accept health check from envoy.""" - logger.debug(f'Request CollaboratorHealthCheck has got: {request}') - health_check_period = self.director.collaborator_health_check( - collaborator_name=request.name, + logger.debug(f'Request EnvoyHealthCheck has got: {request}') + health_check_period = self.director.envoy_health_check( + envoy_name=request.name, is_experiment_running=request.is_experiment_running, + cuda_devices_status=request.cuda_devices ) - resp = director_pb2.CollaboratorHealthCheckResponse() + resp = director_pb2.EnvoyHealthCheckResponse() resp.health_check_period.seconds = health_check_period return resp From e487ac4cdb93eb94e98e650e14149e59a6c0f224 Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Fri, 10 Sep 2021 14:39:21 +0300 Subject: [PATCH 08/31] fixes --- .../PyTorch_Kvasir_UNet/director/director_config.yaml | 4 ++-- .../interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh | 2 +- openfl/component/envoy/envoy.py | 2 +- openfl/interface/envoy.py | 3 +-- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/director/director_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/director/director_config.yaml index 6d73f42176..0ca1182b06 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/director/director_config.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/director/director_config.yaml @@ -1,6 +1,6 @@ settings: listen_host: localhost - listen_port: 50051 + listen_port: 50050 sample_shape: ['300', '400', '3'] target_shape: ['300', '400'] - envoy_health_check_period: 60 # in seconds \ No newline at end of file + envoy_health_check_period: 5 # in seconds \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh index 1dfda52241..ae9b4c27a0 100755 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls --envoy-config-path envoy_config.yaml -dh localhost -dp 50051 +fx envoy start -n env_one --disable-tls --envoy-config-path envoy_config.yaml -dh localhost -dp 50050 diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index d8741d46eb..f2694a3219 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -97,7 +97,7 @@ def send_health_check(self): if self.cuda_device_monitor is not None: cuda_devices_info = {} for device_id in self.cuda_devices: - cuda_devices_info[str(device_id)] = { + cuda_devices_info[device_id] = { 'memory_total': self.cuda_device_monitor.get_device_memory_total(device_id), 'memory_used': diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index f36fb3441f..64c185826b 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -68,7 +68,7 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, certificate = Path(certificate).absolute() envoy_params = envoy_config.get('params', {}) - for plugin_name, plugin_settings in envoy_config.get('optional_plugin_components', {}).items(): + for plugin_name, plugin_settings in envoy_params.get('optional_plugin_components', {}).items(): template = plugin_settings.get('template') if not template: raise Exception('You should put a template' @@ -124,7 +124,6 @@ def create(envoy_path): def shard_descriptor_from_config(shard_config: dict): """Build a shard descriptor from config.""" - print(shard_config) template = shard_config.get('template') if not template: raise Exception('You should define a shard ' From a73321006feb7ea5c6f22f423b3717fae2cb82b4 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Thu, 16 Sep 2021 19:10:38 +0300 Subject: [PATCH 09/31] envoy represented as dict --- .../envoy/sd_requirements.txt | 3 +- .../workspace/PyTorch_Kvasir_UNet.ipynb | 6 +- openfl/component/director/director.py | 35 +- openfl/protocols/director_pb2.py | 521 ++++++------------ openfl/transport/grpc/director_server.py | 26 +- 5 files changed, 214 insertions(+), 377 deletions(-) diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/sd_requirements.txt b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/sd_requirements.txt index 92606011b1..5c89593e48 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/sd_requirements.txt +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/sd_requirements.txt @@ -1,2 +1,3 @@ numpy -pillow \ No newline at end of file +pillow +nvidia-ml-py3 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb index dceaea21b8..b65ecfa4bd 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb @@ -81,7 +81,7 @@ "\n", "# 2) Run with TLS disabled (trusted environment)\n", "# Federation can also determine local fqdn automatically\n", - "federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50051', tls=False)\n" + "federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50050', tls=False)\n" ] }, { @@ -588,7 +588,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -602,7 +602,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.12" + "version": "3.7.10" } }, "nbformat": 4, diff --git a/openfl/component/director/director.py b/openfl/component/director/director.py index 36377668c4..21c989b041 100644 --- a/openfl/component/director/director.py +++ b/openfl/component/director/director.py @@ -45,15 +45,15 @@ def __init__( self.settings = settings or {} self.col_exp_queues = defaultdict(asyncio.Queue) - def acknowledge_shard(self, shard_info: director_pb2.ShardInfo) -> bool: + def acknowledge_shard(self, shard_info: dict) -> bool: """Save shard info to shard registry if it's acceptable.""" is_accepted = False - if (self.sample_shape != shard_info.sample_shape - or self.target_shape != shard_info.target_shape): + if (self.sample_shape != shard_info['sample_shape'] + or self.target_shape != shard_info['target_shape']): logger.info('Request was not accepted') return is_accepted logger.info('Request was accepted') - self._shard_registry[shard_info.node_info.name] = { + self._shard_registry[shard_info['node_info']['name']] = { 'shard_info': shard_info, 'is_online': True, 'is_experiment_running': False @@ -117,9 +117,9 @@ def get_dataset_info(self): """Get dataset info.""" return self.sample_shape, self.target_shape - def get_registered_shards(self) -> list: - """Get registered shard infos.""" - return [shard_status['shard_info'] for shard_status in self._shard_registry.values()] + # def get_registered_shards(self) -> list: + # """Get registered shard infos.""" + # return [shard_status['shard_info'] for shard_status in self._shard_registry.values()] async def stream_metrics(self, experiment_name: str, caller: str): """ @@ -184,26 +184,21 @@ def envoy_health_check( if cuda_devices_status is not None: for i in range(len(cuda_devices_status)): - shard_info['shard_info'].node_info.cuda_devices[i] = cuda_devices_status[i] + shard_info['shard_info']['node_info']['cuda_devices'][i] = cuda_devices_status[i] return hc_period def get_envoys(self) -> list: """Get a status information about envoys.""" + logger.info(f'Shard registry: {self._shard_registry}') - envoy_infos = [] - for envoy in self._shard_registry.values(): - envoy_info = director_pb2.EnvoyInfo( - shard_info=envoy['shard_info'], - is_online=time.time() < envoy['last_updated'] + envoy['valid_duration'], - is_experiment_running=envoy['is_experiment_running'] - ) - envoy_info.valid_duration.seconds = envoy['valid_duration'] - envoy_info.last_updated.seconds = int(envoy['last_updated']) - - envoy_infos.append(envoy_info) + for envoy_info in self._shard_registry.values(): + envoy_info['is_online'] = ( + time.time() < envoy_info['last_updated'] + + envoy_info['valid_duration'] + ) - return envoy_infos + return self._shard_registry.values() async def start_experiment_execution_loop(self): """Run task to monitor and run experiments.""" diff --git a/openfl/protocols/director_pb2.py b/openfl/protocols/director_pb2.py index 9d6732eebd..038f52217d 100644 --- a/openfl/protocols/director_pb2.py +++ b/openfl/protocols/director_pb2.py @@ -6,11 +6,11 @@ from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() + from google.protobuf import timestamp_pb2 as google_dot_protobuf_dot_timestamp__pb2 from google.protobuf import duration_pb2 as google_dot_protobuf_dot_duration__pb2 import openfl.protocols.federation_pb2 as federation__pb2 @@ -21,7 +21,7 @@ syntax='proto3', serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"\x1f\n\rRequestHeader\x12\x0e\n\x06sender\x18\x01 \x01(\t\"\x87\x01\n\x0e\x43udaDeviceInfo\x12\r\n\x05index\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_total\x18\x02 \x01(\x04\x12\x17\n\x0fmemory_utilized\x18\x03 \x01(\x04\x12\x1a\n\x12\x64\x65vice_utilization\x18\x04 \x01(\t\x12\x1b\n\x13\x63uda_driver_version\x18\x05 \x01(\t\"?\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12%\n\x0c\x63uda_devices\x18\x02 \x03(\x0b\x32\x0f.CudaDeviceInfo\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\"a\n\x0b\x45nvoyStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\x12%\n\x0c\x63uda_devices\x18\x03 \x03(\x0b\x32\x0f.CudaDeviceInfo\"R\n\x18\x45nvoyHealthCheckResponse\x12\x36\n\x13health_check_period\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\"\xa6\x01\n\x0e\x45xperimentInfo\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\",\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"\xb5\x01\n\x16GetTrainedModelRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"7\n\x15GetDatasetInfoRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\"O\n\x14StreamMetricsRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"R\n\x17RemoveExperimentRequest\x12\x1e\n\x06header\x18\x01 \x01(\x0b\x32\x0e.RequestHeader\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xa4\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12=\n\x10\x45nvoyHealthCheck\x12\x0c.EnvoyStatus\x1a\x19.EnvoyHealthCheckResponse\"\x00\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' + serialized_pb=b'\n\x0e\x64irector.proto\x1a\x1fgoogle/protobuf/timestamp.proto\x1a\x1egoogle/protobuf/duration.proto\x1a\x10\x66\x65\x64\x65ration.proto\"\x87\x01\n\x0e\x43udaDeviceInfo\x12\r\n\x05index\x18\x01 \x01(\x04\x12\x14\n\x0cmemory_total\x18\x02 \x01(\x04\x12\x17\n\x0fmemory_utilized\x18\x03 \x01(\x04\x12\x1a\n\x12\x64\x65vice_utilization\x18\x04 \x01(\t\x12\x1b\n\x13\x63uda_driver_version\x18\x05 \x01(\t\"?\n\x08NodeInfo\x12\x0c\n\x04name\x18\x01 \x01(\t\x12%\n\x0c\x63uda_devices\x18\x02 \x03(\x0b\x32\x0f.CudaDeviceInfo\"\x83\x01\n\tShardInfo\x12\x1c\n\tnode_info\x18\x01 \x01(\x0b\x32\t.NodeInfo\x12\x19\n\x11shard_description\x18\x02 \x01(\t\x12\x11\n\tn_samples\x18\x03 \x01(\x04\x12\x14\n\x0csample_shape\x18\x04 \x03(\t\x12\x14\n\x0ctarget_shape\x18\x05 \x03(\t\"(\n\x14ShardAcknowledgement\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"2\n\x15WaitExperimentRequest\x12\x19\n\x11\x63ollaborator_name\x18\x01 \x01(\t\"1\n\x16WaitExperimentResponse\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\"N\n\x18GetExperimentDataRequest\x12\x17\n\x0f\x65xperiment_name\x18\x01 \x01(\t\x12\x19\n\x11\x63ollaborator_name\x18\x02 \x01(\t\"/\n\x0e\x45xperimentData\x12\x0c\n\x04size\x18\x01 \x01(\r\x12\x0f\n\x07npbytes\x18\x02 \x01(\x0c\"a\n\x0b\x45nvoyStatus\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x1d\n\x15is_experiment_running\x18\x02 \x01(\x08\x12%\n\x0c\x63uda_devices\x18\x03 \x03(\x0b\x32\x0f.CudaDeviceInfo\"R\n\x18\x45nvoyHealthCheckResponse\x12\x36\n\x13health_check_period\x18\x01 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x86\x01\n\x0e\x45xperimentInfo\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x1a\n\x12\x63ollaborator_names\x18\x03 \x03(\t\x12(\n\x0f\x65xperiment_data\x18\x04 \x01(\x0b\x32\x0f.ExperimentData\x12 \n\x0bmodel_proto\x18\x05 \x01(\x0b\x32\x0b.ModelProto\",\n\x18SetNewExperimentResponse\x12\x10\n\x08\x61\x63\x63\x65pted\x18\x01 \x01(\x08\"\x95\x01\n\x16GetTrainedModelRequest\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\x12\x35\n\nmodel_type\x18\x03 \x01(\x0e\x32!.GetTrainedModelRequest.ModelType\"+\n\tModelType\x12\x0e\n\nBEST_MODEL\x10\x00\x12\x0e\n\nLAST_MODEL\x10\x01\"8\n\x14TrainedModelResponse\x12 \n\x0bmodel_proto\x18\x01 \x01(\x0b\x32\x0b.ModelProto\"\x17\n\x15GetDatasetInfoRequest\"/\n\x14StreamMetricsRequest\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"{\n\x15StreamMetricsResponse\x12\x15\n\rmetric_origin\x18\x01 \x01(\t\x12\x11\n\ttask_name\x18\x02 \x01(\t\x12\x13\n\x0bmetric_name\x18\x03 \x01(\t\x12\x14\n\x0cmetric_value\x18\x04 \x01(\x02\x12\r\n\x05round\x18\x05 \x01(\r\"2\n\x17RemoveExperimentRequest\x12\x17\n\x0f\x65xperiment_name\x18\x02 \x01(\t\"3\n\x18RemoveExperimentResponse\x12\x17\n\x0f\x61\x63knowledgement\x18\x01 \x01(\x08\"\xc2\x01\n\tEnvoyInfo\x12\x1e\n\nshard_info\x18\x01 \x01(\x0b\x32\n.ShardInfo\x12\x11\n\tis_online\x18\x02 \x01(\x08\x12\x1d\n\x15is_experiment_running\x18\x03 \x01(\x08\x12\x30\n\x0clast_updated\x18\x04 \x01(\x0b\x32\x1a.google.protobuf.Timestamp\x12\x31\n\x0evalid_duration\x18\x05 \x01(\x0b\x32\x19.google.protobuf.Duration\"\x12\n\x10GetEnvoysRequest\"4\n\x11GetEnvoysResponse\x12\x1f\n\x0b\x65nvoy_infos\x18\x01 \x03(\x0b\x32\n.EnvoyInfo2\xa4\x05\n\x12\x46\x65\x64\x65rationDirector\x12\x37\n\x10\x41\x63knowledgeShard\x12\n.ShardInfo\x1a\x15.ShardAcknowledgement\"\x00\x12G\n\x0eWaitExperiment\x12\x16.WaitExperimentRequest\x1a\x17.WaitExperimentResponse\"\x00(\x01\x30\x01\x12\x43\n\x11GetExperimentData\x12\x19.GetExperimentDataRequest\x1a\x0f.ExperimentData\"\x00\x30\x01\x12=\n\x10\x45nvoyHealthCheck\x12\x0c.EnvoyStatus\x1a\x19.EnvoyHealthCheckResponse\"\x00\x12\x42\n\x10SetNewExperiment\x12\x0f.ExperimentInfo\x1a\x19.SetNewExperimentResponse\"\x00(\x01\x12\x36\n\x0eGetDatasetInfo\x12\x16.GetDatasetInfoRequest\x1a\n.ShardInfo\"\x00\x12\x43\n\x0fGetTrainedModel\x12\x17.GetTrainedModelRequest\x1a\x15.TrainedModelResponse\"\x00\x12\x42\n\rStreamMetrics\x12\x15.StreamMetricsRequest\x1a\x16.StreamMetricsResponse\"\x00\x30\x01\x12M\n\x14RemoveExperimentData\x12\x18.RemoveExperimentRequest\x1a\x19.RemoveExperimentResponse\"\x00\x12\x34\n\tGetEnvoys\x12\x11.GetEnvoysRequest\x1a\x12.GetEnvoysResponse\"\x00\x62\x06proto3' , dependencies=[google_dot_protobuf_dot_timestamp__pb2.DESCRIPTOR,google_dot_protobuf_dot_duration__pb2.DESCRIPTOR,federation__pb2.DESCRIPTOR,]) @@ -47,44 +47,12 @@ ], containing_type=None, serialized_options=None, - serialized_start=1282, - serialized_end=1325, + serialized_start=1185, + serialized_end=1228, ) _sym_db.RegisterEnumDescriptor(_GETTRAINEDMODELREQUEST_MODELTYPE) -_REQUESTHEADER = _descriptor.Descriptor( - name='RequestHeader', - full_name='RequestHeader', - filename=None, - file=DESCRIPTOR, - containing_type=None, - create_key=_descriptor._internal_create_key, - fields=[ - _descriptor.FieldDescriptor( - name='sender', full_name='RequestHeader.sender', index=0, - number=1, type=9, cpp_type=9, label=1, - has_default_value=False, default_value=b"".decode('utf-8'), - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - ], - extensions=[ - ], - nested_types=[], - enum_types=[ - ], - serialized_options=None, - is_extendable=False, - syntax='proto3', - extension_ranges=[], - oneofs=[ - ], - serialized_start=101, - serialized_end=132, -) - - _CUDADEVICEINFO = _descriptor.Descriptor( name='CudaDeviceInfo', full_name='CudaDeviceInfo', @@ -140,8 +108,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=135, - serialized_end=270, + serialized_start=102, + serialized_end=237, ) @@ -179,10 +147,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=272, - serialized_end=335, + serialized_start=239, + serialized_end=302, ) + _SHARDINFO = _descriptor.Descriptor( name='ShardInfo', full_name='ShardInfo', @@ -238,10 +207,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=338, - serialized_end=469, + serialized_start=305, + serialized_end=436, ) + _SHARDACKNOWLEDGEMENT = _descriptor.Descriptor( name='ShardAcknowledgement', full_name='ShardAcknowledgement', @@ -269,10 +239,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=471, - serialized_end=511, + serialized_start=438, + serialized_end=478, ) + _WAITEXPERIMENTREQUEST = _descriptor.Descriptor( name='WaitExperimentRequest', full_name='WaitExperimentRequest', @@ -300,10 +271,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=513, - serialized_end=563, + serialized_start=480, + serialized_end=530, ) + _WAITEXPERIMENTRESPONSE = _descriptor.Descriptor( name='WaitExperimentResponse', full_name='WaitExperimentResponse', @@ -331,10 +303,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=565, - serialized_end=614, + serialized_start=532, + serialized_end=581, ) + _GETEXPERIMENTDATAREQUEST = _descriptor.Descriptor( name='GetExperimentDataRequest', full_name='GetExperimentDataRequest', @@ -369,10 +342,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=616, - serialized_end=694, + serialized_start=583, + serialized_end=661, ) + _EXPERIMENTDATA = _descriptor.Descriptor( name='ExperimentData', full_name='ExperimentData', @@ -407,8 +381,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=696, - serialized_end=743, + serialized_start=663, + serialized_end=710, ) @@ -453,8 +427,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=745, - serialized_end=842, + serialized_start=712, + serialized_end=809, ) @@ -485,10 +459,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=844, - serialized_end=926, + serialized_start=811, + serialized_end=893, ) + _EXPERIMENTINFO = _descriptor.Descriptor( name='ExperimentInfo', full_name='ExperimentInfo', @@ -498,35 +473,28 @@ create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( - name='header', full_name='ExperimentInfo.header', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='name', full_name='ExperimentInfo.name', index=1, + name='name', full_name='ExperimentInfo.name', index=0, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='collaborator_names', full_name='ExperimentInfo.collaborator_names', index=2, + name='collaborator_names', full_name='ExperimentInfo.collaborator_names', index=1, number=3, type=9, cpp_type=9, label=3, has_default_value=False, default_value=[], message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='experiment_data', full_name='ExperimentInfo.experiment_data', index=3, + name='experiment_data', full_name='ExperimentInfo.experiment_data', index=2, number=4, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='model_proto', full_name='ExperimentInfo.model_proto', index=4, + name='model_proto', full_name='ExperimentInfo.model_proto', index=3, number=5, type=11, cpp_type=10, label=1, has_default_value=False, default_value=None, message_type=None, enum_type=None, containing_type=None, @@ -544,10 +512,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=929, - serialized_end=1095, + serialized_start=896, + serialized_end=1030, ) + _SETNEWEXPERIMENTRESPONSE = _descriptor.Descriptor( name='SetNewExperimentResponse', full_name='SetNewExperimentResponse', @@ -575,10 +544,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1097, - serialized_end=1141, + serialized_start=1032, + serialized_end=1076, ) + _GETTRAINEDMODELREQUEST = _descriptor.Descriptor( name='GetTrainedModelRequest', full_name='GetTrainedModelRequest', @@ -588,21 +558,14 @@ create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( - name='header', full_name='GetTrainedModelRequest.header', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='experiment_name', full_name='GetTrainedModelRequest.experiment_name', index=1, + name='experiment_name', full_name='GetTrainedModelRequest.experiment_name', index=0, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), _descriptor.FieldDescriptor( - name='model_type', full_name='GetTrainedModelRequest.model_type', index=2, + name='model_type', full_name='GetTrainedModelRequest.model_type', index=1, number=3, type=14, cpp_type=8, label=1, has_default_value=False, default_value=0, message_type=None, enum_type=None, containing_type=None, @@ -621,10 +584,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1144, - serialized_end=1325, + serialized_start=1079, + serialized_end=1228, ) + _TRAINEDMODELRESPONSE = _descriptor.Descriptor( name='TrainedModelResponse', full_name='TrainedModelResponse', @@ -652,10 +616,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1327, - serialized_end=1383, + serialized_start=1230, + serialized_end=1286, ) + _GETDATASETINFOREQUEST = _descriptor.Descriptor( name='GetDatasetInfoRequest', full_name='GetDatasetInfoRequest', @@ -664,13 +629,6 @@ containing_type=None, create_key=_descriptor._internal_create_key, fields=[ - _descriptor.FieldDescriptor( - name='header', full_name='GetDatasetInfoRequest.header', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -683,10 +641,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1385, - serialized_end=1440, + serialized_start=1288, + serialized_end=1311, ) + _STREAMMETRICSREQUEST = _descriptor.Descriptor( name='StreamMetricsRequest', full_name='StreamMetricsRequest', @@ -696,14 +655,7 @@ create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( - name='header', full_name='StreamMetricsRequest.header', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='experiment_name', full_name='StreamMetricsRequest.experiment_name', index=1, + name='experiment_name', full_name='StreamMetricsRequest.experiment_name', index=0, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, @@ -721,10 +673,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1442, - serialized_end=1521, + serialized_start=1313, + serialized_end=1360, ) + _STREAMMETRICSRESPONSE = _descriptor.Descriptor( name='StreamMetricsResponse', full_name='StreamMetricsResponse', @@ -780,10 +733,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1523, - serialized_end=1646, + serialized_start=1362, + serialized_end=1485, ) + _REMOVEEXPERIMENTREQUEST = _descriptor.Descriptor( name='RemoveExperimentRequest', full_name='RemoveExperimentRequest', @@ -793,14 +747,7 @@ create_key=_descriptor._internal_create_key, fields=[ _descriptor.FieldDescriptor( - name='header', full_name='RemoveExperimentRequest.header', index=0, - number=1, type=11, cpp_type=10, label=1, - has_default_value=False, default_value=None, - message_type=None, enum_type=None, containing_type=None, - is_extension=False, extension_scope=None, - serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), - _descriptor.FieldDescriptor( - name='experiment_name', full_name='RemoveExperimentRequest.experiment_name', index=1, + name='experiment_name', full_name='RemoveExperimentRequest.experiment_name', index=0, number=2, type=9, cpp_type=9, label=1, has_default_value=False, default_value=b"".decode('utf-8'), message_type=None, enum_type=None, containing_type=None, @@ -818,10 +765,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1648, - serialized_end=1730, + serialized_start=1487, + serialized_end=1537, ) + _REMOVEEXPERIMENTRESPONSE = _descriptor.Descriptor( name='RemoveExperimentResponse', full_name='RemoveExperimentResponse', @@ -849,10 +797,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1732, - serialized_end=1783, + serialized_start=1539, + serialized_end=1590, ) + _ENVOYINFO = _descriptor.Descriptor( name='EnvoyInfo', full_name='EnvoyInfo', @@ -908,10 +857,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1786, - serialized_end=1980, + serialized_start=1593, + serialized_end=1787, ) + _GETENVOYSREQUEST = _descriptor.Descriptor( name='GetEnvoysRequest', full_name='GetEnvoysRequest', @@ -932,10 +882,11 @@ extension_ranges=[], oneofs=[ ], - serialized_start=1982, - serialized_end=2000, + serialized_start=1789, + serialized_end=1807, ) + _GETENVOYSRESPONSE = _descriptor.Descriptor( name='GetEnvoysResponse', full_name='GetEnvoysResponse', @@ -963,30 +914,23 @@ extension_ranges=[], oneofs=[ ], - serialized_start=2002, - serialized_end=2054, + serialized_start=1809, + serialized_end=1861, ) _NODEINFO.fields_by_name['cuda_devices'].message_type = _CUDADEVICEINFO _SHARDINFO.fields_by_name['node_info'].message_type = _NODEINFO _ENVOYSTATUS.fields_by_name['cuda_devices'].message_type = _CUDADEVICEINFO _ENVOYHEALTHCHECKRESPONSE.fields_by_name['health_check_period'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION -_EXPERIMENTINFO.fields_by_name['header'].message_type = _REQUESTHEADER _EXPERIMENTINFO.fields_by_name['experiment_data'].message_type = _EXPERIMENTDATA _EXPERIMENTINFO.fields_by_name['model_proto'].message_type = federation__pb2._MODELPROTO _GETTRAINEDMODELREQUEST.fields_by_name['model_type'].enum_type = _GETTRAINEDMODELREQUEST_MODELTYPE _GETTRAINEDMODELREQUEST_MODELTYPE.containing_type = _GETTRAINEDMODELREQUEST _TRAINEDMODELRESPONSE.fields_by_name['model_proto'].message_type = federation__pb2._MODELPROTO -_GETDATASETINFOREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER -_STREAMMETRICSREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER -_REMOVEEXPERIMENTREQUEST.fields_by_name['header'].message_type = _REQUESTHEADER _ENVOYINFO.fields_by_name['shard_info'].message_type = _SHARDINFO -_ENVOYINFO.fields_by_name[ - 'last_updated'].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP -_ENVOYINFO.fields_by_name[ - 'valid_duration'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION +_ENVOYINFO.fields_by_name['last_updated'].message_type = google_dot_protobuf_dot_timestamp__pb2._TIMESTAMP +_ENVOYINFO.fields_by_name['valid_duration'].message_type = google_dot_protobuf_dot_duration__pb2._DURATION _GETENVOYSRESPONSE.fields_by_name['envoy_infos'].message_type = _ENVOYINFO -DESCRIPTOR.message_types_by_name['RequestHeader'] = _REQUESTHEADER DESCRIPTOR.message_types_by_name['CudaDeviceInfo'] = _CUDADEVICEINFO DESCRIPTOR.message_types_by_name['NodeInfo'] = _NODEINFO DESCRIPTOR.message_types_by_name['ShardInfo'] = _SHARDINFO @@ -1011,13 +955,6 @@ DESCRIPTOR.message_types_by_name['GetEnvoysResponse'] = _GETENVOYSRESPONSE _sym_db.RegisterFileDescriptor(DESCRIPTOR) -RequestHeader = _reflection.GeneratedProtocolMessageType('RequestHeader', (_message.Message,), { - 'DESCRIPTOR' : _REQUESTHEADER, - '__module__' : 'director_pb2' - # @@protoc_insertion_point(class_scope:RequestHeader) - }) -_sym_db.RegisterMessage(RequestHeader) - CudaDeviceInfo = _reflection.GeneratedProtocolMessageType('CudaDeviceInfo', (_message.Message,), { 'DESCRIPTOR' : _CUDADEVICEINFO, '__module__' : 'director_pb2' @@ -1026,56 +963,52 @@ _sym_db.RegisterMessage(CudaDeviceInfo) NodeInfo = _reflection.GeneratedProtocolMessageType('NodeInfo', (_message.Message,), { - 'DESCRIPTOR': _NODEINFO, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:NodeInfo) -}) + 'DESCRIPTOR' : _NODEINFO, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:NodeInfo) + }) _sym_db.RegisterMessage(NodeInfo) ShardInfo = _reflection.GeneratedProtocolMessageType('ShardInfo', (_message.Message,), { - 'DESCRIPTOR': _SHARDINFO, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:ShardInfo) -}) + 'DESCRIPTOR' : _SHARDINFO, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:ShardInfo) + }) _sym_db.RegisterMessage(ShardInfo) -ShardAcknowledgement = _reflection.GeneratedProtocolMessageType('ShardAcknowledgement', - (_message.Message,), { - 'DESCRIPTOR': _SHARDACKNOWLEDGEMENT, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:ShardAcknowledgement) - }) +ShardAcknowledgement = _reflection.GeneratedProtocolMessageType('ShardAcknowledgement', (_message.Message,), { + 'DESCRIPTOR' : _SHARDACKNOWLEDGEMENT, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:ShardAcknowledgement) + }) _sym_db.RegisterMessage(ShardAcknowledgement) -WaitExperimentRequest = _reflection.GeneratedProtocolMessageType('WaitExperimentRequest', - (_message.Message,), { - 'DESCRIPTOR': _WAITEXPERIMENTREQUEST, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:WaitExperimentRequest) - }) +WaitExperimentRequest = _reflection.GeneratedProtocolMessageType('WaitExperimentRequest', (_message.Message,), { + 'DESCRIPTOR' : _WAITEXPERIMENTREQUEST, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:WaitExperimentRequest) + }) _sym_db.RegisterMessage(WaitExperimentRequest) -WaitExperimentResponse = _reflection.GeneratedProtocolMessageType('WaitExperimentResponse', - (_message.Message,), { - 'DESCRIPTOR': _WAITEXPERIMENTRESPONSE, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:WaitExperimentResponse) - }) +WaitExperimentResponse = _reflection.GeneratedProtocolMessageType('WaitExperimentResponse', (_message.Message,), { + 'DESCRIPTOR' : _WAITEXPERIMENTRESPONSE, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:WaitExperimentResponse) + }) _sym_db.RegisterMessage(WaitExperimentResponse) -GetExperimentDataRequest = _reflection.GeneratedProtocolMessageType('GetExperimentDataRequest', - (_message.Message,), { - 'DESCRIPTOR': _GETEXPERIMENTDATAREQUEST, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:GetExperimentDataRequest) - }) +GetExperimentDataRequest = _reflection.GeneratedProtocolMessageType('GetExperimentDataRequest', (_message.Message,), { + 'DESCRIPTOR' : _GETEXPERIMENTDATAREQUEST, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:GetExperimentDataRequest) + }) _sym_db.RegisterMessage(GetExperimentDataRequest) ExperimentData = _reflection.GeneratedProtocolMessageType('ExperimentData', (_message.Message,), { - 'DESCRIPTOR': _EXPERIMENTDATA, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:ExperimentData) -}) + 'DESCRIPTOR' : _EXPERIMENTDATA, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:ExperimentData) + }) _sym_db.RegisterMessage(ExperimentData) EnvoyStatus = _reflection.GeneratedProtocolMessageType('EnvoyStatus', (_message.Message,), { @@ -1093,99 +1026,91 @@ _sym_db.RegisterMessage(EnvoyHealthCheckResponse) ExperimentInfo = _reflection.GeneratedProtocolMessageType('ExperimentInfo', (_message.Message,), { - 'DESCRIPTOR': _EXPERIMENTINFO, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:ExperimentInfo) -}) + 'DESCRIPTOR' : _EXPERIMENTINFO, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:ExperimentInfo) + }) _sym_db.RegisterMessage(ExperimentInfo) -SetNewExperimentResponse = _reflection.GeneratedProtocolMessageType('SetNewExperimentResponse', - (_message.Message,), { - 'DESCRIPTOR': _SETNEWEXPERIMENTRESPONSE, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:SetNewExperimentResponse) - }) +SetNewExperimentResponse = _reflection.GeneratedProtocolMessageType('SetNewExperimentResponse', (_message.Message,), { + 'DESCRIPTOR' : _SETNEWEXPERIMENTRESPONSE, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:SetNewExperimentResponse) + }) _sym_db.RegisterMessage(SetNewExperimentResponse) -GetTrainedModelRequest = _reflection.GeneratedProtocolMessageType('GetTrainedModelRequest', - (_message.Message,), { - 'DESCRIPTOR': _GETTRAINEDMODELREQUEST, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:GetTrainedModelRequest) - }) +GetTrainedModelRequest = _reflection.GeneratedProtocolMessageType('GetTrainedModelRequest', (_message.Message,), { + 'DESCRIPTOR' : _GETTRAINEDMODELREQUEST, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:GetTrainedModelRequest) + }) _sym_db.RegisterMessage(GetTrainedModelRequest) -TrainedModelResponse = _reflection.GeneratedProtocolMessageType('TrainedModelResponse', - (_message.Message,), { - 'DESCRIPTOR': _TRAINEDMODELRESPONSE, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:TrainedModelResponse) - }) +TrainedModelResponse = _reflection.GeneratedProtocolMessageType('TrainedModelResponse', (_message.Message,), { + 'DESCRIPTOR' : _TRAINEDMODELRESPONSE, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:TrainedModelResponse) + }) _sym_db.RegisterMessage(TrainedModelResponse) -GetDatasetInfoRequest = _reflection.GeneratedProtocolMessageType('GetDatasetInfoRequest', - (_message.Message,), { - 'DESCRIPTOR': _GETDATASETINFOREQUEST, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:GetDatasetInfoRequest) - }) +GetDatasetInfoRequest = _reflection.GeneratedProtocolMessageType('GetDatasetInfoRequest', (_message.Message,), { + 'DESCRIPTOR' : _GETDATASETINFOREQUEST, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:GetDatasetInfoRequest) + }) _sym_db.RegisterMessage(GetDatasetInfoRequest) -StreamMetricsRequest = _reflection.GeneratedProtocolMessageType('StreamMetricsRequest', - (_message.Message,), { - 'DESCRIPTOR': _STREAMMETRICSREQUEST, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:StreamMetricsRequest) - }) +StreamMetricsRequest = _reflection.GeneratedProtocolMessageType('StreamMetricsRequest', (_message.Message,), { + 'DESCRIPTOR' : _STREAMMETRICSREQUEST, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:StreamMetricsRequest) + }) _sym_db.RegisterMessage(StreamMetricsRequest) -StreamMetricsResponse = _reflection.GeneratedProtocolMessageType('StreamMetricsResponse', - (_message.Message,), { - 'DESCRIPTOR': _STREAMMETRICSRESPONSE, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:StreamMetricsResponse) - }) +StreamMetricsResponse = _reflection.GeneratedProtocolMessageType('StreamMetricsResponse', (_message.Message,), { + 'DESCRIPTOR' : _STREAMMETRICSRESPONSE, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:StreamMetricsResponse) + }) _sym_db.RegisterMessage(StreamMetricsResponse) -RemoveExperimentRequest = _reflection.GeneratedProtocolMessageType('RemoveExperimentRequest', - (_message.Message,), { - 'DESCRIPTOR': _REMOVEEXPERIMENTREQUEST, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:RemoveExperimentRequest) - }) +RemoveExperimentRequest = _reflection.GeneratedProtocolMessageType('RemoveExperimentRequest', (_message.Message,), { + 'DESCRIPTOR' : _REMOVEEXPERIMENTREQUEST, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:RemoveExperimentRequest) + }) _sym_db.RegisterMessage(RemoveExperimentRequest) -RemoveExperimentResponse = _reflection.GeneratedProtocolMessageType('RemoveExperimentResponse', - (_message.Message,), { - 'DESCRIPTOR': _REMOVEEXPERIMENTRESPONSE, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:RemoveExperimentResponse) - }) +RemoveExperimentResponse = _reflection.GeneratedProtocolMessageType('RemoveExperimentResponse', (_message.Message,), { + 'DESCRIPTOR' : _REMOVEEXPERIMENTRESPONSE, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:RemoveExperimentResponse) + }) _sym_db.RegisterMessage(RemoveExperimentResponse) EnvoyInfo = _reflection.GeneratedProtocolMessageType('EnvoyInfo', (_message.Message,), { - 'DESCRIPTOR': _ENVOYINFO, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:EnvoyInfo) -}) + 'DESCRIPTOR' : _ENVOYINFO, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:EnvoyInfo) + }) _sym_db.RegisterMessage(EnvoyInfo) -GetEnvoysRequest = _reflection.GeneratedProtocolMessageType('GetEnvoysRequest', - (_message.Message,), { - 'DESCRIPTOR': _GETENVOYSREQUEST, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:GetEnvoysRequest) - }) +GetEnvoysRequest = _reflection.GeneratedProtocolMessageType('GetEnvoysRequest', (_message.Message,), { + 'DESCRIPTOR' : _GETENVOYSREQUEST, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:GetEnvoysRequest) + }) _sym_db.RegisterMessage(GetEnvoysRequest) -GetEnvoysResponse = _reflection.GeneratedProtocolMessageType('GetEnvoysResponse', - (_message.Message,), { - 'DESCRIPTOR': _GETENVOYSRESPONSE, - '__module__': 'director_pb2' - # @@protoc_insertion_point(class_scope:GetEnvoysResponse) - }) +GetEnvoysResponse = _reflection.GeneratedProtocolMessageType('GetEnvoysResponse', (_message.Message,), { + 'DESCRIPTOR' : _GETENVOYSRESPONSE, + '__module__' : 'director_pb2' + # @@protoc_insertion_point(class_scope:GetEnvoysResponse) + }) _sym_db.RegisterMessage(GetEnvoysResponse) + + _FEDERATIONDIRECTOR = _descriptor.ServiceDescriptor( name='FederationDirector', full_name='FederationDirector', @@ -1193,8 +1118,8 @@ index=0, serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_start=2057, - serialized_end=2733, + serialized_start=1864, + serialized_end=2540, methods=[ _descriptor.MethodDescriptor( name='AcknowledgeShard', @@ -1295,110 +1220,8 @@ output_type=_GETENVOYSRESPONSE, serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_start=1752, - serialized_end=2449, - methods=[ - _descriptor.MethodDescriptor( - name='AcknowledgeShard', - full_name='FederationDirector.AcknowledgeShard', - index=0, - containing_service=None, - input_type=_SHARDINFO, - output_type=_SHARDACKNOWLEDGEMENT, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='WaitExperiment', - full_name='FederationDirector.WaitExperiment', - index=1, - containing_service=None, - input_type=_WAITEXPERIMENTREQUEST, - output_type=_WAITEXPERIMENTRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='GetExperimentData', - full_name='FederationDirector.GetExperimentData', - index=2, - containing_service=None, - input_type=_GETEXPERIMENTDATAREQUEST, - output_type=_EXPERIMENTDATA, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='SetNewExperiment', - full_name='FederationDirector.SetNewExperiment', - index=3, - containing_service=None, - input_type=_EXPERIMENTINFO, - output_type=_SETNEWEXPERIMENTRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='GetDatasetInfo', - full_name='FederationDirector.GetDatasetInfo', - index=4, - containing_service=None, - input_type=_GETDATASETINFOREQUEST, - output_type=_SHARDINFO, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='GetTrainedModel', - full_name='FederationDirector.GetTrainedModel', - index=5, - containing_service=None, - input_type=_GETTRAINEDMODELREQUEST, - output_type=_TRAINEDMODELRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='StreamMetrics', - full_name='FederationDirector.StreamMetrics', - index=6, - containing_service=None, - input_type=_STREAMMETRICSREQUEST, - output_type=_STREAMMETRICSRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='RemoveExperimentData', - full_name='FederationDirector.RemoveExperimentData', - index=7, - containing_service=None, - input_type=_REMOVEEXPERIMENTREQUEST, - output_type=_REMOVEEXPERIMENTRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='CollaboratorHealthCheck', - full_name='FederationDirector.CollaboratorHealthCheck', - index=8, - containing_service=None, - input_type=_COLLABORATORSTATUS, - output_type=_COLLABORATORHEALTHCHECKRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - _descriptor.MethodDescriptor( - name='GetEnvoys', - full_name='FederationDirector.GetEnvoys', - index=9, - containing_service=None, - input_type=_GETENVOYSREQUEST, - output_type=_GETENVOYSRESPONSE, - serialized_options=None, - create_key=_descriptor._internal_create_key, - ), - ]) + ), +]) _sym_db.RegisterServiceDescriptor(_FEDERATIONDIRECTOR) DESCRIPTOR.services_by_name['FederationDirector'] = _FEDERATIONDIRECTOR diff --git a/openfl/transport/grpc/director_server.py b/openfl/transport/grpc/director_server.py index ccb4dc3f78..e068921274 100644 --- a/openfl/transport/grpc/director_server.py +++ b/openfl/transport/grpc/director_server.py @@ -8,6 +8,8 @@ import uuid from pathlib import Path +from google.protobuf.json_format import MessageToDict +from google.protobuf.json_format import ParseDict from grpc import aio from grpc import ssl_server_credentials @@ -105,7 +107,8 @@ async def _run_server(self): async def AcknowledgeShard(self, shard_info, context): # NOQA:N802 """Receive acknowledge shard info.""" logger.info(f'AcknowledgeShard request has got: {shard_info}') - is_accepted = self.director.acknowledge_shard(shard_info) + dict_shard_info = MessageToDict(shard_info, preserving_proto_field_name=True) + is_accepted = self.director.acknowledge_shard(dict_shard_info) reply = director_pb2.ShardAcknowledgement(accepted=is_accepted) return reply @@ -229,10 +232,14 @@ async def RemoveExperimentData(self, request, context): # NOQA:N802 async def EnvoyHealthCheck(self, request, context): # NOQA:N802 """Accept health check from envoy.""" logger.debug(f'Request EnvoyHealthCheck has got: {request}') + cuda_devices_info = [ + MessageToDict(message, preserving_proto_field_name=True) + for message in request.cuda_devices + ] health_check_period = self.director.envoy_health_check( envoy_name=request.name, is_experiment_running=request.is_experiment_running, - cuda_devices_status=request.cuda_devices + cuda_devices_status=cuda_devices_info ) resp = director_pb2.EnvoyHealthCheckResponse() resp.health_check_period.seconds = health_check_period @@ -242,5 +249,16 @@ async def EnvoyHealthCheck(self, request, context): # NOQA:N802 async def GetEnvoys(self, request, context): # NOQA:N802 """Get a status information about envoys.""" envoy_infos = self.director.get_envoys() - - return director_pb2.GetEnvoysResponse(envoy_infos=envoy_infos) + response = [] + for envoy_info in envoy_infos: + envoy_info_message = director_pb2.EnvoyInfo( + shard_info=ParseDict(envoy_info['shard_info'], director_pb2.ShardInfo(), + ignore_unknown_fields=True), + is_online=envoy_info['is_online'], + is_experiment_running=envoy_info['is_experiment_running']) + envoy_info_message.valid_duration.seconds = envoy_info['valid_duration'] + envoy_info_message.last_updated.seconds = int(envoy_info['last_updated']) + + response.append(envoy_info_message) + + return director_pb2.GetEnvoysResponse(envoy_infos=response) From fbb85c56817c0b52e4dc035918c1ed2206e3ce6c Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Fri, 17 Sep 2021 18:19:51 +0300 Subject: [PATCH 10/31] working example --- .../envoy/envoy_config.yaml | 2 +- .../envoy/envoy_config2.yaml | 14 ++++++++ .../PyTorch_Kvasir_UNet/envoy/start_envoy2.sh | 4 +++ .../workspace/PyTorch_Kvasir_UNet.ipynb | 30 ++++++++++------- openfl/component/collaborator/collaborator.py | 28 ++++++++++++++++ openfl/component/envoy/envoy.py | 1 + .../interface/interactive_api/experiment.py | 33 +++++++++---------- 7 files changed, 83 insertions(+), 29 deletions(-) create mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml create mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml index 35d1abb9c8..0677b34f11 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml @@ -9,6 +9,6 @@ shard_descriptor: template: kvasir_shard_descriptor.KvasirShardDescriptor params: data_folder: kvasir_data - rank_worldsize: 1,90 + rank_worldsize: 1,10 enforce_image_hw: '300,400' \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml new file mode 100644 index 0000000000..ce1a0d9c0c --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml @@ -0,0 +1,14 @@ +params: + cuda_devices: [] + optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: kvasir_shard_descriptor.KvasirShardDescriptor + params: + data_folder: kvasir_data + rank_worldsize: 2,10 + enforce_image_hw: '300,400' + \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh new file mode 100644 index 0000000000..d30661f66e --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -e + +fx envoy start -n env_two --disable-tls --envoy-config-path envoy_config2.yaml -dh localhost -dp 50050 diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb index b65ecfa4bd..a84dfc27b9 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb @@ -44,7 +44,7 @@ "outputs": [], "source": [ "# Install dependencies if not already installed\n", - "!pip install torchvision==0.8.1" + "!pip install torchvision" ] }, { @@ -91,6 +91,11 @@ "metadata": {}, "outputs": [], "source": [ + "# import time\n", + "# while True:\n", + "# shard_registry = federation.get_shard_registry()\n", + "# print(shard_registry)\n", + "# time.sleep(5)\n", "shard_registry = federation.get_shard_registry()\n", "shard_registry" ] @@ -385,10 +390,12 @@ " device='device', optimizer='optimizer') \n", "@TI.set_aggregation_function(aggregation_function)\n", "def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss, some_parameter=None):\n", - " if not torch.cuda.is_available():\n", - " device = 'cpu'\n", - " else:\n", - " device = 'cuda'\n", + "# if not torch.cuda.is_available():\n", + "# device = 'cpu'\n", + "# else:\n", + "# device = 'cuda'\n", + "\n", + " print(f'\\n\\n TASK TRAIN GOT DEVICE {device}\\n\\n')\n", " \n", " function_defined_in_notebook(some_parameter)\n", " \n", @@ -414,11 +421,12 @@ "\n", "@TI.register_fl_task(model='unet_model', data_loader='val_loader', device='device') \n", "def validate(unet_model, val_loader, device):\n", - " if not torch.cuda.is_available():\n", - " device = 'cpu'\n", - " else:\n", - " device = 'cuda'\n", - " \n", + "# if not torch.cuda.is_available():\n", + "# device = 'cpu'\n", + "# else:\n", + "# device = 'cuda'\n", + " print(f'\\n\\n TASK VALIDATE GOT DEVICE {device}\\n\\n')\n", + " \n", " unet_model.eval()\n", " unet_model.to(device)\n", " \n", @@ -475,7 +483,7 @@ " data_loader=fed_dataset,\n", " rounds_to_train=2,\n", " opt_treatment='CONTINUE_GLOBAL',\n", - " )\n" + " device_assignment_policy='CUDA_PREFERRED')\n" ] }, { diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 440fa25701..91b3deec41 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -6,6 +6,7 @@ from enum import Enum from logging import getLogger from time import sleep +from typing import Tuple from openfl.databases import TensorDB from openfl.pipelines import NoCompressionPipeline @@ -13,6 +14,12 @@ from openfl.protocols import utils from openfl.utilities import TensorKey +class DevicePolicy(Enum): + """Device assignment policy.""" + + CPU_ONLY = 1 + + CUDA_PREFERRED = 2 class OptTreatment(Enum): """Optimizer Methods.""" @@ -68,6 +75,7 @@ def __init__(self, task_runner, task_config, opt_treatment=OptTreatment.RESET, + device_assignment_policy=DevicePolicy.CPU_ONLY, delta_updates=False, compression_pipeline=None, db_store_rounds=1, @@ -104,8 +112,22 @@ def __init__(self, self.logger.error(f'Unknown opt_treatment: {opt_treatment}.') raise NotImplementedError(f'Unknown opt_treatment: {opt_treatment}.') + if hasattr(DevicePolicy, device_assignment_policy): + self.device_assignment_policy = DevicePolicy[device_assignment_policy] + else: + self.logger.error(f'Unknown device_assignment_policy: {device_assignment_policy}.') + raise NotImplementedError(f'Unknown device_assignment_policy: {device_assignment_policy}.') + self.task_runner.set_optimizer_treatment(self.opt_treatment.name) + def set_available_devices(self, cuda: Tuple[str]=[]): + """ + Set available CUDA devices. + + Cuda tuple contains string indeces, ('1', '3'). + """ + self.cuda_devices = cuda + def run(self): """Run the collaborator.""" while True: @@ -162,6 +184,12 @@ def do_task(self, task, round_number): func_name = self.task_config[task]['function'] kwargs = self.task_config[task]['kwargs'] + if (self.device_assignment_policy.name == 'CUDA_PREFERRED' and + len(self.cuda_devices) > 0): + kwargs['device'] = 'cuda:' + str(self.cuda_devices[0]) + else: + kwargs['device'] = 'cpu' + # this would return a list of what tensors we require as TensorKeys required_tensorkeys_relative = self.task_runner.get_required_tensorkeys_for_function( func_name, diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index f2694a3219..3ddc65a765 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -126,6 +126,7 @@ def _run_collaborator(self, plan='plan/plan.yaml'): col = plan.get_collaborator(self.name, self.root_certificate, self.private_key, self.certificate, shard_descriptor=self.shard_descriptor) + col.set_available_devices(cuda=self.cuda_devices) col.run() def start(self): diff --git a/openfl/interface/interactive_api/experiment.py b/openfl/interface/interactive_api/experiment.py index 0ac5bff674..e9025423cc 100644 --- a/openfl/interface/interactive_api/experiment.py +++ b/openfl/interface/interactive_api/experiment.py @@ -121,18 +121,8 @@ def remove_experiment_data(self): self.logger.info(log_message) - def prepare_workspace_distribution( - self, model_provider, task_keeper, data_loader, - rounds_to_train, - delta_updates=False, opt_treatment='RESET'): + def prepare_workspace_distribution(self, model_provider, task_keeper, data_loader): """Prepare an archive from a user workspace.""" - self._prepare_plan(model_provider, task_keeper, data_loader, - rounds_to_train, - delta_updates=delta_updates, opt_treatment=opt_treatment, - model_interface_file='model_obj.pkl', - tasks_interface_file='tasks_obj.pkl', - dataloader_interface_file='loader_obj.pkl') - # Save serialized python objects to disc self._serialize_interface_objects(model_provider, task_keeper, data_loader) # Save the prepared plan @@ -145,15 +135,22 @@ def prepare_workspace_distribution( # Compress te workspace to restore it on collaborator self.arch_path = self._pack_the_workspace() - # DO CERTIFICATES exchange - def start(self, *, model_provider, task_keeper, data_loader, - rounds_to_train, delta_updates=False, opt_treatment='RESET'): + rounds_to_train, delta_updates=False, opt_treatment='RESET', + device_assignment_policy='CPU_ONLY'): """Prepare experiment and run.""" + self._prepare_plan(model_provider, task_keeper, data_loader, + rounds_to_train, + delta_updates=delta_updates, opt_treatment=opt_treatment, + device_assignment_policy=device_assignment_policy, + model_interface_file='model_obj.pkl', + tasks_interface_file='tasks_obj.pkl', + dataloader_interface_file='loader_obj.pkl') + self.prepare_workspace_distribution( - model_provider, task_keeper, data_loader, - rounds_to_train, delta_updates=delta_updates, opt_treatment=opt_treatment + model_provider, task_keeper, data_loader ) + self.logger.info('Starting experiment!') self.plan.resolve() initial_tensor_dict = self._get_initial_tensor_dict(model_provider) @@ -235,7 +232,8 @@ def _get_initial_tensor_dict(self, model_provider): def _prepare_plan(self, model_provider, task_keeper, data_loader, rounds_to_train, - delta_updates=False, opt_treatment='RESET', + delta_updates, opt_treatment, + device_assignment_policy, model_interface_file='model_obj.pkl', tasks_interface_file='tasks_obj.pkl', dataloader_interface_file='loader_obj.pkl', aggregation_function_interface_file='aggregation_function_obj.pkl'): @@ -271,6 +269,7 @@ def _prepare_plan(self, model_provider, task_keeper, data_loader, # Collaborator part plan.config['collaborator']['settings']['delta_updates'] = delta_updates plan.config['collaborator']['settings']['opt_treatment'] = opt_treatment + plan.config['collaborator']['settings']['device_assignment_policy'] = device_assignment_policy # DataLoader part for setting, value in data_loader.kwargs.items(): From d765e6eb3a5ff0cc4ab36eec3e15548b0a9b6a9b Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Tue, 21 Sep 2021 11:22:14 +0300 Subject: [PATCH 11/31] flake8 fixes --- openfl/component/collaborator/collaborator.py | 16 +++++++++++----- openfl/component/director/director.py | 13 ++++++------- openfl/interface/interactive_api/experiment.py | 3 ++- .../processing_units_monitor/pynvml_monitor.py | 10 +++++++--- openfl/transport/grpc/director_server.py | 7 ++++--- 5 files changed, 30 insertions(+), 19 deletions(-) diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 91b3deec41..2034066b8f 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -14,6 +14,7 @@ from openfl.protocols import utils from openfl.utilities import TensorKey + class DevicePolicy(Enum): """Device assignment policy.""" @@ -21,6 +22,7 @@ class DevicePolicy(Enum): CUDA_PREFERRED = 2 + class OptTreatment(Enum): """Optimizer Methods.""" @@ -116,14 +118,16 @@ def __init__(self, self.device_assignment_policy = DevicePolicy[device_assignment_policy] else: self.logger.error(f'Unknown device_assignment_policy: {device_assignment_policy}.') - raise NotImplementedError(f'Unknown device_assignment_policy: {device_assignment_policy}.') + raise NotImplementedError( + f'Unknown device_assignment_policy: {device_assignment_policy}.' + ) self.task_runner.set_optimizer_treatment(self.opt_treatment.name) - def set_available_devices(self, cuda: Tuple[str]=[]): + def set_available_devices(self, cuda: Tuple[str] = ()): """ Set available CUDA devices. - + Cuda tuple contains string indeces, ('1', '3'). """ self.cuda_devices = cuda @@ -184,8 +188,10 @@ def do_task(self, task, round_number): func_name = self.task_config[task]['function'] kwargs = self.task_config[task]['kwargs'] - if (self.device_assignment_policy.name == 'CUDA_PREFERRED' and - len(self.cuda_devices) > 0): + if (self.device_assignment_policy.name == ( + 'CUDA_PREFERRED' and len(self.cuda_devices) > 0 + ) + ): kwargs['device'] = 'cuda:' + str(self.cuda_devices[0]) else: kwargs['device'] = 'cpu' diff --git a/openfl/component/director/director.py b/openfl/component/director/director.py index 21c989b041..974cb2f407 100644 --- a/openfl/component/director/director.py +++ b/openfl/component/director/director.py @@ -117,9 +117,9 @@ def get_dataset_info(self): """Get dataset info.""" return self.sample_shape, self.target_shape - # def get_registered_shards(self) -> list: - # """Get registered shard infos.""" - # return [shard_status['shard_info'] for shard_status in self._shard_registry.values()] + def get_registered_shards(self) -> list: # Why is it here? + """Get registered shard infos.""" + return [shard_status['shard_info'] for shard_status in self._shard_registry.values()] async def stream_metrics(self, experiment_name: str, caller: str): """ @@ -190,13 +190,12 @@ def envoy_health_check( def get_envoys(self) -> list: """Get a status information about envoys.""" - logger.info(f'Shard registry: {self._shard_registry}') for envoy_info in self._shard_registry.values(): envoy_info['is_online'] = ( - time.time() < envoy_info['last_updated'] + - envoy_info['valid_duration'] - ) + time.time() < envoy_info['last_updated'] + + envoy_info['valid_duration'] + ) return self._shard_registry.values() diff --git a/openfl/interface/interactive_api/experiment.py b/openfl/interface/interactive_api/experiment.py index e9025423cc..96c73b346f 100644 --- a/openfl/interface/interactive_api/experiment.py +++ b/openfl/interface/interactive_api/experiment.py @@ -269,7 +269,8 @@ def _prepare_plan(self, model_provider, task_keeper, data_loader, # Collaborator part plan.config['collaborator']['settings']['delta_updates'] = delta_updates plan.config['collaborator']['settings']['opt_treatment'] = opt_treatment - plan.config['collaborator']['settings']['device_assignment_policy'] = device_assignment_policy + plan.config['collaborator']['settings'][ + 'device_assignment_policy'] = device_assignment_policy # DataLoader part for setting, value in data_loader.kwargs.items(): diff --git a/openfl/plugins/processing_units_monitor/pynvml_monitor.py b/openfl/plugins/processing_units_monitor/pynvml_monitor.py index 7ed79fb87d..143f739043 100644 --- a/openfl/plugins/processing_units_monitor/pynvml_monitor.py +++ b/openfl/plugins/processing_units_monitor/pynvml_monitor.py @@ -1,6 +1,10 @@ # Copyright (C) 2020-2021 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -"""pynvml CUDA Device monitor plugin module.""" +""" +pynvml CUDA Device monitor plugin module. + +Required package: nvidia-ml-py3 +""" import pynvml @@ -9,15 +13,15 @@ class PynvmlCUDADeviceMonitor(CUDADeviceMonitor): """CUDA Device monitor plugin using pynvml lib.""" - # required package: nvidia-ml-py3 def __init__(self) -> None: + """Initialize pynvml plugin.""" super().__init__() pynvml.nvmlInit() def get_driver_version(self) -> str: """Get CUDA driver version.""" - return pynvml.nvmlSystemGetDriverVersion().decode("utf-8") + return pynvml.nvmlSystemGetDriverVersion().decode('utf-8') def get_device_memory_total(self, index: int) -> int: """Get total memory available on the device.""" diff --git a/openfl/transport/grpc/director_server.py b/openfl/transport/grpc/director_server.py index e068921274..2ea0fe784e 100644 --- a/openfl/transport/grpc/director_server.py +++ b/openfl/transport/grpc/director_server.py @@ -235,7 +235,7 @@ async def EnvoyHealthCheck(self, request, context): # NOQA:N802 cuda_devices_info = [ MessageToDict(message, preserving_proto_field_name=True) for message in request.cuda_devices - ] + ] health_check_period = self.director.envoy_health_check( envoy_name=request.name, is_experiment_running=request.is_experiment_running, @@ -252,8 +252,9 @@ async def GetEnvoys(self, request, context): # NOQA:N802 response = [] for envoy_info in envoy_infos: envoy_info_message = director_pb2.EnvoyInfo( - shard_info=ParseDict(envoy_info['shard_info'], director_pb2.ShardInfo(), - ignore_unknown_fields=True), + shard_info=ParseDict( + envoy_info['shard_info'], director_pb2.ShardInfo(), + ignore_unknown_fields=True), is_online=envoy_info['is_online'], is_experiment_running=envoy_info['is_experiment_running']) envoy_info_message.valid_duration.seconds = envoy_info['valid_duration'] From dc401d61f0cc68f2b94bddbda3ebc4295aa8967e Mon Sep 17 00:00:00 2001 From: igor-davidyuk <76463150+igor-davidyuk@users.noreply.github.com> Date: Wed, 29 Sep 2021 11:04:22 +0300 Subject: [PATCH 12/31] Iliya's suggestion for template unpacking Co-authored-by: Ilya Trushkin --- openfl/interface/envoy.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index 64c185826b..d2aa35871f 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -73,8 +73,7 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, if not template: raise Exception('You should put a template' f'for plugin {plugin_name}') - class_name = template.split('.')[-1] - module_path = '.'.join(template.split('.')[:-1]) + module_path, _, class_name = template.rpartition('.') plugin_params = plugin_settings.get('params', {}) module = import_module(module_path) From 61854cf2ba00258357a382abb434d5f984ed6e1d Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Wed, 29 Sep 2021 11:49:21 +0300 Subject: [PATCH 13/31] Required fixes --- .../envoy/envoy_config.yaml | 9 +++++---- .../envoy/envoy_config2.yaml | 9 +++++---- .../envoy/envoy_config_no_gpu.yaml | 15 +++++++++++++++ .../workspace/PyTorch_Kvasir_UNet.ipynb | 19 +++++++++++-------- openfl/component/collaborator/collaborator.py | 13 ++++++------- openfl/component/envoy/envoy.py | 10 +++++----- openfl/interface/envoy.py | 4 ++-- .../pynvml_monitor.py | 2 +- openfl/transport/grpc/director_client.py | 3 ++- openfl/transport/grpc/director_server.py | 6 +++--- 10 files changed, 55 insertions(+), 35 deletions(-) create mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml index 0677b34f11..8ba8760e76 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml @@ -1,9 +1,10 @@ params: cuda_devices: [1,2,5] - optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] shard_descriptor: template: kvasir_shard_descriptor.KvasirShardDescriptor diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml index ce1a0d9c0c..598b3af4d8 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml @@ -1,9 +1,10 @@ params: cuda_devices: [] - optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] + +# optional_plugin_components: +# cuda_device_monitor: +# template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor +# settings: [] shard_descriptor: template: kvasir_shard_descriptor.KvasirShardDescriptor diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml new file mode 100644 index 0000000000..598b3af4d8 --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml @@ -0,0 +1,15 @@ +params: + cuda_devices: [] + +# optional_plugin_components: +# cuda_device_monitor: +# template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor +# settings: [] + +shard_descriptor: + template: kvasir_shard_descriptor.KvasirShardDescriptor + params: + data_folder: kvasir_data + rank_worldsize: 2,10 + enforce_image_hw: '300,400' + \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb index a84dfc27b9..677bc417b3 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/PyTorch_Kvasir_UNet.ipynb @@ -390,10 +390,17 @@ " device='device', optimizer='optimizer') \n", "@TI.set_aggregation_function(aggregation_function)\n", "def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss, some_parameter=None):\n", - "# if not torch.cuda.is_available():\n", - "# device = 'cpu'\n", - "# else:\n", - "# device = 'cuda'\n", + " \n", + " \"\"\" \n", + " The following constructions, that may lead to resource race\n", + " is no longer needed:\n", + " \n", + " if not torch.cuda.is_available():\n", + " device = 'cpu'\n", + " else:\n", + " device = 'cuda'\n", + " \n", + " \"\"\"\n", "\n", " print(f'\\n\\n TASK TRAIN GOT DEVICE {device}\\n\\n')\n", " \n", @@ -421,10 +428,6 @@ "\n", "@TI.register_fl_task(model='unet_model', data_loader='val_loader', device='device') \n", "def validate(unet_model, val_loader, device):\n", - "# if not torch.cuda.is_available():\n", - "# device = 'cpu'\n", - "# else:\n", - "# device = 'cuda'\n", " print(f'\\n\\n TASK VALIDATE GOT DEVICE {device}\\n\\n')\n", " \n", " unet_model.eval()\n", diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 2034066b8f..953e7634eb 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -111,13 +111,14 @@ def __init__(self, if hasattr(OptTreatment, opt_treatment): self.opt_treatment = OptTreatment[opt_treatment] else: - self.logger.error(f'Unknown opt_treatment: {opt_treatment}.') + self.logger.error(f'Unknown opt_treatment: {opt_treatment.name}.') raise NotImplementedError(f'Unknown opt_treatment: {opt_treatment}.') if hasattr(DevicePolicy, device_assignment_policy): self.device_assignment_policy = DevicePolicy[device_assignment_policy] else: - self.logger.error(f'Unknown device_assignment_policy: {device_assignment_policy}.') + self.logger.error('Unknown device_assignment_policy: ' + f'{device_assignment_policy.name}.') raise NotImplementedError( f'Unknown device_assignment_policy: {device_assignment_policy}.' ) @@ -188,11 +189,9 @@ def do_task(self, task, round_number): func_name = self.task_config[task]['function'] kwargs = self.task_config[task]['kwargs'] - if (self.device_assignment_policy.name == ( - 'CUDA_PREFERRED' and len(self.cuda_devices) > 0 - ) - ): - kwargs['device'] = 'cuda:' + str(self.cuda_devices[0]) + if (self.device_assignment_policy is DevicePolicy.CUDA_PREFERRED + and len(self.cuda_devices) > 0): + kwargs['device'] = f'cuda:{self.cuda_devices[0]}' else: kwargs['device'] = 'cpu' diff --git a/openfl/component/envoy/envoy.py b/openfl/component/envoy/envoy.py index 3ddc65a765..688234152a 100644 --- a/openfl/component/envoy/envoy.py +++ b/openfl/component/envoy/envoy.py @@ -25,7 +25,7 @@ class Envoy: def __init__(self, *, shard_name, director_host, director_port, shard_descriptor, root_certificate: str = None, private_key: str = None, certificate: str = None, - tls: bool = True, **envoy_params) -> None: + tls: bool = True, cuda_devices=(), cuda_device_monitor=None) -> None: """Initialize a envoy object.""" self.name = shard_name self.root_certificate = Path( @@ -43,10 +43,10 @@ def __init__(self, *, shard_name, director_host, director_port, shard_descriptor ) self.shard_descriptor = shard_descriptor - self.cuda_devices = envoy_params.get('cuda_devices', []) + self.cuda_devices = tuple(cuda_devices) # Optional plugins - self.cuda_device_monitor = envoy_params.get('cuda_device_monitor', None) + self.cuda_device_monitor = cuda_device_monitor self.executor = ThreadPoolExecutor() self.running_experiments = {} @@ -105,8 +105,8 @@ def send_health_check(self): 'device_utilization': self.cuda_device_monitor.get_device_utilization(device_id)} - devices_status_kwargs['cuda_driver_version'] = \ - self.cuda_device_monitor.get_driver_version() + devices_status_kwargs[ + 'cuda_driver_version'] = self.cuda_device_monitor.get_driver_version() devices_status_kwargs['cuda_devices_info'] = cuda_devices_info timeout = self.director_client.send_health_check( diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index d2aa35871f..55b5361f3b 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -54,7 +54,7 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, if is_directory_traversal(shard_config_path): click.echo('The shard config path is out of the openfl workspace scope.') sys.exit(1) - # Reed the Envoy config + # Read the Envoy config with open(envoy_config_path) as stream: envoy_config = safe_load(stream) @@ -68,7 +68,7 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, certificate = Path(certificate).absolute() envoy_params = envoy_config.get('params', {}) - for plugin_name, plugin_settings in envoy_params.get('optional_plugin_components', {}).items(): + for plugin_name, plugin_settings in envoy_config.get('optional_plugin_components', {}).items(): template = plugin_settings.get('template') if not template: raise Exception('You should put a template' diff --git a/openfl/plugins/processing_units_monitor/pynvml_monitor.py b/openfl/plugins/processing_units_monitor/pynvml_monitor.py index 143f739043..03e3041ea7 100644 --- a/openfl/plugins/processing_units_monitor/pynvml_monitor.py +++ b/openfl/plugins/processing_units_monitor/pynvml_monitor.py @@ -43,4 +43,4 @@ def get_device_utilization(self, index: int) -> str: """ handle = pynvml.nvmlDeviceGetHandleByIndex(index) info_utilization = pynvml.nvmlDeviceGetUtilizationRates(handle) - return str(info_utilization.gpu) + '%' + return f'{info_utilization.gpu}%' diff --git a/openfl/transport/grpc/director_client.py b/openfl/transport/grpc/director_client.py index 3410d9e6b1..0a870c828d 100644 --- a/openfl/transport/grpc/director_client.py +++ b/openfl/transport/grpc/director_client.py @@ -8,6 +8,7 @@ import grpc +from openfl.interface.interactive_api.shard_descriptor import ShardDescriptor from openfl.pipelines import NoCompressionPipeline from openfl.protocols import director_pb2 from openfl.protocols import director_pb2_grpc @@ -51,7 +52,7 @@ def __init__(self, *, director_host, director_port, shard_name, tls=True, channel = grpc.secure_channel(director_addr, credentials, options=options) self.stub = director_pb2_grpc.FederationDirectorStub(channel) - def report_shard_info(self, shard_descriptor, cuda_devices) -> bool: + def report_shard_info(self, shard_descriptor: ShardDescriptor, cuda_devices: tuple) -> bool: """Report shard info to the director.""" logger.info('Send report AcknowledgeShard') # True considered as successful registration diff --git a/openfl/transport/grpc/director_server.py b/openfl/transport/grpc/director_server.py index 2ea0fe784e..c9114ab958 100644 --- a/openfl/transport/grpc/director_server.py +++ b/openfl/transport/grpc/director_server.py @@ -249,7 +249,7 @@ async def EnvoyHealthCheck(self, request, context): # NOQA:N802 async def GetEnvoys(self, request, context): # NOQA:N802 """Get a status information about envoys.""" envoy_infos = self.director.get_envoys() - response = [] + envoy_statuses = [] for envoy_info in envoy_infos: envoy_info_message = director_pb2.EnvoyInfo( shard_info=ParseDict( @@ -260,6 +260,6 @@ async def GetEnvoys(self, request, context): # NOQA:N802 envoy_info_message.valid_duration.seconds = envoy_info['valid_duration'] envoy_info_message.last_updated.seconds = int(envoy_info['last_updated']) - response.append(envoy_info_message) + envoy_statuses.append(envoy_info_message) - return director_pb2.GetEnvoysResponse(envoy_infos=response) + return director_pb2.GetEnvoysResponse(envoy_infos=envoy_statuses) From 6bd00c8f0857fde9e132d66d5fff5a5b81c7b9e8 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Wed, 29 Sep 2021 12:01:23 +0300 Subject: [PATCH 14/31] enum fix in collaborator --- openfl/component/collaborator/collaborator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index 953e7634eb..a9b73cb479 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -76,8 +76,8 @@ def __init__(self, client, task_runner, task_config, - opt_treatment=OptTreatment.RESET, - device_assignment_policy=DevicePolicy.CPU_ONLY, + opt_treatment='RESET', + device_assignment_policy='CPU_ONLY', delta_updates=False, compression_pipeline=None, db_store_rounds=1, From 309fba2b2edbf1bce624f91799325b47d3816145 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Wed, 29 Sep 2021 12:29:43 +0300 Subject: [PATCH 15/31] fix envoy client test --- tests/openfl/communication/test_envoys_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/openfl/communication/test_envoys_client.py b/tests/openfl/communication/test_envoys_client.py index c79fc974a4..04860054bd 100644 --- a/tests/openfl/communication/test_envoys_client.py +++ b/tests/openfl/communication/test_envoys_client.py @@ -41,7 +41,9 @@ def test_report_shard_info(director_client): shard_descriptor.sample_shape = [str(dim) for dim in (1, 2)] shard_descriptor.target_shape = [str(dim) for dim in (10,)] - director_client.report_shard_info(shard_descriptor) + cuda_devices = () + + director_client.report_shard_info(shard_descriptor, cuda_devices) director_client.stub.AcknowledgeShard.assert_called_once() if sys.version_info < (3, 8): From c290ac385921dd1cdcd62a06d8c9bbbc28981558 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Wed, 29 Sep 2021 13:05:39 +0300 Subject: [PATCH 16/31] avoid passing new device parameter to old tusk runners --- openfl/component/collaborator/collaborator.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/openfl/component/collaborator/collaborator.py b/openfl/component/collaborator/collaborator.py index a9b73cb479..1fbf66b311 100644 --- a/openfl/component/collaborator/collaborator.py +++ b/openfl/component/collaborator/collaborator.py @@ -189,12 +189,6 @@ def do_task(self, task, round_number): func_name = self.task_config[task]['function'] kwargs = self.task_config[task]['kwargs'] - if (self.device_assignment_policy is DevicePolicy.CUDA_PREFERRED - and len(self.cuda_devices) > 0): - kwargs['device'] = f'cuda:{self.cuda_devices[0]}' - else: - kwargs['device'] = 'cpu' - # this would return a list of what tensors we require as TensorKeys required_tensorkeys_relative = self.task_runner.get_required_tensorkeys_for_function( func_name, @@ -229,6 +223,17 @@ def do_task(self, task, round_number): # New `Core` TaskRunner contains registry of tasks func = self.task_runner.TASK_REGISTRY[func_name] self.logger.info('Using Interactive Python API') + + # So far 'kwargs' contained parameters read from the plan + # those are parameters that the eperiment owner registered for + # the task. + # There is another set of parameters that created on the + # collaborator side, for instance, local processing unit identifier:s + if (self.device_assignment_policy is DevicePolicy.CUDA_PREFERRED + and len(self.cuda_devices) > 0): + kwargs['device'] = f'cuda:{self.cuda_devices[0]}' + else: + kwargs['device'] = 'cpu' else: # TaskRunner subclassing API # Tasks are defined as methods of TaskRunner From e0c853126d3c52351da27fd0f553334006ce22b5 Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Fri, 8 Oct 2021 15:30:38 +0300 Subject: [PATCH 17/31] moved experiment --- .../Updated_Kvasir_with_Director.ipynb | 641 ++++++++++++++++++ 1 file changed, 641 insertions(+) create mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb new file mode 100644 index 0000000000..933568e238 --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb @@ -0,0 +1,641 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "liquid-jacket", + "metadata": {}, + "source": [ + "# Federated Kvasir with Director example\n", + "## Using low-level Python API" + ] + }, + { + "cell_type": "markdown", + "id": "af0579f8", + "metadata": {}, + "source": [ + "# Long-Living entities update\n", + "\n", + "* We now may have director running on another machine.\n", + "* We use Federation API to communicate with Director.\n", + "* Federation object should hold a Director's client (for user service)\n", + "* Keeping in mind that several API instances may be connacted to one Director.\n", + "\n", + "\n", + "* We do not think for now how we start a Director.\n", + "* But it knows the data shape and target shape for the DataScience problem in the Federation.\n", + "* Director holds the list of connected envoys, we do not need to specify it anymore.\n", + "* Director and Envoys are responsible for encrypting connections, we do not need to worry about certs.\n", + "\n", + "\n", + "* Yet we MUST have a cert to communicate to the Director.\n", + "* We MUST know the FQDN of a Director.\n", + "* Director communicates data and target shape to the Federation interface object.\n", + "\n", + "\n", + "* Experiment API may use this info to construct a dummy dataset and a `shard descriptor` stub." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "alike-sharing", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Install dependencies if not already installed\n", + "!pip install torchvision" + ] + }, + { + "cell_type": "markdown", + "id": "16986f22", + "metadata": {}, + "source": [ + "# Connect to the Federation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4485ac79", + "metadata": {}, + "outputs": [], + "source": [ + "# Create a federation\n", + "from openfl.interface.interactive_api.federation import Federation\n", + "\n", + "# please use the same identificator that was used in signed certificate\n", + "cliend_id = 'frontend'\n", + "\n", + "# 1) Run with API layer - Director mTLS \n", + "# If the user wants to enable mTLS their must provide CA root chain, and signed key pair to the federation interface\n", + "# cert_chain = 'cert/root_ca.crt'\n", + "# API_certificate = 'cert/frontend.crt'\n", + "# API_private_key = 'cert/frontend.key'\n", + "\n", + "# federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50051',\n", + "# cert_chain=cert_chain, api_cert=API_certificate, api_private_key=API_private_key)\n", + "\n", + "# --------------------------------------------------------------------------------------------------------------------\n", + "\n", + "# 2) Run with TLS disabled (trusted environment)\n", + "# Federation can also determine local fqdn automatically\n", + "federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50050', tls=False)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e35802d5", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# import time\n", + "# while True:\n", + "# shard_registry = federation.get_shard_registry()\n", + "# print(shard_registry)\n", + "# time.sleep(5)\n", + "shard_registry = federation.get_shard_registry()\n", + "shard_registry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "67ae50de", + "metadata": {}, + "outputs": [], + "source": [ + "federation.target_shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "920216d3", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# First, request a dummy_shard_desc that holds information about the federated dataset \n", + "dummy_shard_desc = federation.get_dummy_shard_descriptor(size=10)\n", + "sample, target = dummy_shard_desc[0]" + ] + }, + { + "cell_type": "markdown", + "id": "obvious-tyler", + "metadata": {}, + "source": [ + "## Creating a FL experiment using Interactive API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "rubber-address", + "metadata": {}, + "outputs": [], + "source": [ + "from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, ModelInterface, FLExperiment" + ] + }, + { + "cell_type": "markdown", + "id": "sustainable-public", + "metadata": {}, + "source": [ + "### Register dataset" + ] + }, + { + "cell_type": "markdown", + "id": "unlike-texas", + "metadata": {}, + "source": [ + "We extract User dataset class implementation.\n", + "Is it convinient?\n", + "What if the dataset is not a class?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64f37dcf", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import PIL\n", + "import numpy as np\n", + "from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler\n", + "from torchvision import transforms as tsf\n", + "\n", + "# Now you can implement you data loaders using dummy_shard_desc\n", + "class KvasirSD(DataInterface, Dataset):\n", + "\n", + " def __init__(self, validation_fraction=1/8, **kwargs):\n", + " super().__init__(**kwargs)\n", + " \n", + " self.validation_fraction = validation_fraction\n", + " \n", + " # Prepare transforms\n", + " self.img_trans = tsf.Compose([\n", + " tsf.ToPILImage(),\n", + " tsf.Resize((332, 332)),\n", + " tsf.ToTensor(),\n", + " tsf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])\n", + " self.mask_trans = tsf.Compose([\n", + " tsf.ToPILImage(),\n", + " tsf.Resize((332, 332), interpolation=PIL.Image.NEAREST),\n", + " tsf.ToTensor()])\n", + " \n", + " @property\n", + " def shard_descriptor(self):\n", + " return self._shard_descriptor\n", + " \n", + " @shard_descriptor.setter\n", + " def shard_descriptor(self, shard_descriptor):\n", + " \"\"\"\n", + " Describe per-collaborator procedures or sharding.\n", + "\n", + " This method will be called during a collaborator initialization.\n", + " Local shard_descriptor will be set by Envoy.\n", + " \"\"\"\n", + " self._shard_descriptor = shard_descriptor\n", + " \n", + " validation_size = max(1, int(len(self.shard_descriptor) * self.validation_fraction))\n", + " \n", + " self.train_indeces = np.arange(len(self.shard_descriptor) - validation_size)\n", + " self.val_indeces = np.arange(len(self.shard_descriptor) - validation_size, len(self.shard_descriptor))\n", + " \n", + "\n", + " def __getitem__(self, index):\n", + " img, mask = self.shard_descriptor[index]\n", + " img = self.img_trans(img).numpy()\n", + " mask = self.mask_trans(mask).numpy()\n", + " return img, mask\n", + "\n", + " def __len__(self):\n", + " return len(self.shard_descriptor)\n", + " \n", + " \n", + " def get_train_loader(self, **kwargs):\n", + " \"\"\"\n", + " Output of this method will be provided to tasks with optimizer in contract\n", + " \"\"\"\n", + " train_sampler = SubsetRandomSampler(self.train_indeces)\n", + " return DataLoader(\n", + " self, num_workers=8, batch_size=self.kwargs['train_bs'], sampler=train_sampler\n", + " )\n", + "\n", + " def get_valid_loader(self, **kwargs):\n", + " \"\"\"\n", + " Output of this method will be provided to tasks without optimizer in contract\n", + " \"\"\"\n", + " val_sampler = SubsetRandomSampler(self.val_indeces)\n", + " return DataLoader(self, num_workers=8, batch_size=self.kwargs['valid_bs'], sampler=val_sampler)\n", + "\n", + " def get_train_data_size(self):\n", + " \"\"\"\n", + " Information for aggregation\n", + " \"\"\"\n", + " return len(self.train_indeces)\n", + "\n", + " def get_valid_data_size(self):\n", + " \"\"\"\n", + " Information for aggregation\n", + " \"\"\"\n", + " return len(self.val_indeces)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d8df35f5", + "metadata": {}, + "outputs": [], + "source": [ + "fed_dataset = KvasirSD(train_bs=4, valid_bs=8)\n", + "fed_dataset.shard_descriptor = dummy_shard_desc\n", + "for i, (sample, target) in enumerate(fed_dataset.get_train_loader()):\n", + " print(sample.shape)" + ] + }, + { + "cell_type": "markdown", + "id": "caring-distinction", + "metadata": {}, + "source": [ + "### Describe a model and optimizer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "visible-victor", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.optim as optim" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "foreign-gospel", + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "UNet model definition\n", + "\"\"\"\n", + "from layers import soft_dice_coef, soft_dice_loss, DoubleConv, Down, Up\n", + "\n", + "\n", + "class UNet(nn.Module):\n", + " def __init__(self, n_channels=3, n_classes=1):\n", + " super().__init__()\n", + " self.inc = DoubleConv(n_channels, 64)\n", + " self.down1 = Down(64, 128)\n", + " self.down2 = Down(128, 256)\n", + " self.down3 = Down(256, 512)\n", + " self.up1 = Up(512, 256)\n", + " self.up2 = Up(256, 128)\n", + " self.up3 = Up(128, 64)\n", + " self.outc = nn.Conv2d(64, n_classes, 1)\n", + "\n", + " def forward(self, x):\n", + " x1 = self.inc(x)\n", + " x2 = self.down1(x1)\n", + " x3 = self.down2(x2)\n", + " x4 = self.down3(x3)\n", + " x = self.up1(x4, x3)\n", + " x = self.up2(x, x2)\n", + " x = self.up3(x, x1)\n", + " x = self.outc(x)\n", + " x = torch.sigmoid(x)\n", + " return x\n", + " \n", + "model_unet = UNet()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "greater-activation", + "metadata": {}, + "outputs": [], + "source": [ + "optimizer_adam = optim.Adam(model_unet.parameters(), lr=1e-4)" + ] + }, + { + "cell_type": "markdown", + "id": "caroline-passion", + "metadata": {}, + "source": [ + "#### Register model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "handled-teens", + "metadata": {}, + "outputs": [], + "source": [ + "from copy import deepcopy\n", + "\n", + "framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin'\n", + "MI = ModelInterface(model=model_unet, optimizer=optimizer_adam, framework_plugin=framework_adapter)\n", + "\n", + "# Save the initial model state\n", + "initial_model = deepcopy(model_unet)" + ] + }, + { + "cell_type": "markdown", + "id": "portuguese-groove", + "metadata": {}, + "source": [ + "### Define and register FL tasks" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "increasing-builder", + "metadata": {}, + "outputs": [], + "source": [ + "TI = TaskInterface()\n", + "import torch\n", + "\n", + "import tqdm\n", + "from openfl.component.aggregation_functions import Median\n", + "\n", + "# The Interactive API supports registering functions definied in main module or imported.\n", + "def function_defined_in_notebook(some_parameter):\n", + " print(f'Also I accept a parameter and it is {some_parameter}')\n", + "\n", + "#The Interactive API supports overriding of the aggregation function\n", + "aggregation_function = Median()\n", + "\n", + "# Task interface currently supports only standalone functions.\n", + "@TI.add_kwargs(**{'some_parameter': 42})\n", + "@TI.register_fl_task(model='unet_model', data_loader='train_loader', \\\n", + " device='device', optimizer='optimizer') \n", + "@TI.set_aggregation_function(aggregation_function)\n", + "def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss, some_parameter=None):\n", + " \n", + " \"\"\" \n", + " The following constructions, that may lead to resource race\n", + " is no longer needed:\n", + " \n", + " if not torch.cuda.is_available():\n", + " device = 'cpu'\n", + " else:\n", + " device = 'cuda'\n", + " \n", + " \"\"\"\n", + "\n", + " print(f'\\n\\n TASK TRAIN GOT DEVICE {device}\\n\\n')\n", + " \n", + " function_defined_in_notebook(some_parameter)\n", + " \n", + " train_loader = tqdm.tqdm(train_loader, desc=\"train\")\n", + " \n", + " unet_model.train()\n", + " unet_model.to(device)\n", + "\n", + " losses = []\n", + "\n", + " for data, target in train_loader:\n", + " data, target = torch.tensor(data).to(device), torch.tensor(\n", + " target).to(device, dtype=torch.float32)\n", + " optimizer.zero_grad()\n", + " output = unet_model(data)\n", + " loss = loss_fn(output=output, target=target)\n", + " loss.backward()\n", + " optimizer.step()\n", + " losses.append(loss.detach().cpu().numpy())\n", + " \n", + " return {'train_loss': np.mean(losses),}\n", + "\n", + "\n", + "@TI.register_fl_task(model='unet_model', data_loader='val_loader', device='device') \n", + "def validate(unet_model, val_loader, device):\n", + " print(f'\\n\\n TASK VALIDATE GOT DEVICE {device}\\n\\n')\n", + " \n", + " unet_model.eval()\n", + " unet_model.to(device)\n", + " \n", + " val_loader = tqdm.tqdm(val_loader, desc=\"validate\")\n", + "\n", + " val_score = 0\n", + " total_samples = 0\n", + "\n", + " with torch.no_grad():\n", + " for data, target in val_loader:\n", + " samples = target.shape[0]\n", + " total_samples += samples\n", + " data, target = torch.tensor(data).to(device), \\\n", + " torch.tensor(target).to(device, dtype=torch.int64)\n", + " output = unet_model(data)\n", + " val = soft_dice_coef(output, target)\n", + " val_score += val.sum().cpu().numpy()\n", + " \n", + " return {'dice_coef': val_score / total_samples,}" + ] + }, + { + "cell_type": "markdown", + "id": "derived-bride", + "metadata": {}, + "source": [ + "## Time to start a federated learning experiment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "mature-renewal", + "metadata": {}, + "outputs": [], + "source": [ + "# create an experimnet in federation\n", + "experiment_name = 'kvasir_test_experiment'\n", + "fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "lightweight-causing", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# If I use autoreload I got a pickling error\n", + "\n", + "# The following command zips the workspace and python requirements to be transfered to collaborator nodes\n", + "fl_experiment.start(model_provider=MI, \n", + " task_keeper=TI,\n", + " data_loader=fed_dataset,\n", + " rounds_to_train=2,\n", + " opt_treatment='CONTINUE_GLOBAL',\n", + " device_assignment_policy='CUDA_PREFERRED')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f1543a36", + "metadata": {}, + "outputs": [], + "source": [ + "# If user want to stop IPython session, then reconnect and check how experiment is going \n", + "# fl_experiment.restore_experiment_state(MI)\n", + "\n", + "fl_experiment.stream_metrics()" + ] + }, + { + "cell_type": "markdown", + "id": "8c30b301", + "metadata": {}, + "source": [ + "## Now we validate the best model!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "55acff59", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "best_model = fl_experiment.get_best_model()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9479fb7f", + "metadata": {}, + "outputs": [], + "source": [ + "# We remove exremove_experiment_datamove_experiment_datamove_experiment_datariment data from director\n", + "fl_experiment.remove_experiment_data()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75c8aeab", + "metadata": {}, + "outputs": [], + "source": [ + "best_model.inc.conv[0].weight\n", + "# model_unet.inc.conv[0].weight" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2acb7e6", + "metadata": {}, + "outputs": [], + "source": [ + "# Validating initial model\n", + "validate(initial_model, fed_dataset.get_valid_loader(), 'cpu')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c12ca93f", + "metadata": {}, + "outputs": [], + "source": [ + "# Validating trained model\n", + "validate(best_model, fed_dataset.get_valid_loader(), 'cpu')" + ] + }, + { + "cell_type": "markdown", + "id": "1e6734f6", + "metadata": {}, + "source": [ + "## We can tune model further!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3940e75e", + "metadata": {}, + "outputs": [], + "source": [ + "MI = ModelInterface(model=best_model, optimizer=optimizer_adam, framework_plugin=framework_adapter)\n", + "fl_experiment.start(model_provider=MI, task_keeper=TI, data_loader=fed_dataset, rounds_to_train=4, \\\n", + " opt_treatment='CONTINUE_GLOBAL')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bd786d2", + "metadata": {}, + "outputs": [], + "source": [ + "best_model = fl_experiment.get_best_model()\n", + "# Validating trained model\n", + "validate(best_model, fed_dataset.get_valid_loader(), 'cpu')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e00ff26c", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 3508cac67758da7d8f69c53a462cddc073b1b484 Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Fri, 8 Oct 2021 16:12:08 +0300 Subject: [PATCH 18/31] removed unsued files --- .../PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml | 15 --------------- .../PyTorch_Kvasir_UNet/envoy/shard_config.yaml | 5 ----- .../PyTorch_Kvasir_UNet/envoy/start_envoy2.sh | 4 ---- .../envoy/start_envoy_with_tls.sh | 2 +- 4 files changed, 1 insertion(+), 25 deletions(-) delete mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml delete mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml delete mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml deleted file mode 100644 index 598b3af4d8..0000000000 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config2.yaml +++ /dev/null @@ -1,15 +0,0 @@ -params: - cuda_devices: [] - -# optional_plugin_components: -# cuda_device_monitor: -# template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor -# settings: [] - -shard_descriptor: - template: kvasir_shard_descriptor.KvasirShardDescriptor - params: - data_folder: kvasir_data - rank_worldsize: 2,10 - enforce_image_hw: '300,400' - \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml deleted file mode 100644 index 62cf42a45c..0000000000 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/shard_config.yaml +++ /dev/null @@ -1,5 +0,0 @@ -template: kvasir_shard_descriptor.KvasirShardDescriptor -params: - data_folder: kvasir_data - rank_worldsize: 1,10 - enforce_image_hw: '300,400' \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh deleted file mode 100644 index d30661f66e..0000000000 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy2.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash -set -e - -fx envoy start -n env_two --disable-tls --envoy-config-path envoy_config2.yaml -dh localhost -dp 50050 diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh index 0357a4c1a5..07a8353c4a 100755 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh @@ -3,4 +3,4 @@ set -e ENVOY_NAME=$1 DIRECTOR_FQDN=$2 -fx envoy start -n "$ENVOY_NAME" --shard-config-path shard_config.yaml -dh "$DIRECTOR_FQDN" -dp 50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt +fx envoy start -n "$ENVOY_NAME" --shard-config-path shard_config.yaml -dh "$DIRECTOR_FQDN" -dp 50050 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt From 8338f05a014542774841e6fcc6f9e502d8d93e72 Mon Sep 17 00:00:00 2001 From: Igor Davidyuk Date: Fri, 15 Oct 2021 11:38:41 +0300 Subject: [PATCH 19/31] fix envoy cli after rebase --- openfl/interface/envoy.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index 55b5361f3b..fdb8196ac5 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -55,11 +55,9 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, click.echo('The shard config path is out of the openfl workspace scope.') sys.exit(1) # Read the Envoy config - with open(envoy_config_path) as stream: + with open(Path(envoy_config_path).absolute()) as stream: envoy_config = safe_load(stream) - # pass envoy parameters - shard_config_path = Path(shard_config_path).absolute() if root_certificate: root_certificate = Path(root_certificate).absolute() if private_key: @@ -67,6 +65,7 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, if certificate: certificate = Path(certificate).absolute() + # Parse envoy parameters envoy_params = envoy_config.get('params', {}) for plugin_name, plugin_settings in envoy_config.get('optional_plugin_components', {}).items(): template = plugin_settings.get('template') @@ -80,6 +79,7 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, instance = getattr(module, class_name)(**plugin_params) envoy_params[plugin_name] = instance + # Instantiate Shard Descriptor shard_descriptor = shard_descriptor_from_config(envoy_config.get('shard_descriptor', {})) envoy = Envoy( shard_name=shard_name, From c8ede9722d15b6a6b74b57b1609275cbbdbe4ee6 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Fri, 15 Oct 2021 12:13:42 +0300 Subject: [PATCH 20/31] director fix --- openfl/component/director/director.py | 1 - 1 file changed, 1 deletion(-) diff --git a/openfl/component/director/director.py b/openfl/component/director/director.py index 974cb2f407..c917b4aecf 100644 --- a/openfl/component/director/director.py +++ b/openfl/component/director/director.py @@ -11,7 +11,6 @@ from typing import Iterable from typing import Union -from openfl.protocols import director_pb2 from .experiment import Experiment from .experiment import ExperimentsRegistry From da673eba1873ecd5310dae3274f3d64863a51961 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Fri, 15 Oct 2021 14:35:01 +0300 Subject: [PATCH 21/31] fix tests --- openfl-workspace/default/envoy_config.yaml | 9 +++++---- setup.py | 3 ++- .../pytorch_kvasir_unet/envoy/envoy_config.yaml | 15 +++++++++++++++ .../pytorch_kvasir_unet/envoy/shard_config.yaml | 6 ------ .../tensorflow_mnist/envoy/envoy_config.yaml | 12 ++++++++++++ .../tensorflow_mnist/envoy/shard_config.yaml | 3 --- .../tensorflow_mnist/envoy/start_envoy.sh | 2 +- 7 files changed, 35 insertions(+), 15 deletions(-) create mode 100644 tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml delete mode 100644 tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/shard_config.yaml create mode 100644 tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml delete mode 100644 tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_config.yaml diff --git a/openfl-workspace/default/envoy_config.yaml b/openfl-workspace/default/envoy_config.yaml index 46d9844662..b1ab9e3180 100644 --- a/openfl-workspace/default/envoy_config.yaml +++ b/openfl-workspace/default/envoy_config.yaml @@ -19,10 +19,11 @@ params: cuda_devices: [] - optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] shard_descriptor: template: shard_descriptor.LocalShardDescriptor diff --git a/setup.py b/setup.py index ef83cf8792..6f39021198 100644 --- a/setup.py +++ b/setup.py @@ -70,7 +70,8 @@ 'cloudpickle', 'tensorboardX', 'tensorboard', - 'requests' + 'requests', + 'nvidia-ml-py3' ], python_requires='>=3.6, <3.9', project_urls={ diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml new file mode 100644 index 0000000000..98e5039bf9 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml @@ -0,0 +1,15 @@ +params: + cuda_devices: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: kvasir_shard_descriptor.KvasirShardDescriptor + params: + data_folder: kvasir_data + rank: 1 + worldsize: 90 + enforce_image_hw: [300, 400] \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/shard_config.yaml b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/shard_config.yaml deleted file mode 100644 index 80a864f944..0000000000 --- a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/shard_config.yaml +++ /dev/null @@ -1,6 +0,0 @@ -template: kvasir_shard_descriptor.KvasirShardDescriptor -params: - data_folder: kvasir_data - rank: 1 - worldsize: 90 - enforce_image_hw: [300, 400] \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml new file mode 100644 index 0000000000..413f439714 --- /dev/null +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml @@ -0,0 +1,12 @@ +params: + cuda_devices: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: shard_descriptor.MNISTShardDescriptor + params: + rank_worldsize: 1,90 \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_config.yaml b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_config.yaml deleted file mode 100644 index d6dc9bccc6..0000000000 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_config.yaml +++ /dev/null @@ -1,3 +0,0 @@ -template: shard_descriptor.MNISTShardDescriptor -params: - rank_worldsize: 1,90 \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/start_envoy.sh b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/start_envoy.sh index 222d3988e0..1dd6591439 100644 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/start_envoy.sh +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls --shard-config-path shard_config.yaml -dh localhost -dp 50051 \ No newline at end of file +fx envoy start -n env_one --disable-tls --envoy-config-path envoy_config.yaml -dh localhost -dp 50051 \ No newline at end of file From 07c16bb78bbca8e149961d363e8c7513b2ffab74 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Fri, 15 Oct 2021 15:13:19 +0300 Subject: [PATCH 22/31] removed additional notebook --- .../Updated_Kvasir_with_Director.ipynb | 641 ------------------ 1 file changed, 641 deletions(-) delete mode 100644 openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb deleted file mode 100644 index 933568e238..0000000000 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/workspace/Updated_Kvasir_with_Director.ipynb +++ /dev/null @@ -1,641 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "liquid-jacket", - "metadata": {}, - "source": [ - "# Federated Kvasir with Director example\n", - "## Using low-level Python API" - ] - }, - { - "cell_type": "markdown", - "id": "af0579f8", - "metadata": {}, - "source": [ - "# Long-Living entities update\n", - "\n", - "* We now may have director running on another machine.\n", - "* We use Federation API to communicate with Director.\n", - "* Federation object should hold a Director's client (for user service)\n", - "* Keeping in mind that several API instances may be connacted to one Director.\n", - "\n", - "\n", - "* We do not think for now how we start a Director.\n", - "* But it knows the data shape and target shape for the DataScience problem in the Federation.\n", - "* Director holds the list of connected envoys, we do not need to specify it anymore.\n", - "* Director and Envoys are responsible for encrypting connections, we do not need to worry about certs.\n", - "\n", - "\n", - "* Yet we MUST have a cert to communicate to the Director.\n", - "* We MUST know the FQDN of a Director.\n", - "* Director communicates data and target shape to the Federation interface object.\n", - "\n", - "\n", - "* Experiment API may use this info to construct a dummy dataset and a `shard descriptor` stub." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "alike-sharing", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# Install dependencies if not already installed\n", - "!pip install torchvision" - ] - }, - { - "cell_type": "markdown", - "id": "16986f22", - "metadata": {}, - "source": [ - "# Connect to the Federation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4485ac79", - "metadata": {}, - "outputs": [], - "source": [ - "# Create a federation\n", - "from openfl.interface.interactive_api.federation import Federation\n", - "\n", - "# please use the same identificator that was used in signed certificate\n", - "cliend_id = 'frontend'\n", - "\n", - "# 1) Run with API layer - Director mTLS \n", - "# If the user wants to enable mTLS their must provide CA root chain, and signed key pair to the federation interface\n", - "# cert_chain = 'cert/root_ca.crt'\n", - "# API_certificate = 'cert/frontend.crt'\n", - "# API_private_key = 'cert/frontend.key'\n", - "\n", - "# federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50051',\n", - "# cert_chain=cert_chain, api_cert=API_certificate, api_private_key=API_private_key)\n", - "\n", - "# --------------------------------------------------------------------------------------------------------------------\n", - "\n", - "# 2) Run with TLS disabled (trusted environment)\n", - "# Federation can also determine local fqdn automatically\n", - "federation = Federation(client_id='frontend', director_node_fqdn='localhost', director_port='50050', tls=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e35802d5", - "metadata": { - "scrolled": false - }, - "outputs": [], - "source": [ - "# import time\n", - "# while True:\n", - "# shard_registry = federation.get_shard_registry()\n", - "# print(shard_registry)\n", - "# time.sleep(5)\n", - "shard_registry = federation.get_shard_registry()\n", - "shard_registry" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "67ae50de", - "metadata": {}, - "outputs": [], - "source": [ - "federation.target_shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "920216d3", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# First, request a dummy_shard_desc that holds information about the federated dataset \n", - "dummy_shard_desc = federation.get_dummy_shard_descriptor(size=10)\n", - "sample, target = dummy_shard_desc[0]" - ] - }, - { - "cell_type": "markdown", - "id": "obvious-tyler", - "metadata": {}, - "source": [ - "## Creating a FL experiment using Interactive API" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "rubber-address", - "metadata": {}, - "outputs": [], - "source": [ - "from openfl.interface.interactive_api.experiment import TaskInterface, DataInterface, ModelInterface, FLExperiment" - ] - }, - { - "cell_type": "markdown", - "id": "sustainable-public", - "metadata": {}, - "source": [ - "### Register dataset" - ] - }, - { - "cell_type": "markdown", - "id": "unlike-texas", - "metadata": {}, - "source": [ - "We extract User dataset class implementation.\n", - "Is it convinient?\n", - "What if the dataset is not a class?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64f37dcf", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import os\n", - "import PIL\n", - "import numpy as np\n", - "from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler\n", - "from torchvision import transforms as tsf\n", - "\n", - "# Now you can implement you data loaders using dummy_shard_desc\n", - "class KvasirSD(DataInterface, Dataset):\n", - "\n", - " def __init__(self, validation_fraction=1/8, **kwargs):\n", - " super().__init__(**kwargs)\n", - " \n", - " self.validation_fraction = validation_fraction\n", - " \n", - " # Prepare transforms\n", - " self.img_trans = tsf.Compose([\n", - " tsf.ToPILImage(),\n", - " tsf.Resize((332, 332)),\n", - " tsf.ToTensor(),\n", - " tsf.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])])\n", - " self.mask_trans = tsf.Compose([\n", - " tsf.ToPILImage(),\n", - " tsf.Resize((332, 332), interpolation=PIL.Image.NEAREST),\n", - " tsf.ToTensor()])\n", - " \n", - " @property\n", - " def shard_descriptor(self):\n", - " return self._shard_descriptor\n", - " \n", - " @shard_descriptor.setter\n", - " def shard_descriptor(self, shard_descriptor):\n", - " \"\"\"\n", - " Describe per-collaborator procedures or sharding.\n", - "\n", - " This method will be called during a collaborator initialization.\n", - " Local shard_descriptor will be set by Envoy.\n", - " \"\"\"\n", - " self._shard_descriptor = shard_descriptor\n", - " \n", - " validation_size = max(1, int(len(self.shard_descriptor) * self.validation_fraction))\n", - " \n", - " self.train_indeces = np.arange(len(self.shard_descriptor) - validation_size)\n", - " self.val_indeces = np.arange(len(self.shard_descriptor) - validation_size, len(self.shard_descriptor))\n", - " \n", - "\n", - " def __getitem__(self, index):\n", - " img, mask = self.shard_descriptor[index]\n", - " img = self.img_trans(img).numpy()\n", - " mask = self.mask_trans(mask).numpy()\n", - " return img, mask\n", - "\n", - " def __len__(self):\n", - " return len(self.shard_descriptor)\n", - " \n", - " \n", - " def get_train_loader(self, **kwargs):\n", - " \"\"\"\n", - " Output of this method will be provided to tasks with optimizer in contract\n", - " \"\"\"\n", - " train_sampler = SubsetRandomSampler(self.train_indeces)\n", - " return DataLoader(\n", - " self, num_workers=8, batch_size=self.kwargs['train_bs'], sampler=train_sampler\n", - " )\n", - "\n", - " def get_valid_loader(self, **kwargs):\n", - " \"\"\"\n", - " Output of this method will be provided to tasks without optimizer in contract\n", - " \"\"\"\n", - " val_sampler = SubsetRandomSampler(self.val_indeces)\n", - " return DataLoader(self, num_workers=8, batch_size=self.kwargs['valid_bs'], sampler=val_sampler)\n", - "\n", - " def get_train_data_size(self):\n", - " \"\"\"\n", - " Information for aggregation\n", - " \"\"\"\n", - " return len(self.train_indeces)\n", - "\n", - " def get_valid_data_size(self):\n", - " \"\"\"\n", - " Information for aggregation\n", - " \"\"\"\n", - " return len(self.val_indeces)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d8df35f5", - "metadata": {}, - "outputs": [], - "source": [ - "fed_dataset = KvasirSD(train_bs=4, valid_bs=8)\n", - "fed_dataset.shard_descriptor = dummy_shard_desc\n", - "for i, (sample, target) in enumerate(fed_dataset.get_train_loader()):\n", - " print(sample.shape)" - ] - }, - { - "cell_type": "markdown", - "id": "caring-distinction", - "metadata": {}, - "source": [ - "### Describe a model and optimizer" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "visible-victor", - "metadata": {}, - "outputs": [], - "source": [ - "import torch\n", - "import torch.nn as nn\n", - "import torch.optim as optim" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "foreign-gospel", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "UNet model definition\n", - "\"\"\"\n", - "from layers import soft_dice_coef, soft_dice_loss, DoubleConv, Down, Up\n", - "\n", - "\n", - "class UNet(nn.Module):\n", - " def __init__(self, n_channels=3, n_classes=1):\n", - " super().__init__()\n", - " self.inc = DoubleConv(n_channels, 64)\n", - " self.down1 = Down(64, 128)\n", - " self.down2 = Down(128, 256)\n", - " self.down3 = Down(256, 512)\n", - " self.up1 = Up(512, 256)\n", - " self.up2 = Up(256, 128)\n", - " self.up3 = Up(128, 64)\n", - " self.outc = nn.Conv2d(64, n_classes, 1)\n", - "\n", - " def forward(self, x):\n", - " x1 = self.inc(x)\n", - " x2 = self.down1(x1)\n", - " x3 = self.down2(x2)\n", - " x4 = self.down3(x3)\n", - " x = self.up1(x4, x3)\n", - " x = self.up2(x, x2)\n", - " x = self.up3(x, x1)\n", - " x = self.outc(x)\n", - " x = torch.sigmoid(x)\n", - " return x\n", - " \n", - "model_unet = UNet()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "greater-activation", - "metadata": {}, - "outputs": [], - "source": [ - "optimizer_adam = optim.Adam(model_unet.parameters(), lr=1e-4)" - ] - }, - { - "cell_type": "markdown", - "id": "caroline-passion", - "metadata": {}, - "source": [ - "#### Register model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "handled-teens", - "metadata": {}, - "outputs": [], - "source": [ - "from copy import deepcopy\n", - "\n", - "framework_adapter = 'openfl.plugins.frameworks_adapters.pytorch_adapter.FrameworkAdapterPlugin'\n", - "MI = ModelInterface(model=model_unet, optimizer=optimizer_adam, framework_plugin=framework_adapter)\n", - "\n", - "# Save the initial model state\n", - "initial_model = deepcopy(model_unet)" - ] - }, - { - "cell_type": "markdown", - "id": "portuguese-groove", - "metadata": {}, - "source": [ - "### Define and register FL tasks" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "increasing-builder", - "metadata": {}, - "outputs": [], - "source": [ - "TI = TaskInterface()\n", - "import torch\n", - "\n", - "import tqdm\n", - "from openfl.component.aggregation_functions import Median\n", - "\n", - "# The Interactive API supports registering functions definied in main module or imported.\n", - "def function_defined_in_notebook(some_parameter):\n", - " print(f'Also I accept a parameter and it is {some_parameter}')\n", - "\n", - "#The Interactive API supports overriding of the aggregation function\n", - "aggregation_function = Median()\n", - "\n", - "# Task interface currently supports only standalone functions.\n", - "@TI.add_kwargs(**{'some_parameter': 42})\n", - "@TI.register_fl_task(model='unet_model', data_loader='train_loader', \\\n", - " device='device', optimizer='optimizer') \n", - "@TI.set_aggregation_function(aggregation_function)\n", - "def train(unet_model, train_loader, optimizer, device, loss_fn=soft_dice_loss, some_parameter=None):\n", - " \n", - " \"\"\" \n", - " The following constructions, that may lead to resource race\n", - " is no longer needed:\n", - " \n", - " if not torch.cuda.is_available():\n", - " device = 'cpu'\n", - " else:\n", - " device = 'cuda'\n", - " \n", - " \"\"\"\n", - "\n", - " print(f'\\n\\n TASK TRAIN GOT DEVICE {device}\\n\\n')\n", - " \n", - " function_defined_in_notebook(some_parameter)\n", - " \n", - " train_loader = tqdm.tqdm(train_loader, desc=\"train\")\n", - " \n", - " unet_model.train()\n", - " unet_model.to(device)\n", - "\n", - " losses = []\n", - "\n", - " for data, target in train_loader:\n", - " data, target = torch.tensor(data).to(device), torch.tensor(\n", - " target).to(device, dtype=torch.float32)\n", - " optimizer.zero_grad()\n", - " output = unet_model(data)\n", - " loss = loss_fn(output=output, target=target)\n", - " loss.backward()\n", - " optimizer.step()\n", - " losses.append(loss.detach().cpu().numpy())\n", - " \n", - " return {'train_loss': np.mean(losses),}\n", - "\n", - "\n", - "@TI.register_fl_task(model='unet_model', data_loader='val_loader', device='device') \n", - "def validate(unet_model, val_loader, device):\n", - " print(f'\\n\\n TASK VALIDATE GOT DEVICE {device}\\n\\n')\n", - " \n", - " unet_model.eval()\n", - " unet_model.to(device)\n", - " \n", - " val_loader = tqdm.tqdm(val_loader, desc=\"validate\")\n", - "\n", - " val_score = 0\n", - " total_samples = 0\n", - "\n", - " with torch.no_grad():\n", - " for data, target in val_loader:\n", - " samples = target.shape[0]\n", - " total_samples += samples\n", - " data, target = torch.tensor(data).to(device), \\\n", - " torch.tensor(target).to(device, dtype=torch.int64)\n", - " output = unet_model(data)\n", - " val = soft_dice_coef(output, target)\n", - " val_score += val.sum().cpu().numpy()\n", - " \n", - " return {'dice_coef': val_score / total_samples,}" - ] - }, - { - "cell_type": "markdown", - "id": "derived-bride", - "metadata": {}, - "source": [ - "## Time to start a federated learning experiment" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "mature-renewal", - "metadata": {}, - "outputs": [], - "source": [ - "# create an experimnet in federation\n", - "experiment_name = 'kvasir_test_experiment'\n", - "fl_experiment = FLExperiment(federation=federation, experiment_name=experiment_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "lightweight-causing", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "# If I use autoreload I got a pickling error\n", - "\n", - "# The following command zips the workspace and python requirements to be transfered to collaborator nodes\n", - "fl_experiment.start(model_provider=MI, \n", - " task_keeper=TI,\n", - " data_loader=fed_dataset,\n", - " rounds_to_train=2,\n", - " opt_treatment='CONTINUE_GLOBAL',\n", - " device_assignment_policy='CUDA_PREFERRED')\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f1543a36", - "metadata": {}, - "outputs": [], - "source": [ - "# If user want to stop IPython session, then reconnect and check how experiment is going \n", - "# fl_experiment.restore_experiment_state(MI)\n", - "\n", - "fl_experiment.stream_metrics()" - ] - }, - { - "cell_type": "markdown", - "id": "8c30b301", - "metadata": {}, - "source": [ - "## Now we validate the best model!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "55acff59", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "best_model = fl_experiment.get_best_model()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9479fb7f", - "metadata": {}, - "outputs": [], - "source": [ - "# We remove exremove_experiment_datamove_experiment_datamove_experiment_datariment data from director\n", - "fl_experiment.remove_experiment_data()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75c8aeab", - "metadata": {}, - "outputs": [], - "source": [ - "best_model.inc.conv[0].weight\n", - "# model_unet.inc.conv[0].weight" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a2acb7e6", - "metadata": {}, - "outputs": [], - "source": [ - "# Validating initial model\n", - "validate(initial_model, fed_dataset.get_valid_loader(), 'cpu')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c12ca93f", - "metadata": {}, - "outputs": [], - "source": [ - "# Validating trained model\n", - "validate(best_model, fed_dataset.get_valid_loader(), 'cpu')" - ] - }, - { - "cell_type": "markdown", - "id": "1e6734f6", - "metadata": {}, - "source": [ - "## We can tune model further!" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3940e75e", - "metadata": {}, - "outputs": [], - "source": [ - "MI = ModelInterface(model=best_model, optimizer=optimizer_adam, framework_plugin=framework_adapter)\n", - "fl_experiment.start(model_provider=MI, task_keeper=TI, data_loader=fed_dataset, rounds_to_train=4, \\\n", - " opt_treatment='CONTINUE_GLOBAL')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1bd786d2", - "metadata": {}, - "outputs": [], - "source": [ - "best_model = fl_experiment.get_best_model()\n", - "# Validating trained model\n", - "validate(best_model, fed_dataset.get_valid_loader(), 'cpu')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e00ff26c", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} From c512e335b7c0a1c4adfbcf93aef5d5c4eaf2ad6c Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Fri, 15 Oct 2021 18:54:25 +0300 Subject: [PATCH 23/31] added plugin to setup.py and fixed envoy configs in tutorials --- .../envoy/envoy_config_one.yaml | 13 +++++++++++++ .../envoy/envoy_config_two.yaml | 13 +++++++++++++ .../envoy/shard_config_one.yaml | 4 ---- .../envoy/shard_config_two.yaml | 4 ---- .../PyTorch_TinyImageNet/envoy/envoy_config.yaml | 13 +++++++++++++ .../PyTorch_TinyImageNet/envoy/shard_config.yaml | 4 ---- .../Tensorflow_MNIST/envoy/envoy_config_one.yaml | 12 ++++++++++++ .../Tensorflow_MNIST/envoy/envoy_config_two.yaml | 12 ++++++++++++ .../Tensorflow_MNIST/envoy/shard_config_one.yaml | 3 --- .../Tensorflow_MNIST/envoy/shard_config_two.yaml | 3 --- setup.py | 3 ++- 11 files changed, 65 insertions(+), 19 deletions(-) create mode 100644 openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml create mode 100644 openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml delete mode 100644 openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_one.yaml delete mode 100644 openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_two.yaml create mode 100644 openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml delete mode 100644 openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/shard_config.yaml create mode 100644 openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml create mode 100644 openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml delete mode 100644 openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_one.yaml delete mode 100644 openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_two.yaml diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml new file mode 100644 index 0000000000..446ea261c7 --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml @@ -0,0 +1,13 @@ +params: + cuda_devices: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: market_shard_descriptor.MarketShardDescriptor + params: + datafolder: Market-1501-v15.09.15 + rank_worldsize: 1,2 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml new file mode 100644 index 0000000000..8f964cb39e --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml @@ -0,0 +1,13 @@ +params: + cuda_devices: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: market_shard_descriptor.MarketShardDescriptor + params: + datafolder: Market-1501-v15.09.15 + rank_worldsize: 2,2 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_one.yaml b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_one.yaml deleted file mode 100644 index 68de28f6a9..0000000000 --- a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_one.yaml +++ /dev/null @@ -1,4 +0,0 @@ -template: market_shard_descriptor.MarketShardDescriptor -params: - datafolder: Market-1501-v15.09.15 - rank_worldsize: 1,2 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_two.yaml b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_two.yaml deleted file mode 100644 index 64f149c17c..0000000000 --- a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/shard_config_two.yaml +++ /dev/null @@ -1,4 +0,0 @@ -template: market_shard_descriptor.MarketShardDescriptor -params: - datafolder: Market-1501-v15.09.15 - rank_worldsize: 2,2 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml new file mode 100644 index 0000000000..850ae2b09a --- /dev/null +++ b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml @@ -0,0 +1,13 @@ +params: + cuda_devices: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: tinyimagenet_shard_descriptor.TinyImageNetShardDescriptor + params: + data_folder: tinyimagenet_data + rank_worldsize: 1,2 diff --git a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/shard_config.yaml b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/shard_config.yaml deleted file mode 100644 index 5a8f930986..0000000000 --- a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/shard_config.yaml +++ /dev/null @@ -1,4 +0,0 @@ -template: tinyimagenet_shard_descriptor.TinyImageNetShardDescriptor -params: - data_folder: tinyimagenet_data - rank_worldsize: 1,2 diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml new file mode 100644 index 0000000000..07bcc7c081 --- /dev/null +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml @@ -0,0 +1,12 @@ +params: + cuda_devices: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: mnist_shard_descriptor.MnistShardDescriptor + params: + rank_worldsize: 1, 2 diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml new file mode 100644 index 0000000000..04c643efb6 --- /dev/null +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml @@ -0,0 +1,12 @@ +params: + cuda_devices: [] + +optional_plugin_components: + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] + +shard_descriptor: + template: mnist_shard_descriptor.MnistShardDescriptor + params: + rank_worldsize: 2, 2 diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_one.yaml b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_one.yaml deleted file mode 100644 index bc819844c5..0000000000 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_one.yaml +++ /dev/null @@ -1,3 +0,0 @@ -template: mnist_shard_descriptor.MnistShardDescriptor -params: - rank_worldsize: 1, 2 diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_two.yaml b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_two.yaml deleted file mode 100644 index 9e15a838cb..0000000000 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/shard_config_two.yaml +++ /dev/null @@ -1,3 +0,0 @@ -template: mnist_shard_descriptor.MnistShardDescriptor -params: - rank_worldsize: 2, 2 diff --git a/setup.py b/setup.py index 6f39021198..d170128f33 100644 --- a/setup.py +++ b/setup.py @@ -43,8 +43,9 @@ 'openfl.federated.task', 'openfl.federated.data', 'openfl.plugins', - 'openfl.plugins.interface_serializer', 'openfl.plugins.frameworks_adapters', + 'openfl.plugins.interface_serializer', + 'openfl.plugins.processing_units_monitor', 'openfl-workspace', 'openfl-docker', 'openfl-tutorials', From 3fb7616370564c215aa93d55dcfe6812815adfa3 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Mon, 18 Oct 2021 10:20:01 +0300 Subject: [PATCH 24/31] fixed default value for device monitor plugin --- .../envoy/envoy_config.yaml | 9 +++---- .../envoy/envoy_config_no_gpu.yaml | 5 +--- .../envoy/envoy_config_one.yaml | 5 +--- .../envoy/envoy_config_two.yaml | 5 +--- .../envoy/envoy_config.yaml | 5 +--- .../envoy/envoy_config_one.yaml | 5 +--- .../envoy/envoy_config_two.yaml | 5 +--- openfl/interface/envoy.py | 26 +++++++++++-------- .../envoy/envoy_config.yaml | 5 +--- .../tensorflow_mnist/envoy/envoy_config.yaml | 5 +--- 10 files changed, 27 insertions(+), 48 deletions(-) diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml index 8ba8760e76..aae095f4e7 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config.yaml @@ -1,10 +1,10 @@ params: - cuda_devices: [1,2,5] + cuda_devices: [0,2] optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] + cuda_device_monitor: + template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor + settings: [] shard_descriptor: template: kvasir_shard_descriptor.KvasirShardDescriptor @@ -12,4 +12,3 @@ shard_descriptor: data_folder: kvasir_data rank_worldsize: 1,10 enforce_image_hw: '300,400' - \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml index 598b3af4d8..1c121e534a 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/envoy_config_no_gpu.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -# optional_plugin_components: -# cuda_device_monitor: -# template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor -# settings: [] +optional_plugin_components: {} shard_descriptor: template: kvasir_shard_descriptor.KvasirShardDescriptor diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml index 446ea261c7..8d05c44b0f 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_one.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] +optional_plugin_components: {} shard_descriptor: template: market_shard_descriptor.MarketShardDescriptor diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml index 8f964cb39e..195d09779f 100644 --- a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/envoy_config_two.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] +optional_plugin_components: {} shard_descriptor: template: market_shard_descriptor.MarketShardDescriptor diff --git a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml index 850ae2b09a..d4fc58e89c 100644 --- a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml +++ b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/envoy_config.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] +optional_plugin_components: {} shard_descriptor: template: tinyimagenet_shard_descriptor.TinyImageNetShardDescriptor diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml index 07bcc7c081..053d5ef9cb 100644 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_one.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] +optional_plugin_components: {} shard_descriptor: template: mnist_shard_descriptor.MnistShardDescriptor diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml index 04c643efb6..b8b9685171 100644 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/envoy_config_two.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] +optional_plugin_components: {} shard_descriptor: template: mnist_shard_descriptor.MnistShardDescriptor diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index fdb8196ac5..d03d555ef4 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -67,17 +67,21 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, # Parse envoy parameters envoy_params = envoy_config.get('params', {}) - for plugin_name, plugin_settings in envoy_config.get('optional_plugin_components', {}).items(): - template = plugin_settings.get('template') - if not template: - raise Exception('You should put a template' - f'for plugin {plugin_name}') - module_path, _, class_name = template.rpartition('.') - plugin_params = plugin_settings.get('params', {}) - - module = import_module(module_path) - instance = getattr(module, class_name)(**plugin_params) - envoy_params[plugin_name] = instance + + # Build optional plugin components + optional_plugins_section = envoy_config.get('optional_plugin_components', None) + if optional_plugins_section is not None: + for plugin_name, plugin_settings in optional_plugins_section.items(): + template = plugin_settings.get('template') + if not template: + raise Exception('You should put a template' + f'for plugin {plugin_name}') + module_path, _, class_name = template.rpartition('.') + plugin_params = plugin_settings.get('params', {}) + + module = import_module(module_path) + instance = getattr(module, class_name)(**plugin_params) + envoy_params[plugin_name] = instance # Instantiate Shard Descriptor shard_descriptor = shard_descriptor_from_config(envoy_config.get('shard_descriptor', {})) diff --git a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml index 98e5039bf9..98c1d6721b 100644 --- a/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml +++ b/tests/github/interactive_api_director/experiments/pytorch_kvasir_unet/envoy/envoy_config.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] +optional_plugin_components: {} shard_descriptor: template: kvasir_shard_descriptor.KvasirShardDescriptor diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml index 413f439714..42c5c37031 100644 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/envoy_config.yaml @@ -1,10 +1,7 @@ params: cuda_devices: [] -optional_plugin_components: - cuda_device_monitor: - template: openfl.plugins.processing_units_monitor.pynvml_monitor.PynvmlCUDADeviceMonitor - settings: [] +optional_plugin_components: {} shard_descriptor: template: shard_descriptor.MNISTShardDescriptor From 2bee1f530eb40d4774bf4efc5b744091ed2b016b Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Mon, 18 Oct 2021 12:18:35 +0300 Subject: [PATCH 25/31] fixed tensorflow test --- setup.py | 7 +++---- .../experiments/tensorflow_mnist/director/config.yaml | 2 +- .../tensorflow_mnist/envoy/shard_descriptor.py | 8 ++++---- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index d170128f33..42c31ccfde 100644 --- a/setup.py +++ b/setup.py @@ -58,8 +58,8 @@ 'numpy', 'pandas', 'protobuf', - 'grpcio~=1.34.0', - 'grpcio-tools~=1.34.0', + 'grpcio>=1.34.0', + 'grpcio-tools>=1.34.0', 'rich==9.1.0', 'tqdm', 'scikit-learn', @@ -71,8 +71,7 @@ 'cloudpickle', 'tensorboardX', 'tensorboard', - 'requests', - 'nvidia-ml-py3' + 'requests' ], python_requires='>=3.6, <3.9', project_urls={ diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/config.yaml b/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/config.yaml index 7211f71161..f3efd5ccfc 100644 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/config.yaml +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/director/config.yaml @@ -8,4 +8,4 @@ settings: sample_shape: ['784'] - target_shape: [] \ No newline at end of file + target_shape: ['0'] \ No newline at end of file diff --git a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_descriptor.py b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_descriptor.py index e8b4ad8758..764ba7048d 100644 --- a/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_descriptor.py +++ b/tests/github/interactive_api_director/experiments/tensorflow_mnist/envoy/shard_descriptor.py @@ -14,7 +14,7 @@ class MNISTShardDescriptor(ShardDescriptor): def __init__(self, rank_worldsize: str = '1,1') -> None: """Initialize KvasirShardDescriptor.""" super().__init__() - + (x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data() x_train = np.reshape(x_train, (-1, 784)) x_test = np.reshape(x_test, (-1, 784)) @@ -28,11 +28,11 @@ def __init__(self, rank_worldsize: str = '1,1') -> None: # Calculating data and target shapes - sample, target = self[0] + sample, _ = self[0] self._sample_shape = [str(dim) for dim in sample.shape] - self._target_shape = [str(dim) for dim in target.shape] + self._target_shape = ['0'] + - def __getitem__(self, index): """Return a item by the index.""" if index < len(self.X_train): From ef2321e1d1f07471581505800817ff529aad6357 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Mon, 18 Oct 2021 16:29:53 +0300 Subject: [PATCH 26/31] fix rebase --- openfl/interface/envoy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openfl/interface/envoy.py b/openfl/interface/envoy.py index d03d555ef4..6bf7da7d54 100644 --- a/openfl/interface/envoy.py +++ b/openfl/interface/envoy.py @@ -51,7 +51,7 @@ def start_(shard_name, director_host, director_port, tls, envoy_config_path, root_certificate, private_key, certificate): """Start the Envoy.""" logger.info('🧿 Starting the Envoy.') - if is_directory_traversal(shard_config_path): + if is_directory_traversal(envoy_config_path): click.echo('The shard config path is out of the openfl workspace scope.') sys.exit(1) # Read the Envoy config From 569405afd87c581fa85c8e75492c547c2830d8d7 Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Tue, 19 Oct 2021 09:59:36 +0300 Subject: [PATCH 27/31] initialized docks --- docs/source/openfl/plugins.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/openfl/plugins.rst b/docs/source/openfl/plugins.rst index 2bc06bac45..7735edeb7c 100644 --- a/docs/source/openfl/plugins.rst +++ b/docs/source/openfl/plugins.rst @@ -71,3 +71,10 @@ As well as :code:`restore_object` that will load previously serialized object fr @staticmethod def restore_object(filename: str): + + +.. _device_monitor_plugin: + +CUDA Device Monitor plugin +###################### + From 482ba17a0fcc3e9fc6cab0dfa4979bc3c2b2535d Mon Sep 17 00:00:00 2001 From: "Davidyuk, Igor" Date: Tue, 19 Oct 2021 13:02:45 +0300 Subject: [PATCH 28/31] update docs --- docs/source/openfl/plugins.rst | 39 +++++++++++++++++-- .../workflow/director_based_workflow.rst | 16 ++++++++ 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/docs/source/openfl/plugins.rst b/docs/source/openfl/plugins.rst index 7735edeb7c..c7f08ac7d3 100644 --- a/docs/source/openfl/plugins.rst +++ b/docs/source/openfl/plugins.rst @@ -10,11 +10,17 @@ framework_adapter_ serializer_plugin_ + device_monitor_plugin_ |productName| is designed to be a flexible and extensible framework. Plugins are interchangeable parts of -|productName| components. Different plugins support varying usage scenarios. |productName| users are free to provide -their implementations of |productName| plugins to support desired behavior. +|productName| components. +A plugin may be :code:`required` or :code:`optional`. |productName| can run without optional plugins. +|productName| users are free to provide +their implementations of |productName| plugins to achieve a desired behavior. +Technically, a plugin is just a class, that satisfies a certain interface. One may enable a plugin by putting its +import path and initialization parameters to the config file of a corresponding |productName| component +or to the frontend Python API. Please refer to openfl-tutorials for more information. .. _framework_adapter: @@ -22,6 +28,7 @@ Framework Adapter ###################### Framework Adapter plugins enable |productName| support for Deep Learning frameworks usage in FL experiments. +It is a required plugin for the frontend API component and Envoy. All the framework-specific operations on model weights are isolated in this plugin so |productName| can be framework-agnostic. The Framework adapter plugin interface is simple: there are two required methods to load and extract tensors from a model and an optimizer. @@ -57,7 +64,7 @@ Experiment Serializer Serializer plugins are used on the Frontend API to serialize the Experiment components and then on Envoys to deserialize them back. Currently, the default serializer is based on pickling. - +It is a required plugin. A Serializer plugin must implement :code:`serialize` method that creates a python object's representation on disk. .. code-block:: python @@ -75,6 +82,30 @@ As well as :code:`restore_object` that will load previously serialized object fr .. _device_monitor_plugin: -CUDA Device Monitor plugin +CUDA Device Monitor ###################### +CUDA Device Monitor plugin is an optional plugin for Envoy that can gather status information about GPU devices. +This information may be used by Envoy and included in a healthcheck message that is sent to Director. +Thus the CUDA devices statuses are visible to frontend users that may query this Envoy Registry information from Director. + +CUDA Device Monitor plugin must implement the following interface: + +.. code-block:: python + + class CUDADeviceMonitor: + + def get_driver_version(self) -> str: + ... + + def get_device_memory_total(self, index: int) -> int: + ... + + def get_device_memory_utilized(self, index: int) -> int: + ... + + def get_device_utilization(self, index: int) -> str: + """It is just a general method that returns a string that may be shown to the frontend user.""" + ... + + diff --git a/docs/source/workflow/director_based_workflow.rst b/docs/source/workflow/director_based_workflow.rst index 3b7c2cd098..58d1c12051 100644 --- a/docs/source/workflow/director_based_workflow.rst +++ b/docs/source/workflow/director_based_workflow.rst @@ -345,6 +345,22 @@ This method: * Compresses the whole workspace to an archive. * Sends the experiment archive to the Director so it may distribute the archive across the Federation and start the *Aggregator*. +FLExperiment's code:`start()` method parameters +------------------------------------------------- + +* code:`model_provider` - defined earlier code:`ModelInterface` object +* code:`task_keeper` - defined earlier code:`TaskInterface` object +* code:`data_loader` - defined earlier code:`DataInterface` object +* code:`rounds_to_train` - number of aggregation rounds needed to be conducted before the experiment is considered finished +* code:`delta_updates` - use calculated gradients instead of model checkpoints for aggregation +* code:`opt_treatment` - optimizer state treatment in federation. Possible values: 'RESET' means the optimizer state +is initialized each round from noise, if 'CONTINUE_LOCAL' is used the optimizer state will be reused locally by every collaborator, +in case the parameter is set to 'CONTINUE_GLOBAL' the optimizer's state will be aggregated. +* code:`device_assignment_policy` - this setting may be 'CPU_ONLY' or 'CUDA_PREFFERED'. In the first case, the code:`device` +parameter (which is a part of a task contract) that is passed to an FL task each round will be 'cpu'. In case +code:`device_assignment_policy='CUDA_PREFFERED'`, the code:`device` parameter will be 'cuda:{index}' if cuda devices +enabled in Envoy config and 'cpu' otherwise. + Observing the Experiment execution ---------------------------------- From 509e014d3d3cdd7a3a4f5d4e1f3d21dc69c8d5b0 Mon Sep 17 00:00:00 2001 From: igor-davidyuk Date: Fri, 29 Oct 2021 09:27:50 +0300 Subject: [PATCH 29/31] restore setup.py content --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 42c31ccfde..57d7186838 100644 --- a/setup.py +++ b/setup.py @@ -58,8 +58,8 @@ 'numpy', 'pandas', 'protobuf', - 'grpcio>=1.34.0', - 'grpcio-tools>=1.34.0', + 'grpcio~=1.34.0', + 'grpcio-tools~=1.34.0', 'rich==9.1.0', 'tqdm', 'scikit-learn', From 56cab83d801073e763a6189ca1af6ca049b52dae Mon Sep 17 00:00:00 2001 From: igor-davidyuk Date: Fri, 29 Oct 2021 09:32:25 +0300 Subject: [PATCH 30/31] shard-config -> envoy-config renaming --- docs/source/workflow/director_based_workflow.rst | 4 ++-- .../PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh | 2 +- .../envoy/start_envoy_with_tls.sh | 2 +- .../PyTorch_TinyImageNet/envoy/start_envoy.sh | 2 +- .../envoy/start_envoy_with_tls.sh | 2 +- .../Tensorflow_MNIST/envoy/start_envoy.sh | 2 +- .../Tensorflow_MNIST/envoy/start_envoy_with_tls.sh | 2 +- .../envoy/envoy_config_one.yaml | 11 +++++++++++ .../envoy/envoy_config_three.yaml | 11 +++++++++++ .../envoy/envoy_config_two.yaml | 11 +++++++++++ .../envoy/shard_config_one.yaml | 5 ----- .../envoy/shard_config_three.yaml | 5 ----- .../envoy/shard_config_two.yaml | 5 ----- .../Tensorflow_Word_Prediction/envoy/start_envoy.sh | 2 +- .../envoy/start_envoy_with_tls.sh | 2 +- 15 files changed, 43 insertions(+), 25 deletions(-) create mode 100644 openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_one.yaml create mode 100644 openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_three.yaml create mode 100644 openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_two.yaml delete mode 100644 openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_one.yaml delete mode 100644 openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_three.yaml delete mode 100644 openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_two.yaml diff --git a/docs/source/workflow/director_based_workflow.rst b/docs/source/workflow/director_based_workflow.rst index 58d1c12051..504bf992f3 100644 --- a/docs/source/workflow/director_based_workflow.rst +++ b/docs/source/workflow/director_based_workflow.rst @@ -118,7 +118,7 @@ To start the Envoy without mTLS use the following CLI command: .. code-block:: console $ fx envoy start -n env_one --disable-tls \ - --shard-config-path shard_config.yaml -d director_fqdn:port + --envoy-config-path envoy_config.yaml -d director_fqdn:port Alternatively, use the following command to establish a secured connection: @@ -127,7 +127,7 @@ Alternatively, use the following command to establish a secured connection: $ ENVOY_NAME=envoy_example_name $ fx envoy start -n "$ENVOY_NAME" \ - --shard-config-path shard_config.yaml \ + --envoy-config-path envoy_config.yaml \ -d director_fqdn:port -rc cert/root_ca.crt \ -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt diff --git a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh index 07a8353c4a..97e3f4d893 100755 --- a/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh +++ b/openfl-tutorials/interactive_api/PyTorch_Kvasir_UNet/envoy/start_envoy_with_tls.sh @@ -3,4 +3,4 @@ set -e ENVOY_NAME=$1 DIRECTOR_FQDN=$2 -fx envoy start -n "$ENVOY_NAME" --shard-config-path shard_config.yaml -dh "$DIRECTOR_FQDN" -dp 50050 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt +fx envoy start -n "$ENVOY_NAME" --envoy-config-path envoy_config.yaml -dh "$DIRECTOR_FQDN" -dp 50050 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy_with_tls.sh index 295f61b101..873ebcb3d6 100755 --- a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy_with_tls.sh +++ b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy_with_tls.sh @@ -3,4 +3,4 @@ set -e ENVOY_NAME=$1 DIRECTOR_FQDN=$2 -fx envoy start -n "$ENVOY_NAME" --shard-config-path shard_config.yaml -d "$DIRECTOR_FQDN":50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt \ No newline at end of file +fx envoy start -n "$ENVOY_NAME" --envoy-config-path envoy_config.yaml -d "$DIRECTOR_FQDN":50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy.sh index 222d3988e0..1dd6591439 100644 --- a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls --shard-config-path shard_config.yaml -dh localhost -dp 50051 \ No newline at end of file +fx envoy start -n env_one --disable-tls --envoy-config-path envoy_config.yaml -dh localhost -dp 50051 \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy_with_tls.sh index 4f6375af62..06b2916a4f 100644 --- a/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy_with_tls.sh +++ b/openfl-tutorials/interactive_api/PyTorch_TinyImageNet/envoy/start_envoy_with_tls.sh @@ -3,4 +3,4 @@ set -e ENVOY_NAME=$1 DIRECTOR_FQDN=$2 -fx envoy start -n "$ENVOY_NAME" --shard-config-path shard_config.yaml -dh "$DIRECTOR_FQDN" -dp 50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt \ No newline at end of file +fx envoy start -n "$ENVOY_NAME" --envoy-config-path envoy_config.yaml -dh "$DIRECTOR_FQDN" -dp 50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh index 962bd19daa..72a15413ed 100755 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh @@ -3,4 +3,4 @@ set -e ENVOY_NAME=$1 SHARD_CONF=$2 -fx envoy start -n "$ENVOY_NAME" --disable-tls --shard-config-path "$SHARD_CONF" -dh localhost -dp 50051 +fx envoy start -n "$ENVOY_NAME" --disable-tls --envoy-config-path "$SHARD_CONF" -dh localhost -dp 50051 diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh index d2f695f414..2f4b10144e 100755 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh @@ -4,4 +4,4 @@ ENVOY_NAME=$1 SHARD_CONF=$2 DIRECTOR_FQDN=$3 -fx envoy start -n "$ENVOY_NAME" --shard-config-path "$SHARD_CONF" -dh "$DIRECTOR_FQDN" -dp 50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt +fx envoy start -n "$ENVOY_NAME" --envoy-config-path "$SHARD_CONF" -dh "$DIRECTOR_FQDN" -dp 50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_one.yaml b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_one.yaml new file mode 100644 index 0000000000..3db7be6176 --- /dev/null +++ b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_one.yaml @@ -0,0 +1,11 @@ +# https://www.gutenberg.org/files/36668/36668-h/36668-h.htm +params: + cuda_devices: [] + +optional_plugin_components: {} + +shard_descriptor: + template: shard_descriptor.NextWordShardDescriptor + params: + title: Polish Fairy Tales + author: A. J. Gliński \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_three.yaml b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_three.yaml new file mode 100644 index 0000000000..a8ebdd7b3d --- /dev/null +++ b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_three.yaml @@ -0,0 +1,11 @@ +# https://www.gutenberg.org/files/4357/4357-h/4357-h.htm +params: + cuda_devices: [] + +optional_plugin_components: {} + +shard_descriptor: + template: shard_descriptor.NextWordShardDescriptor + params: + title: American Fairy Tales + author: L. FRANK BAUM \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_two.yaml b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_two.yaml new file mode 100644 index 0000000000..fbdf200ee5 --- /dev/null +++ b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/envoy_config_two.yaml @@ -0,0 +1,11 @@ +# https://www.gutenberg.org/cache/epub/7439/pg7439-images.html +params: + cuda_devices: [] + +optional_plugin_components: {} + +shard_descriptor: + template: shard_descriptor.NextWordShardDescriptor + params: + title: English Fairy Tales + author: Joseph Jacobs \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_one.yaml b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_one.yaml deleted file mode 100644 index 5fb67c824c..0000000000 --- a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_one.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# https://www.gutenberg.org/files/36668/36668-h/36668-h.htm -template: shard_descriptor.NextWordShardDescriptor -params: - title: Polish Fairy Tales - author: A. J. Gliński \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_three.yaml b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_three.yaml deleted file mode 100644 index a72b861201..0000000000 --- a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_three.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# https://www.gutenberg.org/files/4357/4357-h/4357-h.htm -template: shard_descriptor.NextWordShardDescriptor -params: - title: American Fairy Tales - author: L. FRANK BAUM \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_two.yaml b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_two.yaml deleted file mode 100644 index b401d47b37..0000000000 --- a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/shard_config_two.yaml +++ /dev/null @@ -1,5 +0,0 @@ -# https://www.gutenberg.org/cache/epub/7439/pg7439-images.html -template: shard_descriptor.NextWordShardDescriptor -params: - title: English Fairy Tales - author: Joseph Jacobs \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy.sh index eeab8b8be5..3ddcc4932a 100755 --- a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls -dh localhost -dp 50051 -sc shard_config_one.yaml \ No newline at end of file +fx envoy start -n env_one --disable-tls -dh localhost -dp 50051 -ec envoy_config_one.yaml \ No newline at end of file diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy_with_tls.sh index 295f61b101..873ebcb3d6 100755 --- a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy_with_tls.sh +++ b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/envoy/start_envoy_with_tls.sh @@ -3,4 +3,4 @@ set -e ENVOY_NAME=$1 DIRECTOR_FQDN=$2 -fx envoy start -n "$ENVOY_NAME" --shard-config-path shard_config.yaml -d "$DIRECTOR_FQDN":50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt \ No newline at end of file +fx envoy start -n "$ENVOY_NAME" --envoy-config-path envoy_config.yaml -d "$DIRECTOR_FQDN":50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt \ No newline at end of file From f217a5a1b3eca89b186674c2d018afbb31842333 Mon Sep 17 00:00:00 2001 From: igor-davidyuk Date: Fri, 29 Oct 2021 09:44:47 +0300 Subject: [PATCH 31/31] more renamings: shard_config -> envoy_config --- .../PyTorch_Market_Re-ID/envoy/start_envoy.sh | 2 +- .../interactive_api/Tensorflow_MNIST/README.md | 4 ++-- .../interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh | 4 ++-- .../Tensorflow_MNIST/envoy/start_envoy_with_tls.sh | 4 ++-- .../interactive_api/Tensorflow_Word_Prediction/README.md | 4 ++-- .../github/interactive_api_director/experiment_runner.py | 8 ++++---- 6 files changed, 13 insertions(+), 13 deletions(-) diff --git a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy.sh index ef23ba6261..e28c747843 100755 --- a/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/PyTorch_Market_Re-ID/envoy/start_envoy.sh @@ -1,4 +1,4 @@ #!/bin/bash set -e -fx envoy start -n env_one --disable-tls -dh localhost -dp 50051 -sc shard_config_one.yaml +fx envoy start -n env_one --disable-tls -dh localhost -dp 50051 -ec envoy_config_one.yaml diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/README.md b/openfl-tutorials/interactive_api/Tensorflow_MNIST/README.md index 209da5b944..1704d9e287 100644 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/README.md +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/README.md @@ -22,13 +22,13 @@ cd director_folder 2. Run envoy: ```sh cd envoy_folder -./start_envoy.sh env_one shard_config_one.yaml +./start_envoy.sh env_one envoy_config_one.yaml ``` Optional: start second envoy: - Copy `envoy_folder` to another place and run from there: ```sh -./start_envoy.sh env_two shard_config_two.yaml +./start_envoy.sh env_two envoy_config_two.yaml ``` 3. Run `Mnist_Classification_FL.ipybnb` jupyter notebook: diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh index 72a15413ed..cdd84e7fb6 100755 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy.sh @@ -1,6 +1,6 @@ #!/bin/bash set -e ENVOY_NAME=$1 -SHARD_CONF=$2 +ENVOY_CONF=$2 -fx envoy start -n "$ENVOY_NAME" --disable-tls --envoy-config-path "$SHARD_CONF" -dh localhost -dp 50051 +fx envoy start -n "$ENVOY_NAME" --disable-tls --envoy-config-path "$ENVOY_CONF" -dh localhost -dp 50051 diff --git a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh index 2f4b10144e..2585cc9a01 100755 --- a/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh +++ b/openfl-tutorials/interactive_api/Tensorflow_MNIST/envoy/start_envoy_with_tls.sh @@ -1,7 +1,7 @@ #!/bin/bash set -e ENVOY_NAME=$1 -SHARD_CONF=$2 +ENVOY_CONF=$2 DIRECTOR_FQDN=$3 -fx envoy start -n "$ENVOY_NAME" --envoy-config-path "$SHARD_CONF" -dh "$DIRECTOR_FQDN" -dp 50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt +fx envoy start -n "$ENVOY_NAME" --envoy-config-path "$ENVOY_CONF" -dh "$DIRECTOR_FQDN" -dp 50051 -rc cert/root_ca.crt -pk cert/"$ENVOY_NAME".key -oc cert/"$ENVOY_NAME".crt diff --git a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/README.md b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/README.md index 037ac6def0..02c7b33085 100644 --- a/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/README.md +++ b/openfl-tutorials/interactive_api/Tensorflow_Word_Prediction/README.md @@ -17,10 +17,10 @@ https://github.com/intel/openfl/issues/185 ## To run experiment: 1. Create a folder for each envoy (they can be subfolders of `envoy` for simulation purposes or folders on different machines in a real-life setting), in our case we should create three folders. -2. Put a relevant `shard_config` in each of the three folders and copy other files from `envoy` there as well. +2. Put a relevant `envoy_config` in each of the three folders and copy other files from `envoy` there as well. 3. Modify the `start_envoy` accordingly: 1. change `env_one` to `env_two`, `env_three` (or any unique envoy names you like) - 2. `shard_config_one.yaml` to `shard_config_two.yaml` and `shard_config_three.yaml`. + 2. `envoy_config_one.yaml` to `envoy_config_two.yaml` and `envoy_config_three.yaml`. 4. Install requirements for each envoy: `pip install -r sd_requirements.txt` 5. Run the director: execute `start_director.sh` in `director`. 6. Run the envoys: execute `start_envoy.sh` in each envoy folder. diff --git a/tests/github/interactive_api_director/experiment_runner.py b/tests/github/interactive_api_director/experiment_runner.py index f3685e018e..3a4036f9df 100644 --- a/tests/github/interactive_api_director/experiment_runner.py +++ b/tests/github/interactive_api_director/experiment_runner.py @@ -72,7 +72,7 @@ def create_director(director_path, recreate, config): shutil.copy(config, director_path) -def create_envoy(col_path, recreate, shard_config, shard_descriptor): +def create_envoy(col_path, recreate, envoy_config, shard_descriptor): logger.info(f'Creating the envoy in {col_path}!') if os.path.exists(col_path): if not recreate: @@ -82,7 +82,7 @@ def create_envoy(col_path, recreate, shard_config, shard_descriptor): f'fx envoy create-workspace -p {col_path}', shell=True ).wait() - shutil.copy(shard_config, col_path) + shutil.copy(envoy_config, col_path) shutil.copy(shard_descriptor, col_path) @@ -90,14 +90,14 @@ def create_federation( director_path: str, collaborator_paths: typing.Iterable[str], director_config, - shard_config, + envoy_config, shard_descriptor, recreate=False ): logger.info('Creating the federation!') create_director(director_path, recreate, director_config) for col_path in collaborator_paths: - create_envoy(col_path, recreate, shard_config, shard_descriptor) + create_envoy(col_path, recreate, envoy_config, shard_descriptor) # TODO: create mTLS logger.info('Federation was created')