Move use_gpu from ClassyTrainer to ClassificationTask

vreis · facebook-github-bot · commit 80b1df6fdddb · 2020-04-07T14:57:22.000-07:00
Summary:
This is the first in a series of diffs to eliminate the ClassyTrainer abstraction. The only reason Trainer existed was to support elastic training, but PET v0.2 does not require changing out training loop. The plan is to move all attributes from ClassyTrainer into ClassificationTask.

Start by moving use_gpu to the task.

Differential Revision: D20801017

fbshipit-source-id: 9fd3322a4503498a969c2bdfa7301c8c99a8f790
diff --git a/classy_train.py b/classy_train.py
@@ -93,18 +93,13 @@ def main(args, config):
     # Configure hooks to do tensorboard logging, checkpoints and so on
     task.set_hooks(configure_hooks(args, config))
 
-    use_gpu = None
-    if args.device is not None:
-        use_gpu = args.device == "gpu"
-        assert torch.cuda.is_available() or not use_gpu, "CUDA is unavailable"
-
     # LocalTrainer is used for a single node. DistributedTrainer will setup
     # training to use PyTorch's DistributedDataParallel.
     trainer_class = {"none": LocalTrainer, "ddp": DistributedTrainer}[
         args.distributed_backend
     ]
 
-    trainer = trainer_class(use_gpu=use_gpu, num_dataloader_workers=args.num_workers)
+    trainer = trainer_class(num_dataloader_workers=args.num_workers)
 
     logging.info(
         f"Starting training on rank {get_rank()} worker. "
diff --git a/classy_vision/generic/opts.py b/classy_vision/generic/opts.py
@@ -18,12 +18,6 @@ def add_generic_args(parser):
     parser.add_argument(
         "--config_file", type=str, help="path to config file for model", required=True
     )
-    parser.add_argument(
-        "--device",
-        default=None,
-        type=str,
-        help="device to use: either 'cpu' or 'gpu'. If unspecified, will use GPU when available and CPU otherwise.",
-    )
     parser.add_argument(
         "--num_workers",
         default=4,
@@ -145,13 +139,6 @@ def check_generic_args(args):
     # check types and values:
     assert is_pos_int(args.num_workers), "incorrect number of workers"
     assert is_pos_int(args.visdom_port), "incorrect visdom port"
-    assert (
-        args.device is None or args.device == "cpu" or args.device == "gpu"
-    ), "unknown device"
-
-    # check that CUDA is available:
-    if args.device == "gpu":
-        assert torch.cuda.is_available(), "CUDA required to train on GPUs"
 
     # create checkpoint folder if it does not exist:
     if args.checkpoint_folder != "" and not os.path.exists(args.checkpoint_folder):
diff --git a/classy_vision/tasks/classification_task.py b/classy_vision/tasks/classification_task.py
@@ -142,6 +142,16 @@ def __init__(self):
         self.perf_log = []
         self.last_batch = None
         self.batch_norm_sync_mode = BatchNormSyncMode.DISABLED
+        self.use_gpu = torch.cuda.is_available()
+
+    def set_use_gpu(self, use_gpu: bool):
+        self.use_gpu = use_gpu
+
+        assert (
+            not self.use_gpu or torch.cuda.is_available()
+        ), "CUDA required to train on GPUs"
+
+        return self
 
     def set_checkpoint(self, checkpoint):
         """Sets checkpoint on task.
@@ -359,6 +369,10 @@ def from_config(cls, config: Dict[str, Any]) -> "ClassificationTask":
             .set_hooks(hooks)
         )
 
+        use_gpu = config.get("use_gpu")
+        if use_gpu is not None:
+            task.set_use_gpu(use_gpu)
+
         for phase_type in phase_types:
             task.set_dataset(datasets[phase_type], phase_type)
 
@@ -508,24 +522,19 @@ def build_dataloaders(
             for phase_type in self.datasets.keys()
         }
 
-    def prepare(
-        self,
-        num_dataloader_workers=0,
-        pin_memory=False,
-        use_gpu=False,
-        dataloader_mp_context=None,
-    ):
+    def prepare(self, num_dataloader_workers=0, dataloader_mp_context=None):
         """Prepares task for training, populates all derived attributes
 
         Args:
             num_dataloader_workers: Number of dataloading processes. If 0,
                 dataloading is done on main process
-            pin_memory: if true pin memory on GPU
-            use_gpu: if true, load model, optimizer, loss, etc on GPU
             dataloader_mp_context: Determines how processes are spawned.
                 Value must be one of None, "spawn", "fork", "forkserver".
                 If None, then context is inherited from parent process
         """
+
+        pin_memory = self.use_gpu and torch.cuda.device_count() > 1
+
         self.phases = self._build_phases()
         self.dataloaders = self.build_dataloaders(
             num_workers=num_dataloader_workers,
@@ -539,7 +548,7 @@ def prepare(
             self.base_model = apex.parallel.convert_syncbn_model(self.base_model)
 
         # move the model and loss to the right device
-        if use_gpu:
+        if self.use_gpu:
             self.base_model, self.loss = copy_model_to_gpu(self.base_model, self.loss)
         else:
             self.loss.cpu()
@@ -686,7 +695,7 @@ def set_classy_state(self, state):
         # Set up pytorch module in train vs eval mode, update optimizer.
         self._set_model_train_mode()
 
-    def eval_step(self, use_gpu):
+    def eval_step(self):
         self.last_batch = None
 
         # Process next sample
@@ -699,7 +708,7 @@ def eval_step(self, use_gpu):
 
         # Copy sample to GPU
         target = sample["target"]
-        if use_gpu:
+        if self.use_gpu:
             for key, value in sample.items():
                 sample[key] = recursive_copy_to_gpu(value, non_blocking=True)
 
@@ -726,12 +735,8 @@ def check_inf_nan(self, loss):
         if loss == float("inf") or loss == float("-inf") or loss != loss:
             raise FloatingPointError(f"Loss is infinity or NaN: {loss}")
 
-    def train_step(self, use_gpu):
-        """Train step to be executed in train loop
-
-        Args:
-            use_gpu: if true, execute training on GPU
-        """
+    def train_step(self):
+        """Train step to be executed in train loop."""
 
         self.last_batch = None
 
@@ -745,7 +750,7 @@ def train_step(self, use_gpu):
 
         # Copy sample to GPU
         target = sample["target"]
-        if use_gpu:
+        if self.use_gpu:
             for key, value in sample.items():
                 sample[key] = recursive_copy_to_gpu(value, non_blocking=True)
 
diff --git a/classy_vision/tasks/classy_task.py b/classy_vision/tasks/classy_task.py
@@ -86,11 +86,7 @@ def set_classy_state(self, state):
 
     @abstractmethod
     def prepare(
-        self,
-        num_dataloader_workers=0,
-        pin_memory=False,
-        use_gpu=False,
-        dataloader_mp_context=None,
+        self, num_dataloader_workers=0, pin_memory=False, dataloader_mp_context=None
     ) -> None:
         """
         Prepares the task for training.
@@ -102,19 +98,15 @@ def prepare(
             num_dataloader_workers: Number of workers to create for the dataloaders
             pin_memory: Whether the dataloaders should copy the Tensors into CUDA
                 pinned memory (default False)
-            use_gpu: True if training on GPUs, False otherwise
         """
         pass
 
     @abstractmethod
-    def train_step(self, use_gpu) -> None:
+    def train_step(self) -> None:
         """
         Run a train step.
 
         This corresponds to training over one batch of data from the dataloaders.
-
-        Args:
-            use_gpu: True if training on GPUs, False otherwise
         """
         pass
 
@@ -155,24 +147,21 @@ def on_end(self):
         pass
 
     @abstractmethod
-    def eval_step(self, use_gpu) -> None:
+    def eval_step(self) -> None:
         """
         Run an evaluation step.
 
         This corresponds to evaluating the model over one batch of data.
-
-        Args:
-            use_gpu: True if training on GPUs, False otherwise
         """
         pass
 
-    def step(self, use_gpu) -> None:
+    def step(self) -> None:
         from classy_vision.hooks import ClassyHookFunctions
 
         if self.train:
-            self.train_step(use_gpu)
+            self.train_step()
         else:
-            self.eval_step(use_gpu)
+            self.eval_step()
 
         for hook in self.hooks:
             hook.on_step(self)
diff --git a/classy_vision/tasks/fine_tuning_task.py b/classy_vision/tasks/fine_tuning_task.py
@@ -67,18 +67,12 @@ def _set_model_train_mode(self):
             self.base_model.train(phase["train"])
 
     def prepare(
-        self,
-        num_dataloader_workers: int = 0,
-        pin_memory: bool = False,
-        use_gpu: bool = False,
-        dataloader_mp_context=None,
+        self, num_dataloader_workers: int = 0, dataloader_mp_context=None
     ) -> None:
         assert (
             self.pretrained_checkpoint is not None
         ), "Need a pretrained checkpoint for fine tuning"
-        super().prepare(
-            num_dataloader_workers, pin_memory, use_gpu, dataloader_mp_context
-        )
+        super().prepare(num_dataloader_workers, dataloader_mp_context)
         if self.checkpoint is None:
             # no checkpoint exists, load the model's state from the pretrained
             # checkpoint
diff --git a/classy_vision/trainer/classy_trainer.py b/classy_vision/trainer/classy_trainer.py
@@ -27,25 +27,18 @@ class ClassyTrainer:
 
     def __init__(
         self,
-        use_gpu: Optional[bool] = None,
         num_dataloader_workers: int = 0,
         dataloader_mp_context: Optional[str] = None,
     ):
         """Constructor for ClassyTrainer.
 
         Args:
-            use_gpu: If true, then use GPUs for training.
-                If None, then check if we have GPUs available, if we do
-                then use GPU for training.
             num_dataloader_workers: Number of CPU processes doing dataloading
                 per GPU. If 0, then dataloading is done on main thread.
             dataloader_mp_context: Determines how to launch
                 new processes for dataloading. Must be one of "fork", "forkserver",
                 "spawn". If None, process launching is inherited from parent.
         """
-        if use_gpu is None:
-            use_gpu = torch.cuda.is_available()
-        self.use_gpu = use_gpu
         self.num_dataloader_workers = num_dataloader_workers
         self.dataloader_mp_context = dataloader_mp_context
 
@@ -57,11 +50,8 @@ def train(self, task: ClassyTask):
                 everything that is needed for training
         """
 
-        pin_memory = self.use_gpu and torch.cuda.device_count() > 1
         task.prepare(
             num_dataloader_workers=self.num_dataloader_workers,
-            pin_memory=pin_memory,
-            use_gpu=self.use_gpu,
             dataloader_mp_context=self.dataloader_mp_context,
         )
         assert isinstance(task, ClassyTask)
@@ -75,7 +65,7 @@ def train(self, task: ClassyTask):
             task.on_phase_start()
             while True:
                 try:
-                    task.step(self.use_gpu)
+                    task.step()
                 except StopIteration:
                     break
             task.on_phase_end()
diff --git a/classy_vision/trainer/distributed_trainer.py b/classy_vision/trainer/distributed_trainer.py
@@ -56,39 +56,19 @@ class DistributedTrainer(ClassyTrainer):
     """Distributed trainer for using multiple training processes
     """
 
-    def __init__(
-        self,
-        use_gpu: Optional[bool] = None,
-        num_dataloader_workers: int = 0,
-        dataloader_mp_context: Optional[str] = None,
-    ):
-        """Constructor for DistributedTrainer.
-
-        Args:
-            use_gpu: If true, then use GPU 0 for training.
-                If None, then check if we have GPUs available, if we do
-                then use GPU for training.
-            num_dataloader_workers: Number of CPU processes doing dataloading
-                per GPU. If 0, then dataloading is done on main thread.
-            dataloader_mp_context: Determines how to launch
-                new processes for dataloading. Must be one of "fork", "forkserver",
-                "spawn". If None, process launching is inherited from parent.
-        """
-        super().__init__(
-            use_gpu=use_gpu,
-            num_dataloader_workers=num_dataloader_workers,
-            dataloader_mp_context=dataloader_mp_context,
-        )
+    def train(self, task):
         _init_env_vars()
-        _init_distributed(self.use_gpu)
+        _init_distributed(task.use_gpu)
         logging.info(
             f"Done setting up distributed process_group with rank {get_rank()}"
             + f", world_size {get_world_size()}"
         )
         local_rank = int(os.environ["LOCAL_RANK"])
-        if self.use_gpu:
+        if task.use_gpu:
             logging.info("Using GPU, CUDA device index: {}".format(local_rank))
             set_cuda_device_index(local_rank)
         else:
             logging.info("Using CPU")
             set_cpu_device()
+
+        super().train(task)
diff --git a/classy_vision/trainer/local_trainer.py b/classy_vision/trainer/local_trainer.py
@@ -16,32 +16,12 @@ class LocalTrainer(ClassyTrainer):
     """Trainer to be used if you want want use only a single training process.
     """
 
-    def __init__(
-        self,
-        use_gpu: Optional[bool] = None,
-        num_dataloader_workers: int = 0,
-        dataloader_mp_context: Optional[str] = None,
-    ):
-        """Constructor for LocalTrainer.
-
-        Args:
-            use_gpu: If true, then use GPU 0 for training.
-                If None, then check if we have GPUs available, if we do
-                then use GPU for training.
-            num_dataloader_workers: Number of CPU processes doing dataloading
-                per GPU. If 0, then dataloading is done on main thread.
-            dataloader_mp_context: Determines how to launch
-                new processes for dataloading. Must be one of "fork", "forkserver",
-                "spawn". If None, process launching is inherited from parent.
-        """
-        super().__init__(
-            use_gpu=use_gpu,
-            num_dataloader_workers=num_dataloader_workers,
-            dataloader_mp_context=dataloader_mp_context,
-        )
-        if self.use_gpu:
+    def train(self, task):
+        if task.use_gpu:
             logging.info("Using GPU, CUDA device index: {}".format(0))
             set_cuda_device_index(0)
         else:
             logging.info("Using CPU")
             set_cpu_device()
+
+        super().train(task)
diff --git a/test/generic_util_test.py b/test/generic_util_test.py
@@ -437,7 +437,7 @@ def test_update_classy_state(self):
         task = build_task(config)
         task_2 = build_task(config)
         task_2.prepare()
-        trainer = LocalTrainer(use_gpu=False)
+        trainer = LocalTrainer()
         trainer.train(task)
         update_classy_state(task_2, task.get_classy_state(deep_copy=True))
         self._compare_states(task.get_classy_state(), task_2.get_classy_state())
@@ -449,13 +449,12 @@ def test_update_classy_model(self):
         """
         config = get_fast_test_task_config()
         task = build_task(config)
-        use_gpu = torch.cuda.is_available()
-        trainer = LocalTrainer(use_gpu=use_gpu)
+        trainer = LocalTrainer()
         trainer.train(task)
         for reset_heads in [False, True]:
             task_2 = build_task(config)
             # prepare task_2 for the right device
-            task_2.prepare(use_gpu=use_gpu)
+            task_2.prepare()
             update_classy_model(
                 task_2.model, task.model.get_classy_state(deep_copy=True), reset_heads
             )
diff --git a/test/hooks_checkpoint_hook_test.py b/test/hooks_checkpoint_hook_test.py
diff --git a/test/manual/tasks_classification_task_amp_test.py b/test/manual/tasks_classification_task_amp_test.py
diff --git a/test/tasks_classification_task_test.py b/test/tasks_classification_task_test.py