Updated distributed Demos (#215)

williamFalcon · web-flow · commit 396047ffa0ea · 2019-09-08T18:17:33.000-04:00
* added simple cluster template

* added simple cluster template

* added simple cluster template

* added simple cluster template

* added simple cluster template

* added simple cluster template

* added simple cluster template

* added simple cluster template

* added simple cluster template

* added simple cluster template

* sets correct backend for possible combinations of gpu inputs

* sets correct backend for possible combinations of gpu inputs

* simple slurm example

* simple slurm example

* simple slurm example
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -13,11 +13,16 @@ include LICENSE
 exclude *.sh
 exclude *.toml
 exclude *.svg    
-recursive-include examples *.py
 recursive-include pytorch_lightning *.py
 
+# include examples
+recursive-include examples *.py
+recursive-include examples *.md
+recursive-include examples *.sh
+
 # exclude tests from package
 recursive-exclude tests *
+recursive-exclude site *
 exclude tests
 
 # Exclude the documentation files
diff --git a/examples/new_project_templates/lightning_module_template.py b/examples/new_project_templates/lightning_module_template.py
@@ -240,15 +240,15 @@ def add_model_specific_args(parent_parser, root_dir):  # pragma: no cover
         parser.add_argument('--out_features', default=10, type=int)
         # use 500 for CPU, 50000 for GPU to see speed difference
         parser.add_argument('--hidden_dim', default=50000, type=int)
-        parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=False)
+        parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True)
+        parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
+                        options=[0.0001, 0.0005, 0.001],
+                        tunable=True)
 
         # data
         parser.add_argument('--data_root', default=os.path.join(root_dir, 'mnist'), type=str)
 
         # training params (opt)
-        parser.opt_list('--learning_rate', default=0.001 * 8, type=float,
-                        options=[0.0001, 0.0005, 0.001, 0.005],
-                        tunable=False)
         parser.opt_list('--optimizer_name', default='adam', type=str,
                         options=['adam'], tunable=False)
 
diff --git a/examples/new_project_templates/multi_node_examples/README.md b/examples/new_project_templates/multi_node_examples/README.md
@@ -0,0 +1,37 @@
+# Multi-node examples
+Use these templates for multi-node training
+
+## Simplest example.   
+1. Modify this script with your CoolModel file.   
+2. Update and submit [this bash script]()   
+```bash
+squeue minimal_multi_node_demo_script.sh
+```
+
+## Grid search on a cluster   
+
+#### Option 1: Run on cluster using your own SLURM script    
+The trainer and model will work on a cluster if you configure your SLURM script correctly.   
+
+1. Update [this demo slurm script]().  
+2. Submit the script   
+```bash
+$ squeue demo_script.sh
+```
+
+Most people have some way they automatically generate their own scripts.  
+To run a grid search this way, you'd need a way to automatically generate scripts using all the combinations of 
+hyperparameters to search over.   
+
+#### Option 2: Use test-tube for SLURM script
+With test tube we can automatically generate slurm scripts for different hyperparameter options.   
+
+To run this demo:    
+```bash
+source activate YourCondaEnv
+
+python multi_node_cluster_auto_slurm.py --email your@email.com --gpu_partition your_partition --conda_env YourCondaEnv
+```
+
+That will submit 6 jobs. Each job will have a specific combination of hyperparams. Each job will also run on 2 nodes
+where each node has 8 gpus.   
diff --git a/examples/new_project_templates/multi_node_examples/__init__.py b/examples/new_project_templates/multi_node_examples/__init__.py
diff --git a/examples/new_project_templates/multi_node_examples/demo_script.sh b/examples/new_project_templates/multi_node_examples/demo_script.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+#
+# Auto-generated by test-tube (https://github.com/williamFalcon/test-tube)
+#################
+
+# set a job name
+#SBATCH --job-name=lightning_test
+#################
+
+# a file for job output, you can check job progress
+#SBATCH --output=/slurm_output_%j.out
+#################
+
+# a file for errors
+#SBATCH --error=/slurm_output_%j.err
+#################
+
+# time needed for job
+#SBATCH --time=01:00:00
+#################
+
+# gpus per node
+#SBATCH --gres=gpu:8
+#################
+
+# cpus per job
+#SBATCH --cpus-per-task=10
+#################
+
+# number of requested nodes
+#SBATCH --nodes=2
+#################
+
+# memory per node (0 means all)
+#SBATCH --mem=0
+#################
+
+# slurm will send a signal this far out before it kills the job
+#SBATCH --signal=USR1@300
+#################
+
+# comment
+#SBATCH --comment=lightning_demo
+#################
+
+# 1 task per gpu
+#SBATCH --ntasks-per-node=8
+#################
+
+source activate YourEnv
+
+# debugging flags (optional)
+export NCCL_DEBUG=INFO
+export PYTHONFAULTHANDLER=1
+
+# random port between 12k and 20k
+export MASTER_PORT=$((12000 + RANDOM % 20000))$
+
+srun python multi_node_own_slurm_script.py
diff --git a/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo.py b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo.py
@@ -0,0 +1,24 @@
+from pytorch_lightning import Trainer
+from test_tube import Experiment
+import os
+
+
+def main():
+    # use the cool model from the main README.md
+    model = CoolModel()  # noqa: F821
+    exp = Experiment(save_dir=os.getcwd())
+
+    # train on 4 GPUs across 4 nodes
+    trainer = Trainer(
+        experiment=exp,
+        distributed_backend='ddp',
+        max_nb_epochs=10,
+        gpus=4,
+        nb_gpu_nodes=4
+    )
+
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh b/examples/new_project_templates/multi_node_examples/minimal_multi_node_demo_script.sh
@@ -0,0 +1,14 @@
+#!/bin/bash -l
+
+# SLURM SUBMIT SCRIPT
+#SBATCH --nodes=4
+#SBATCH --gres=gpu:4
+#SBATCH --ntasks-per-node=4
+#SBATCH --mem=0
+#SBATCH --time=0-02:00:00
+
+# activate conda env
+conda activate my_env
+
+# run script from above
+python minimal_multi_node_demo.py
diff --git a/examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py b/examples/new_project_templates/multi_node_examples/multi_node_cluster_auto_slurm.py
@@ -75,12 +75,12 @@ def main(hparams, cluster):
     # ------------------------
     # 4 INIT TRAINER
     # ------------------------
+    gpus = list(range(0, hparams.per_experiment_nb_gpus))
     trainer = Trainer(
         experiment=exp,
-        cluster=cluster,
         checkpoint_callback=checkpoint,
         early_stop_callback=early_stop,
-        gpus=hparams.gpus,
+        gpus=gpus,
         nb_gpu_nodes=hyperparams.nb_gpu_nodes
     )
 
@@ -99,7 +99,7 @@ def optimize_on_cluster(hyperparams):
     )
 
     # email for cluster coms
-    cluster.notify_job_status(email='add_email_here', on_done=True, on_fail=True)
+    cluster.notify_job_status(email=hyperparams.email, on_done=True, on_fail=True)
 
     # configure cluster
     cluster.per_experiment_nb_gpus = hyperparams.per_experiment_nb_gpus
@@ -109,7 +109,7 @@ def optimize_on_cluster(hyperparams):
     cluster.memory_mb_per_node = 0
 
     # any modules for code to run in env
-    cluster.add_command('source activate lightning')
+    cluster.add_command(f'source activate {hyperparams.conda_env}')
 
     # run only on 32GB voltas
     cluster.add_slurm_cmd(cmd='constraint', value='volta32gb',
@@ -121,7 +121,7 @@ def optimize_on_cluster(hyperparams):
     # creates and submits jobs to slurm
     cluster.optimize_parallel_cluster_gpu(
         main,
-        nb_trials=hyperparams.nb_hopt_trials,
+        nb_trials=hyperparams.num_hyperparam_trials,
         job_name=hyperparams.experiment_name
     )
 
@@ -139,15 +139,10 @@ def optimize_on_cluster(hyperparams):
     parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
 
     # cluster args not defined inside the model
-    parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')
 
-    # TODO: make 1 param
     parent_parser.add_argument('--per_experiment_nb_gpus', type=int,
-                               default=2, help='how many gpus to use in a node')
-    parent_parser.add_argument('--gpus', type=str, default='-1',
-                               help='how many gpus to use in the node')
-
-    parent_parser.add_argument('--nb_gpu_nodes', type=int, default=1,
+                               default=8, help='how many gpus to use in a node')
+    parent_parser.add_argument('--nb_gpu_nodes', type=int, default=2,
                                help='how many nodes to use in a cluster')
     parent_parser.add_argument('--test_tube_save_path', type=str, default=test_tube_dir,
                                help='where to save logs')
@@ -157,9 +152,15 @@ def optimize_on_cluster(hyperparams):
                                help='where to save model')
     parent_parser.add_argument('--experiment_name', type=str, default='pt_lightning_exp_a',
                                help='test tube exp name')
-    parent_parser.add_argument('--nb_hopt_trials', type=int, default=1,
+    parent_parser.add_argument('--num_hyperparam_trials', type=int, default=6,
                                help='how many grid search trials to run')
 
+    parent_parser.add_argument('--email', type=str, default='add@email.com',
+                               help='email for jobs')
+    parent_parser.add_argument('--conda_env', type=str, default='base',
+                               help='email for jobs')
+    parent_parser.add_argument('--gpu_partition', type=str, help='consult your cluster manual')
+
     # allow model to overwrite or extend args
     parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
     hyperparams = parser.parse_args()
diff --git a/examples/new_project_templates/multi_node_examples/multi_node_own_slurm_script.py b/examples/new_project_templates/multi_node_examples/multi_node_own_slurm_script.py
@@ -0,0 +1,70 @@
+"""
+Multi-node example (GPU)
+"""
+import os
+import numpy as np
+import torch
+
+from test_tube import HyperOptArgumentParser, Experiment
+from pytorch_lightning import Trainer
+from examples.new_project_templates.lightning_module_template import LightningTemplateModel
+
+SEED = 2334
+torch.manual_seed(SEED)
+np.random.seed(SEED)
+
+
+def main(hparams):
+    """
+    Main training routine specific for this project
+    :param hparams:
+    :return:
+    """
+    # ------------------------
+    # 1 INIT LIGHTNING MODEL
+    # ------------------------
+    model = LightningTemplateModel(hparams)
+
+    # ------------------------
+    # 2 INIT TEST TUBE EXP
+    # ------------------------
+    # init experiment
+    exp = Experiment(
+        name='test_exp',
+        save_dir=hyperparams.log_dir,
+        autosave=False,
+        description='test demo'
+    )
+
+    # ------------------------
+    # 2 INIT TRAINER
+    # ------------------------
+    trainer = Trainer(
+        experiment=exp,
+        gpus=[0, 1, 2, 3, 4, 5, 6, 7],
+        nb_gpu_nodes=2
+    )
+
+    # ------------------------
+    # 5 START TRAINING
+    # ------------------------
+    trainer.fit(model)
+
+
+if __name__ == '__main__':
+    # use current dir for logging
+    root_dir = os.path.dirname(os.path.realpath(__file__))
+    log_dir = os.path.join(root_dir, 'pt_lightning_demo_logs')
+
+    parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
+    parent_parser.add_argument('--log_dir', type=str, default=log_dir,
+                               help='where to save logs')
+
+    # allow model to overwrite or extend args
+    parser = LightningTemplateModel.add_model_specific_args(parent_parser, root_dir)
+    hyperparams = parser.parse_args()
+
+    # ---------------------
+    # RUN TRAINING
+    # ---------------------
+    main(hyperparams)