pytorch · teng-li · Feb 22, 2018 · Mar 12, 2018
diff --git a/imagenet/main.py b/imagenet/main.py
@@ -55,6 +55,9 @@
                     help='url used to set up distributed training')
 parser.add_argument('--dist-backend', default='gloo', type=str,
                     help='distributed backend')
+parser.add_argument('--local_rank', default=-1, type=int,
+                    help='If specified, the training will only run on the '
+                         'single given GPU device of the local_rank')
 
 best_prec1 = 0
 
@@ -63,12 +66,24 @@ def main():
     global args, best_prec1
     args = parser.parse_args()
 
+    if args.dist_url == "env://" and os.environ.get("WORLD_SIZE") is not None:
+        args.world_size = int(os.environ.get("WORLD_SIZE"))
+
     args.distributed = args.world_size > 1
 
     if args.distributed:
         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
                                 world_size=args.world_size)
 
+    if args.local_rank > -1 and args.local_rank < torch.cuda.device_count():
+        print("=> using single device: {} for training".format(args.local_rank))
+        dp_device_ids = [args.local_rank]
+    else:
+        dp_device_ids = None
+
+    if dp_device_ids:
+        torch.cuda.set_device(args.local_rank)
+
     # create model
     if args.pretrained:
         print("=> using pre-trained model '{}'".format(args.arch))
@@ -79,13 +94,14 @@ def main():
 
     if not args.distributed:
         if args.arch.startswith('alexnet') or args.arch.startswith('vgg'):
-            model.features = torch.nn.DataParallel(model.features)
+            model.features = torch.nn.DataParallel(model.features,
+                                                   device_ids=dp_device_ids)
             model.cuda()
         else:
-            model = torch.nn.DataParallel(model).cuda()
+            model = torch.nn.DataParallel(model, device_ids=dp_device_ids).cuda()
     else:
         model.cuda()
-        model = torch.nn.parallel.DistributedDataParallel(model)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=dp_device_ids)
 
     # define loss function (criterion) and optimizer
     criterion = nn.CrossEntropyLoss().cuda()