diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 053a9be5e05487..fae1e867ae51e0 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -109,6 +109,10 @@ if [[ $BUILD_ENVIRONMENT == *-rocm* ]]; then # Our cuda top_k op has some asm code, the hipified version doesn't # compile yet, so we don't have top_k operator for now rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py") + + # this is a multi-gpu test + rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/data_parallel_model_test.py") + fi # Python tests diff --git a/caffe2/python/cnn.py b/caffe2/python/cnn.py index 05eaa1b0495e00..f927020e6ae88f 100644 --- a/caffe2/python/cnn.py +++ b/caffe2/python/cnn.py @@ -5,7 +5,7 @@ from __future__ import print_function from __future__ import unicode_literals -from caffe2.python import brew, workspace +from caffe2.python import brew from caffe2.python.model_helper import ModelHelper from caffe2.proto import caffe2_pb2 import logging @@ -17,7 +17,7 @@ class CNNModelHelper(ModelHelper): """ def __init__(self, order="NCHW", name=None, - use_gpu_engine=True, gpu_engine_exhaustive_search=False, + use_cudnn=True, cudnn_exhaustive_search=False, ws_nbytes_limit=None, init_params=True, skip_sparse_optim=False, param_model=None): @@ -31,8 +31,8 @@ def __init__(self, order="NCHW", name=None, cnn_arg_scope = { 'order': order, - 'use_gpu_engine': use_gpu_engine, - 'gpu_engine_exhaustive_search': gpu_engine_exhaustive_search, + 'use_cudnn': use_cudnn, + 'cudnn_exhaustive_search': cudnn_exhaustive_search, } if ws_nbytes_limit: cnn_arg_scope['ws_nbytes_limit'] = ws_nbytes_limit @@ -45,8 +45,8 @@ def __init__(self, order="NCHW", name=None, ) self.order = order - self.use_gpu_engine = use_gpu_engine - self.gpu_engine_exhaustive_search = gpu_engine_exhaustive_search + self.use_cudnn = use_cudnn + self.cudnn_exhaustive_search = cudnn_exhaustive_search self.ws_nbytes_limit = ws_nbytes_limit if self.order != "NHWC" and self.order != "NCHW": raise ValueError( @@ -79,9 +79,9 @@ def ConvNd(self, *args, **kwargs): return brew.conv_nd( self, *args, - use_gpu_engine=self.use_gpu_engine, + use_cudnn=self.use_cudnn, order=self.order, - gpu_engine_exhaustive_search=self.gpu_engine_exhaustive_search, + cudnn_exhaustive_search=self.cudnn_exhaustive_search, ws_nbytes_limit=self.ws_nbytes_limit, **kwargs ) @@ -90,9 +90,9 @@ def Conv(self, *args, **kwargs): return brew.conv( self, *args, - use_gpu_engine=self.use_gpu_engine, + use_cudnn=self.use_cudnn, order=self.order, - gpu_engine_exhaustive_search=self.gpu_engine_exhaustive_search, + cudnn_exhaustive_search=self.cudnn_exhaustive_search, ws_nbytes_limit=self.ws_nbytes_limit, **kwargs ) @@ -101,9 +101,9 @@ def ConvTranspose(self, *args, **kwargs): return brew.conv_transpose( self, *args, - use_gpu_engine=self.use_gpu_engine, + use_cudnn=self.use_cudnn, order=self.order, - gpu_engine_exhaustive_search=self.gpu_engine_exhaustive_search, + cudnn_exhaustive_search=self.cudnn_exhaustive_search, ws_nbytes_limit=self.ws_nbytes_limit, **kwargs ) @@ -112,9 +112,9 @@ def GroupConv(self, *args, **kwargs): return brew.group_conv( self, *args, - use_gpu_engine=self.use_gpu_engine, + use_cudnn=self.use_cudnn, order=self.order, - gpu_engine_exhaustive_search=self.gpu_engine_exhaustive_search, + cudnn_exhaustive_search=self.cudnn_exhaustive_search, ws_nbytes_limit=self.ws_nbytes_limit, **kwargs ) @@ -123,9 +123,9 @@ def GroupConv_Deprecated(self, *args, **kwargs): return brew.group_conv_deprecated( self, *args, - use_gpu_engine=self.use_gpu_engine, + use_cudnn=self.use_cudnn, order=self.order, - gpu_engine_exhaustive_search=self.gpu_engine_exhaustive_search, + cudnn_exhaustive_search=self.cudnn_exhaustive_search, ws_nbytes_limit=self.ws_nbytes_limit, **kwargs ) @@ -147,16 +147,16 @@ def FC_Sparse(self, *args, **kwargs): def Dropout(self, *args, **kwargs): return brew.dropout( - self, *args, order=self.order, use_gpu_engine=self.use_gpu_engine, **kwargs + self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs ) def LRN(self, *args, **kwargs): return brew.lrn( - self, *args, order=self.order, use_gpu_engine=self.use_gpu_engine, **kwargs + self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs ) def Softmax(self, *args, **kwargs): - return brew.softmax(self, *args, use_gpu_engine=self.use_gpu_engine, **kwargs) + return brew.softmax(self, *args, use_cudnn=self.use_cudnn, **kwargs) def SpatialBN(self, *args, **kwargs): return brew.spatial_bn(self, *args, order=self.order, **kwargs) @@ -169,7 +169,7 @@ def InstanceNorm(self, *args, **kwargs): def Relu(self, *args, **kwargs): return brew.relu( - self, *args, order=self.order, use_gpu_engine=self.use_gpu_engine, **kwargs + self, *args, order=self.order, use_cudnn=self.use_cudnn, **kwargs ) def PRelu(self, *args, **kwargs): @@ -187,7 +187,7 @@ def Sum(self, *args, **kwargs): return brew.sum(self, *args, **kwargs) def Transpose(self, *args, **kwargs): - return brew.transpose(self, *args, use_gpu_engine=self.use_gpu_engine, **kwargs) + return brew.transpose(self, *args, use_cudnn=self.use_cudnn, **kwargs) def Iter(self, *args, **kwargs): return brew.iter(self, *args, **kwargs) @@ -197,7 +197,7 @@ def Accuracy(self, *args, **kwargs): def MaxPool(self, *args, **kwargs): return brew.max_pool( - self, *args, use_gpu_engine=self.use_gpu_engine, order=self.order, **kwargs + self, *args, use_cudnn=self.use_cudnn, order=self.order, **kwargs ) def MaxPoolWithIndex(self, *args, **kwargs): @@ -205,7 +205,7 @@ def MaxPoolWithIndex(self, *args, **kwargs): def AveragePool(self, *args, **kwargs): return brew.average_pool( - self, *args, use_gpu_engine=self.use_gpu_engine, order=self.order, **kwargs + self, *args, use_cudnn=self.use_cudnn, order=self.order, **kwargs ) @property @@ -235,11 +235,6 @@ def CPU(self): @property def GPU(self, gpu_id=0): device_option = caffe2_pb2.DeviceOption() - if workspace.has_hip_support: - device_option.device_type = caffe2_pb2.HIP - device_option.hip_gpu_id = gpu_id - else: - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = gpu_id - + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = gpu_id return device_option diff --git a/caffe2/python/core.py b/caffe2/python/core.py index 5eb16fe844f8a0..b566e01da23e4e 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -2021,16 +2021,16 @@ def DeduplicateGradientSlices(self, g, aggregator='sum'): raise ValueError('{} is not supported'.format(aggregator)) return GradientSlice(indices=unique, values=new_g) - def RunAllOnGPU(self, gpu_id=0, use_gpu_engine=False): + def RunAllOnGPU(self, gpu_id=0, use_cudnn=False): """A convenient function to run everything on the GPU.""" device_option = caffe2_pb2.DeviceOption() device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA device_option.cuda_gpu_id = gpu_id device_option.hip_gpu_id = gpu_id self._net.device_option.CopyFrom(device_option) - if use_gpu_engine: + if use_cudnn: for op in self._net.op: - op.engine = "MIOPEN" if workspace.has_hip_support else 'CUDNN' + op.engine = 'CUDNN' def RunAllOnMKL(self): """A convenient function to run everything using MKLDNN.""" diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index bf25806f20dde2..88b07eab347323 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -9,7 +9,7 @@ import unittest from caffe2.proto import caffe2_pb2 -from caffe2.python import core, test_util +from caffe2.python import core, test_util, workspace from caffe2.python.core import CreateOperator, GradientRegistry from caffe2.python import workspace @@ -94,7 +94,7 @@ def assertOperatorListEqual(self, operatorDefList1, operatorDefList2): @given(device_option=st.sampled_from([ None, - core.DeviceOption(caffe2_pb2.CUDA, 1)])) + core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)])) def testDirect(self, device_option): operators = [ CreateOperator('Direct', 'in', 'hidden'), @@ -279,7 +279,7 @@ def testUseInputButInputHasBeenChanged(self): @given(device_option=st.sampled_from([ None, - core.DeviceOption(caffe2_pb2.CUDA, 1)])) + core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)])) def testMultiUseInput(self, device_option): """Test gradient for the following case: diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index 60e5c39bed1318..f60d9fe5c97336 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -127,15 +127,15 @@ def Parallelize( device scope was: {}".format(scope.CurrentDeviceScope()) if devices is None: - devices = list(range(0, workspace.NumCudaDevices())), + devices = list(range(0, workspace.NumGpuDevices())), if not cpu_device: for gpu in devices: - if gpu >= workspace.NumCudaDevices(): + if gpu >= workspace.NumGpuDevices(): log.warning("** Only {} GPUs available, GPUs {} requested".format( - workspace.NumCudaDevices(), devices)) + workspace.NumGpuDevices(), devices)) break - model_helper_obj._device_type = caffe2_pb2.CUDA + model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA model_helper_obj._device_prefix = "gpu" model_helper_obj._shared_model = False device_name = "GPU" @@ -165,7 +165,6 @@ def Parallelize( model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelper) - # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) @@ -200,7 +199,10 @@ def Parallelize( # TODO: make into assert for device in devices: - device_opt = core.DeviceOption(model_helper_obj._device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format(model_helper_obj._device_prefix, device)): @@ -307,7 +309,10 @@ def Parallelize( if param_update_builder_fun is not None: for device in devices: - device_opt = core.DeviceOption(model_helper_obj._device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): with core.NameScope( "{}_{}".format(model_helper_obj._device_prefix, device) @@ -352,7 +357,10 @@ def Parallelize( # i.e. making sure multi-precision copies of parameters are up-to-date if post_sync_builder_fun is not None: for device in devices: - device_opt = core.DeviceOption(model_helper_obj._device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): with core.NameScope( "{}_{}".format(model_helper_obj._device_prefix, device) @@ -432,17 +440,17 @@ def Parallelize_BMUF( assert isinstance(model_helper_obj, model_helper.ModelHelper) if devices is None: - devices = list(range(0, workspace.NumCudaDevices())) + devices = list(range(0, workspace.NumGpuDevices())) if master_device is None: master_device = devices[0] if not cpu_device: for gpu in devices: - if gpu >= workspace.NumCudaDevices(): + if gpu >= workspace.NumGpuDevices(): log.warning("** Only {} GPUs available, GPUs {} requested".format( - workspace.NumCudaDevices(), devices)) + workspace.NumGpuDevices(), devices)) break - model_helper_obj._device_type = caffe2_pb2.CUDA + model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA model_helper_obj._device_prefix = "gpu" else: model_helper_obj._device_type = caffe2_pb2.CPU @@ -453,7 +461,10 @@ def Parallelize_BMUF( model_helper_obj._sync_barrier_net = None model_helper_obj._broadcast_context = None model_helper_obj._shared_model = False - master_dev_opt = core.DeviceOption(model_helper_obj._device_type, master_device) + if workspace.has_hip_support: + master_dev_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=master_device) + else: + master_dev_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=master_device) # question: rendezvous structure num_shards = rendezvous['num_shards'] if rendezvous else 1 @@ -797,9 +808,9 @@ def builder_fun(model): if device is None: device = scope.CurrentDeviceScope() - device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu" + device_prefix = "gpu" if (device.device_type == caffe2_pb2.CUDA or device.device_type == caffe2_pb2.HIP) else "cpu" - namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id) + namescope = "{}_{}/".format(device_prefix, device.hip_gpu_id if workspace.has_hip_support else device.cuda_gpu_id) for op in mnet.Proto().op: if "RecurrentNetwork" in op.type: raise("RecurrentNetwork conversion not yet supported") @@ -820,7 +831,10 @@ def builder_fun(model): def _ForEachDevice(devices, f, device_type, device_prefix, scoped=False, *args, **kwargs): for device in devices: - device_opt = core.DeviceOption(device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): if scoped: with core.NameScope("{}_{}".format(device_prefix, device)): @@ -836,7 +850,10 @@ def create_grad(lossp): loss_grad = {} # Explicitly need to create gradients on each GPU for gpu_id in devices: - device = core.DeviceOption(model._device_type, gpu_id) + if workspace.has_hip_support: + device = core.DeviceOption(model._device_type, hip_gpu_id=gpu_id) + else: + device = core.DeviceOption(model._device_type, cuda_gpu_id=gpu_id) with core.DeviceScope(device): for l in losses_by_gpu[gpu_id]: lg = create_grad(l) @@ -954,7 +971,7 @@ def GetLearningRateBlobNames(model): if model._optimizer is not None: if model._device_type == caffe2_pb2.CPU: return [model._optimizer.get_cpu_blob_name('lr')] - elif model._device_type == caffe2_pb2.CUDA: + elif model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP: return [model._optimizer.get_gpu_blob_name('lr', gpu, '') for gpu in model._devices] else: @@ -975,7 +992,10 @@ def _Broadcast(devices, model, net, param, use_nccl=False): if use_nccl: if _IsGPUBlob(model, param): - master_device_opt = core.DeviceOption(model._device_type, master_dev) + if workspace.has_hip_support: + master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=master_dev) + else: + master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=master_dev) with core.DeviceScope(master_device_opt): # Note that the root is the root _rank_ and not the root # _device_. Thus we always use root=0, regardless of the @@ -986,10 +1006,12 @@ def _Broadcast(devices, model, net, param, use_nccl=False): root=0, ) return - for dev_idx in devices[1:]: if _IsGPUBlob(model, param): - device_opt = core.DeviceOption(caffe2_pb2.CUDA, dev_idx) + if workspace.has_hip_support: + device_opt = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=dev_idx) + else: + device_opt = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=dev_idx) else: device_opt = core.DeviceOption(caffe2_pb2.CPU, 0) with core.DeviceScope(device_opt): @@ -1010,6 +1032,8 @@ def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None): if model._device_type == caffe2_pb2.CUDA: p2p_access_pattern = workspace.GetCudaPeerAccessPattern() + elif model._device_type == caffe2_pb2.HIP: + p2p_access_pattern = workspace.GetHipPeerAccessPattern() else: p2p_access_pattern = None @@ -1034,7 +1058,10 @@ def sumN(*dev_indices): blobs[i], 'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer) ) - device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) with core.DeviceScope(device_opt): net.Sum(blobs, [blobs[0]], name='dpm') @@ -1140,8 +1167,10 @@ def _SyncAllParamsDistributed( max_concurrent_distributed_ops ): assert rendezvous['num_shards'] > 1 - - gpu_device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + gpu_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + gpu_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU) if model._broadcast_context is None: @@ -1316,8 +1345,10 @@ def _AllReduceBlobsDistributed( num_workers = model.net.Proto().num_workers assert num_workers > 1, "Please specify more than 1 worker" all_reduce_engine = rendezvous['engine'] - - master_device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) reducing_device_opt = master_device_opt @@ -1393,7 +1424,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl): # Now we need to Allreduce blobs on all the GPUs. # Pick GPU #0 as a master GPU. - master_device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) last_out = None concatenated_idx = set() @@ -1439,7 +1473,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl): name="note:data_parallel_model") for gpu, g in viewitems(model._device_grouped_blobs[blob_name]): - device_opt = core.DeviceOption(model._device_type, gpu) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu) + else: + device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu) with core.DeviceScope(device_opt): model.Copy(grad_idx_concat, g.indices) concatenated_idx.add(g.indices) @@ -1451,7 +1488,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl): axis=0, name="note:data_parallel_model") for gpu, g in viewitems(model._device_grouped_blobs[blob_name]): - device_opt = core.DeviceOption(model._device_type, gpu) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu) + else: + device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu) with core.DeviceScope(device_opt): model.Copy(grad_val_concat, g.values) @@ -1526,11 +1566,15 @@ def _AnalyzeOperators(model): continue op_dev = op.device_option - op_gpu = op_dev.cuda_gpu_id + op_gpu = op_dev.hip_gpu_id if workspace.has_hip_support else op_dev.cuda_gpu_id # This avoids failing on operators that are only for CPU - if op_dev.device_type != caffe2_pb2.CUDA: - continue + if workspace.has_hip_support: + if op_dev.device_type != caffe2_pb2.HIP: + continue + else: + if op_dev.device_type != caffe2_pb2.CUDA: + continue namescope = "{}_{}/".format(model._device_prefix, op_gpu) for inp in list(op.input) + list(op.output): @@ -1572,14 +1616,16 @@ def map_ops(proto): def _IsGPUBlob(model, blob_name): if blob_name in model._blob_to_device: - return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA + return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \ + model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP else: blob_name = "{}_{}/{}".format( model._device_prefix, model._devices[0], blob_name ) if blob_name not in model._blob_to_device: - return model._device_type == caffe2_pb2.CUDA - return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA + return model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP + return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \ + model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP def _GroupByDevice(model, devices, params, non_data_params): @@ -1890,7 +1936,10 @@ def _InterleaveOps(model): new_ops = [] ops = {d: [] for d in range(num_devices)} for op in orig_ops: - ops[op.device_option.cuda_gpu_id].append(op) + if workspace.has_hip_support: + ops[op.device_option.hip_gpu_id].append(op) + else: + ops[op.device_option.cuda_gpu_id].append(op) for j in range(num_ops_per_dev): tp = None diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py index 1bc41b7d1ccbc5..b7fcb708d8951e 100644 --- a/caffe2/python/docs/generator.py +++ b/caffe2/python/docs/generator.py @@ -103,7 +103,8 @@ def __init__(self, name): def getDeviceImpl(self): deviceImplList = [] for device, impl in [('CPU', OpSchema.get_cpu_impl(self.op_name)), - ('CUDA', OpSchema.get_cuda_impl(self.op_name))]: + ('CUDA', OpSchema.get_cuda_impl(self.op_name)), + ('HIP', OpSchema.get_hip_impl(self.op_name))]: if not impl: continue deviceImplList.append((device, impl)) @@ -194,7 +195,8 @@ def generateDevices(self, formatter): self.getInfo(formatter, 'CPU', OpSchema.get_cpu_impl(self.name)), self.getInfo(formatter, - 'GPU', OpSchema.get_cuda_impl(self.name)), + 'HIP', OpSchema.get_hip_impl(self.name)) if workspace.has_hip_support else + self.getInfo(formatter, 'GPU', OpSchema.get_cuda_impl(self.name)), ] formatter.addList([i for i in devices if i]) diff --git a/caffe2/python/helpers/conv.py b/caffe2/python/helpers/conv.py index fb1aabe153e652..76f0443e121b52 100644 --- a/caffe2/python/helpers/conv.py +++ b/caffe2/python/helpers/conv.py @@ -23,9 +23,9 @@ def _ConvBase( BiasInitializer=None, group=1, transform_inputs=None, - use_gpu_engine=False, + use_cudnn=False, order="NCHW", - gpu_engine_exhaustive_search=False, + cudnn_exhaustive_search=False, ws_nbytes_limit=None, float16_compute=False, **kwargs @@ -45,22 +45,18 @@ def _ConvBase( requested_engine = kwargs.get('engine') if requested_engine is not None: - if workspace.has_gpu_support and use_gpu_engine and requested_engine != 'CUDNN': + if use_cudnn and requested_engine != 'CUDNN': raise ValueError( - 'When use_gpu_engine=True and has CUDA GPU, the only engine you can specify is ' + 'When use_cudnn=True, the only engine you can specify is ' '"CUDNN"') - elif workspace.has_hip_support and use_gpu_engine and requested_engine != 'MIOPEN': + elif not use_cudnn and requested_engine == 'CUDNN': raise ValueError( - 'When use_gpu_engine=True and has HIP GPU, the only engine you can specify is ' - '"MIOPEN"') - elif not use_gpu_engine and (requested_engine in {'CUDNN','MIOPEN'}): - raise ValueError( - 'When use_gpu_engine=False, the only engine you can specify is ' - '""') + 'When use_cudnn=False, the only engine you can specify is ' + '""') - if use_gpu_engine: - kwargs['engine'] = 'MIOPEN' if workspace.has_hip_support else 'CUDNN' - kwargs['exhaustive_search'] = gpu_engine_exhaustive_search + if use_cudnn: + kwargs['engine'] = 'CUDNN' + kwargs['exhaustive_search'] = cudnn_exhaustive_search if ws_nbytes_limit: kwargs['ws_nbytes_limit'] = ws_nbytes_limit @@ -199,9 +195,9 @@ def conv_transpose( kernel, weight_init=None, bias_init=None, - use_gpu_engine=False, + use_cudnn=False, order="NCHW", - gpu_engine_exhaustive_search=False, + cudnn_exhaustive_search=False, ws_nbytes_limit=None, **kwargs ): @@ -234,9 +230,9 @@ def conv_transpose( blob_out + '_b', model.param_init_net) model.AddParameter(weight, ParameterTags.WEIGHT) model.AddParameter(bias, ParameterTags.BIAS) - if use_gpu_engine: - kwargs['engine'] = 'MIOPEN' if workspace.has_hip_support else 'CUDNN' - kwargs['exhaustive_search'] = gpu_engine_exhaustive_search + if use_cudnn: + kwargs['engine'] = 'CUDNN' + kwargs['exhaustive_search'] = cudnn_exhaustive_search if ws_nbytes_limit: kwargs['ws_nbytes_limit'] = ws_nbytes_limit return model.net.ConvTranspose( @@ -280,9 +276,9 @@ def group_conv_deprecated( weight_init=None, bias_init=None, group=1, - use_gpu_engine=False, + use_cudnn=False, order="NCHW", - gpu_engine_exhaustive_search=False, + cudnn_exhaustive_search=False, ws_nbytes_limit=None, **kwargs ): @@ -294,9 +290,9 @@ def group_conv_deprecated( weight_init = weight_init if weight_init else ('XavierFill', {}) bias_init = bias_init if bias_init else ('ConstantFill', {}) use_bias = False if ("no_bias" in kwargs and kwargs["no_bias"]) else True - if use_gpu_engine: - kwargs['engine'] = 'MIOPEN' if workspace.has_hip_support else 'CUDNN' - kwargs['exhaustive_search'] = gpu_engine_exhaustive_search + if use_cudnn: + kwargs['engine'] = 'CUDNN' + kwargs['exhaustive_search'] = cudnn_exhaustive_search if ws_nbytes_limit: kwargs['ws_nbytes_limit'] = ws_nbytes_limit if dim_in % group: diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index f640f6db20eff8..e0083167c4b56e 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -278,7 +278,7 @@ def gradient_checker_device_option(): ) gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do])) -gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do])) +gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]+[hip_do]), dc=st.just([gpu_do]+[hip_do])) gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip)) diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py index 31cba3facb0559..3f438d35cad35e 100644 --- a/caffe2/python/model_device_test.py +++ b/caffe2/python/model_device_test.py @@ -124,7 +124,7 @@ def _testMiniAlexNet(self, order): cpu_device = caffe2_pb2.DeviceOption() cpu_device.device_type = caffe2_pb2.CPU gpu_device = caffe2_pb2.DeviceOption() - gpu_device.device_type = caffe2_pb2.CUDA + gpu_device.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device]) ret = checker.CheckNet( @@ -136,7 +136,7 @@ def _testMiniAlexNet(self, order): ) self.assertEqual(ret, True) - @unittest.skipIf(not workspace.has_gpu_support, + @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No GPU support. Skipping test.") def testMiniAlexNetNCHW(self): self._testMiniAlexNet("NCHW") diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py index e84369fbbbef7f..e03943e3f084f4 100644 --- a/caffe2/python/model_helper.py +++ b/caffe2/python/model_helper.py @@ -570,7 +570,10 @@ def rename_list(proto_list): rename_list(step_op.output) if device is not None: step_op.device_option.device_type = device.device_type - step_op.device_option.cuda_gpu_id = device.cuda_gpu_id + if workspace.has_hip_support: + step_op.device_option.hip_gpu_id = device.hip_gpu_id + else: + step_op.device_option.cuda_gpu_id = device.cuda_gpu_id rename_list(arg.n.external_input) rename_list(arg.n.external_output) @@ -584,7 +587,10 @@ def rename_list(proto_list): if device is not None: op.device_option.device_type = device.device_type - op.device_option.cuda_gpu_id = device.cuda_gpu_id + if workspace.has_hip_support: + op.device_option.hip_gpu_id = device.hip_gpu_id + else: + op.device_option.cuda_gpu_id = device.cuda_gpu_id validate_op(op) predict_proto.op.extend([op]) known_blobs.update(op.output) diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py index b407f96d2391f8..186857fe569430 100644 --- a/caffe2/python/muji.py +++ b/caffe2/python/muji.py @@ -25,8 +25,12 @@ def OnGPU(gpu_id): specified gpu id. """ device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = gpu_id + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = gpu_id + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = gpu_id return device_option @@ -48,7 +52,7 @@ def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None): "gpu_indices length and blobs length mismatch: %d vs %d" % (len(gpu_indices), len(blobs)) ) - pattern = workspace.GetCudaPeerAccessPattern() + pattern = workspace.GetHipPeerAccessPattern() if workspace.has_hip_support else workspace.GetCudaPeerAccessPattern() if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]): return Allreduce2(net, blobs, reduced_affix, gpu_indices) elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]): diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py index 4b5cddb61d244e..60b3598f3fb192 100644 --- a/caffe2/python/net_printer.py +++ b/caffe2/python/net_printer.py @@ -8,6 +8,7 @@ from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef from caffe2.python.checkpoint import Job from caffe2.python.core import Net, ExecutionStep, Plan +from caffe2.python import workspace from caffe2.python.task import Task, TaskGroup, WorkspaceType, TaskOutput from collections import defaultdict from contextlib import contextmanager @@ -267,12 +268,13 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False): def format_device_option(dev_opt): + gpu_id = dev_opt.hip_gpu_id if workspace.has_hip_support else dev_opt.cuda_gpu_id if not dev_opt or not ( - dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name): + dev_opt.device_type or gpu_id or dev_opt.node_name): return None return call( 'DeviceOption', - [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name]) + [dev_opt.device_type, gpu_id, "'%s'" % dev_opt.node_name]) @Printer.register(OperatorDef) diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index 1ec140931d68ac..0f8cf66b0ae1ec 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -521,7 +521,7 @@ def canonical(o): ["simple", "dag"] + (["async_dag"] if workspace.has_gpu_support or workspace.has_hip_support else [])), do=st.sampled_from(hu.device_options), - engine=st.sampled_from(["MIOPEN" if workspace.has_hip_support else "CUDNN", ""])) + engine=st.sampled_from(["CUDNN", ""])) def test_convolution_sync(self, net_type, num_workers, do, engine): m = ModelHelper(name="test_model") n = 1 @@ -532,7 +532,7 @@ def test_convolution_sync(self, net_type, num_workers, do, engine): w = 5 workspace.ResetWorkspace() - use_gpu_engine = (engine == 'CUDNN' or engine == 'MIOPEN') + use_cudnn = (engine == 'CUDNN') np.random.seed(1701) # Build a binary tree of conv layers, summing at each node. @@ -554,7 +554,7 @@ def test_convolution_sync(self, net_type, num_workers, do, engine): stride=1, pad=1, deterministic=1, - use_gpu_engine=use_gpu_engine, + use_cudnn=use_cudnn, engine=engine) brew.conv( m, bottom_2, mid_2, @@ -566,7 +566,7 @@ def test_convolution_sync(self, net_type, num_workers, do, engine): bias_init=('ConstantFill', dict(value=b2)), deterministic=1, cudnn_state=np.random.randint(0, 3), - use_gpu_engine=use_gpu_engine, + use_cudnn=use_cudnn, engine=engine) m.net.Sum([mid_1, mid_2], top) @@ -605,47 +605,42 @@ def run(): 1763719461732352.0, rtol=1e-5) - def test_use_gpu_engine_interactions(self): - """Make sure the use_gpu_engine and engine kwargs work as expected.""" + def test_use_cudnn_engine_interactions(self): + """Make sure the use_cudnn and engine kwargs work as expected.""" for model_default in [None, True, False]: arg_scope = {} if model_default is not None: - arg_scope['use_gpu_engine'] = model_default + arg_scope['use_cudnn'] = model_default else: model_default = True # the default model = ModelHelper(arg_scope=arg_scope) - self.assertEqual(model.arg_scope['use_gpu_engine'], model_default) + self.assertEqual(model.arg_scope['use_cudnn'], model_default) f = functools.partial(brew.conv, model, 'conv_in', 'conv_out', 10, 10, 5) - for op_gpu_engine in [None, True, False]: - for op_engine in [None, '', 'MIOPEN' if workspace.has_hip_support else 'CUDNN']: + for op_cudnn in [None, True, False]: + for op_engine in [None, '', 'CUDNN']: kwargs = {} - if op_gpu_engine is not None: - kwargs['use_gpu_engine'] = op_gpu_engine + if op_cudnn is not None: + kwargs['use_cudnn'] = op_cudnn else: - op_gpu_engine = False # the default + op_cudnn = False # the default if op_engine is not None: kwargs['engine'] = op_engine - calculated_gpu_engine = kwargs.get('use_gpu_engine', model_default) - if calculated_gpu_engine: - expected_engine_default = 'MIOPEN' if workspace.has_hip_support else 'CUDNN' - else: - expected_engine_default = '' + calculated_cudnn = kwargs.get('use_cudnn', model_default) expected_engine = kwargs.get( 'engine', - expected_engine_default) + 'CUDNN' if calculated_cudnn else '') - if ((calculated_gpu_engine is True and op_engine == '') or - (calculated_cudnn is False and op_engine == ('MIOPEN' if workspace.has_hip_support else 'CUDNN'))): + if ((calculated_cudnn is True and op_engine == '') or + (calculated_cudnn is False and op_engine == 'CUDNN')): with self.assertRaises(ValueError): f(**kwargs) else: f(**kwargs) - self.assertEqual(model.Proto().op[-1].engine, - expected_engine) + self.assertEqual(model.Proto().op[-1].engine, expected_engine) @given(op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4), G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4), diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py index 956d0ec9619987..4301b5c60f66f3 100644 --- a/caffe2/python/operator_test/pooling_test.py +++ b/caffe2/python/operator_test/pooling_test.py @@ -209,12 +209,14 @@ def test_pooling_with_index(self, stride, pad, kernel, size, @given(sz=st.integers(1, 20), batch_size=st.integers(1, 4), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), op_type=st.sampled_from(["AveragePool", "AveragePool2D"]), **hu.gcs) @settings(max_examples=3, timeout=10) def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc): ''' Special test to stress the fast path of NCHW average pool ''' + if engine == 'MIOPEN': + assume(sz<16) op = core.CreateOperator( op_type, ["X"], @@ -233,7 +235,7 @@ def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc): @given(sz=st.integers(1, 20), batch_size=st.integers(1, 4), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), op_type=st.sampled_from(["MaxPool", "MaxPool2D"]), **hu.gcs) @settings(max_examples=3, timeout=10) @@ -241,7 +243,10 @@ def test_global_max_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc): ''' Special test to stress the fast path of NCHW max pool ''' # CuDNN 5 does not support deterministic max pooling. - assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN") + if engine == 'MIOPEN': + assume(sz<16) + if not workspace.has_hip_support: + assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN") op = core.CreateOperator( op_type, ["X"], @@ -270,12 +275,14 @@ def test_global_max_pool_nchw(self, op_type, sz, order=st.sampled_from(["NCHW", "NHWC"]), op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool", "MaxPool2D", "AveragePool2D"]), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **hu.gcs) def test_pooling(self, stride, pad, kernel, size, input_channels, batch_size, order, op_type, engine, gc, dc): assume(pad < kernel) + if engine == 'MIOPEN': + assume(op_type != "LpPool" and order == "NCHW") op = core.CreateOperator( op_type, ["X"], @@ -300,12 +307,14 @@ def test_pooling(self, stride, pad, kernel, size, batch_size=st.integers(1, 3), order=st.sampled_from(["NCHW", "NHWC"]), op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool"]), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **hu.gcs) def test_global_pooling(self, size, input_channels, batch_size, order, op_type, engine, gc, dc): # CuDNN 5 does not support deterministic max pooling. assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool") + if engine == 'MIOPEN': + assume(op_type != "LpPool" and order == "NCHW") op = core.CreateOperator( op_type, ["X"], diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index db870972f83946..b26cc24b7e09e8 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -75,9 +75,11 @@ def make_unique_blob_name(self, base_str): if current_scope is None: return self.get_cpu_blob_name(base_str) - if current_scope.device_type == caffe2_pb2.CUDA: + if current_scope.device_type == caffe2_pb2.CUDA or current_scope.device_type == caffe2_pb2.HIP: return self.get_gpu_blob_name( - base_str, current_scope.cuda_gpu_id, current_scope.node_name + base_str, + current_scope.hip_gpu_id if workspace.has_hip_support else current_scope.cuda_gpu_id, + current_scope.node_name ) else: return self.get_cpu_blob_name(base_str, current_scope.node_name) @@ -121,7 +123,7 @@ def build_lr(self, net, param_init_net, base_learning_rate, if self._local_lr_multiplier is not None: current_scope = scope.CurrentDeviceScope() if (current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA) and not self._local_lr_multiplier_on_gpu): local_lr_multiplier = net.CopyFromCPUInput( self._local_lr_multiplier, @@ -241,7 +243,7 @@ def _run(self, net, param_init_net, param_info): self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA), + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)), ) # We need negative sign for LR when used directly with WeightedSum @@ -262,7 +264,7 @@ def _run(self, net, param_init_net, param_info): # to include device information. ONE = param_init_net.ConstantFill( [], - "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name), + "ONE_{}_{}{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id, dev.node_name), shape=[1], value=1.0 ) @@ -471,12 +473,12 @@ def _run(self, net, param_init_net, param_info): ONE = param_init_net.ConstantFill( [], - "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id), + "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id), shape=[1], value=1.0 ) WD = param_init_net.ConstantFill( - [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id), + [], "wd_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id), shape=[1], value=self.weight_decay ) @@ -528,7 +530,7 @@ def _run(self, net, param_init_net, param_info): self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA), + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)), ) lr, _ = self.build_lr( @@ -649,7 +651,7 @@ def _run(self, net, param_init_net, param_info): self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA), + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)), ) lr, _ = self.build_lr( @@ -1126,7 +1128,7 @@ def _run(self, net, param_init_net, param_info): ONE = param_init_net.ConstantFill( [], - "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id), + "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id), shape=[1], value=1.0 ) diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py index bbb0c02625f09a..a136684cd0e38e 100644 --- a/caffe2/python/optimizer_test.py +++ b/caffe2/python/optimizer_test.py @@ -77,7 +77,7 @@ def check_optimizer(self, optimizer): tensor = workspace.FetchBlob(param) np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5) - @unittest.skipIf(not workspace.has_gpu_support, "No GPU support") + @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support , "No GPU support") def testGPUDense(self): super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16) @@ -434,11 +434,11 @@ def test_caffe2_cpu_vs_numpy(self): ) @unittest.skip("Results might vary too much. Only for individual use.") - @unittest.skipIf(not workspace.has_gpu_support, "No gpu support") + @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support") def test_caffe2_gpu_vs_numpy(self): n_dim = 1000000 n_iter = 50 - gpu_device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) + gpu_device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0) with core.DeviceScope(gpu_device_opt): for zero_debias in [False, True]: for grad_coef in [1.0, 0.1, 0.01]: diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py index dbb0dbeae2dd37..4c62d9ee21b797 100644 --- a/caffe2/python/optimizer_test_util.py +++ b/caffe2/python/optimizer_test_util.py @@ -70,7 +70,7 @@ def testDense(self): @unittest.skipIf(not workspace.has_gpu_support, "No gpu support") def testGPUDense(self, dtype=core.DataType.FLOAT): - device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) + device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0) with core.DeviceScope(device_opt): model, _perfect_model, data, label = self._createDense(dtype) if dtype == core.DataType.FLOAT16: diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py index 11f7a2c44046af..f4b3aa4a244d17 100644 --- a/caffe2/python/scope_test.py +++ b/caffe2/python/scope_test.py @@ -3,7 +3,7 @@ from __future__ import print_function from __future__ import unicode_literals -from caffe2.python import scope, core +from caffe2.python import scope, core, workspace from caffe2.proto import caffe2_pb2 import unittest @@ -18,7 +18,10 @@ def thread_runner(idx, testobj): testobj.assertEquals(scope.CurrentNameScope(), "") testobj.assertEquals(scope.CurrentDeviceScope(), None) namescope = "namescope_{}".format(idx) - dsc = core.DeviceOption(caffe2_pb2.CUDA, idx) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=idx) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=idx) with scope.DeviceScope(dsc): with scope.NameScope(namescope): testobj.assertEquals(scope.CurrentNameScope(), namescope + "/") @@ -58,7 +61,10 @@ def testNamescopeAssertion(self): def testDevicescopeBasic(self): self.assertEquals(scope.CurrentDeviceScope(), None) - dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9) with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) @@ -67,7 +73,10 @@ def testDevicescopeBasic(self): def testEmptyDevicescopeBasic(self): self.assertEquals(scope.CurrentDeviceScope(), None) - dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9) with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) with scope.EmptyDeviceScope(): @@ -78,7 +87,10 @@ def testEmptyDevicescopeBasic(self): def testDevicescopeAssertion(self): self.assertEquals(scope.CurrentDeviceScope(), None) - dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9) try: with scope.DeviceScope(dsc): diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 661a92d7c1f68d..72935a94ac45e7 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -589,8 +589,8 @@ def test_simple_transform(self, input_dim, output_dim, batch_size): conv = brew.conv(m, fc2, "conv", dim_in=output_dim, dim_out=output_dim, - use_gpu_engine=True, - engine="MIOPEN" if workspace.has_hip_support else "CUDNN", + use_cudnn=True, + engine="CUDNN", kernel=3) conv.Relu([], conv)\ @@ -631,8 +631,8 @@ def test_apply_transform_if_faster(self, value): dim_in=5, dim_out=5, kernel=3, - use_gpu_engine=True, - engine="MIOPEN" if workspace.has_hip_support else "CUDNN") + use_cudnn=True, + engine="CUDNN") conv.Relu([], conv)\ .Softmax([], "pred") \