From c8328ce768db33e0d01ad67c896e5b7e5a0dc9b8 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 14 Aug 2018 15:55:01 -0700 Subject: [PATCH 01/28] py changes and docs --- .jenkins/caffe2/amd/binary_tests.sh | 16 + .jenkins/caffe2/amd/build_amd.sh | 71 ++++ .../caffe2/amd/install_dependencies_amd.sh | 87 +++++ .jenkins/caffe2/amd/python_tests.sh | 40 +++ caffe2/python/convnet_benchmarks.py | 101 +++++- caffe2/python/core.py | 50 ++- caffe2/python/core_gradients_test.py | 6 +- caffe2/python/core_test.py | 311 +++++++++++++----- caffe2/python/data_parallel_model.py | 121 +++++-- caffe2/python/docs/generator.py | 6 +- caffe2/python/functional_test.py | 2 +- caffe2/python/gradient_check_test.py | 4 +- caffe2/python/hypothesis_test_util.py | 2 +- caffe2/python/memonger_test.py | 11 +- caffe2/python/model_device_test.py | 4 +- caffe2/python/model_helper.py | 10 +- caffe2/python/muji.py | 10 +- caffe2/python/net_printer.py | 6 +- .../operator_test/activation_ops_test.py | 6 +- .../python/operator_test/boolean_mask_test.py | 8 +- caffe2/python/operator_test/ceil_op_test.py | 4 +- caffe2/python/operator_test/conv_test.py | 40 ++- caffe2/python/operator_test/copy_ops_test.py | 30 +- .../elementwise_op_broadcast_test.py | 4 +- caffe2/python/operator_test/pooling_test.py | 19 +- caffe2/python/optimizer.py | 55 ++-- caffe2/python/optimizer_test.py | 6 +- caffe2/python/optimizer_test_util.py | 2 +- caffe2/python/scope_test.py | 22 +- caffe2/python/workspace.py | 18 +- caffe2/python/workspace_test.py | 11 +- rocm-docs/caffe2-build.md | 108 ++++++ 32 files changed, 934 insertions(+), 257 deletions(-) create mode 100755 .jenkins/caffe2/amd/binary_tests.sh create mode 100644 .jenkins/caffe2/amd/build_amd.sh create mode 100755 .jenkins/caffe2/amd/install_dependencies_amd.sh create mode 100755 .jenkins/caffe2/amd/python_tests.sh create mode 100644 rocm-docs/caffe2-build.md diff --git a/.jenkins/caffe2/amd/binary_tests.sh b/.jenkins/caffe2/amd/binary_tests.sh new file mode 100755 index 00000000000000..132cc9c5606be0 --- /dev/null +++ b/.jenkins/caffe2/amd/binary_tests.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -ex + +LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ROOT_DIR=$(cd "$LOCAL_DIR"/../../../ && pwd) + +cd "$ROOT_DIR" + +echo "Running C++ tests.." + +for file in $(find "${ROOT_DIR}/build_caffe2/bin" -executable -type f); do + if [[ "$file" =~ "test" ]]; then + $file + fi +done diff --git a/.jenkins/caffe2/amd/build_amd.sh b/.jenkins/caffe2/amd/build_amd.sh new file mode 100644 index 00000000000000..5c5dd0a3af41d5 --- /dev/null +++ b/.jenkins/caffe2/amd/build_amd.sh @@ -0,0 +1,71 @@ +#!/bin/bash + +set -ex + +# The INSTALL_PREFIX here must match up with test.sh +INSTALL_PREFIX="/usr/local/caffe2" +LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ROOT_DIR=$(cd "$LOCAL_DIR"/../../.. && pwd) +CMAKE_ARGS=() + +############################################################################## +# Explicitly set Python executable. +############################################################################### +# On Ubuntu 16.04 the default Python is still 2.7. +PYTHON="$(which python)" + +############################################################################### +# Set cmake args +############################################################################### +CMAKE_ARGS+=("-DBUILD_BINARY=ON") +CMAKE_ARGS+=("-DBUILD_TEST=ON") +CMAKE_ARGS+=("-DINSTALL_TEST=ON") +CMAKE_ARGS+=("-DUSE_OBSERVERS=ON") +CMAKE_ARGS+=("-DUSE_ZSTD=ON") +CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}") + +# TODO: This is patching the official FindHip to properly handly +# cmake generator expression. A PR is opened in the upstream repo here: +# https://github.com/ROCm-Developer-Tools/HIP/pull/516 +# remove this hack once it's merged. +if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then + sudo sed -i 's/\ -I${dir}/\ $<$:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake +fi + +export LANG=C.UTF-8 +export LC_ALL=C.UTF-8 +export HCC_AMDGPU_TARGET=gfx900 + +########## HIPIFY Caffe2 operators +${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py" + +MAX_JOBS=$(nproc) + +############################################################################### +# Configure and make +############################################################################### +# Run cmake from ./build_caffe2 directory so it doesn't conflict with +# standard PyTorch build directory. Eventually these won't need to +# be separate. +rm -rf build_caffe2 +mkdir build_caffe2 +cd ./build_caffe2 + +# Configure +cmake "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@" + +# Build +if [ "$(uname)" == "Linux" ]; then + make "-j${MAX_JOBS}" install +else + echo "Don't know how to build on $(uname)" + exit 1 +fi + +############################################################################### +# Install ONNX +############################################################################### + +# Install ONNX into a local directory +pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx" + diff --git a/.jenkins/caffe2/amd/install_dependencies_amd.sh b/.jenkins/caffe2/amd/install_dependencies_amd.sh new file mode 100755 index 00000000000000..51c2fe98ce4aa0 --- /dev/null +++ b/.jenkins/caffe2/amd/install_dependencies_amd.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +set -ex + +apt-get update +apt-get install -y --no-install-recommends \ + autoconf \ + build-essential \ + ca-certificates \ + cmake \ + curl \ + git \ + libgoogle-glog-dev \ + libhiredis-dev \ + libiomp-dev \ + libleveldb-dev \ + liblmdb-dev \ + libopencv-dev \ + libpthread-stubs0-dev \ + libsnappy-dev \ + libprotobuf-dev \ + protobuf-compiler \ + python-dev \ + python-setuptools \ + python-pip + +pip install networkx==2.0 + +pip install --user \ + click \ + future \ + hypothesis \ + jupyter \ + numpy \ + protobuf \ + pytest \ + pyyaml \ + scipy==0.19.1 \ + scikit-image \ + tabulate \ + virtualenv \ + mock \ + typing \ + typing-extensions \ + pyyaml + +CLANG_VERSION=3.8 + +apt-get update +apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" +rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Use update-alternatives to make this version the default +update-alternatives --install /usr/bin/gcc gcc /usr/bin/clang-"$CLANG_VERSION" 50 +update-alternatives --install /usr/bin/g++ g++ /usr/bin/clang++-"$CLANG_VERSION" 50 + + +apt-get install -y --no-install-recommends \ + rocm-dev \ + rocm-utils \ + rocm-libs \ + rocfft \ + miopen-hip \ + miopengemm \ + rocblas \ + rocrand + + +if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then + sudo sed -i 's/\ -I${dir}/\ $<$:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake +fi + +if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then + sudo sed -i 's/set(_hip_build_configuration "${CMAKE_BUILD_TYPE}")/string(TOUPPER _hip_build_configuration "${CMAKE_BUILD_TYPE}")/' /opt/rocm/hip/cmake/FindHIP.cmake +fi + +git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust +rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip +git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip + + +export PATH=/opt/rocm/bin:$PATH +export PATH=/opt/rocm/hcc/bin:$PATH +export PATH=/opt/rocm/hip/bin:$PATH +export PATH=/opt/rocm/opencl/bin:$PATH +export THRUST_ROOT=/data/Thrust +export HIP_PLATFORM=hcc diff --git a/.jenkins/caffe2/amd/python_tests.sh b/.jenkins/caffe2/amd/python_tests.sh new file mode 100755 index 00000000000000..46daa7c3e98a6f --- /dev/null +++ b/.jenkins/caffe2/amd/python_tests.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +set -ex + +LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +ROOT_DIR=$(cd "$LOCAL_DIR"/../../../ && pwd) + +cd "$ROOT_DIR" + +# Get the relative path to where the caffe2 python module was installed +CAFFE2_PYPATH="$ROOT_DIR/build_caffe2/caffe2" + +rocm_ignore_test=() +# need to debug +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/arg_ops_test.py") +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py") +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py") +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py") +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/model_device_test.py") +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/data_parallel_model_test.py") + +# Need to go through roi ops to replace max(...) with fmaxf(...) +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/roi_align_rotated_op_test.py") + +# cuda top_k op has some asm code, the hipified version doesn't +# compile yet, so we don't have top_k operator for now +rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py") + + +# Python tests +echo "Running Python tests.." +python \ + -m pytest \ + -v \ + --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \ + --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \ + --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \ + --ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \ + ${rocm_ignore_test[@]} \ + "$CAFFE2_PYPATH/python" \ No newline at end of file diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py index 3aac78c18df16b..6b8af12f5cec71 100644 --- a/caffe2/python/convnet_benchmarks.py +++ b/caffe2/python/convnet_benchmarks.py @@ -59,11 +59,12 @@ """ import argparse +import os -from caffe2.python import workspace, brew, model_helper +from caffe2.python import workspace, brew, model_helper, core +from caffe2.proto import caffe2_pb2 - -def MLP(order, cudnn_ws): +def MLP(order, cudnn_ws, model_path=""): model = model_helper.ModelHelper(name="MLP") d = 256 depth = 20 @@ -98,7 +99,7 @@ def MLP(order, cudnn_ws): return model, d -def AlexNet(order, cudnn_ws): +def AlexNet(order, cudnn_ws, model_path=""): my_arg_scope = { 'order': order, 'use_cudnn': True, @@ -191,7 +192,7 @@ def AlexNet(order, cudnn_ws): return model, 224 -def OverFeat(order, cudnn_ws): +def OverFeat(order, cudnn_ws, model_path=""): my_arg_scope = { 'order': order, 'use_cudnn': True, @@ -277,7 +278,7 @@ def OverFeat(order, cudnn_ws): return model, 231 -def VGGA(order, cudnn_ws): +def VGGA(order, cudnn_ws, model_path=""): my_arg_scope = { 'order': order, 'use_cudnn': True, @@ -475,7 +476,7 @@ def _InceptionModule( return output -def Inception(order, cudnn_ws): +def Inception(order, cudnn_ws, model_path=""): my_arg_scope = { 'order': order, 'use_cudnn': True, @@ -562,6 +563,84 @@ def Inception(order, cudnn_ws): model.net.AveragedLoss(xent, "loss") return model, 224 +def Resnet50(order, cudnn_ws, model_path=""): + if model_path == "": + print("ERROR: please specify paths to init_net and predict_net protobufs for Resnet50") + exit(1) + device_opts = caffe2_pb2.DeviceOption() + device_opts.device_type = caffe2_pb2.HIP + device_opts.hip_gpu_id = 0 + + INIT_NET_PB = os.path.join(model_path, "init_net.pb") + PREDICT_NET_PB = os.path.join(model_path, "predict_net.pb") + init_def = caffe2_pb2.NetDef() + with open(INIT_NET_PB, 'rb') as f: + init_def.ParseFromString(f.read()) + init_def.device_option.CopyFrom(device_opts) + + net_def = caffe2_pb2.NetDef() + with open(PREDICT_NET_PB, 'rb') as f: + net_def.ParseFromString(f.read()) + net_def.device_option.CopyFrom(device_opts) + + init_net = core.Net(init_def) + predict_net = core.Net(net_def) + for op in init_net.Proto().op: + op.device_option.CopyFrom(device_opts) + for op in predict_net.Proto().op: + op.device_option.CopyFrom(device_opts) + my_arg_scope = { + 'order': order, + } + model = model_helper.ModelHelper( + name="resnet50", + arg_scope=my_arg_scope, + ) + + model.param_init_net = init_net + model.net = predict_net + xent = model.net.LabelCrossEntropy(["gpu_0/softmax", "label"], "xent") + model.net.AveragedLoss(xent, "loss") + return model, 224 + +def Inception_v2(order, cudnn_ws, model_path=""): + if model_path == "": + print("ERROR: please specify paths to init_net and predict_net protobufs for Inception_v2") + exit(1) + device_opts = caffe2_pb2.DeviceOption() + device_opts.device_type = caffe2_pb2.HIP + device_opts.hip_gpu_id = 0 + + INIT_NET_PB = os.path.join(model_path, "init_net.pb") + PREDICT_NET_PB = os.path.join(model_path, "predict_net.pb") + init_def = caffe2_pb2.NetDef() + with open(INIT_NET_PB, 'rb') as f: + init_def.ParseFromString(f.read()) + init_def.device_option.CopyFrom(device_opts) + + net_def = caffe2_pb2.NetDef() + with open(PREDICT_NET_PB, 'rb') as f: + net_def.ParseFromString(f.read()) + net_def.device_option.CopyFrom(device_opts) + + init_net = core.Net(init_def) + predict_net = core.Net(net_def) + + my_arg_scope = { + 'order': order, + } + + model = model_helper.ModelHelper( + name="GoogleNet", + arg_scope=my_arg_scope, + ) + + model.param_init_net = init_net + model.net = predict_net + xent = model.net.LabelCrossEntropy(["prob", "label"], "xent") + model.net.AveragedLoss(xent, "loss") + return model, 224 + def AddParameterUpdate(model): """ Simple plain SGD update -- not tuned to actually train the models """ @@ -575,7 +654,7 @@ def AddParameterUpdate(model): def Benchmark(model_gen, arg): - model, input_size = model_gen(arg.order, arg.cudnn_ws) + model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.model_path) model.Proto().type = arg.net_type model.Proto().num_workers = arg.num_workers @@ -590,7 +669,7 @@ def Benchmark(model_gen, arg): model.param_init_net.GaussianFill( [], - "data", + "gpu_0/data" if arg.model == "Resnet50" else "data", shape=input_shape, mean=0.0, std=1.0 @@ -701,6 +780,7 @@ def GetArgumentParser(): parser.add_argument("--num_workers", type=int, default=2) parser.add_argument("--use-nvtx", default=False, action='store_true') parser.add_argument("--htrace_span_log_path", type=str) + parser.add_argument("--model_path", type=str, default="", help="set path to init net and predict_net protobufs") return parser @@ -723,5 +803,8 @@ def GetArgumentParser(): 'VGGA': VGGA, 'Inception': Inception, 'MLP': MLP, + 'Resnet50': Resnet50, + 'Inception_v2':Inception_v2 + } Benchmark(model_map[args.model], args) diff --git a/caffe2/python/core.py b/caffe2/python/core.py index 1423cfea3a6c09..b566e01da23e4e 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -85,6 +85,7 @@ def IsOperatorWithEngine(op_type, engine): def DeviceOption( device_type, cuda_gpu_id=0, + hip_gpu_id=0, random_seed=None, node_name=None, numa_node_id=None, @@ -93,6 +94,7 @@ def DeviceOption( option = caffe2_pb2.DeviceOption() option.device_type = device_type option.cuda_gpu_id = cuda_gpu_id + option.hip_gpu_id = hip_gpu_id if node_name is not None: option.node_name = node_name if random_seed is not None: @@ -115,7 +117,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr if not opt1.device_type or not opt2.device_type: # At least one option is for CPU, check if both are for CPU. return not opt1.device_type and not opt2.device_type - return opt1.cuda_gpu_id == opt2.cuda_gpu_id + return (opt1.cuda_gpu_id == opt2.cuda_gpu_id) and (opt1.hip_gpu_id == opt2.hip_gpu_id) def InferBlobDevices(net): @@ -2022,12 +2024,13 @@ def DeduplicateGradientSlices(self, g, aggregator='sum'): def RunAllOnGPU(self, gpu_id=0, use_cudnn=False): """A convenient function to run everything on the GPU.""" device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA + device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA device_option.cuda_gpu_id = gpu_id + device_option.hip_gpu_id = gpu_id self._net.device_option.CopyFrom(device_option) if use_cudnn: for op in self._net.op: - op.engine = "CUDNN" + op.engine = 'CUDNN' def RunAllOnMKL(self): """A convenient function to run everything using MKLDNN.""" @@ -2186,27 +2189,38 @@ def extend_ops(self, new_ops): def copy_func_between_devices(src, dst): CPU = caffe2_pb2.CPU - CUDA = caffe2_pb2.CUDA + if workspace.has_hip_support: + GPU = caffe2_pb2.HIP + else: + GPU = caffe2_pb2.CUDA if src.device_type == CPU and dst.device_type == CPU: return None - if src.device_type == CUDA and dst.device_type == CUDA: - if src.cuda_gpu_id == dst.cuda_gpu_id: - return None + if src.device_type == GPU and dst.device_type == GPU: + def fun(net, *args, **kw): + with DeviceScope(dst): + return net.Copy(*args, **kw) + + if workspace.has_hip_support: + if src.hip_gpu_id == dst.hip_gpu_id: + return None + else: + return fun else: - def fun(net, *args, **kw): - with DeviceScope(dst): - return net.Copy(*args, **kw) - return fun + if src.cuda_gpu_id == dst.cuda_gpu_id: + return None + else: + return fun + - if src.device_type == CUDA and dst.device_type == CPU: + if src.device_type == GPU and dst.device_type == CPU: def fun(net, *args, **kw): with DeviceScope(src): return net.CopyGPUToCPU(*args, **kw) return fun - if src.device_type == CPU and dst.device_type == CUDA: + if src.device_type == CPU and dst.device_type == GPU: def fun(net, *args, **kw): with DeviceScope(dst): return net.CopyCPUToGPU(*args, **kw) @@ -2221,7 +2235,12 @@ def device_equal(src, dst): comparison between empty device_options and {device_type:0, cuda_gpu_id:0} returns not equal in some cases. ''' - return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id + if workspace.has_hip_support: + gpu_id_eq = src.hip_gpu_id == dst.hip_gpu_id + else: + gpu_id_eq = src.cuda_gpu_id == dst.cuda_gpu_id + + return src.device_type == dst.device_type and gpu_id_eq def update_placeholder_op_output(op, blob_to_device): @@ -2332,10 +2351,13 @@ def InjectCrossDeviceCopies(net, blob_to_device=None, blob_remap=None, def _gen_new_name(blob, device_option): CPU = caffe2_pb2.CPU CUDA = caffe2_pb2.CUDA + HIP = caffe2_pb2.HIP if device_option.device_type == CPU: suffix = '_cpu' elif device_option.device_type == CUDA: suffix = '_cuda_' + str(device_option.cuda_gpu_id) + elif device_option.device_type == HIP: + suffix = '_hip_' + str(device_option.hip_gpu_id) else: raise RuntimeError( "Unknown device type: {}". diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py index bf25806f20dde2..88b07eab347323 100644 --- a/caffe2/python/core_gradients_test.py +++ b/caffe2/python/core_gradients_test.py @@ -9,7 +9,7 @@ import unittest from caffe2.proto import caffe2_pb2 -from caffe2.python import core, test_util +from caffe2.python import core, test_util, workspace from caffe2.python.core import CreateOperator, GradientRegistry from caffe2.python import workspace @@ -94,7 +94,7 @@ def assertOperatorListEqual(self, operatorDefList1, operatorDefList2): @given(device_option=st.sampled_from([ None, - core.DeviceOption(caffe2_pb2.CUDA, 1)])) + core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)])) def testDirect(self, device_option): operators = [ CreateOperator('Direct', 'in', 'hidden'), @@ -279,7 +279,7 @@ def testUseInputButInputHasBeenChanged(self): @given(device_option=st.sampled_from([ None, - core.DeviceOption(caffe2_pb2.CUDA, 1)])) + core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)])) def testMultiUseInput(self, device_option): """Test gradient for the following case: diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py index b7099d20eae863..f47c79ac847a4a 100644 --- a/caffe2/python/core_test.py +++ b/caffe2/python/core_test.py @@ -82,18 +82,30 @@ def testDeviceScope(self): self.assertFalse(op.HasField('device_option')) # explicitly setting a device device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 - op = core.CreateOperator("Relu", "x", "y", device_option=device_option) - self.assertTrue(op.HasField('device_option')) - self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) - self.assertEqual(op.device_option.cuda_gpu_id, 1) + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + op = core.CreateOperator("Relu", "x", "y", device_option=device_option) + self.assertTrue(op.HasField('device_option')) + self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP) + self.assertEqual(op.device_option.hip_gpu_id, 1) + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 + op = core.CreateOperator("Relu", "x", "y", device_option=device_option) + self.assertTrue(op.HasField('device_option')) + self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) + self.assertEqual(op.device_option.cuda_gpu_id, 1) with core.DeviceScope(device_option): # from device scope op = core.CreateOperator("Relu", "x", "y") self.assertTrue(op.HasField('device_option')) - self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) - self.assertEqual(op.device_option.cuda_gpu_id, 1) + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP) + self.assertEqual(op.device_option.hip_gpu_id, 1) + else: + self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) + self.assertEqual(op.device_option.cuda_gpu_id, 1) # from an overridden device option override_device = caffe2_pb2.DeviceOption() override_device.device_type = caffe2_pb2.CPU @@ -108,14 +120,22 @@ def testDeviceScope(self): def testNameAndDeviceScopeTogether(self): device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 with core.DeviceScope(device_option): with core.NameScope("foo"): op = core.CreateOperator("Relu", "x", "y") self.assertTrue(op.HasField('device_option')) - self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) - self.assertEqual(op.device_option.cuda_gpu_id, 1) + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP) + self.assertEqual(op.device_option.hip_gpu_id, 1) + else: + self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) + self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(len(op.input), 1) self.assertEqual(op.input[0], "foo/x") self.assertEqual(len(op.output), 1) @@ -220,8 +240,12 @@ def testSetInputRecordWithoutBlobs(self): class TestCreateOperator(test_util.TestCase): def testCreate(self): device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 op = core.CreateOperator( "Ludicrous", "x", "y", name="ludicrous", control_input="z", device_option=device_option, @@ -236,8 +260,12 @@ def testCreate(self): self.assertEqual(len(op.control_input), 1) self.assertEqual(op.control_input[0], "z") self.assertTrue(op.HasField('device_option')) - self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) - self.assertEqual(op.device_option.cuda_gpu_id, 1) + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP) + self.assertEqual(op.device_option.hip_gpu_id, 1) + else: + self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA) + self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertTrue(len(op.arg), 3) # can't guarantee ordering of kwargs, so generate a set of args @@ -540,12 +568,15 @@ def test_check_equal_default_value(self): opt2 = caffe2_pb2.DeviceOption() opt1.device_type = 0 self.assertTrue(core.device_option_equal(opt1, opt2)) - opt1.cuda_gpu_id = 5 + if workspace.has_hip_support: + opt1.hip_gpu_id = 5 + else: + opt1.cuda_gpu_id = 5 # opt1 still is on CPU, so the options should be equal self.assertTrue(core.device_option_equal(opt1, opt2)) opt2.device_type = 0 self.assertTrue(core.device_option_equal(opt1, opt2)) - opt1.device_type = 1 + opt1.device_type = 6 if workspace.has_hip_support else 1 self.assertFalse(core.device_option_equal(opt1, opt2)) @@ -609,14 +640,18 @@ def test_inject_copy(self): self.assertEqual(op.input[2], "fc_b") -@unittest.skipIf(not workspace.has_gpu_support, 'No GPU support') +@unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, 'No GPU support') class TestInferDevice(test_util.TestCase): def setUp(self): device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 - self.cuda_option = device_option + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 + self.gpu_option = device_option self.cpu_option = caffe2_pb2.DeviceOption() def _test_op( @@ -628,7 +663,7 @@ def _test_op( inputs=None, outputs=None ): - op_option = self.cuda_option if not op_option else op_option + op_option = self.gpu_option if not op_option else op_option inputs = ["blob_1"] if not inputs else inputs outputs = ["blob_2"] if not outputs else outputs with core.DeviceScope(op_option): @@ -656,9 +691,9 @@ def _test_op( def test_infer_device(self): self._test_op( "FC", - self.cuda_option, - self.cuda_option, - op_option=self.cuda_option, + self.gpu_option, + self.gpu_option, + op_option=self.gpu_option, inputs=["data", "fc_w", "fc_b"], outputs=["fc_1"] ) @@ -666,17 +701,17 @@ def test_infer_device(self): def test_infer_device_split_by_lengths(self): self._test_op( "SplitByLengths", - [self.cuda_option, self.cpu_option], - self.cuda_option, - op_option=self.cuda_option, + [self.gpu_option, self.cpu_option], + self.gpu_option, + op_option=self.gpu_option, inputs=["data", "fc_w"], outputs=["fc_1"] ) def test_infer_device_cross_device(self): - self._test_op("CopyGPUToCPU", self.cuda_option, self.cpu_option) - self._test_op("CopyCPUToGPU", self.cpu_option, self.cuda_option) - self._test_op("CopyFromCPUInput", self.cpu_option, self.cuda_option) + self._test_op("CopyGPUToCPU", self.gpu_option, self.cpu_option) + self._test_op("CopyCPUToGPU", self.cpu_option, self.gpu_option) + self._test_op("CopyFromCPUInput", self.cpu_option, self.gpu_option) self._test_op( "CopyFromCPUInput", self.cpu_option, @@ -686,7 +721,7 @@ def test_infer_device_cross_device(self): def test_device_inference_function(self): # ConcatOp. - op_option = self.cuda_option + op_option = self.gpu_option with core.DeviceScope(op_option): op = core.CreateOperator( 'Concat', @@ -698,7 +733,7 @@ def test_device_inference_function(self): self.assertEqual(output_dev[1], self.cpu_option) #SplitOp. - op_option = self.cuda_option + op_option = self.gpu_option with core.DeviceScope(op_option): op = core.CreateOperator( 'Split', @@ -713,8 +748,12 @@ def test_inject_copy(self): net = core.Net("test") init_net = core.Net("init") device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 weight = init_net.XavierFill([], 'fc_w', shape=[10, 100]) bias = init_net.ConstantFill([], 'fc_b', shape=[10, ]) @@ -727,11 +766,18 @@ def test_inject_copy(self): ) op = new_net._net.op[-1] self.assertEqual(op.type, "FC") - self.assertEqual(op.input[0], "data_cuda_1") - self.assertEqual(op.input[1], "fc_w_cuda_1") - self.assertEqual(op.input[2], "fc_b_cuda_1") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) + if workspace.has_hip_support: + self.assertEqual(op.input[0], "data_hip_1") + self.assertEqual(op.input[1], "fc_w_hip_1") + self.assertEqual(op.input[2], "fc_b_hip_1") + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + else: + self.assertEqual(op.input[0], "data_cuda_1") + self.assertEqual(op.input[1], "fc_w_cuda_1") + self.assertEqual(op.input[2], "fc_b_cuda_1") + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU") self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU") self.assertNotEqual(blob_to_device["fc_w"], device_option) @@ -740,8 +786,12 @@ def test_cross_nets(self): net = core.Net("test") init_net = core.Net("init") device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 weight = init_net.XavierFill([], 'fc_w', shape=[10, 100]) bias = init_net.ConstantFill([], 'fc_b', shape=[10, ]) const = init_net.ConstantFill([], 'const', shape=[], value=1.) @@ -756,28 +806,53 @@ def test_cross_nets(self): ) op = nets[1]._net.op[0] self.assertEqual(op.type, "CopyCPUToGPU") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) - self.assertEqual(op.output[0], "fc_w_cuda_1") + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + self.assertEqual(op.output[0], "fc_w_hip_1") + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) + self.assertEqual(op.output[0], "fc_w_cuda_1") op = nets[1]._net.op[1] self.assertEqual(op.type, "CopyCPUToGPU") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) - self.assertEqual(op.output[0], "fc_b_cuda_1") + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + self.assertEqual(op.output[0], "fc_b_hip_1") + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) + self.assertEqual(op.output[0], "fc_b_cuda_1") op = nets[1]._net.op[2] self.assertEqual(op.type, "FC") self.assertEqual(op.input[0], "data") - self.assertEqual(op.input[1], "fc_w_cuda_1") - self.assertEqual(op.input[2], "fc_b_cuda_1") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) + if workspace.has_hip_support: + self.assertEqual(op.input[1], "fc_w_hip_1") + self.assertEqual(op.input[2], "fc_b_hip_1") + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + else: + self.assertEqual(op.input[1], "fc_w_cuda_1") + self.assertEqual(op.input[2], "fc_b_cuda_1") + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) op = nets[1]._net.op[3] self.assertEqual(op.type, "Add") self.assertEqual(op.input[0], "fc1") - self.assertEqual(op.input[1], "const_cuda_1") + if workspace.has_hip_support: + self.assertEqual(op.input[1], "const_hip_1") + else: + self.assertEqual(op.input[1], "const_cuda_1") # check that moved blob is in input to the new net - for c in ["data", "fc_w", "fc_b", "const_cuda_1"]: - self.assertTrue(c in nets[1]._net.external_input) + if workspace.has_hip_support: + for c in ["data", "fc_w", "fc_b", "const_hip_1"]: + self.assertTrue(c in nets[1]._net.external_input) + else: + for c in ["data", "fc_w", "fc_b", "const_cuda_1"]: + self.assertTrue(c in nets[1]._net.external_input) """ For reference, net.Proto() should be like: name: "" @@ -877,8 +952,12 @@ def test_cross_nets_no_change(self): def test_inject_copy_multi_use(self): net = core.Net("test") device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 with core.DeviceScope(device_option): net.Relu("data", "relu1") @@ -886,23 +965,38 @@ def test_inject_copy_multi_use(self): with core.DeviceScope(device_option): net.Relu("data", "relu3") net.Relu("data", "relu4") - device_option.cuda_gpu_id = 0 + if workspace.has_hip_support: + device_option.hip_gpu_id = 0 + else: + device_option.cuda_gpu_id = 0 with core.DeviceScope(device_option): net.Relu("data", "relu5") - device_option.cuda_gpu_id = 1 + if workspace.has_hip_support: + device_option.hip_gpu_id = 1 + else: + device_option.cuda_gpu_id = 1 with core.DeviceScope(device_option): net.Relu("data", "relu6") new_net, _ = core.InjectCrossDeviceCopies(net) op = new_net._net.op[0] self.assertEqual(op.type, "CopyCPUToGPU") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) - self.assertEqual(op.output[0], "data_cuda_1") + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + self.assertEqual(op.output[0], "data_hip_1") + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) + self.assertEqual(op.output[0], "data_cuda_1") op = new_net._net.op[1] self.assertEqual(op.type, "Relu") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) self.assertEqual(op.output[0], "relu1") op = new_net._net.op[2] self.assertEqual(op.type, "Relu") @@ -910,9 +1004,14 @@ def test_inject_copy_multi_use(self): self.assertEqual(op.output[0], "relu2") op = new_net._net.op[3] self.assertEqual(op.type, "Relu") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) - self.assertEqual(op.input[0], "data_cuda_1") + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + self.assertEqual(op.input[0], "data_hip_1") + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) + self.assertEqual(op.input[0], "data_cuda_1") self.assertEqual(op.output[0], "relu3") op = new_net._net.op[4] self.assertEqual(op.type, "Relu") @@ -920,20 +1019,35 @@ def test_inject_copy_multi_use(self): self.assertEqual(op.output[0], "relu4") op = new_net._net.op[5] self.assertEqual(op.type, "CopyCPUToGPU") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 0) - self.assertEqual(op.output[0], "data_cuda_0") + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 0) + self.assertEqual(op.output[0], "data_hip_0") + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 0) + self.assertEqual(op.output[0], "data_cuda_0") op = new_net._net.op[6] self.assertEqual(op.type, "Relu") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 0) - self.assertEqual(op.input[0], "data_cuda_0") + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 0) + self.assertEqual(op.input[0], "data_hip_0") + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 0) + self.assertEqual(op.input[0], "data_cuda_0") self.assertEqual(op.output[0], "relu5") op = new_net._net.op[7] self.assertEqual(op.type, "Relu") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 1) - self.assertEqual(op.input[0], "data_cuda_1") + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 1) + self.assertEqual(op.input[0], "data_hip_1") + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 1) + self.assertEqual(op.input[0], "data_cuda_1") self.assertEqual(op.output[0], "relu6") """ For reference, net.Proto() should be like: @@ -1025,8 +1139,12 @@ def test_inject_copy_placeholder_ops(self): cpu_device.append(caffe2_pb2.DeviceOption()) cpu_device[i].node_name = 'node:' + str(i) gpu_device.append(caffe2_pb2.DeviceOption()) - gpu_device[i].device_type = caffe2_pb2.CUDA - gpu_device[i].cuda_gpu_id = 0 + if workspace.has_hip_support: + gpu_device[i].device_type = caffe2_pb2.HIP + gpu_device[i].hip_gpu_id = 0 + else: + gpu_device[i].device_type = caffe2_pb2.CUDA + gpu_device[i].cuda_gpu_id = 0 gpu_device[i].node_name = 'node:' + str(i) send_node = 'node:0' recv_node = 'node:1' @@ -1065,13 +1183,21 @@ def test_inject_copy_placeholder_ops(self): # Verify (init_net) op = init_net._net.op[2] self.assertEqual(op.type, "CopyGPUToCPU") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 0) + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 0) + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.output[0], "fc_w_cpu") op = init_net._net.op[3] self.assertEqual(op.type, "CopyGPUToCPU") - self.assertEqual(op.device_option.device_type, 1) - self.assertEqual(op.device_option.cuda_gpu_id, 0) + if workspace.has_hip_support: + self.assertEqual(op.device_option.device_type, 6) + self.assertEqual(op.device_option.hip_gpu_id, 0) + else: + self.assertEqual(op.device_option.device_type, 1) + self.assertEqual(op.device_option.cuda_gpu_id, 0) self.assertEqual(op.output[0], "fc_b_cpu") op = init_net._net.op[4] self.assertEqual(op.type, placeholder_send) @@ -1093,8 +1219,12 @@ def test_inject_copy_placeholder_ops(self): def test_blob_inplace(self): net = core.Net("test") device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = 1 + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = 1 + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = 1 net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment']) with core.DeviceScope(device_option): @@ -1103,10 +1233,15 @@ def test_blob_inplace(self): op = net._net.op[1] self.assertEqual(op.type, 'CopyCPUToGPU') self.assertEqual(op.input[0], 'param') - self.assertEqual(op.output[0], 'param_cuda_1') + if workspace.has_hip_support: + self.assertEqual(op.output[0], 'param_hip_1') + else: + self.assertEqual(op.output[0], 'param_cuda_1') op = net._net.op[2] - self.assertEqual(op.input[0], 'param_cuda_1') - + if workspace.has_hip_support: + self.assertEqual(op.input[0], 'param_hip_1') + else: + self.assertEqual(op.input[0], 'param_cuda_1') net.Relu('nonsense_input', 'moment') # should not raise inplace error core.InjectCrossDeviceCopies(net) diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index 60e5c39bed1318..f60d9fe5c97336 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -127,15 +127,15 @@ def Parallelize( device scope was: {}".format(scope.CurrentDeviceScope()) if devices is None: - devices = list(range(0, workspace.NumCudaDevices())), + devices = list(range(0, workspace.NumGpuDevices())), if not cpu_device: for gpu in devices: - if gpu >= workspace.NumCudaDevices(): + if gpu >= workspace.NumGpuDevices(): log.warning("** Only {} GPUs available, GPUs {} requested".format( - workspace.NumCudaDevices(), devices)) + workspace.NumGpuDevices(), devices)) break - model_helper_obj._device_type = caffe2_pb2.CUDA + model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA model_helper_obj._device_prefix = "gpu" model_helper_obj._shared_model = False device_name = "GPU" @@ -165,7 +165,6 @@ def Parallelize( model_helper_obj._grad_names = [] assert isinstance(model_helper_obj, model_helper.ModelHelper) - # Keep track of params that were in the model before: they are not # data parallel, so we need to handle them separately non_datapar_params = copy.copy(model_helper_obj.params) @@ -200,7 +199,10 @@ def Parallelize( # TODO: make into assert for device in devices: - device_opt = core.DeviceOption(model_helper_obj._device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): with core.NameScope("{}_{}".format(model_helper_obj._device_prefix, device)): @@ -307,7 +309,10 @@ def Parallelize( if param_update_builder_fun is not None: for device in devices: - device_opt = core.DeviceOption(model_helper_obj._device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): with core.NameScope( "{}_{}".format(model_helper_obj._device_prefix, device) @@ -352,7 +357,10 @@ def Parallelize( # i.e. making sure multi-precision copies of parameters are up-to-date if post_sync_builder_fun is not None: for device in devices: - device_opt = core.DeviceOption(model_helper_obj._device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): with core.NameScope( "{}_{}".format(model_helper_obj._device_prefix, device) @@ -432,17 +440,17 @@ def Parallelize_BMUF( assert isinstance(model_helper_obj, model_helper.ModelHelper) if devices is None: - devices = list(range(0, workspace.NumCudaDevices())) + devices = list(range(0, workspace.NumGpuDevices())) if master_device is None: master_device = devices[0] if not cpu_device: for gpu in devices: - if gpu >= workspace.NumCudaDevices(): + if gpu >= workspace.NumGpuDevices(): log.warning("** Only {} GPUs available, GPUs {} requested".format( - workspace.NumCudaDevices(), devices)) + workspace.NumGpuDevices(), devices)) break - model_helper_obj._device_type = caffe2_pb2.CUDA + model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA model_helper_obj._device_prefix = "gpu" else: model_helper_obj._device_type = caffe2_pb2.CPU @@ -453,7 +461,10 @@ def Parallelize_BMUF( model_helper_obj._sync_barrier_net = None model_helper_obj._broadcast_context = None model_helper_obj._shared_model = False - master_dev_opt = core.DeviceOption(model_helper_obj._device_type, master_device) + if workspace.has_hip_support: + master_dev_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=master_device) + else: + master_dev_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=master_device) # question: rendezvous structure num_shards = rendezvous['num_shards'] if rendezvous else 1 @@ -797,9 +808,9 @@ def builder_fun(model): if device is None: device = scope.CurrentDeviceScope() - device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu" + device_prefix = "gpu" if (device.device_type == caffe2_pb2.CUDA or device.device_type == caffe2_pb2.HIP) else "cpu" - namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id) + namescope = "{}_{}/".format(device_prefix, device.hip_gpu_id if workspace.has_hip_support else device.cuda_gpu_id) for op in mnet.Proto().op: if "RecurrentNetwork" in op.type: raise("RecurrentNetwork conversion not yet supported") @@ -820,7 +831,10 @@ def builder_fun(model): def _ForEachDevice(devices, f, device_type, device_prefix, scoped=False, *args, **kwargs): for device in devices: - device_opt = core.DeviceOption(device_type, device) + if workspace.has_hip_support: + device_opt = core.DeviceOption(device_type, hip_gpu_id=device) + else: + device_opt = core.DeviceOption(device_type, cuda_gpu_id=device) with core.DeviceScope(device_opt): if scoped: with core.NameScope("{}_{}".format(device_prefix, device)): @@ -836,7 +850,10 @@ def create_grad(lossp): loss_grad = {} # Explicitly need to create gradients on each GPU for gpu_id in devices: - device = core.DeviceOption(model._device_type, gpu_id) + if workspace.has_hip_support: + device = core.DeviceOption(model._device_type, hip_gpu_id=gpu_id) + else: + device = core.DeviceOption(model._device_type, cuda_gpu_id=gpu_id) with core.DeviceScope(device): for l in losses_by_gpu[gpu_id]: lg = create_grad(l) @@ -954,7 +971,7 @@ def GetLearningRateBlobNames(model): if model._optimizer is not None: if model._device_type == caffe2_pb2.CPU: return [model._optimizer.get_cpu_blob_name('lr')] - elif model._device_type == caffe2_pb2.CUDA: + elif model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP: return [model._optimizer.get_gpu_blob_name('lr', gpu, '') for gpu in model._devices] else: @@ -975,7 +992,10 @@ def _Broadcast(devices, model, net, param, use_nccl=False): if use_nccl: if _IsGPUBlob(model, param): - master_device_opt = core.DeviceOption(model._device_type, master_dev) + if workspace.has_hip_support: + master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=master_dev) + else: + master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=master_dev) with core.DeviceScope(master_device_opt): # Note that the root is the root _rank_ and not the root # _device_. Thus we always use root=0, regardless of the @@ -986,10 +1006,12 @@ def _Broadcast(devices, model, net, param, use_nccl=False): root=0, ) return - for dev_idx in devices[1:]: if _IsGPUBlob(model, param): - device_opt = core.DeviceOption(caffe2_pb2.CUDA, dev_idx) + if workspace.has_hip_support: + device_opt = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=dev_idx) + else: + device_opt = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=dev_idx) else: device_opt = core.DeviceOption(caffe2_pb2.CPU, 0) with core.DeviceScope(device_opt): @@ -1010,6 +1032,8 @@ def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None): if model._device_type == caffe2_pb2.CUDA: p2p_access_pattern = workspace.GetCudaPeerAccessPattern() + elif model._device_type == caffe2_pb2.HIP: + p2p_access_pattern = workspace.GetHipPeerAccessPattern() else: p2p_access_pattern = None @@ -1034,7 +1058,10 @@ def sumN(*dev_indices): blobs[i], 'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer) ) - device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) with core.DeviceScope(device_opt): net.Sum(blobs, [blobs[0]], name='dpm') @@ -1140,8 +1167,10 @@ def _SyncAllParamsDistributed( max_concurrent_distributed_ops ): assert rendezvous['num_shards'] > 1 - - gpu_device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + gpu_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + gpu_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU) if model._broadcast_context is None: @@ -1316,8 +1345,10 @@ def _AllReduceBlobsDistributed( num_workers = model.net.Proto().num_workers assert num_workers > 1, "Please specify more than 1 worker" all_reduce_engine = rendezvous['engine'] - - master_device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) reducing_device_opt = master_device_opt @@ -1393,7 +1424,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl): # Now we need to Allreduce blobs on all the GPUs. # Pick GPU #0 as a master GPU. - master_device_opt = core.DeviceOption(model._device_type, devices[0]) + if workspace.has_hip_support: + master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) + else: + master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0]) last_out = None concatenated_idx = set() @@ -1439,7 +1473,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl): name="note:data_parallel_model") for gpu, g in viewitems(model._device_grouped_blobs[blob_name]): - device_opt = core.DeviceOption(model._device_type, gpu) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu) + else: + device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu) with core.DeviceScope(device_opt): model.Copy(grad_idx_concat, g.indices) concatenated_idx.add(g.indices) @@ -1451,7 +1488,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl): axis=0, name="note:data_parallel_model") for gpu, g in viewitems(model._device_grouped_blobs[blob_name]): - device_opt = core.DeviceOption(model._device_type, gpu) + if workspace.has_hip_support: + device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu) + else: + device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu) with core.DeviceScope(device_opt): model.Copy(grad_val_concat, g.values) @@ -1526,11 +1566,15 @@ def _AnalyzeOperators(model): continue op_dev = op.device_option - op_gpu = op_dev.cuda_gpu_id + op_gpu = op_dev.hip_gpu_id if workspace.has_hip_support else op_dev.cuda_gpu_id # This avoids failing on operators that are only for CPU - if op_dev.device_type != caffe2_pb2.CUDA: - continue + if workspace.has_hip_support: + if op_dev.device_type != caffe2_pb2.HIP: + continue + else: + if op_dev.device_type != caffe2_pb2.CUDA: + continue namescope = "{}_{}/".format(model._device_prefix, op_gpu) for inp in list(op.input) + list(op.output): @@ -1572,14 +1616,16 @@ def map_ops(proto): def _IsGPUBlob(model, blob_name): if blob_name in model._blob_to_device: - return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA + return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \ + model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP else: blob_name = "{}_{}/{}".format( model._device_prefix, model._devices[0], blob_name ) if blob_name not in model._blob_to_device: - return model._device_type == caffe2_pb2.CUDA - return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA + return model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP + return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \ + model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP def _GroupByDevice(model, devices, params, non_data_params): @@ -1890,7 +1936,10 @@ def _InterleaveOps(model): new_ops = [] ops = {d: [] for d in range(num_devices)} for op in orig_ops: - ops[op.device_option.cuda_gpu_id].append(op) + if workspace.has_hip_support: + ops[op.device_option.hip_gpu_id].append(op) + else: + ops[op.device_option.cuda_gpu_id].append(op) for j in range(num_ops_per_dev): tp = None diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py index 1bc41b7d1ccbc5..b7fcb708d8951e 100644 --- a/caffe2/python/docs/generator.py +++ b/caffe2/python/docs/generator.py @@ -103,7 +103,8 @@ def __init__(self, name): def getDeviceImpl(self): deviceImplList = [] for device, impl in [('CPU', OpSchema.get_cpu_impl(self.op_name)), - ('CUDA', OpSchema.get_cuda_impl(self.op_name))]: + ('CUDA', OpSchema.get_cuda_impl(self.op_name)), + ('HIP', OpSchema.get_hip_impl(self.op_name))]: if not impl: continue deviceImplList.append((device, impl)) @@ -194,7 +195,8 @@ def generateDevices(self, formatter): self.getInfo(formatter, 'CPU', OpSchema.get_cpu_impl(self.name)), self.getInfo(formatter, - 'GPU', OpSchema.get_cuda_impl(self.name)), + 'HIP', OpSchema.get_hip_impl(self.name)) if workspace.has_hip_support else + self.getInfo(formatter, 'GPU', OpSchema.get_cuda_impl(self.name)), ] formatter.addList([i for i in devices if i]) diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py index e7803e829bb431..db252d8f704d0f 100644 --- a/caffe2/python/functional_test.py +++ b/caffe2/python/functional_test.py @@ -46,7 +46,7 @@ def _tensor_splits(draw, add_axis=False): class TestFunctional(hu.HypothesisTestCase): - @given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs) + @given(X=hu.tensor(), engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **hu.gcs) def test_relu(self, X, engine, gc, dc): X += 0.02 * np.sign(X) X[X == 0.0] += 0.02 diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py index f1c190aa6efac6..52da4e7dd49304 100644 --- a/caffe2/python/gradient_check_test.py +++ b/caffe2/python/gradient_check_test.py @@ -23,9 +23,9 @@ import unittest -if workspace.has_gpu_support and workspace.NumCudaDevices() > 0: +if (workspace.has_gpu_support or workspace.has_hip_support) and workspace.NumGpuDevices() > 0: gpu_device_option = caffe2_pb2.DeviceOption() - gpu_device_option.device_type = caffe2_pb2.CUDA + gpu_device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA cpu_device_option = caffe2_pb2.DeviceOption() gpu_device_checker = device_checker.DeviceChecker( 0.01, [gpu_device_option] diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index f640f6db20eff8..e0083167c4b56e 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -278,7 +278,7 @@ def gradient_checker_device_option(): ) gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do])) -gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do])) +gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]+[hip_do]), dc=st.just([gpu_do]+[hip_do])) gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip)) diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py index 6536280d8a6057..cb5712f425e223 100644 --- a/caffe2/python/memonger_test.py +++ b/caffe2/python/memonger_test.py @@ -223,13 +223,13 @@ def test_gradient_optim(self, input_dim, output_dim, batch_size): np.testing.assert_almost_equal(loss, optimized_loss) np.testing.assert_almost_equal(grad, optimized_grad) - @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.") + @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support.") def test_memonger_mix_cpu_gpu(self): ''' Check that memonger does not make blobs cross CPU/GPU boundary ''' m = model_helper.ModelHelper() - with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): + with core.DeviceScope(core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)): fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2) fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2) fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2) @@ -259,7 +259,10 @@ def test_memonger_mix_cpu_gpu(self): # Create set of blobs on CPU side and GPU side and check they don't # overlap - device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()} + if workspace.has_hip_support: + device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.HIP: set()} + else: + device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()} for op in optim_proto.op: if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]: dev = op.device_option.device_type @@ -267,7 +270,7 @@ def test_memonger_mix_cpu_gpu(self): device_blobs[dev].add(b) device_crossers = device_blobs[caffe2_pb2.CPU].intersection( - device_blobs[caffe2_pb2.CUDA] + device_blobs[caffe2_pb2.HIP] if workspace.has_hip_support else device_blobs[caffe2_pb2.CUDA] ) self.assertEquals(device_crossers, set()) diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py index 31cba3facb0559..3f438d35cad35e 100644 --- a/caffe2/python/model_device_test.py +++ b/caffe2/python/model_device_test.py @@ -124,7 +124,7 @@ def _testMiniAlexNet(self, order): cpu_device = caffe2_pb2.DeviceOption() cpu_device.device_type = caffe2_pb2.CPU gpu_device = caffe2_pb2.DeviceOption() - gpu_device.device_type = caffe2_pb2.CUDA + gpu_device.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device]) ret = checker.CheckNet( @@ -136,7 +136,7 @@ def _testMiniAlexNet(self, order): ) self.assertEqual(ret, True) - @unittest.skipIf(not workspace.has_gpu_support, + @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No GPU support. Skipping test.") def testMiniAlexNetNCHW(self): self._testMiniAlexNet("NCHW") diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py index e84369fbbbef7f..e03943e3f084f4 100644 --- a/caffe2/python/model_helper.py +++ b/caffe2/python/model_helper.py @@ -570,7 +570,10 @@ def rename_list(proto_list): rename_list(step_op.output) if device is not None: step_op.device_option.device_type = device.device_type - step_op.device_option.cuda_gpu_id = device.cuda_gpu_id + if workspace.has_hip_support: + step_op.device_option.hip_gpu_id = device.hip_gpu_id + else: + step_op.device_option.cuda_gpu_id = device.cuda_gpu_id rename_list(arg.n.external_input) rename_list(arg.n.external_output) @@ -584,7 +587,10 @@ def rename_list(proto_list): if device is not None: op.device_option.device_type = device.device_type - op.device_option.cuda_gpu_id = device.cuda_gpu_id + if workspace.has_hip_support: + op.device_option.hip_gpu_id = device.hip_gpu_id + else: + op.device_option.cuda_gpu_id = device.cuda_gpu_id validate_op(op) predict_proto.op.extend([op]) known_blobs.update(op.output) diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py index b407f96d2391f8..186857fe569430 100644 --- a/caffe2/python/muji.py +++ b/caffe2/python/muji.py @@ -25,8 +25,12 @@ def OnGPU(gpu_id): specified gpu id. """ device_option = caffe2_pb2.DeviceOption() - device_option.device_type = caffe2_pb2.CUDA - device_option.cuda_gpu_id = gpu_id + if workspace.has_hip_support: + device_option.device_type = caffe2_pb2.HIP + device_option.hip_gpu_id = gpu_id + else: + device_option.device_type = caffe2_pb2.CUDA + device_option.cuda_gpu_id = gpu_id return device_option @@ -48,7 +52,7 @@ def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None): "gpu_indices length and blobs length mismatch: %d vs %d" % (len(gpu_indices), len(blobs)) ) - pattern = workspace.GetCudaPeerAccessPattern() + pattern = workspace.GetHipPeerAccessPattern() if workspace.has_hip_support else workspace.GetCudaPeerAccessPattern() if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]): return Allreduce2(net, blobs, reduced_affix, gpu_indices) elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]): diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py index 4b5cddb61d244e..60b3598f3fb192 100644 --- a/caffe2/python/net_printer.py +++ b/caffe2/python/net_printer.py @@ -8,6 +8,7 @@ from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef from caffe2.python.checkpoint import Job from caffe2.python.core import Net, ExecutionStep, Plan +from caffe2.python import workspace from caffe2.python.task import Task, TaskGroup, WorkspaceType, TaskOutput from collections import defaultdict from contextlib import contextmanager @@ -267,12 +268,13 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False): def format_device_option(dev_opt): + gpu_id = dev_opt.hip_gpu_id if workspace.has_hip_support else dev_opt.cuda_gpu_id if not dev_opt or not ( - dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name): + dev_opt.device_type or gpu_id or dev_opt.node_name): return None return call( 'DeviceOption', - [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name]) + [dev_opt.device_type, gpu_id, "'%s'" % dev_opt.node_name]) @Printer.register(OperatorDef) diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py index 0d35110740825b..9d5d961f71c756 100644 --- a/caffe2/python/operator_test/activation_ops_test.py +++ b/caffe2/python/operator_test/activation_ops_test.py @@ -17,7 +17,7 @@ class TestActivations(hu.HypothesisTestCase): @given(X=hu.tensor(), in_place=st.booleans(), - engine=st.sampled_from(["", "CUDNN"]), **mu.gcs) + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **mu.gcs) def test_relu(self, X, in_place, engine, gc, dc): if gc == mu.mkl_do: in_place = False @@ -43,7 +43,7 @@ def relu_ref(X): @unittest.skipIf(not workspace.has_gpu_support, "Relu for float16 can only run on GPU now.") @given(X=hu.tensor(dtype=np.float16), in_place=st.booleans(), - engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_gpu_only) + engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]), **hu.gcs_gpu_only) def test_relu_fp16(self, X, in_place, engine, gc, dc): op = core.CreateOperator( "Relu", @@ -102,7 +102,7 @@ def relu_n_ref(X): @given(X=hu.tensor(), alpha=st.floats(min_value=0.1, max_value=2.0), - in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]), + in_place=st.booleans(), engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]), **hu.gcs) def test_elu(self, X, alpha, in_place, engine, gc, dc): op = core.CreateOperator( diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index 638248d60bafe5..0980cc5c043e78 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -7,7 +7,7 @@ import hypothesis.strategies as st from caffe2.proto import caffe2_pb2 -from caffe2.python import core +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu @@ -47,10 +47,10 @@ def ref(x, mask): @staticmethod def _dtype_conversion(x, dtype, gc, dc): - """SequenceMask only supports fp16 with CUDA.""" + """SequenceMask only supports fp16 with CUDA/HIP.""" if dtype == np.float16: - assume(gc.device_type == caffe2_pb2.CUDA) - dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA] + assume(gc.device_type == caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA) + dc = [d for d in dc if d.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)] x = x.astype(dtype) return x, dc diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py index 79293fc6453232..1a77c2965b6aba 100644 --- a/caffe2/python/operator_test/ceil_op_test.py +++ b/caffe2/python/operator_test/ceil_op_test.py @@ -3,7 +3,7 @@ from __future__ import print_function from __future__ import unicode_literals -from caffe2.python import core +from caffe2.python import core, workspace from hypothesis import given import hypothesis.strategies as st import caffe2.python.hypothesis_test_util as hu @@ -15,7 +15,7 @@ class TestCeil(hu.HypothesisTestCase): @given(X=hu.tensor(), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]), **hu.gcs) def test_ceil(self, X, gc, dc, engine): op = core.CreateOperator("Ceil", ["X"], ["Y"], engine=engine) diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index f460c191b02f5f..0f8cf66b0ae1ec 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -37,6 +37,15 @@ def _cudnn_supports( return False return True +def _miopen_supports( + dilation=False, + nhwc=False, + backward=False, +): + """Return True if MIOPEN supports this configuration.""" + if nhwc or dilation: + return False + return True def _cudnn_convolution_algo_count(direction): try: @@ -192,7 +201,7 @@ def test_convolution_separate_stride_pad_layout(self, op_type, output_channels=st.integers(1, 8), batch_size=st.integers(1, 3), order=st.sampled_from(["NCHW", "NHWC"]), - engine=st.sampled_from(["", "CUDNN", "MKLDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN", "MKLDNN"]), use_bias=st.booleans(), force_algo_fwd=_cudnn_convolution_algo_count("fwd"), force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"), @@ -209,6 +218,10 @@ def test_convolution_gradients(self, op_type, stride, pad, kernel, dilation, assume(_cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC'), backward=True)) + if engine == 'MIOPEN': + assume(_cudnn_supports(dilation=(dilation > 1), + nhwc=(order == 'NHWC'), + backward=True)) assume(engine != "MKLDNN" or use_bias is True) @@ -451,8 +464,12 @@ def test_convolution_layout(self, op_type, stride, pad, kernel, dilation, for order in ["NCHW", "NHWC"]: engine_list = [''] - if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')): - engine_list.append('CUDNN') + if workspace.has_hip_support: + if _miopen_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')): + engine_list.append('MIOPEN') + else: + if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')): + engine_list.append('CUDNN') for engine in engine_list: op = core.CreateOperator( @@ -503,9 +520,9 @@ def canonical(o): net_type=st.sampled_from( ["simple", "dag"] + (["async_dag"] if workspace.has_gpu_support or workspace.has_hip_support else [])), - engine=st.sampled_from(["CUDNN", ""]), - **hu.gcs_no_hip) - def test_convolution_sync(self, net_type, num_workers, engine, gc, dc): + do=st.sampled_from(hu.device_options), + engine=st.sampled_from(["CUDNN", ""])) + def test_convolution_sync(self, net_type, num_workers, do, engine): m = ModelHelper(name="test_model") n = 1 d = 2 @@ -557,8 +574,8 @@ def test_convolution_sync(self, net_type, num_workers, engine, gc, dc): m.net.SquaredL2Distance(["0_0_flat", "label"], "xent") m.net.AveragedLoss("xent", "loss") input_to_grad = m.AddGradientOperators(["loss"]) - m.Proto().device_option.CopyFrom(gc) - m.param_init_net.Proto().device_option.CopyFrom(gc) + m.Proto().device_option.CopyFrom(do) + m.param_init_net.Proto().device_option.CopyFrom(do) m.Proto().type = net_type m.Proto().num_workers = num_workers self.ws.run(m.param_init_net) @@ -570,10 +587,10 @@ def run(): for input_blob in input_blobs: self.ws.create_blob(input_blob).feed( np.random.randn(n, d, h, w).astype(np.float32), - device_option=gc) + device_option=do) self.ws.create_blob("label").feed( np.random.randn(n, d * h * w).astype(np.float32), - device_option=gc) + device_option=do) self.ws.run(m.net) gradients = [ self.ws.blobs[str(input_to_grad[input_blob])].fetch() @@ -623,8 +640,7 @@ def test_use_cudnn_engine_interactions(self): f(**kwargs) else: f(**kwargs) - self.assertEqual(model.Proto().op[-1].engine, - expected_engine) + self.assertEqual(model.Proto().op[-1].engine, expected_engine) @given(op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4), G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4), diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py index 05a018ff90a2c3..3b07090a2b3283 100644 --- a/caffe2/python/operator_test/copy_ops_test.py +++ b/caffe2/python/operator_test/copy_ops_test.py @@ -40,21 +40,29 @@ def run_test_copy_gradient(self, device_opt): def test_copy_gradient_cpu(self): self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0)) - @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.") + num_gpu = 0 + if workspace.has_hip_support: + num_gpu = workspace.NumHipDevices() + else: + num_gpu = workspace.NumCudaDevices() + + @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.") def test_copy_gradient_gpu(self): - self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CUDA, 0)) + self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)) - @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPU.") + @unittest.skipIf(num_gpu < 2, "Need at least 2 GPU.") def test_copy_gradient_multiple_gpus(self): model = model_helper.ModelHelper(name="copy_test") with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)): x_cpu = model.net.AddExternalInputs("x_cpu") - with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)): + gpu_device = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA + + with core.DeviceScope(core.DeviceOption(gpu_device, 0)): x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1") - with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)): + with core.DeviceScope(core.DeviceOption(gpu_device, 1)): x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2") loss = model.AveragedLoss(x_gpu_2, "loss") gradient_map = model.AddGradientOperators([loss]) @@ -80,20 +88,20 @@ def get_op_with_output(model, output_blob_name): self.assertEqual( get_op_with_output(model, "x_gpu_2_grad").device_option, - core.DeviceOption(caffe2_pb2.CUDA, 1), + core.DeviceOption(gpu_device, 1), ) self.assertEqual( get_op_with_output(model, "x_cpu_grad").device_option, - core.DeviceOption(caffe2_pb2.CUDA, 0), + core.DeviceOption(gpu_device, 0), ) - @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.") + @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.") def test_cpu2gpu_gpu2cpu_sparse_gradients(self): model = model_helper.ModelHelper(name="copy_test") v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4]) indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4]) cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0) - gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) + gpu_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0) with core.DeviceScope(gpu_opt): vcpu = model.CopyGPUToCPU(v, "vcpu") @@ -112,13 +120,13 @@ def test_cpu2gpu_gpu2cpu_sparse_gradients(self): self.assertTrue("v" in gradient_map) self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice)) - @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.") + @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.") def test_cpu2gpu_gpu2cpu_gradients(self): model = model_helper.ModelHelper(name="copy_test") batch = 32 cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0) - gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) + gpu_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0) with core.NameScope("cpu"): with core.DeviceScope(cpu_opt): diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py index e767a0db161a9f..74325052bb5e37 100644 --- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py +++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py @@ -405,8 +405,8 @@ def test_sum_reduce(self, gc, dc): np.testing.assert_array_almost_equal(out, res) self.assertDeviceChecks(dc, op, [X, Y], [0]) - # fp64 is not supported with the CUDA op - dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA] + # fp64 is not supported with the CUDA/HIP op + dc_cpu_only = [d for d in dc if (d.device_type != caffe2_pb2.CUDA or d.device_type != caffe2_pb2.HIP)] self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0]) @unittest.skipIf(not workspace.has_gpu_support, "No gpu support") diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py index 956d0ec9619987..4301b5c60f66f3 100644 --- a/caffe2/python/operator_test/pooling_test.py +++ b/caffe2/python/operator_test/pooling_test.py @@ -209,12 +209,14 @@ def test_pooling_with_index(self, stride, pad, kernel, size, @given(sz=st.integers(1, 20), batch_size=st.integers(1, 4), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), op_type=st.sampled_from(["AveragePool", "AveragePool2D"]), **hu.gcs) @settings(max_examples=3, timeout=10) def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc): ''' Special test to stress the fast path of NCHW average pool ''' + if engine == 'MIOPEN': + assume(sz<16) op = core.CreateOperator( op_type, ["X"], @@ -233,7 +235,7 @@ def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc): @given(sz=st.integers(1, 20), batch_size=st.integers(1, 4), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), op_type=st.sampled_from(["MaxPool", "MaxPool2D"]), **hu.gcs) @settings(max_examples=3, timeout=10) @@ -241,7 +243,10 @@ def test_global_max_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc): ''' Special test to stress the fast path of NCHW max pool ''' # CuDNN 5 does not support deterministic max pooling. - assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN") + if engine == 'MIOPEN': + assume(sz<16) + if not workspace.has_hip_support: + assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN") op = core.CreateOperator( op_type, ["X"], @@ -270,12 +275,14 @@ def test_global_max_pool_nchw(self, op_type, sz, order=st.sampled_from(["NCHW", "NHWC"]), op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool", "MaxPool2D", "AveragePool2D"]), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **hu.gcs) def test_pooling(self, stride, pad, kernel, size, input_channels, batch_size, order, op_type, engine, gc, dc): assume(pad < kernel) + if engine == 'MIOPEN': + assume(op_type != "LpPool" and order == "NCHW") op = core.CreateOperator( op_type, ["X"], @@ -300,12 +307,14 @@ def test_pooling(self, stride, pad, kernel, size, batch_size=st.integers(1, 3), order=st.sampled_from(["NCHW", "NHWC"]), op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool"]), - engine=st.sampled_from(["", "CUDNN"]), + engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **hu.gcs) def test_global_pooling(self, size, input_channels, batch_size, order, op_type, engine, gc, dc): # CuDNN 5 does not support deterministic max pooling. assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool") + if engine == 'MIOPEN': + assume(op_type != "LpPool" and order == "NCHW") op = core.CreateOperator( op_type, ["X"], diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 2ebcf1d92a1240..b26cc24b7e09e8 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -75,9 +75,11 @@ def make_unique_blob_name(self, base_str): if current_scope is None: return self.get_cpu_blob_name(base_str) - if current_scope.device_type == caffe2_pb2.CUDA: + if current_scope.device_type == caffe2_pb2.CUDA or current_scope.device_type == caffe2_pb2.HIP: return self.get_gpu_blob_name( - base_str, current_scope.cuda_gpu_id, current_scope.node_name + base_str, + current_scope.hip_gpu_id if workspace.has_hip_support else current_scope.cuda_gpu_id, + current_scope.node_name ) else: return self.get_cpu_blob_name(base_str, current_scope.node_name) @@ -121,7 +123,7 @@ def build_lr(self, net, param_init_net, base_learning_rate, if self._local_lr_multiplier is not None: current_scope = scope.CurrentDeviceScope() if (current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA) and not self._local_lr_multiplier_on_gpu): local_lr_multiplier = net.CopyFromCPUInput( self._local_lr_multiplier, @@ -204,14 +206,6 @@ def scale_learning_rate(self, *args, **kwargs): raise NotImplementedError( "Optimizer Need to Implement `scale_learning_rate` method.") - def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max): - wd = param_init_net.ConstantFill([], "weight_decay", - shape=[1], value=weight_decay) - trust = param_init_net.ConstantFill([], "trust", shape=[1], value=trust) - lr_max = param_init_net.ConstantFill([], "lr_max", shape=[1], - value=lr_max) - return wd, trust, lr_max - class SgdOptimizer(Optimizer): def __init__(self, base_learning_rate=0.01, policy='fixed', @@ -241,18 +235,15 @@ def _run(self, net, param_init_net, param_info): if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) - wd, trust, lr_max = self.create_lars_inputs( - param_init_net, 0.0, 1.0, np.finfo(np.float32).max) lr_lars_multiplier = net.Lars( - [param, grad, wd, trust, lr_max], + [param, grad], self.make_unique_blob_name(str(param) + "_lars"), - offset=self.lars, - lr_min=0.0) + offset=self.lars) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA), + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)), ) # We need negative sign for LR when used directly with WeightedSum @@ -273,7 +264,7 @@ def _run(self, net, param_init_net, param_info): # to include device information. ONE = param_init_net.ConstantFill( [], - "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name), + "ONE_{}_{}{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id, dev.node_name), shape=[1], value=1.0 ) @@ -482,12 +473,12 @@ def _run(self, net, param_init_net, param_info): ONE = param_init_net.ConstantFill( [], - "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id), + "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id), shape=[1], value=1.0 ) WD = param_init_net.ConstantFill( - [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id), + [], "wd_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id), shape=[1], value=self.weight_decay ) @@ -531,19 +522,15 @@ def _run(self, net, param_init_net, param_info): if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) - wd, trust, lr_max = self.create_lars_inputs( - param_init_net, 0.0, 1.0, np.finfo(np.float32).max) lr_lars_multiplier = net.Lars( - [param, grad, wd, trust, lr_max], + [param, grad], self.make_unique_blob_name(str(param) + "_lars"), - offset=self.lars, - lr_min=0.0) - + offset=self.lars) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA), + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)), ) lr, _ = self.build_lr( @@ -656,18 +643,15 @@ def _run(self, net, param_init_net, param_info): if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) - wd, trust, lr_max = self.create_lars_inputs( - param_init_net, 0.0, 1.0, np.finfo(np.float32).max) lr_lars_multiplier = net.Lars( - [param, grad, wd, trust, lr_max], + [param, grad], self.make_unique_blob_name(str(param) + "_lars"), - offset=self.lars, - lr_min=0.0) + offset=self.lars) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, is_gpu_blob=(current_scope is not None - and current_scope.device_type == caffe2_pb2.CUDA), + and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)), ) lr, _ = self.build_lr( @@ -1144,7 +1128,7 @@ def _run(self, net, param_init_net, param_info): ONE = param_init_net.ConstantFill( [], - "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id), + "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id), shape=[1], value=1.0 ) @@ -1439,8 +1423,7 @@ def build_ftrl(model, engine="SIMD", **kwargs): def build_gftrl(model, engine="", **kwargs): - if engine == "SIMD": - assert core.IsOperator('GFtrl_ENGINE_SIMD') + # SIMD version of GFTRL is not supported gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs) return _build(model, gftrl_optimizer) diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py index bbb0c02625f09a..a136684cd0e38e 100644 --- a/caffe2/python/optimizer_test.py +++ b/caffe2/python/optimizer_test.py @@ -77,7 +77,7 @@ def check_optimizer(self, optimizer): tensor = workspace.FetchBlob(param) np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5) - @unittest.skipIf(not workspace.has_gpu_support, "No GPU support") + @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support , "No GPU support") def testGPUDense(self): super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16) @@ -434,11 +434,11 @@ def test_caffe2_cpu_vs_numpy(self): ) @unittest.skip("Results might vary too much. Only for individual use.") - @unittest.skipIf(not workspace.has_gpu_support, "No gpu support") + @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support") def test_caffe2_gpu_vs_numpy(self): n_dim = 1000000 n_iter = 50 - gpu_device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) + gpu_device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0) with core.DeviceScope(gpu_device_opt): for zero_debias in [False, True]: for grad_coef in [1.0, 0.1, 0.01]: diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py index dbb0dbeae2dd37..4c62d9ee21b797 100644 --- a/caffe2/python/optimizer_test_util.py +++ b/caffe2/python/optimizer_test_util.py @@ -70,7 +70,7 @@ def testDense(self): @unittest.skipIf(not workspace.has_gpu_support, "No gpu support") def testGPUDense(self, dtype=core.DataType.FLOAT): - device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0) + device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0) with core.DeviceScope(device_opt): model, _perfect_model, data, label = self._createDense(dtype) if dtype == core.DataType.FLOAT16: diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py index 11f7a2c44046af..f4b3aa4a244d17 100644 --- a/caffe2/python/scope_test.py +++ b/caffe2/python/scope_test.py @@ -3,7 +3,7 @@ from __future__ import print_function from __future__ import unicode_literals -from caffe2.python import scope, core +from caffe2.python import scope, core, workspace from caffe2.proto import caffe2_pb2 import unittest @@ -18,7 +18,10 @@ def thread_runner(idx, testobj): testobj.assertEquals(scope.CurrentNameScope(), "") testobj.assertEquals(scope.CurrentDeviceScope(), None) namescope = "namescope_{}".format(idx) - dsc = core.DeviceOption(caffe2_pb2.CUDA, idx) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=idx) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=idx) with scope.DeviceScope(dsc): with scope.NameScope(namescope): testobj.assertEquals(scope.CurrentNameScope(), namescope + "/") @@ -58,7 +61,10 @@ def testNamescopeAssertion(self): def testDevicescopeBasic(self): self.assertEquals(scope.CurrentDeviceScope(), None) - dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9) with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) @@ -67,7 +73,10 @@ def testDevicescopeBasic(self): def testEmptyDevicescopeBasic(self): self.assertEquals(scope.CurrentDeviceScope(), None) - dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9) with scope.DeviceScope(dsc): self.assertEquals(scope.CurrentDeviceScope(), dsc) with scope.EmptyDeviceScope(): @@ -78,7 +87,10 @@ def testEmptyDevicescopeBasic(self): def testDevicescopeAssertion(self): self.assertEquals(scope.CurrentDeviceScope(), None) - dsc = core.DeviceOption(caffe2_pb2.CUDA, 9) + if workspace.has_hip_support: + dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9) + else: + dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9) try: with scope.DeviceScope(dsc): diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index c033e0684bb5d8..79ac32b6269dc5 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -47,7 +47,7 @@ NumCudaDevices = C.num_cuda_devices GetCUDAVersion = C.get_cuda_version GetCuDNNVersion = C.get_cudnn_version - + NumGpuDevices = NumCudaDevices def GetCudaPeerAccessPattern(): return np.asarray(C.get_cuda_peer_access_pattern()) @@ -59,6 +59,18 @@ def GetCudaPeerAccessPattern(): GetCudaPeerAccessPattern = lambda: np.array([]) # noqa GetDeviceProperties = lambda x: None # noqa +if has_hip_support: + NumHipDevices = C.num_hip_devices + NumGpuDevices = NumHipDevices + def GetHipPeerAccessPattern(): + return np.asarray(C.get_hip_peer_access_pattern()) + + GetDeviceProperties = C.get_device_properties +else: + NumHipDevices = lambda: 0 # noqa + GetHipPeerAccessPattern = lambda: np.array([]) # noqa + GetDeviceProperties = lambda x: None # noqa + IsNUMAEnabled = C.is_numa_enabled GetNumNUMANodes = C.get_num_numa_nodes GetBlobNUMANode = C.get_blob_numa_node @@ -322,10 +334,10 @@ def FeedBlob(name, arr, device_option=None): if device_option is None: device_option = scope.CurrentDeviceScope() - if device_option and device_option.device_type == caffe2_pb2.CUDA: + if device_option and (device_option.device_type == caffe2_pb2.CUDA or device_option.device_type == caffe2_pb2.HIP): if arr.dtype == np.dtype('float64'): logger.warning( - "CUDA operators do not support 64-bit doubles, " + + "CUDA/HIP operators do not support 64-bit doubles, " + "please use arr.astype(np.float32) or np.int32 for ints." + " Blob: {}".format(name) + " type: {}".format(str(arr.dtype)) diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py index 5da37c7f22efca..72935a94ac45e7 100644 --- a/caffe2/python/workspace_test.py +++ b/caffe2/python/workspace_test.py @@ -317,7 +317,7 @@ def testCreateWorkspace(self): self.assertTrue("test" in workspaces) -@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.") +@unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support.") class TestWorkspaceGPU(test_util.TestCase): def setUp(self): @@ -339,6 +339,7 @@ def testFetchBlobGPU(self): self.assertEqual(fetched_again.shape, (1, 2, 3, 4)) np.testing.assert_array_equal(fetched_again, 2.0) + @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.") def testGetCudaPeerAccessPattern(self): pattern = workspace.GetCudaPeerAccessPattern() self.assertEqual(type(pattern), np.ndarray) @@ -346,6 +347,14 @@ def testGetCudaPeerAccessPattern(self): self.assertEqual(pattern.shape[0], pattern.shape[1]) self.assertEqual(pattern.shape[0], workspace.NumCudaDevices()) + @unittest.skipIf(not workspace.has_hip_support, "No hip support.") + def testGetHipPeerAccessPattern(self): + pattern = workspace.GetHipPeerAccessPattern() + self.assertEqual(type(pattern), np.ndarray) + self.assertEqual(pattern.ndim, 2) + self.assertEqual(pattern.shape[0], pattern.shape[1]) + self.assertEqual(pattern.shape[0], workspace.NumHipDevices()) + @unittest.skipIf(not workspace.C.has_mkldnn, "No MKLDNN support.") class TestWorkspaceMKLDNN(test_util.TestCase): diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md new file mode 100644 index 00000000000000..74ff92b72a26a9 --- /dev/null +++ b/rocm-docs/caffe2-build.md @@ -0,0 +1,108 @@ +# Caffe2: Building From Source on ROCm Platform + +## Intro +This instruction provides a starting point to build caffe2 on AMD GPUs (Caffe2 ROCm port) from source. +*Note*: it is recommended to start with a clean Ubuntu 16.04 system + +## Install docker + + If your machine doesn't have docker installed, follow the steps [here](https://docs.docker.com/install/linux/docker-ce/ubuntu/#install-docker-ce) to install docker. + +## Install ROCm + +Install ROCm stack following steps at [link](https://github.com/RadeonOpenCompute/ROCm/blob/master/README.md) if your machine doesn't have ROCm already. + +Once the machine is ready with ROCm stack, there are two ways to use caffe2 +* Run the docker container with caffe2 installed in it. + +* Build caffe2 from source inside a docker with all the dependencies. + +## Launch docker container with caffe2 pre-installed +``` +docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v2 +``` + +To run benchmarks, skip directly to benchmarks section of the document. + +## Build Caffe2 from source +### Pull the docker image +``` +docker pull rocm/caffe2:unbuilt-rocm1.8.2-v2 +``` +This docker image has all the dependencies for caffe2 pre-installed. + +### Pull the latest caffe2 source: + +Checkout the `caffe2_specific` branch of this repository. +* Using https +``` +git clone --recurse-submodules -b caffe2_specific https://github.com/ROCmSoftwarePlatform/pytorch.git +``` +* Using ssh +``` +git clone --recurse-submodules -b caffe2_specific git@github.com:ROCmSoftwarePlatform/pytorch.git +``` +Navigate to repo directory +``` +cd pytorch +``` + +### Launch the docker container +``` +docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v2 +``` +Navigate to pytorch directory `cd /pytorch` inside the container. + +### Build caffe2 Project from source + +* Run the command + + `.jenkins/caffe2/amd/build_amd.sh` + + +* Test the rocm-caffe2 Installation + + Before running the tests, make sure that the required environment variables are set: + ``` + cd build_caffe2 && python -c 'from caffe2.python import core' 2>/dev/null && echo "Success" || echo "Failure" + ``` +Make sure the the following environment variables are set. + +``` +LD_lIBRARY_PATH=/usr/local/caffe2/lib +PYTHONPATH=/usr/local/caffe2/lib/python2.7/dist-packages +``` + +## Run benchmarks + +Navigate to build directory, `cd /pytorch/build_caffe2` to run benchmarks. + +Caffe2 benchmarking script supports the following networks. +1. MLP +2. AlexNet +3. OverFeat +4. VGGA +5. Inception +6. Inception_v2 +7. Resnet50 + +*Special case:* Inception_v2 and Resnet50 will need their corresponding protobuf files to run the benchmarks. Protobufs can be downloaded from caffe2 model zoo using the below command. Substitute model_name with `inception_v2` or `resnet50` + +``` +python caffe2/python/models/download.py +``` +This will download the protobufs to current working directory. + +To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, run the command replacing `` with one of the networks. + +``` +python caffe2/python/convnet_benchmarks.py --batch_size 64 --model --engine MIOPEN --layer_wise_benchmark True --net_type simple + +``` +To run Inception_v2 or Resnet50, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above. + +``` +python caffe2/python/convnet_benchmarks.py --batch_size 64 --model --engine MIOPEN --layer_wise_benchmark True --net_type simple --model_path + +``` + From 50886a625c3d9e2923aa91cf4f4fd6dfc1f7d963 Mon Sep 17 00:00:00 2001 From: root Date: Tue, 14 Aug 2018 15:57:02 -0700 Subject: [PATCH 02/28] add execute permissions to build script --- .jenkins/caffe2/amd/build_amd.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 .jenkins/caffe2/amd/build_amd.sh diff --git a/.jenkins/caffe2/amd/build_amd.sh b/.jenkins/caffe2/amd/build_amd.sh old mode 100644 new mode 100755 From 0a337b42a84223a18cc2699508b4d35de2b9593a Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 27 Aug 2018 18:47:06 -0700 Subject: [PATCH 03/28] update docs --- rocm-docs/caffe2-build.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md index 74ff92b72a26a9..d19116a018b7bd 100644 --- a/rocm-docs/caffe2-build.md +++ b/rocm-docs/caffe2-build.md @@ -27,7 +27,7 @@ To run benchmarks, skip directly to benchmarks section of the document. ## Build Caffe2 from source ### Pull the docker image ``` -docker pull rocm/caffe2:unbuilt-rocm1.8.2-v2 +docker pull rocm/caffe2:unbuilt-rocm1.8.2-v3 ``` This docker image has all the dependencies for caffe2 pre-installed. @@ -49,7 +49,7 @@ cd pytorch ### Launch the docker container ``` -docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v2 +docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v3 ``` Navigate to pytorch directory `cd /pytorch` inside the container. From 53acb376bbaf159657b273033802793df73b96a0 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 29 Aug 2018 11:34:44 -0700 Subject: [PATCH 04/28] Update caffe2-build.md --- rocm-docs/caffe2-build.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md index d19116a018b7bd..0050a01b3c1c61 100644 --- a/rocm-docs/caffe2-build.md +++ b/rocm-docs/caffe2-build.md @@ -19,7 +19,7 @@ Once the machine is ready with ROCm stack, there are two ways to use caffe2 ## Launch docker container with caffe2 pre-installed ``` -docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v2 +docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v3 ``` To run benchmarks, skip directly to benchmarks section of the document. From 6423f9d942eca24fbf05525bf992a179aa5f8305 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 10 Sep 2018 13:56:36 -0700 Subject: [PATCH 05/28] add thinlto --- .jenkins/caffe2/amd/build_amd.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.jenkins/caffe2/amd/build_amd.sh b/.jenkins/caffe2/amd/build_amd.sh index 5c5dd0a3af41d5..be9248e1385155 100755 --- a/.jenkins/caffe2/amd/build_amd.sh +++ b/.jenkins/caffe2/amd/build_amd.sh @@ -35,9 +35,11 @@ fi export LANG=C.UTF-8 export LC_ALL=C.UTF-8 export HCC_AMDGPU_TARGET=gfx900 +export KMTHINLTO=1 ########## HIPIFY Caffe2 operators ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py" +${PYTHON} "${ROOT_DIR}/tools/amd_build/build_pytorch_amd.py" MAX_JOBS=$(nproc) From 740e2e78feb681d48a6358e5dbdabcde7d1c2cfa Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Wed, 12 Sep 2018 16:14:34 -0400 Subject: [PATCH 06/28] cmake fixes --- caffe2/CMakeLists.txt | 4 ++-- cmake/Dependencies.cmake | 23 +++++++++++------------ 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 4e9b3eef8937f4..cc4d03da58fecf 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -360,7 +360,7 @@ if(USE_ROCM) hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS}) # Since PyTorch files contain HIP headers, these flags are required for the necessary definitions to be added. - set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${HIP_HIPCC_FLAGS}) + target_compile_options(caffe2_hip PRIVATE ${HIP_HIPCC_FLAGS}) target_link_libraries(caffe2_hip PUBLIC caffe2) target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS}) @@ -536,7 +536,7 @@ if (BUILD_PYTHON) if(USE_ROCM) hip_add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS}) set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINKER_LANGUAGE HIP) - set_target_properties(caffe2_pybind11_state_hip PROPERTIES COMPILE_FLAGS "${HIP_HIPCC_FLAGS} -fvisibility=hidden") + target_compile_options(caffe2_pybind11_state_hip PRIVATE ${HIP_HIPCC_FLAGS} -fvisibility=hidden) set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "") set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX}) if (APPLE) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index dbec0d07ebf6e3..aeb5a4a46b4d14 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -531,18 +531,17 @@ if(NOT BUILD_ATEN_MOBILE) message(INFO "Compiling with HIP for AMD.") caffe2_update_option(USE_ROCM ON) - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_PLATFORM_HCC__=1") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DCUDA_HAS_FP16=1") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_NO_HALF_OPERATORS__=1") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_NO_HALF_CONVERSIONS__=1") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-macro-redefined") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-inconsistent-missing-override") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-exceptions") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-negative") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-overflow") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-unused-command-line-argument") - set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-duplicate-decl-specifier") + list(APPEND HIP_HIPCC_FLAGS -fPIC) + list(APPEND HIP_HIPCC_FLAGS -D__HIP_PLATFORM_HCC__=1) + list(APPEND HIP_HIPCC_FLAGS -DCUDA_HAS_FP16=1) + list(APPEND HIP_HIPCC_FLAGS -D__HIP_NO_HALF_OPERATORS__=1) + list(APPEND HIP_HIPCC_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1) + list(APPEND HIP_HIPCC_FLAGS -Wno-macro-redefined) + list(APPEND HIP_HIPCC_FLAGS -Wno-inconsistent-missing-override) + list(APPEND HIP_HIPCC_FLAGS -Wno-exceptions) + list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-negative) + list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-overflow) + list(APPEND HIP_HIPCC_FLAGS -Wno-unused-command-line-argument) set(Caffe2_HIP_INCLUDES ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $ ${Caffe2_HIP_INCLUDES}) From 0d892b050a7883e08b0519a9b27f6db5a697ca41 Mon Sep 17 00:00:00 2001 From: Yaxun Sam Liu Date: Wed, 12 Sep 2018 16:12:36 -0400 Subject: [PATCH 07/28] Replace hc::__lane_id with __lane_id --- aten/src/THC/THCAsmUtils.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh index c419fface4c299..89e97d9783ef6f 100644 --- a/aten/src/THC/THCAsmUtils.cuh +++ b/aten/src/THC/THCAsmUtils.cuh @@ -86,7 +86,7 @@ struct Bitfield { __device__ __forceinline__ int getLaneId() { #if defined(__HIP_PLATFORM_HCC__) - return hc::__lane_id(); + return __lane_id(); #else int laneId; asm("mov.s32 %0, %laneid;" : "=r"(laneId) ); From 48e6c170214890f0d12307a1139ff79bee749962 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 17 Sep 2018 14:17:10 -0700 Subject: [PATCH 08/28] update tests --- caffe2/python/operator_test/conv_test.py | 8 ++++---- caffe2/python/operator_test/group_conv_test.py | 9 +++++---- caffe2/python/operator_test/rnn_cell_test.py | 9 ++++++++- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index bca8e327031a54..c4eb708e0de801 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -584,8 +584,8 @@ def test_convolution_sync(self, net_type, num_workers, engine, gc, dc): m.net.SquaredL2Distance(["0_0_flat", "label"], "xent") m.net.AveragedLoss("xent", "loss") input_to_grad = m.AddGradientOperators(["loss"]) - m.Proto().device_option.CopyFrom(do) - m.param_init_net.Proto().device_option.CopyFrom(do) + m.Proto().device_option.CopyFrom(gc) + m.param_init_net.Proto().device_option.CopyFrom(gc) m.Proto().type = net_type m.Proto().num_workers = num_workers self.ws.run(m.param_init_net) @@ -597,10 +597,10 @@ def run(): for input_blob in input_blobs: self.ws.create_blob(input_blob).feed( np.random.randn(n, d, h, w).astype(np.float32), - device_option=do) + device_option=gc) self.ws.create_blob("label").feed( np.random.randn(n, d * h * w).astype(np.float32), - device_option=do) + device_option=gc) self.ws.run(m.net) gradients = [ self.ws.blobs[str(input_to_grad[input_blob])].fetch() diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py index fd4c5adf0075d6..082c0c20c46a62 100644 --- a/caffe2/python/operator_test/group_conv_test.py +++ b/caffe2/python/operator_test/group_conv_test.py @@ -7,7 +7,7 @@ import hypothesis.strategies as st from caffe2.proto import caffe2_pb2 -from caffe2.python import core +from caffe2.python import core, workspace import caffe2.python.hypothesis_test_util as hu import unittest @@ -38,7 +38,8 @@ def test_group_convolution( order, engine, use_bias, gc, dc): assume(size >= kernel) # TODO: Group conv in NHWC not implemented for GPU yet. - assume(group == 1 or order == "NCHW" or gc.device_type != caffe2_pb2.CUDA) + gpu_device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA + assume(group == 1 or order == "NCHW" or gc.device_type != gpu_device_type) input_channels = input_channels_per_group * group output_channels = output_channels_per_group * group @@ -65,8 +66,8 @@ def test_group_convolution( w = w.transpose((0, 3, 1, 2)) inputs = [X, w, b] if use_bias else [X, w] - - self.assertDeviceChecks(dc, op, inputs, [0]) + if order != 'NHWC' or group == 1: + self.assertDeviceChecks(dc, op, inputs, [0]) for i in range(len(inputs)): self.assertGradientChecks(gc, op, inputs, i, [0]) diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py index 9d9bb38e178517..99e9ec17078843 100644 --- a/caffe2/python/operator_test/rnn_cell_test.py +++ b/caffe2/python/operator_test/rnn_cell_test.py @@ -1174,6 +1174,10 @@ def test_lstm_extract_predictor_net(self): shapes[b] = workspace.FetchBlob(b).shape # But export in CPU + if workspace.has_hip_support: + device = core.DeviceOption(caffe2_pb2.CPU, hip_gpu_id=1) + else: + device = core.DeviceOption(caffe2_pb2.CPU, cuda_gpu_id=1) (predict_net, export_blobs) = ExtractPredictorNet( net_proto=model.net.Proto(), input_blobs=["input"], @@ -1216,7 +1220,10 @@ def test_lstm_extract_predictor_net(self): if arg.name == "step_net": for step_op in arg.n.op: self.assertEqual(0, step_op.device_option.device_type) - self.assertEqual(1, step_op.device_option.cuda_gpu_id) + if workspace.has_hip_support: + self.assertEqual(1, step_op.device_option.hip_gpu_id) + else: + self.assertEqual(1, step_op.device_option.cuda_gpu_id) elif arg.name == 'backward_step_net': self.assertEqual(caffe2_pb2.NetDef(), arg.n) From 48371a673b1ea289e94a8949b14d0b55b8a199eb Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 17 Sep 2018 14:44:57 -0700 Subject: [PATCH 09/28] update optimizer.py --- caffe2/python/optimizer.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 1fd7801339535c..3b372f4372e949 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -210,6 +210,13 @@ def scale_learning_rate(self, *args, **kwargs): raise NotImplementedError( "Optimizer Need to Implement `scale_learning_rate` method.") + def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max): + wd = param_init_net.ConstantFill([], "weight_decay", + shape=[1], value=weight_decay) + trust = param_init_net.ConstantFill([], "trust", shape=[1], value=trust) + lr_max = param_init_net.ConstantFill([], "lr_max", shape=[1], + value=lr_max) + return wd, trust, lr_max class SgdOptimizer(Optimizer): def __init__(self, base_learning_rate=0.01, policy='fixed', @@ -239,10 +246,13 @@ def _run(self, net, param_init_net, param_info): if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) + wd, trust, lr_max = self.create_lars_inputs( + param_init_net, 0.0, 1.0, np.finfo(np.float32).max) lr_lars_multiplier = net.Lars( - [param, grad], + [param, grad, wd, trust, lr_max], self.make_unique_blob_name(str(param) + "_lars"), - offset=self.lars) + offset=self.lars, + lr_min=0.0) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, @@ -526,10 +536,13 @@ def _run(self, net, param_init_net, param_info): if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) + wd, trust, lr_max = self.create_lars_inputs( + param_init_net, 0.0, 1.0, np.finfo(np.float32).max) lr_lars_multiplier = net.Lars( - [param, grad], + [param, grad, wd, trust, lr_max], self.make_unique_blob_name(str(param) + "_lars"), - offset=self.lars) + offset=self.lars, + lr_min=0.0) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, @@ -660,10 +673,13 @@ def _run(self, net, param_init_net, param_info): if self.lars is not None and not isinstance(grad, core.GradientSlice): assert self.lars >= 0, ( 'Lars offset must be nonnegative, got {}'.format(self.lars)) + wd, trust, lr_max = self.create_lars_inputs( + param_init_net, 0.0, 1.0, np.finfo(np.float32).max) lr_lars_multiplier = net.Lars( - [param, grad], + [param, grad, wd, trust, lr_max], self.make_unique_blob_name(str(param) + "_lars"), - offset=self.lars) + offset=self.lars, + lr_min=0.0) current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, From 2419ec5d71574582289ac96f48bb2f595de1acf5 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 17 Sep 2018 17:15:32 -0700 Subject: [PATCH 10/28] add ignore tests --- .jenkins/caffe2/amd/binary_tests.sh | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/.jenkins/caffe2/amd/binary_tests.sh b/.jenkins/caffe2/amd/binary_tests.sh index 132cc9c5606be0..1476ce7a0663e2 100755 --- a/.jenkins/caffe2/amd/binary_tests.sh +++ b/.jenkins/caffe2/amd/binary_tests.sh @@ -11,6 +11,16 @@ echo "Running C++ tests.." for file in $(find "${ROOT_DIR}/build_caffe2/bin" -executable -type f); do if [[ "$file" =~ "test" ]]; then - $file + case "$file" in + # skip tests we know are hanging or bad + */mkl_utils_test|*/aten/integer_divider_test) + continue + ;; + */scalar_tensor_test|*/basic|*/native_test) + continue + ;; + *) + "$file" + esac fi done From 9154b6a3d0fe5ab4071caec4bcc3f615eee13f23 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 19 Sep 2018 22:41:41 -0700 Subject: [PATCH 11/28] resolve merge conflict --- caffe2/python/operator_test/ceil_op_test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py index 196e66fe0ec9ea..079e669370722f 100644 --- a/caffe2/python/operator_test/ceil_op_test.py +++ b/caffe2/python/operator_test/ceil_op_test.py @@ -3,13 +3,9 @@ from __future__ import print_function from __future__ import unicode_literals -<<<<<<< HEAD from caffe2.python import core, workspace -======= -from caffe2.python import core import caffe2.python.hypothesis_test_util as hu import caffe2.python.serialized_test.serialized_test_util as serial ->>>>>>> upstream/master from hypothesis import given import hypothesis.strategies as st import numpy as np From c2dc0b77c68adcd88b6d66b0e42e46edbce9d98f Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Fri, 21 Sep 2018 14:48:21 -0700 Subject: [PATCH 12/28] update docker images versions in the docs --- rocm-docs/caffe2-build.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md index 0050a01b3c1c61..2bec136b774581 100644 --- a/rocm-docs/caffe2-build.md +++ b/rocm-docs/caffe2-build.md @@ -19,7 +19,7 @@ Once the machine is ready with ROCm stack, there are two ways to use caffe2 ## Launch docker container with caffe2 pre-installed ``` -docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v3 +docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.9-v2 ``` To run benchmarks, skip directly to benchmarks section of the document. @@ -27,7 +27,7 @@ To run benchmarks, skip directly to benchmarks section of the document. ## Build Caffe2 from source ### Pull the docker image ``` -docker pull rocm/caffe2:unbuilt-rocm1.8.2-v3 +docker pull rocm/caffe2:unbuilt-rocm1.9-v1 ``` This docker image has all the dependencies for caffe2 pre-installed. From daced6b05956be30bdd8a08a33dc2d70e60d98ae Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 24 Sep 2018 13:34:52 -0700 Subject: [PATCH 13/28] fix bug in rnn_cell test --- caffe2/python/operator_test/rnn_cell_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py index 99e9ec17078843..8804dc3bbdbcbc 100644 --- a/caffe2/python/operator_test/rnn_cell_test.py +++ b/caffe2/python/operator_test/rnn_cell_test.py @@ -1182,7 +1182,7 @@ def test_lstm_extract_predictor_net(self): net_proto=model.net.Proto(), input_blobs=["input"], output_blobs=[output], - device=core.DeviceOption(caffe2_pb2.CPU, 1), + device=device, ) # Create the net and run once to see it is valid From b1b216e78cc840dbde10a91806aa092a025f009d Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Mon, 24 Sep 2018 13:35:11 -0700 Subject: [PATCH 14/28] exclude fp16 for hip --- caffe2/python/operator_test/boolean_mask_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py index dda13c76f636be..695cb3818deb5f 100644 --- a/caffe2/python/operator_test/boolean_mask_test.py +++ b/caffe2/python/operator_test/boolean_mask_test.py @@ -47,10 +47,10 @@ def ref(x, mask): @staticmethod def _dtype_conversion(x, dtype, gc, dc): - """SequenceMask only supports fp16 with CUDA/HIP.""" + """SequenceMask only supports fp16 with CUDA.""" if dtype == np.float16: - assume(gc.device_type == caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA) - dc = [d for d in dc if d.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)] + assume(gc.device_type == caffe2_pb2.CUDA) + dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA] x = x.astype(dtype) return x, dc From 3309875d97de7274afccadd6e8ccd406badcc2ce Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Tue, 25 Sep 2018 11:32:54 -0700 Subject: [PATCH 15/28] add correct conditioning in workspace.py --- caffe2/python/workspace.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index 9e69bace2e09c4..a1e7bf40ae7971 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -57,7 +57,6 @@ def GetCudaPeerAccessPattern(): GetCuDNNVersion = lambda: 0 # noqa GetCuDNNVersion = lambda: 0 # noqa GetCudaPeerAccessPattern = lambda: np.array([]) # noqa - GetDeviceProperties = lambda x: None # noqa if has_hip_support: NumHipDevices = C.num_hip_devices @@ -69,8 +68,10 @@ def GetHipPeerAccessPattern(): else: NumHipDevices = lambda: 0 # noqa GetHipPeerAccessPattern = lambda: np.array([]) # noqa - GetDeviceProperties = lambda x: None # noqa +if not has_gpu_support and not has_hip_support: + GetDeviceProperties = lambda x: None # noqa + IsNUMAEnabled = C.is_numa_enabled GetNumNUMANodes = C.get_num_numa_nodes GetBlobNUMANode = C.get_blob_numa_node From 413e94085494d94ea315d17ad1e65a3c7f48edd3 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Tue, 25 Sep 2018 16:46:56 -0700 Subject: [PATCH 16/28] fix device options in hypothesis_test_util --- caffe2/python/hypothesis_test_util.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index 69ee82a932856d..af8d7f6a561146 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -256,6 +256,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None): # temporarily skip some flaky tests on ROCM before it's getting more mature. _device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else []) + _device_options_hip_or_gpu = [hip_do] if workspace.has_hip_support else [gpu_do] # Include device option for each GPU expanded_device_options = [cpu_do] + ( @@ -278,7 +279,8 @@ def gradient_checker_device_option(): ) gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do])) -gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]+[hip_do]), dc=st.just([gpu_do]+[hip_do])) +gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do])) +gcs_gpu_or_hip_only = dict(gc=st.sampled_from(_device_options_hip_or_gpu), dc=st.just(_device_options_hip_or_gpu)) gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip)) From 30cfc8e5516d9278ef2c9ff779fb852952ccc53a Mon Sep 17 00:00:00 2001 From: Ashish Date: Tue, 25 Sep 2018 16:49:35 -0700 Subject: [PATCH 17/28] Fixed a typo --- caffe2/python/operator_test/conv_test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index 071ab68ebd0e58..a3cb4e3f05749e 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -226,9 +226,9 @@ def test_convolution_gradients( nhwc=(order == 'NHWC'), backward=True)) if engine == 'MIOPEN': - assume(_cudnn_supports(dilation=(dilation > 1), - nhwc=(order == 'NHWC'), - backward=True)) + assume(_miopen_supports(dilation=(dilation > 1), + nhwc=(order == 'NHWC'), + backward=True)) assume(engine != "MKLDNN" or use_bias is True) From 402461b8396a5a6ce599da234f5fcae6f5c4734b Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Tue, 25 Sep 2018 17:28:00 -0700 Subject: [PATCH 18/28] fix typo --- caffe2/python/hypothesis_test_util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py index af8d7f6a561146..04700cec76f313 100644 --- a/caffe2/python/hypothesis_test_util.py +++ b/caffe2/python/hypothesis_test_util.py @@ -256,7 +256,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None): # temporarily skip some flaky tests on ROCM before it's getting more mature. _device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else []) device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else []) - _device_options_hip_or_gpu = [hip_do] if workspace.has_hip_support else [gpu_do] +_device_options_hip_or_gpu = [hip_do] if workspace.has_hip_support else [gpu_do] # Include device option for each GPU expanded_device_options = [cpu_do] + ( From 4e6de586497b9cbd136dceb5d26e7bfb57ac005f Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 26 Sep 2018 13:08:53 -0700 Subject: [PATCH 19/28] modify resnet benchmarking script --- caffe2/python/convnet_benchmarks.py | 128 ++++++++++++++++++---------- 1 file changed, 85 insertions(+), 43 deletions(-) diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py index 6b8af12f5cec71..fe9420902ad928 100644 --- a/caffe2/python/convnet_benchmarks.py +++ b/caffe2/python/convnet_benchmarks.py @@ -61,8 +61,10 @@ import argparse import os -from caffe2.python import workspace, brew, model_helper, core +from caffe2.python import workspace, brew, model_helper, core, net_drawer, memonger from caffe2.proto import caffe2_pb2 +from caffe2.python import data_parallel_model as dpm +from caffe2.python.models import resnet def MLP(order, cudnn_ws, model_path=""): model = model_helper.ModelHelper(name="MLP") @@ -563,45 +565,81 @@ def Inception(order, cudnn_ws, model_path=""): model.net.AveragedLoss(xent, "loss") return model, 224 -def Resnet50(order, cudnn_ws, model_path=""): - if model_path == "": - print("ERROR: please specify paths to init_net and predict_net protobufs for Resnet50") - exit(1) - device_opts = caffe2_pb2.DeviceOption() - device_opts.device_type = caffe2_pb2.HIP - device_opts.hip_gpu_id = 0 - INIT_NET_PB = os.path.join(model_path, "init_net.pb") - PREDICT_NET_PB = os.path.join(model_path, "predict_net.pb") - init_def = caffe2_pb2.NetDef() - with open(INIT_NET_PB, 'rb') as f: - init_def.ParseFromString(f.read()) - init_def.device_option.CopyFrom(device_opts) +def Resnet50(args): + gpus = [0] + device_opt = core.DeviceOption(caffe2_pb2.HIP) + device_opt.hip_gpu_id = gpus[0] + + # Batch size of 32 sums up to roughly 5GB of memory per device + batch_per_device = args.batch_size + total_batch_size = batch_per_device * len(gpus) + + num_labels = 1000 + + base_learning_rate = 0.0004 * total_batch_size + + # Weight decay (L2 regularization) + weight_decay = 1e-4 + + num_epochs = 10 + + ################## + # Define the Model + ################## + train_model = model_helper.ModelHelper(name="resnet50_train") + + def create_resnet50_model_ops(model, loss_scale=1.0): + # residual network + [softmax, loss] = resnet.create_resnet50(model, + "data", + num_input_channels=3, + num_labels=num_labels, + label="label", ) + prefix = model.net.Proto().name + loss = model.net.Scale(loss, prefix + "_loss", scale=loss_scale) + brew.accuracy(model, [softmax, "label"], prefix + "_accuracy") + return [loss] + + def add_parameter_update_ops(model): + brew.add_weight_decay(model, weight_decay) + iter = brew.iter(model, "iter") + lr = model.net.LearningRate([iter], + "lr", + base_lr=base_learning_rate, + policy="fixed", + gamma=0.1) + for param in model.GetParams(): + param_grad = model.param_to_grad[param] + param_momentum = model.param_init_net.ConstantFill( + [param], param + '_momentum', value=0.0 ) + + model.net.MomentumSGDUpdate( + [param_grad, param_momentum, lr, param], + [param_grad, param_momentum, param], + momentum=0.9, + nesterov=1) + + def optimize_gradient_memory(model, loss): + model.net._net = memonger.share_grad_blobs( + model.net, + loss, + set(model.param_to_grad.values()), + namescope="", + share_activations=False) + + + with core.NameScope(""): + with core.DeviceScope(device_opt): + losses = create_resnet50_model_ops(train_model) + if not args.forward_only: + blobs_to_gradients = train_model.AddGradientOperators(losses) + add_parameter_update_ops(train_model) + if not args.forward_only: + optimize_gradient_memory(train_model, [blobs_to_gradients[losses[0]]]) + + return train_model, 224 - net_def = caffe2_pb2.NetDef() - with open(PREDICT_NET_PB, 'rb') as f: - net_def.ParseFromString(f.read()) - net_def.device_option.CopyFrom(device_opts) - - init_net = core.Net(init_def) - predict_net = core.Net(net_def) - for op in init_net.Proto().op: - op.device_option.CopyFrom(device_opts) - for op in predict_net.Proto().op: - op.device_option.CopyFrom(device_opts) - my_arg_scope = { - 'order': order, - } - model = model_helper.ModelHelper( - name="resnet50", - arg_scope=my_arg_scope, - ) - - model.param_init_net = init_net - model.net = predict_net - xent = model.net.LabelCrossEntropy(["gpu_0/softmax", "label"], "xent") - model.net.AveragedLoss(xent, "loss") - return model, 224 def Inception_v2(order, cudnn_ws, model_path=""): if model_path == "": @@ -654,7 +692,10 @@ def AddParameterUpdate(model): def Benchmark(model_gen, arg): - model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.model_path) + if arg.model == 'Resnet50': + model, input_size = model_gen(arg) + else: + model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.model_path) model.Proto().type = arg.net_type model.Proto().num_workers = arg.num_workers @@ -669,7 +710,7 @@ def Benchmark(model_gen, arg): model.param_init_net.GaussianFill( [], - "gpu_0/data" if arg.model == "Resnet50" else "data", + "data", shape=input_shape, mean=0.0, std=1.0 @@ -686,8 +727,9 @@ def Benchmark(model_gen, arg): print('{}: running forward only.'.format(arg.model)) else: print('{}: running forward-backward.'.format(arg.model)) - model.AddGradientOperators(["loss"]) - AddParameterUpdate(model) + if not arg.model == "Resnet50": + model.AddGradientOperators(["loss"]) + AddParameterUpdate(model) if arg.order == 'NHWC': print( '==WARNING==\n' @@ -805,6 +847,6 @@ def GetArgumentParser(): 'MLP': MLP, 'Resnet50': Resnet50, 'Inception_v2':Inception_v2 - } Benchmark(model_map[args.model], args) + From 98ba4a1fd85a4564431ed5acb64971fd71fd415f Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 26 Sep 2018 13:10:56 -0700 Subject: [PATCH 20/28] filter group > 1 in conv_test --- caffe2/python/operator_test/conv_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py index 071ab68ebd0e58..bca477c0278958 100644 --- a/caffe2/python/operator_test/conv_test.py +++ b/caffe2/python/operator_test/conv_test.py @@ -45,9 +45,10 @@ def _miopen_supports( dilation=False, nhwc=False, backward=False, + group=False, ): """Return True if MIOPEN supports this configuration.""" - if nhwc or dilation: + if nhwc or dilation or group: return False return True @@ -226,9 +227,10 @@ def test_convolution_gradients( nhwc=(order == 'NHWC'), backward=True)) if engine == 'MIOPEN': - assume(_cudnn_supports(dilation=(dilation > 1), + assume(_miopen_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC'), - backward=True)) + backward=True, + group=(group > 1))) assume(engine != "MKLDNN" or use_bias is True) From 83b85e0d9d202b2531d61d71dbff64076d377d57 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 26 Sep 2018 13:43:25 -0700 Subject: [PATCH 21/28] fix whitespaces Dependencies.cmake --- cmake/Dependencies.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 7498b4540915a7..869a563d05a27a 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -548,7 +548,6 @@ if(NOT BUILD_ATEN_MOBILE) list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-negative) list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-overflow) list(APPEND HIP_HIPCC_FLAGS -Wno-unused-command-line-argument) - list(APPEND HIP_HIPCC_FLAGS -Wno-duplicate-decl-specifier) if(CMAKE_BUILD_TYPE MATCHES Debug) From 775d102613944e68c3d1a021d712500ea1ca8ca6 Mon Sep 17 00:00:00 2001 From: Ashish Date: Wed, 26 Sep 2018 13:56:31 -0700 Subject: [PATCH 22/28] Using MIOpen group conv API --- caffe2/operators/hip/conv_op_miopen.cc | 536 ++++++------------------- 1 file changed, 128 insertions(+), 408 deletions(-) diff --git a/caffe2/operators/hip/conv_op_miopen.cc b/caffe2/operators/hip/conv_op_miopen.cc index 5b9df03dc78953..13f2428bf9daa2 100644 --- a/caffe2/operators/hip/conv_op_miopen.cc +++ b/caffe2/operators/hip/conv_op_miopen.cc @@ -48,7 +48,11 @@ class MIOPENConvOpBase : public ConvPoolOpBase { if ((operator_def.type().substr(0, 6) == "Conv") || (operator_def.type().substr(0, 14) == "ConvGradient")) { - mode_ = miopenConvolution; + if(group_ > 1) { + mode_ = miopenGroupConv; + } else{ + mode_ = miopenConvolution; + } } else if ( (operator_def.type().substr(0, 7) == "Trans") || (operator_def.type().substr(0, 15) == "TransGradient")) { @@ -57,6 +61,12 @@ class MIOPENConvOpBase : public ConvPoolOpBase { LOG(FATAL) << "Unsupported convolution method: " << operator_def.type(); } + if(mode_ == miopenGroupConv) { + OPERATOR_NEEDS_FEATURE( + dilation_h() == 1 && dilation_w() == 1, + "MIOpen convolution does not support dilation for groups > 1."); + } + MIOPEN_ENFORCE(miopenInitConvolutionDescriptor( conv_desc_, mode_, @@ -66,6 +76,9 @@ class MIOPENConvOpBase : public ConvPoolOpBase { stride_w(), dilation_h(), dilation_w())); + + MIOPEN_ENFORCE(miopenSetConvolutionGroupCount( + conv_desc_, group_)); } ~MIOPENConvOpBase() { @@ -155,9 +168,6 @@ class MIOPENConvGradientOp final : public MIOPENConvOpBase { bwdDataWsSize_(0), bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM), bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) { - OPERATOR_NEEDS_FEATURE( - group_ == 1, - "Group convolution not supported yet for MIOpen ConvGradient."); CAFFE_ENFORCE( !(no_bias_ && OutputSize() == 3), "If bias is not present, you should not have 3 grad output."); @@ -247,186 +257,70 @@ bool MIOPENConvOp::DoRunWithType() { "If you set group, the number of output channels should be divisible " "by group."); - if (group_ > 1) { - int group_offset_filter = Weight.size() / group_; - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - weight_desc_, - miopenTypeWrapper::type, - M / group_, - C / group_, - kernel_h(), - kernel_w())); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - bottom_desc_, miopenTypeWrapper::type, N, C / group_, H, W)); - - MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim( - conv_desc_, - bottom_desc_, - weight_desc_, - &N_out, - &C_out, - &H_out, - &W_out)); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - top_desc_, miopenTypeWrapper::type, N_out, C_out, H_out, W_out)); - - if (InputSize() == 3) { - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - bias_desc_, miopenTypeWrapper::type, 1, Y->dim32(1), 1, 1)); + MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( + bottom_desc_, miopenTypeWrapper::type, N, C, H, W)); + + MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( + weight_desc_, + miopenTypeWrapper::type, + M, + C / group_, + kernel_h(), + kernel_w())); + + MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim( + conv_desc_, + bottom_desc_, + weight_desc_, + &N_out, + &C_out, + &H_out, + &W_out)); + + MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( + top_desc_, miopenTypeWrapper::type, N_out, C_out, H_out, W_out)); + + if (InputSize() == 3) { MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - top_desc_for_bias_, - miopenTypeWrapper::type, - Y->dim32(0), - Y->dim32(1), - H_out, - W_out)); - } - - int group_offset_X = C / group_ * H * W * D; - int group_offset_Y = M / group_ * H_out * W_out * D_out; + bias_desc_, miopenTypeWrapper::type, 1, M, 1, 1)); + } - while (!bestAlgoFound_) { + while (!bestAlgoFound_) { miopenConvAlgoPerf_t perf; MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize( - miopen_wrapper_.inline_miopen_handle(), - weight_desc_, - bottom_desc_, - conv_desc_, - top_desc_, - &fwdConvWsSize_)); - if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_)); - } - - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm( - state->miopen_handle(), - bottom_desc_, - X.template data(), + miopen_wrapper_.inline_miopen_handle(), weight_desc_, - Weight.template data(), - conv_desc_, - top_desc_, - Y->template mutable_data(), - requestAlgoCount_, - &returnedAlgoCount_, - &perf, - fwdConvWs_, - fwdConvWsSize_, - false)); - }); - bestAlgoFound_ = true; - fwdAlgo_ = perf.fwd_algo; - } - - for (int g = 0; g < group_; g++) { - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenConvolutionForward( - state->miopen_handle(), - &alpha_, bottom_desc_, - X.template data() + g * group_offset_X, - weight_desc_, - Weight.template data() + g * group_offset_filter, conv_desc_, - fwdAlgo_, - &beta_, top_desc_, - Y->template mutable_data() + g * group_offset_Y, - fwdConvWs_, - fwdConvWsSize_)); - }); - } - hipDeviceSynchronize(); - - // BIAS - if (InputSize() == 3) { - auto& bias = Input(BIAS); - - CAFFE_ENFORCE_EQ(bias.ndim(), 1); - CAFFE_ENFORCE_EQ(bias.dim32(0), M); - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenConvolutionForwardBias( - state->miopen_handle(), - &alpha_, - bias_desc_, - bias.template data(), - &beta_, - top_desc_for_bias_, - Y->template mutable_data())); - }); - } - - hipDeviceSynchronize(); - } else { - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - weight_desc_, - miopenTypeWrapper::type, - M, - C, - kernel_h(), - kernel_w())); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - bottom_desc_, miopenTypeWrapper::type, N, C, H, W)); - - MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim( - conv_desc_, - bottom_desc_, - weight_desc_, - &N_out, - &C_out, - &H_out, - &W_out)); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - top_desc_, miopenTypeWrapper::type, N_out, C_out, H_out, W_out)); - - if (InputSize() == 3) { - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - bias_desc_, miopenTypeWrapper::type, 1, C_out, 1, 1)); - } - - while (!bestAlgoFound_) { - miopenConvAlgoPerf_t perf; - - MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize( - miopen_wrapper_.inline_miopen_handle(), - weight_desc_, - bottom_desc_, - conv_desc_, - top_desc_, - &fwdConvWsSize_)); - + &fwdConvWsSize_)); if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_)); + HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_)); } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm( - state->miopen_handle(), - bottom_desc_, - X.template data(), - weight_desc_, - Weight.template data(), - conv_desc_, - top_desc_, - Y->template mutable_data(), - requestAlgoCount_, - &returnedAlgoCount_, - &perf, - fwdConvWs_, - fwdConvWsSize_, - false)); + MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm( + state->miopen_handle(), + bottom_desc_, + X.template data(), + weight_desc_, + Weight.template data(), + conv_desc_, + top_desc_, + Y->template mutable_data(), + requestAlgoCount_, + &returnedAlgoCount_, + &perf, + fwdConvWs_, + fwdConvWsSize_, + false)); }); bestAlgoFound_ = true; fwdAlgo_ = perf.fwd_algo; - } - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { + } + + miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenConvolutionForward( state->miopen_handle(), &alpha_, @@ -441,10 +335,9 @@ bool MIOPENConvOp::DoRunWithType() { Y->template mutable_data(), fwdConvWs_, fwdConvWsSize_)); - }); + }); - // BIAS - if (InputSize() == 3) { + if (InputSize() == 3) { auto& bias = Input(BIAS); CAFFE_ENFORCE_EQ(bias.ndim(), 1); @@ -459,13 +352,10 @@ bool MIOPENConvOp::DoRunWithType() { top_desc_, Y->template mutable_data())); }); - } - - hipDeviceSynchronize(); } - return true; } + // TODO : enable fp16 support. bool MIOPENConvOp::RunOnDevice() { if (Input(0).IsType()) { @@ -535,215 +425,46 @@ bool MIOPENConvGradientOp::DoRunWithType() { bool doBwdDataComputation = (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2))); - if (group_ > 1) { - int group_offset_filter = Weight.size() / group_; - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - weight_desc_, - miopenTypeWrapper::type, - M / group_, - C / group_, - kernel_h(), - kernel_w())); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - bottom_desc_, miopenTypeWrapper::type, N, C / group_, H, W)); - - MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim( - conv_desc_, - bottom_desc_, - weight_desc_, - &N_out, - &C_out, - &H_out, - &W_out)); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - top_desc_, miopenTypeWrapper::type, N_out, C_out, H_out, W_out)); - - if (!no_bias_) { + MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( + bottom_desc_, miopenTypeWrapper::type, N, C, H, W)); + + MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( + weight_desc_, + miopenTypeWrapper::type, + M, + C / group_, + kernel_h(), + kernel_w())); + + MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim( + conv_desc_, + bottom_desc_, + weight_desc_, + &N_out, + &C_out, + &H_out, + &W_out)); + + MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( + top_desc_, miopenTypeWrapper::type, N_out, C_out, H_out, W_out)); + + if (!no_bias_) { MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( bias_desc_, miopenTypeWrapper::type, 1, M, 1, 1)); - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - top_desc_for_bias_, - miopenTypeWrapper::type, - dY.dim32(0), - M, - H_out, - W_out)); - } - - int group_offset_X = C / group_ * H * W * D; - int group_offset_Y = M / group_ * H_out * W_out * D_out; + } - while ((!bestDataAlgoFound_) && doBwdDataComputation) { + while ((!bestDataAlgoFound_) && doBwdDataComputation) { miopenConvAlgoPerf_t perf; MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize( - miopen_wrapper_.inline_miopen_handle(), - top_desc_, - weight_desc_, - conv_desc_, - bottom_desc_, - &bwdDataWsSize_)); - if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_)); - } - - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm( - state->miopen_handle(), + miopen_wrapper_.inline_miopen_handle(), top_desc_, - dY.template data(), weight_desc_, - Weight.template data(), - conv_desc_, - bottom_desc_, - dX->template mutable_data(), - requestAlgoCount_, - &returnedAlgoCount_, - &perf, - bwdDataWs_, - bwdDataWsSize_, - false)); - }); - - bestDataAlgoFound_ = true; - bwdDataAlgo_ = perf.bwd_data_algo; - } - - while (!bestWeightAlgoFound_) { - miopenConvAlgoPerf_t perf; - - MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize( - miopen_wrapper_.inline_miopen_handle(), - top_desc_, - bottom_desc_, - conv_desc_, - weight_desc_, - &bwdWeightWsSize_)); - if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_)); - } - - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm( - state->miopen_handle(), - top_desc_, - dY.template data(), - bottom_desc_, - X.template data(), conv_desc_, - weight_desc_, - dW->template mutable_data(), - requestAlgoCount_, - &returnedAlgoCount_, - &perf, - bwdWeightWs_, - bwdWeightWsSize_, - false)); - }); - bestWeightAlgoFound_ = true; - bwdWeiAlgo_ = perf.bwd_weights_algo; - } - - for (int g = 0; g < group_; g++) { - if (doBwdDataComputation) { - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenConvolutionBackwardData( - state->miopen_handle(), - &alpha_, - top_desc_, - dY.template data() + g * group_offset_Y, - weight_desc_, - Weight.template data() + g * group_offset_filter, - conv_desc_, - bwdDataAlgo_, - &beta_, - bottom_desc_, - dX->template mutable_data() + g * group_offset_X, - bwdDataWs_, - bwdDataWsSize_)); - }); - } - - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenConvolutionBackwardWeights( - state->miopen_handle(), - &alpha_, - top_desc_, - dY.template data() + g * group_offset_Y, bottom_desc_, - X.template data() + g * group_offset_X, - conv_desc_, - bwdWeiAlgo_, - &beta_, - weight_desc_, - dW->template mutable_data() + g * group_offset_filter, - bwdWeightWs_, - bwdWeightWsSize_)); - }); - } - - // Synchronize the work across groups. - hipDeviceSynchronize(); - - ////////////////////////////////////// BIAS /////////////////////////// - if (!no_bias_) { - auto* dbias = Output(BIAS_OR_INPUT_GRAD); - dbias->Resize(M); - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenConvolutionBackwardBias( - state->miopen_handle(), - &alpha_, - top_desc_for_bias_, - dY.template data(), - &beta_, - bias_desc_, - dbias->template mutable_data())); - }); - } - } else // No group - { - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - weight_desc_, - miopenTypeWrapper::type, - M, - C, - kernel_h(), - kernel_w())); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - bottom_desc_, miopenTypeWrapper::type, N, C, H, W)); - - MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim( - conv_desc_, - bottom_desc_, - weight_desc_, - &N_out, - &C_out, - &H_out, - &W_out)); - - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - top_desc_, miopenTypeWrapper::type, N_out, C_out, H_out, W_out)); - - if (!no_bias_) { - MIOPEN_ENFORCE(miopenSet4dTensorDescriptor( - bias_desc_, miopenTypeWrapper::type, 1, M, 1, 1)); - } - - while ((!bestDataAlgoFound_) && doBwdDataComputation) { - miopenConvAlgoPerf_t perf; - - MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize( - miopen_wrapper_.inline_miopen_handle(), - top_desc_, - weight_desc_, - conv_desc_, - bottom_desc_, - &bwdDataWsSize_)); + &bwdDataWsSize_)); if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) { - HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_)); + HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_)); } miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { @@ -766,22 +487,22 @@ bool MIOPENConvGradientOp::DoRunWithType() { bestDataAlgoFound_ = true; bwdDataAlgo_ = perf.bwd_data_algo; - } + } - while (!bestWeightAlgoFound_) { + while (!bestWeightAlgoFound_) { miopenConvAlgoPerf_t perf; MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize( - miopen_wrapper_.inline_miopen_handle(), - top_desc_, - bottom_desc_, - conv_desc_, - weight_desc_, - &bwdWeightWsSize_)); + miopen_wrapper_.inline_miopen_handle(), + top_desc_, + bottom_desc_, + conv_desc_, + weight_desc_, + &bwdWeightWsSize_)); if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) { HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_)); } - + miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm( state->miopen_handle(), @@ -801,9 +522,9 @@ bool MIOPENConvGradientOp::DoRunWithType() { }); bestWeightAlgoFound_ = true; bwdWeiAlgo_ = perf.bwd_weights_algo; - } + } - if (doBwdDataComputation) { + if (doBwdDataComputation) { miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { MIOPEN_ENFORCE(miopenConvolutionBackwardData( state->miopen_handle(), @@ -820,30 +541,30 @@ bool MIOPENConvGradientOp::DoRunWithType() { bwdDataWs_, bwdDataWsSize_)); }); - } - - miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { - MIOPEN_ENFORCE(miopenConvolutionBackwardWeights( - state->miopen_handle(), - &alpha_, - top_desc_, - dY.template data(), - bottom_desc_, - X.template data(), - conv_desc_, - bwdWeiAlgo_, - &beta_, - weight_desc_, - dW->template mutable_data(), - bwdWeightWs_, - bwdWeightWsSize_)); - }); - - // Synchronize the work across groups. - hipDeviceSynchronize(); + } - ////////////////////////////////////// BIAS /////////////////////////// - if (!no_bias_) { + miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { + MIOPEN_ENFORCE(miopenConvolutionBackwardWeights( + state->miopen_handle(), + &alpha_, + top_desc_, + dY.template data(), + bottom_desc_, + X.template data(), + conv_desc_, + bwdWeiAlgo_, + &beta_, + weight_desc_, + dW->template mutable_data(), + bwdWeightWs_, + bwdWeightWsSize_)); + }); + + // Synchronize the work across groups. + hipDeviceSynchronize(); + + ////////////////////////////////////// BIAS /////////////////////////// + if (!no_bias_) { auto* dbias = Output(BIAS_OR_INPUT_GRAD); dbias->Resize(M); miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) { @@ -856,7 +577,6 @@ bool MIOPENConvGradientOp::DoRunWithType() { bias_desc_, dbias->template mutable_data())); }); - } } return true; From 0c8b222bacb9d6f1e64ac8e1a48d7c25bcf38e27 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 26 Sep 2018 14:06:58 -0700 Subject: [PATCH 23/28] address comments, whitespaces --- caffe2/python/core.py | 2 +- caffe2/python/data_parallel_model.py | 3 +++ caffe2/python/operator_test/activation_ops_test.py | 2 +- caffe2/python/optimizer.py | 5 ++++- 4 files changed, 9 insertions(+), 3 deletions(-) diff --git a/caffe2/python/core.py b/caffe2/python/core.py index 75e7ccc3f53630..f08d72347c15b8 100644 --- a/caffe2/python/core.py +++ b/caffe2/python/core.py @@ -2118,7 +2118,7 @@ def RunAllOnGPU(self, gpu_id=0, use_cudnn=False): self._net.device_option.CopyFrom(device_option) if use_cudnn: for op in self._net.op: - op.engine = 'CUDNN' + op.engine = "CUDNN" def RunAllOnMKL(self): """A convenient function to run everything using MKLDNN.""" diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py index 4b1aa22a282dc7..b6d425a699f220 100644 --- a/caffe2/python/data_parallel_model.py +++ b/caffe2/python/data_parallel_model.py @@ -1020,6 +1020,7 @@ def _Broadcast(devices, model, net, param, use_nccl=False): root=0, ) return + for dev_idx in devices[1:]: if _IsGPUBlob(model, param): if workspace.has_hip_support: @@ -1181,6 +1182,7 @@ def _SyncAllParamsDistributed( max_concurrent_distributed_ops ): assert rendezvous['num_shards'] > 1 + if workspace.has_hip_support: gpu_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) else: @@ -1359,6 +1361,7 @@ def _AllReduceBlobsDistributed( num_workers = model.net.Proto().num_workers assert num_workers > 1, "Please specify more than 1 worker" all_reduce_engine = rendezvous['engine'] + if workspace.has_hip_support: master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0]) else: diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py index aff55bcb7834de..200647c95998e4 100644 --- a/caffe2/python/operator_test/activation_ops_test.py +++ b/caffe2/python/operator_test/activation_ops_test.py @@ -16,7 +16,7 @@ import unittest -class TestActivations(hu.HypothesisTestCase): +class TestActivations(serial.SerializedTestCase): @serial.given(X=hu.tensor(), in_place=st.booleans(), engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **mu.gcs) def test_relu(self, X, in_place, engine, gc, dc): diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index 799c3aa36fd2ca..b9838ed0ee1950 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -218,6 +218,7 @@ def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max): value=lr_max) return wd, trust, lr_max + class SgdOptimizer(Optimizer): def __init__(self, base_learning_rate=0.01, policy='fixed', momentum=0.0, nesterov=1, sparse_dedup_aggregator=None, @@ -543,6 +544,7 @@ def _run(self, net, param_init_net, param_info): self.make_unique_blob_name(str(param) + "_lars"), offset=self.lars, lr_min=0.0) + current_scope = scope.CurrentDeviceScope() self._add_local_lr_multiplier( lr_lars_multiplier, @@ -1453,7 +1455,8 @@ def build_ftrl(model, engine="SIMD", **kwargs): def build_gftrl(model, engine="", **kwargs): - # SIMD version of GFTRL is not supported + if engine == "SIMD": + assert core.IsOperator('GFtrl_ENGINE_SIMD') gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs) return _build(model, gftrl_optimizer) From 8bed68c63a0d9b98b672590aaab1972324f82563 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 26 Sep 2018 14:12:39 -0700 Subject: [PATCH 24/28] will instead add a readme for this soon --- .../caffe2/amd/install_dependencies_amd.sh | 87 ------------------- 1 file changed, 87 deletions(-) delete mode 100755 .jenkins/caffe2/amd/install_dependencies_amd.sh diff --git a/.jenkins/caffe2/amd/install_dependencies_amd.sh b/.jenkins/caffe2/amd/install_dependencies_amd.sh deleted file mode 100755 index 51c2fe98ce4aa0..00000000000000 --- a/.jenkins/caffe2/amd/install_dependencies_amd.sh +++ /dev/null @@ -1,87 +0,0 @@ -#!/bin/bash - -set -ex - -apt-get update -apt-get install -y --no-install-recommends \ - autoconf \ - build-essential \ - ca-certificates \ - cmake \ - curl \ - git \ - libgoogle-glog-dev \ - libhiredis-dev \ - libiomp-dev \ - libleveldb-dev \ - liblmdb-dev \ - libopencv-dev \ - libpthread-stubs0-dev \ - libsnappy-dev \ - libprotobuf-dev \ - protobuf-compiler \ - python-dev \ - python-setuptools \ - python-pip - -pip install networkx==2.0 - -pip install --user \ - click \ - future \ - hypothesis \ - jupyter \ - numpy \ - protobuf \ - pytest \ - pyyaml \ - scipy==0.19.1 \ - scikit-image \ - tabulate \ - virtualenv \ - mock \ - typing \ - typing-extensions \ - pyyaml - -CLANG_VERSION=3.8 - -apt-get update -apt-get install -y --no-install-recommends clang-"$CLANG_VERSION" -rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - -# Use update-alternatives to make this version the default -update-alternatives --install /usr/bin/gcc gcc /usr/bin/clang-"$CLANG_VERSION" 50 -update-alternatives --install /usr/bin/g++ g++ /usr/bin/clang++-"$CLANG_VERSION" 50 - - -apt-get install -y --no-install-recommends \ - rocm-dev \ - rocm-utils \ - rocm-libs \ - rocfft \ - miopen-hip \ - miopengemm \ - rocblas \ - rocrand - - -if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then - sudo sed -i 's/\ -I${dir}/\ $<$:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake -fi - -if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then - sudo sed -i 's/set(_hip_build_configuration "${CMAKE_BUILD_TYPE}")/string(TOUPPER _hip_build_configuration "${CMAKE_BUILD_TYPE}")/' /opt/rocm/hip/cmake/FindHIP.cmake -fi - -git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust -rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip -git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip - - -export PATH=/opt/rocm/bin:$PATH -export PATH=/opt/rocm/hcc/bin:$PATH -export PATH=/opt/rocm/hip/bin:$PATH -export PATH=/opt/rocm/opencl/bin:$PATH -export THRUST_ROOT=/data/Thrust -export HIP_PLATFORM=hcc From 65f24b16792652ee505a20769aadab7a8c792800 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 26 Sep 2018 15:41:19 -0700 Subject: [PATCH 25/28] remove unused varaibles --- caffe2/python/convnet_benchmarks.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py index fe9420902ad928..dfb729e9aeb90e 100644 --- a/caffe2/python/convnet_benchmarks.py +++ b/caffe2/python/convnet_benchmarks.py @@ -570,20 +570,12 @@ def Resnet50(args): gpus = [0] device_opt = core.DeviceOption(caffe2_pb2.HIP) device_opt.hip_gpu_id = gpus[0] - - # Batch size of 32 sums up to roughly 5GB of memory per device - batch_per_device = args.batch_size - total_batch_size = batch_per_device * len(gpus) - num_labels = 1000 - - base_learning_rate = 0.0004 * total_batch_size + base_learning_rate = 0.0004 * args.batch_size # Weight decay (L2 regularization) weight_decay = 1e-4 - num_epochs = 10 - ################## # Define the Model ################## From fc7fcc1323adeaa793c5e92227096cb8e69db45b Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Wed, 26 Sep 2018 20:26:58 -0700 Subject: [PATCH 26/28] whitespace changes --- caffe2/python/convnet_benchmarks.py | 1 - caffe2/python/optimizer.py | 2 +- caffe2/python/workspace.py | 2 ++ 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py index dfb729e9aeb90e..89bbab6bf751a6 100644 --- a/caffe2/python/convnet_benchmarks.py +++ b/caffe2/python/convnet_benchmarks.py @@ -841,4 +841,3 @@ def GetArgumentParser(): 'Inception_v2':Inception_v2 } Benchmark(model_map[args.model], args) - diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py index b9838ed0ee1950..adc648c8218d19 100644 --- a/caffe2/python/optimizer.py +++ b/caffe2/python/optimizer.py @@ -1456,7 +1456,7 @@ def build_ftrl(model, engine="SIMD", **kwargs): def build_gftrl(model, engine="", **kwargs): if engine == "SIMD": - assert core.IsOperator('GFtrl_ENGINE_SIMD') + assert core.IsOperator('GFtrl_ENGINE_SIMD') gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs) return _build(model, gftrl_optimizer) diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py index a1e7bf40ae7971..c474fe334b6f1e 100644 --- a/caffe2/python/workspace.py +++ b/caffe2/python/workspace.py @@ -48,6 +48,7 @@ GetCUDAVersion = C.get_cuda_version GetCuDNNVersion = C.get_cudnn_version NumGpuDevices = NumCudaDevices + def GetCudaPeerAccessPattern(): return np.asarray(C.get_cuda_peer_access_pattern()) @@ -61,6 +62,7 @@ def GetCudaPeerAccessPattern(): if has_hip_support: NumHipDevices = C.num_hip_devices NumGpuDevices = NumHipDevices + def GetHipPeerAccessPattern(): return np.asarray(C.get_hip_peer_access_pattern()) From d03ae3f5799f3cd238fef7091d28f8be90b89d08 Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Thu, 27 Sep 2018 10:51:20 -0700 Subject: [PATCH 27/28] update-docs --- rocm-docs/caffe2-build.md | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md index 2bec136b774581..6b1de555235f1c 100644 --- a/rocm-docs/caffe2-build.md +++ b/rocm-docs/caffe2-build.md @@ -27,20 +27,19 @@ To run benchmarks, skip directly to benchmarks section of the document. ## Build Caffe2 from source ### Pull the docker image ``` -docker pull rocm/caffe2:unbuilt-rocm1.9-v1 +docker pull rocm/caffe2:unbuilt-rocm1.9-v2 ``` This docker image has all the dependencies for caffe2 pre-installed. ### Pull the latest caffe2 source: -Checkout the `caffe2_specific` branch of this repository. * Using https ``` -git clone --recurse-submodules -b caffe2_specific https://github.com/ROCmSoftwarePlatform/pytorch.git +git clone --recurse-submodules https://github.com/ROCmSoftwarePlatform/pytorch.git ``` * Using ssh ``` -git clone --recurse-submodules -b caffe2_specific git@github.com:ROCmSoftwarePlatform/pytorch.git +git clone --recurse-submodules git@github.com:ROCmSoftwarePlatform/pytorch.git ``` Navigate to repo directory ``` @@ -49,7 +48,7 @@ cd pytorch ### Launch the docker container ``` -docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v3 +docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.9-v2 ``` Navigate to pytorch directory `cd /pytorch` inside the container. @@ -62,11 +61,10 @@ Navigate to pytorch directory `cd /pytorch` inside the container. * Test the rocm-caffe2 Installation - Before running the tests, make sure that the required environment variables are set: ``` cd build_caffe2 && python -c 'from caffe2.python import core' 2>/dev/null && echo "Success" || echo "Failure" ``` -Make sure the the following environment variables are set. +If the test fails, make sure the following environment variables are set. ``` LD_lIBRARY_PATH=/usr/local/caffe2/lib @@ -86,23 +84,21 @@ Caffe2 benchmarking script supports the following networks. 6. Inception_v2 7. Resnet50 -*Special case:* Inception_v2 and Resnet50 will need their corresponding protobuf files to run the benchmarks. Protobufs can be downloaded from caffe2 model zoo using the below command. Substitute model_name with `inception_v2` or `resnet50` - +*Special case:* Inception_v2 will need model protobuf files to run the benchmarks. Protobufs can be downloaded from caffe2 model zoo using the below command. ``` -python caffe2/python/models/download.py +python caffe2/python/models/download.py inception_v2 ``` This will download the protobufs to current working directory. -To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, run the command replacing `` with one of the networks. +To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, Resnet50 run the command replacing `` with one of the networks. ``` python caffe2/python/convnet_benchmarks.py --batch_size 64 --model --engine MIOPEN --layer_wise_benchmark True --net_type simple ``` -To run Inception_v2 or Resnet50, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above. +To run Inception_v2, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above. ``` python caffe2/python/convnet_benchmarks.py --batch_size 64 --model --engine MIOPEN --layer_wise_benchmark True --net_type simple --model_path ``` - From 56fa470426c62b161c9804c8c70dc4f0d4694cbe Mon Sep 17 00:00:00 2001 From: rohithkrn Date: Thu, 27 Sep 2018 12:45:52 -0700 Subject: [PATCH 28/28] change benchmarking command --- rocm-docs/caffe2-build.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md index 6b1de555235f1c..7ba7e5952dfcd3 100644 --- a/rocm-docs/caffe2-build.md +++ b/rocm-docs/caffe2-build.md @@ -93,12 +93,12 @@ This will download the protobufs to current working directory. To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, Resnet50 run the command replacing `` with one of the networks. ``` -python caffe2/python/convnet_benchmarks.py --batch_size 64 --model --engine MIOPEN --layer_wise_benchmark True --net_type simple +python caffe2/python/convnet_benchmarks.py --batch_size 32 --model --engine MIOPEN ``` To run Inception_v2, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above. ``` -python caffe2/python/convnet_benchmarks.py --batch_size 64 --model --engine MIOPEN --layer_wise_benchmark True --net_type simple --model_path +python caffe2/python/convnet_benchmarks.py --batch_size 32 --model --engine MIOPEN --model_path ```