From c8328ce768db33e0d01ad67c896e5b7e5a0dc9b8 Mon Sep 17 00:00:00 2001
From: root <rohith.nallamaddi@gmail.com>
Date: Tue, 14 Aug 2018 15:55:01 -0700
Subject: [PATCH 01/28] py changes and docs

---
 .jenkins/caffe2/amd/binary_tests.sh           |  16 +
 .jenkins/caffe2/amd/build_amd.sh              |  71 ++++
 .../caffe2/amd/install_dependencies_amd.sh    |  87 +++++
 .jenkins/caffe2/amd/python_tests.sh           |  40 +++
 caffe2/python/convnet_benchmarks.py           | 101 +++++-
 caffe2/python/core.py                         |  50 ++-
 caffe2/python/core_gradients_test.py          |   6 +-
 caffe2/python/core_test.py                    | 311 +++++++++++++-----
 caffe2/python/data_parallel_model.py          | 121 +++++--
 caffe2/python/docs/generator.py               |   6 +-
 caffe2/python/functional_test.py              |   2 +-
 caffe2/python/gradient_check_test.py          |   4 +-
 caffe2/python/hypothesis_test_util.py         |   2 +-
 caffe2/python/memonger_test.py                |  11 +-
 caffe2/python/model_device_test.py            |   4 +-
 caffe2/python/model_helper.py                 |  10 +-
 caffe2/python/muji.py                         |  10 +-
 caffe2/python/net_printer.py                  |   6 +-
 .../operator_test/activation_ops_test.py      |   6 +-
 .../python/operator_test/boolean_mask_test.py |   8 +-
 caffe2/python/operator_test/ceil_op_test.py   |   4 +-
 caffe2/python/operator_test/conv_test.py      |  40 ++-
 caffe2/python/operator_test/copy_ops_test.py  |  30 +-
 .../elementwise_op_broadcast_test.py          |   4 +-
 caffe2/python/operator_test/pooling_test.py   |  19 +-
 caffe2/python/optimizer.py                    |  55 ++--
 caffe2/python/optimizer_test.py               |   6 +-
 caffe2/python/optimizer_test_util.py          |   2 +-
 caffe2/python/scope_test.py                   |  22 +-
 caffe2/python/workspace.py                    |  18 +-
 caffe2/python/workspace_test.py               |  11 +-
 rocm-docs/caffe2-build.md                     | 108 ++++++
 32 files changed, 934 insertions(+), 257 deletions(-)
 create mode 100755 .jenkins/caffe2/amd/binary_tests.sh
 create mode 100644 .jenkins/caffe2/amd/build_amd.sh
 create mode 100755 .jenkins/caffe2/amd/install_dependencies_amd.sh
 create mode 100755 .jenkins/caffe2/amd/python_tests.sh
 create mode 100644 rocm-docs/caffe2-build.md

diff --git a/.jenkins/caffe2/amd/binary_tests.sh b/.jenkins/caffe2/amd/binary_tests.sh
new file mode 100755
index 00000000000000..132cc9c5606be0
--- /dev/null
+++ b/.jenkins/caffe2/amd/binary_tests.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../../../ && pwd)
+
+cd "$ROOT_DIR"
+
+echo "Running C++ tests.."
+
+for file in $(find "${ROOT_DIR}/build_caffe2/bin" -executable -type f); do
+	if [[ "$file" =~ "test" ]]; then
+		$file
+	fi
+done
diff --git a/.jenkins/caffe2/amd/build_amd.sh b/.jenkins/caffe2/amd/build_amd.sh
new file mode 100644
index 00000000000000..5c5dd0a3af41d5
--- /dev/null
+++ b/.jenkins/caffe2/amd/build_amd.sh
@@ -0,0 +1,71 @@
+#!/bin/bash
+
+set -ex
+
+# The INSTALL_PREFIX here must match up with test.sh
+INSTALL_PREFIX="/usr/local/caffe2"
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../../.. && pwd)
+CMAKE_ARGS=()
+
+##############################################################################
+# Explicitly set Python executable.
+###############################################################################
+# On Ubuntu 16.04 the default Python is still 2.7.
+PYTHON="$(which python)"
+
+###############################################################################
+# Set cmake args
+###############################################################################
+CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+CMAKE_ARGS+=("-DBUILD_TEST=ON")
+CMAKE_ARGS+=("-DINSTALL_TEST=ON")
+CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+CMAKE_ARGS+=("-DUSE_ZSTD=ON")
+CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
+
+# TODO: This is patching the official FindHip to properly handly
+# cmake generator expression. A PR is opened in the upstream repo here:
+# https://github.com/ROCm-Developer-Tools/HIP/pull/516
+# remove this hack once it's merged.
+if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
+  sudo sed -i 's/\ -I${dir}/\ $<$<BOOL:${dir}>:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake
+fi
+
+export LANG=C.UTF-8
+export LC_ALL=C.UTF-8
+export HCC_AMDGPU_TARGET=gfx900
+
+########## HIPIFY Caffe2 operators
+${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"
+
+MAX_JOBS=$(nproc)
+
+###############################################################################
+# Configure and make
+###############################################################################
+# Run cmake from ./build_caffe2 directory so it doesn't conflict with
+# standard PyTorch build directory. Eventually these won't need to
+# be separate.
+rm -rf build_caffe2
+mkdir build_caffe2
+cd ./build_caffe2
+
+# Configure
+cmake "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
+
+# Build
+if [ "$(uname)" == "Linux" ]; then
+  make "-j${MAX_JOBS}" install
+else
+  echo "Don't know how to build on $(uname)"
+  exit 1
+fi
+
+###############################################################################
+# Install ONNX
+###############################################################################
+
+# Install ONNX into a local directory
+pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
+
diff --git a/.jenkins/caffe2/amd/install_dependencies_amd.sh b/.jenkins/caffe2/amd/install_dependencies_amd.sh
new file mode 100755
index 00000000000000..51c2fe98ce4aa0
--- /dev/null
+++ b/.jenkins/caffe2/amd/install_dependencies_amd.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+
+set -ex
+
+apt-get update
+apt-get install -y --no-install-recommends \
+        autoconf \
+        build-essential \
+        ca-certificates \
+        cmake \
+        curl \
+        git \
+        libgoogle-glog-dev \
+        libhiredis-dev \
+        libiomp-dev \
+        libleveldb-dev \
+        liblmdb-dev \
+        libopencv-dev \
+        libpthread-stubs0-dev \
+        libsnappy-dev \
+        libprotobuf-dev \
+        protobuf-compiler \
+        python-dev \
+        python-setuptools \
+        python-pip
+
+pip install networkx==2.0
+
+pip install --user \
+    click \
+    future \
+    hypothesis \
+    jupyter \
+    numpy \
+    protobuf \
+    pytest \
+    pyyaml \
+    scipy==0.19.1 \
+    scikit-image \
+    tabulate \
+    virtualenv \
+    mock \
+    typing \
+    typing-extensions \
+    pyyaml
+
+CLANG_VERSION=3.8
+
+apt-get update
+apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
+rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Use update-alternatives to make this version the default
+update-alternatives --install /usr/bin/gcc gcc /usr/bin/clang-"$CLANG_VERSION" 50
+update-alternatives --install /usr/bin/g++ g++ /usr/bin/clang++-"$CLANG_VERSION" 50
+
+
+apt-get install -y --no-install-recommends \
+        rocm-dev \
+        rocm-utils \
+        rocm-libs \
+        rocfft \
+        miopen-hip \
+        miopengemm \
+        rocblas \
+        rocrand 
+
+
+if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
+    sudo sed -i 's/\ -I${dir}/\ $<$<BOOL:${dir}>:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake
+fi
+
+if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
+    sudo sed -i 's/set(_hip_build_configuration "${CMAKE_BUILD_TYPE}")/string(TOUPPER _hip_build_configuration "${CMAKE_BUILD_TYPE}")/' /opt/rocm/hip/cmake/FindHIP.cmake
+fi
+
+git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust
+rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip
+git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
+
+
+export PATH=/opt/rocm/bin:$PATH
+export PATH=/opt/rocm/hcc/bin:$PATH
+export PATH=/opt/rocm/hip/bin:$PATH
+export PATH=/opt/rocm/opencl/bin:$PATH
+export THRUST_ROOT=/data/Thrust
+export HIP_PLATFORM=hcc
diff --git a/.jenkins/caffe2/amd/python_tests.sh b/.jenkins/caffe2/amd/python_tests.sh
new file mode 100755
index 00000000000000..46daa7c3e98a6f
--- /dev/null
+++ b/.jenkins/caffe2/amd/python_tests.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../../../ && pwd)
+
+cd "$ROOT_DIR"
+
+# Get the relative path to where the caffe2 python module was installed
+CAFFE2_PYPATH="$ROOT_DIR/build_caffe2/caffe2"
+
+rocm_ignore_test=()
+# need to debug
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/arg_ops_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/model_device_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/data_parallel_model_test.py")
+
+# Need to go through roi ops to replace max(...) with fmaxf(...)
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/roi_align_rotated_op_test.py")
+
+# cuda top_k op has some asm code, the hipified version doesn't
+# compile yet, so we don't have top_k operator for now
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
+
+
+# Python tests
+echo "Running Python tests.."
+python \
+  -m pytest \
+  -v \
+  --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \
+  ${rocm_ignore_test[@]} \
+  "$CAFFE2_PYPATH/python" 
\ No newline at end of file
diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
index 3aac78c18df16b..6b8af12f5cec71 100644
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@@ -59,11 +59,12 @@
 """
 
 import argparse
+import os
 
-from caffe2.python import workspace, brew, model_helper
+from caffe2.python import workspace, brew, model_helper, core
+from caffe2.proto import caffe2_pb2
 
-
-def MLP(order, cudnn_ws):
+def MLP(order, cudnn_ws, model_path=""):
     model = model_helper.ModelHelper(name="MLP")
     d = 256
     depth = 20
@@ -98,7 +99,7 @@ def MLP(order, cudnn_ws):
     return model, d
 
 
-def AlexNet(order, cudnn_ws):
+def AlexNet(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -191,7 +192,7 @@ def AlexNet(order, cudnn_ws):
     return model, 224
 
 
-def OverFeat(order, cudnn_ws):
+def OverFeat(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -277,7 +278,7 @@ def OverFeat(order, cudnn_ws):
     return model, 231
 
 
-def VGGA(order, cudnn_ws):
+def VGGA(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -475,7 +476,7 @@ def _InceptionModule(
     return output
 
 
-def Inception(order, cudnn_ws):
+def Inception(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -562,6 +563,84 @@ def Inception(order, cudnn_ws):
     model.net.AveragedLoss(xent, "loss")
     return model, 224
 
+def Resnet50(order, cudnn_ws, model_path=""):
+    if model_path == "":
+        print("ERROR: please specify paths to init_net and predict_net protobufs for Resnet50")
+        exit(1)
+    device_opts = caffe2_pb2.DeviceOption()
+    device_opts.device_type = caffe2_pb2.HIP
+    device_opts.hip_gpu_id = 0
+
+    INIT_NET_PB = os.path.join(model_path, "init_net.pb")
+    PREDICT_NET_PB = os.path.join(model_path, "predict_net.pb")
+    init_def = caffe2_pb2.NetDef()
+    with open(INIT_NET_PB, 'rb') as f:
+        init_def.ParseFromString(f.read())
+        init_def.device_option.CopyFrom(device_opts)
+
+    net_def = caffe2_pb2.NetDef()
+    with open(PREDICT_NET_PB, 'rb') as f:
+        net_def.ParseFromString(f.read())
+        net_def.device_option.CopyFrom(device_opts)
+
+    init_net = core.Net(init_def)
+    predict_net = core.Net(net_def)
+    for op in init_net.Proto().op:
+        op.device_option.CopyFrom(device_opts)
+    for op in predict_net.Proto().op:
+        op.device_option.CopyFrom(device_opts)
+    my_arg_scope = {
+        'order': order,
+    }
+    model = model_helper.ModelHelper(
+        name="resnet50",
+        arg_scope=my_arg_scope,
+    )
+
+    model.param_init_net = init_net
+    model.net = predict_net
+    xent = model.net.LabelCrossEntropy(["gpu_0/softmax", "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, 224
+
+def Inception_v2(order, cudnn_ws, model_path=""):
+    if model_path == "":
+        print("ERROR: please specify paths to init_net and predict_net protobufs for Inception_v2")
+        exit(1)
+    device_opts = caffe2_pb2.DeviceOption()
+    device_opts.device_type = caffe2_pb2.HIP
+    device_opts.hip_gpu_id = 0
+
+    INIT_NET_PB = os.path.join(model_path, "init_net.pb")
+    PREDICT_NET_PB = os.path.join(model_path, "predict_net.pb")
+    init_def = caffe2_pb2.NetDef()
+    with open(INIT_NET_PB, 'rb') as f:
+        init_def.ParseFromString(f.read())
+        init_def.device_option.CopyFrom(device_opts)
+
+    net_def = caffe2_pb2.NetDef()
+    with open(PREDICT_NET_PB, 'rb') as f:
+        net_def.ParseFromString(f.read())
+        net_def.device_option.CopyFrom(device_opts)
+
+    init_net = core.Net(init_def)
+    predict_net = core.Net(net_def)
+
+    my_arg_scope = {
+        'order': order,
+    }
+
+    model = model_helper.ModelHelper(
+        name="GoogleNet",
+        arg_scope=my_arg_scope,
+    )
+
+    model.param_init_net = init_net
+    model.net = predict_net
+    xent = model.net.LabelCrossEntropy(["prob", "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, 224
+
 
 def AddParameterUpdate(model):
     """ Simple plain SGD update -- not tuned to actually train the models """
@@ -575,7 +654,7 @@ def AddParameterUpdate(model):
 
 
 def Benchmark(model_gen, arg):
-    model, input_size = model_gen(arg.order, arg.cudnn_ws)
+    model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.model_path)
     model.Proto().type = arg.net_type
     model.Proto().num_workers = arg.num_workers
 
@@ -590,7 +669,7 @@ def Benchmark(model_gen, arg):
 
     model.param_init_net.GaussianFill(
         [],
-        "data",
+        "gpu_0/data" if arg.model == "Resnet50" else "data",
         shape=input_shape,
         mean=0.0,
         std=1.0
@@ -701,6 +780,7 @@ def GetArgumentParser():
     parser.add_argument("--num_workers", type=int, default=2)
     parser.add_argument("--use-nvtx", default=False, action='store_true')
     parser.add_argument("--htrace_span_log_path", type=str)
+    parser.add_argument("--model_path", type=str, default="", help="set path to init net and predict_net protobufs")
     return parser
 
 
@@ -723,5 +803,8 @@ def GetArgumentParser():
             'VGGA': VGGA,
             'Inception': Inception,
             'MLP': MLP,
+            'Resnet50': Resnet50,
+            'Inception_v2':Inception_v2
+
         }
         Benchmark(model_map[args.model], args)
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 1423cfea3a6c09..b566e01da23e4e 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -85,6 +85,7 @@ def IsOperatorWithEngine(op_type, engine):
 def DeviceOption(
     device_type,
     cuda_gpu_id=0,
+    hip_gpu_id=0,
     random_seed=None,
     node_name=None,
     numa_node_id=None,
@@ -93,6 +94,7 @@ def DeviceOption(
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
     option.cuda_gpu_id = cuda_gpu_id
+    option.hip_gpu_id = hip_gpu_id
     if node_name is not None:
         option.node_name = node_name
     if random_seed is not None:
@@ -115,7 +117,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr
     if not opt1.device_type or not opt2.device_type:
         # At least one option is for CPU, check if both are for CPU.
         return not opt1.device_type and not opt2.device_type
-    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
+    return (opt1.cuda_gpu_id == opt2.cuda_gpu_id) and (opt1.hip_gpu_id == opt2.hip_gpu_id)
 
 
 def InferBlobDevices(net):
@@ -2022,12 +2024,13 @@ def DeduplicateGradientSlices(self, g, aggregator='sum'):
     def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
         device_option.cuda_gpu_id = gpu_id
+        device_option.hip_gpu_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
-                op.engine = "CUDNN"
+                op.engine = 'CUDNN'
 
     def RunAllOnMKL(self):
         """A convenient function to run everything using MKLDNN."""
@@ -2186,27 +2189,38 @@ def extend_ops(self, new_ops):
 
 def copy_func_between_devices(src, dst):
     CPU = caffe2_pb2.CPU
-    CUDA = caffe2_pb2.CUDA
+    if workspace.has_hip_support:
+        GPU = caffe2_pb2.HIP
+    else:
+        GPU = caffe2_pb2.CUDA
 
     if src.device_type == CPU and dst.device_type == CPU:
         return None
 
-    if src.device_type == CUDA and dst.device_type == CUDA:
-        if src.cuda_gpu_id == dst.cuda_gpu_id:
-            return None
+    if src.device_type == GPU and dst.device_type == GPU:
+        def fun(net, *args, **kw):
+            with DeviceScope(dst):
+                return net.Copy(*args, **kw)
+
+        if workspace.has_hip_support:
+            if src.hip_gpu_id == dst.hip_gpu_id:
+                return None
+            else:
+                return fun    
         else:
-            def fun(net, *args, **kw):
-                with DeviceScope(dst):
-                    return net.Copy(*args, **kw)
-            return fun
+            if src.cuda_gpu_id == dst.cuda_gpu_id:
+                return None
+            else:
+                return fun
+               
 
-    if src.device_type == CUDA and dst.device_type == CPU:
+    if src.device_type == GPU and dst.device_type == CPU:
         def fun(net, *args, **kw):
             with DeviceScope(src):
                 return net.CopyGPUToCPU(*args, **kw)
         return fun
 
-    if src.device_type == CPU and dst.device_type == CUDA:
+    if src.device_type == CPU and dst.device_type == GPU:
         def fun(net, *args, **kw):
             with DeviceScope(dst):
                 return net.CopyCPUToGPU(*args, **kw)
@@ -2221,7 +2235,12 @@ def device_equal(src, dst):
     comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
     returns not equal in some cases.
     '''
-    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
+    if workspace.has_hip_support:
+        gpu_id_eq = src.hip_gpu_id == dst.hip_gpu_id
+    else:
+        gpu_id_eq = src.cuda_gpu_id == dst.cuda_gpu_id
+
+    return src.device_type == dst.device_type and gpu_id_eq
 
 
 def update_placeholder_op_output(op, blob_to_device):
@@ -2332,10 +2351,13 @@ def InjectCrossDeviceCopies(net, blob_to_device=None, blob_remap=None,
                     def _gen_new_name(blob, device_option):
                         CPU = caffe2_pb2.CPU
                         CUDA = caffe2_pb2.CUDA
+                        HIP = caffe2_pb2.HIP
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
                         elif device_option.device_type == CUDA:
                             suffix = '_cuda_' + str(device_option.cuda_gpu_id)
+                        elif device_option.device_type == HIP:
+                            suffix = '_hip_' + str(device_option.hip_gpu_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index bf25806f20dde2..88b07eab347323 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -9,7 +9,7 @@
 import unittest
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, test_util
+from caffe2.python import core, test_util, workspace
 from caffe2.python.core import CreateOperator, GradientRegistry
 from caffe2.python import workspace
 
@@ -94,7 +94,7 @@ def assertOperatorListEqual(self, operatorDefList1, operatorDefList2):
 
     @given(device_option=st.sampled_from([
         None,
-        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+        core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)]))
     def testDirect(self, device_option):
         operators = [
             CreateOperator('Direct', 'in', 'hidden'),
@@ -279,7 +279,7 @@ def testUseInputButInputHasBeenChanged(self):
 
     @given(device_option=st.sampled_from([
         None,
-        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+        core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)]))
     def testMultiUseInput(self, device_option):
         """Test gradient for the following case:
 
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index b7099d20eae863..f47c79ac847a4a 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -82,18 +82,30 @@ def testDeviceScope(self):
         self.assertFalse(op.HasField('device_option'))
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
-        op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
-        self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+            op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
+            self.assertTrue(op.HasField('device_option'))
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
+            op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
+            self.assertTrue(op.HasField('device_option'))
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
-            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            if workspace.has_hip_support:
+                self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+                self.assertEqual(op.device_option.hip_gpu_id, 1)
+            else:
+                self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+                self.assertEqual(op.device_option.cuda_gpu_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
             override_device.device_type = caffe2_pb2.CPU
@@ -108,14 +120,22 @@ def testDeviceScope(self):
 
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
-                self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-                self.assertEqual(op.device_option.cuda_gpu_id, 1)
+                if workspace.has_hip_support:
+                    self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+                    self.assertEqual(op.device_option.hip_gpu_id, 1)
+                else:
+                    self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+                    self.assertEqual(op.device_option.cuda_gpu_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
                 self.assertEqual(len(op.output), 1)
@@ -220,8 +240,12 @@ def testSetInputRecordWithoutBlobs(self):
 class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
             control_input="z", device_option=device_option,
@@ -236,8 +260,12 @@ def testCreate(self):
         self.assertEqual(len(op.control_input), 1)
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:    
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertTrue(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
@@ -540,12 +568,15 @@ def test_check_equal_default_value(self):
         opt2 = caffe2_pb2.DeviceOption()
         opt1.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.cuda_gpu_id = 5
+        if workspace.has_hip_support:
+            opt1.hip_gpu_id = 5
+        else:
+            opt1.cuda_gpu_id = 5
         # opt1 still is on CPU, so the options should be equal
         self.assertTrue(core.device_option_equal(opt1, opt2))
         opt2.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.device_type = 1
+        opt1.device_type = 6 if workspace.has_hip_support else 1
         self.assertFalse(core.device_option_equal(opt1, opt2))
 
 
@@ -609,14 +640,18 @@ def test_inject_copy(self):
         self.assertEqual(op.input[2], "fc_b")
 
 
-@unittest.skipIf(not workspace.has_gpu_support, 'No GPU support')
+@unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, 'No GPU support')
 class TestInferDevice(test_util.TestCase):
 
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
-        self.cuda_option = device_option
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
+        self.gpu_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
     def _test_op(
@@ -628,7 +663,7 @@ def _test_op(
         inputs=None,
         outputs=None
     ):
-        op_option = self.cuda_option if not op_option else op_option
+        op_option = self.gpu_option if not op_option else op_option
         inputs = ["blob_1"] if not inputs else inputs
         outputs = ["blob_2"] if not outputs else outputs
         with core.DeviceScope(op_option):
@@ -656,9 +691,9 @@ def _test_op(
     def test_infer_device(self):
         self._test_op(
             "FC",
-            self.cuda_option,
-            self.cuda_option,
-            op_option=self.cuda_option,
+            self.gpu_option,
+            self.gpu_option,
+            op_option=self.gpu_option,
             inputs=["data", "fc_w", "fc_b"],
             outputs=["fc_1"]
         )
@@ -666,17 +701,17 @@ def test_infer_device(self):
     def test_infer_device_split_by_lengths(self):
         self._test_op(
             "SplitByLengths",
-            [self.cuda_option, self.cpu_option],
-            self.cuda_option,
-            op_option=self.cuda_option,
+            [self.gpu_option, self.cpu_option],
+            self.gpu_option,
+            op_option=self.gpu_option,
             inputs=["data", "fc_w"],
             outputs=["fc_1"]
         )
 
     def test_infer_device_cross_device(self):
-        self._test_op("CopyGPUToCPU", self.cuda_option, self.cpu_option)
-        self._test_op("CopyCPUToGPU", self.cpu_option, self.cuda_option)
-        self._test_op("CopyFromCPUInput", self.cpu_option, self.cuda_option)
+        self._test_op("CopyGPUToCPU", self.gpu_option, self.cpu_option)
+        self._test_op("CopyCPUToGPU", self.cpu_option, self.gpu_option)
+        self._test_op("CopyFromCPUInput", self.cpu_option, self.gpu_option)
         self._test_op(
             "CopyFromCPUInput",
             self.cpu_option,
@@ -686,7 +721,7 @@ def test_infer_device_cross_device(self):
 
     def test_device_inference_function(self):
         # ConcatOp.
-        op_option = self.cuda_option
+        op_option = self.gpu_option
         with core.DeviceScope(op_option):
             op = core.CreateOperator(
                 'Concat',
@@ -698,7 +733,7 @@ def test_device_inference_function(self):
         self.assertEqual(output_dev[1], self.cpu_option)
 
         #SplitOp.
-        op_option = self.cuda_option
+        op_option = self.gpu_option
         with core.DeviceScope(op_option):
             op = core.CreateOperator(
                 'Split',
@@ -713,8 +748,12 @@ def test_inject_copy(self):
         net = core.Net("test")
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
 
@@ -727,11 +766,18 @@ def test_inject_copy(self):
         )
         op = new_net._net.op[-1]
         self.assertEqual(op.type, "FC")
-        self.assertEqual(op.input[0], "data_cuda_1")
-        self.assertEqual(op.input[1], "fc_w_cuda_1")
-        self.assertEqual(op.input[2], "fc_b_cuda_1")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[0], "data_hip_1")
+            self.assertEqual(op.input[1], "fc_w_hip_1")
+            self.assertEqual(op.input[2], "fc_b_hip_1")
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            self.assertEqual(op.input[0], "data_cuda_1")
+            self.assertEqual(op.input[1], "fc_w_cuda_1")
+            self.assertEqual(op.input[2], "fc_b_cuda_1")
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
         self.assertNotEqual(blob_to_device["fc_w"], device_option)
@@ -740,8 +786,12 @@ def test_cross_nets(self):
         net = core.Net("test")
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
         const = init_net.ConstantFill([], 'const', shape=[], value=1.)
@@ -756,28 +806,53 @@ def test_cross_nets(self):
         )
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.output[0], "fc_w_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_w_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_w_cuda_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.output[0], "fc_b_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_b_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_b_cuda_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
         self.assertEqual(op.input[0], "data")
-        self.assertEqual(op.input[1], "fc_w_cuda_1")
-        self.assertEqual(op.input[2], "fc_b_cuda_1")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[1], "fc_w_hip_1")
+            self.assertEqual(op.input[2], "fc_b_hip_1")
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            self.assertEqual(op.input[1], "fc_w_cuda_1")
+            self.assertEqual(op.input[2], "fc_b_cuda_1")
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
-        self.assertEqual(op.input[1], "const_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[1], "const_hip_1")
+        else:
+            self.assertEqual(op.input[1], "const_cuda_1")
         # check that moved blob is in input to the new net
-        for c in ["data", "fc_w", "fc_b", "const_cuda_1"]:
-            self.assertTrue(c in nets[1]._net.external_input)
+        if workspace.has_hip_support:
+            for c in ["data", "fc_w", "fc_b", "const_hip_1"]:
+                self.assertTrue(c in nets[1]._net.external_input)
+        else:
+            for c in ["data", "fc_w", "fc_b", "const_cuda_1"]:
+                self.assertTrue(c in nets[1]._net.external_input)
         """
 For reference, net.Proto() should be like:
 name: ""
@@ -877,8 +952,12 @@ def test_cross_nets_no_change(self):
     def test_inject_copy_multi_use(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             net.Relu("data", "relu1")
@@ -886,23 +965,38 @@ def test_inject_copy_multi_use(self):
         with core.DeviceScope(device_option):
             net.Relu("data", "relu3")
         net.Relu("data", "relu4")
-        device_option.cuda_gpu_id = 0
+        if workspace.has_hip_support:
+            device_option.hip_gpu_id = 0
+        else:
+            device_option.cuda_gpu_id = 0
         with core.DeviceScope(device_option):
             net.Relu("data", "relu5")
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             net.Relu("data", "relu6")
 
         new_net, _ = core.InjectCrossDeviceCopies(net)
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.output[0], "data_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.output[0], "data_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.output[0], "data_cuda_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
         self.assertEqual(op.type, "Relu")
@@ -910,9 +1004,14 @@ def test_inject_copy_multi_use(self):
         self.assertEqual(op.output[0], "relu2")
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.input[0], "data_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
         self.assertEqual(op.type, "Relu")
@@ -920,20 +1019,35 @@ def test_inject_copy_multi_use(self):
         self.assertEqual(op.output[0], "relu4")
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
-        self.assertEqual(op.output[0], "data_cuda_0")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+            self.assertEqual(op.output[0], "data_hip_0")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
+            self.assertEqual(op.output[0], "data_cuda_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
-        self.assertEqual(op.input[0], "data_cuda_0")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+            self.assertEqual(op.input[0], "data_hip_0")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
+            self.assertEqual(op.input[0], "data_cuda_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.input[0], "data_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu6")
         """
 For reference, net.Proto() should be like:
@@ -1025,8 +1139,12 @@ def test_inject_copy_placeholder_ops(self):
             cpu_device.append(caffe2_pb2.DeviceOption())
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
-            gpu_device[i].device_type = caffe2_pb2.CUDA
-            gpu_device[i].cuda_gpu_id = 0
+            if workspace.has_hip_support:
+                gpu_device[i].device_type = caffe2_pb2.HIP
+                gpu_device[i].hip_gpu_id = 0
+            else:
+                gpu_device[i].device_type = caffe2_pb2.CUDA
+                gpu_device[i].cuda_gpu_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
         recv_node = 'node:1'
@@ -1065,13 +1183,21 @@ def test_inject_copy_placeholder_ops(self):
         # Verify (init_net)
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
         self.assertEqual(op.type, placeholder_send)
@@ -1093,8 +1219,12 @@ def test_inject_copy_placeholder_ops(self):
     def test_blob_inplace(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
         with core.DeviceScope(device_option):
@@ -1103,10 +1233,15 @@ def test_blob_inplace(self):
         op = net._net.op[1]
         self.assertEqual(op.type, 'CopyCPUToGPU')
         self.assertEqual(op.input[0], 'param')
-        self.assertEqual(op.output[0], 'param_cuda_1')
+        if workspace.has_hip_support:
+            self.assertEqual(op.output[0], 'param_hip_1')
+        else:
+            self.assertEqual(op.output[0], 'param_cuda_1')
         op = net._net.op[2]
-        self.assertEqual(op.input[0], 'param_cuda_1')
-
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[0], 'param_hip_1')
+        else:
+            self.assertEqual(op.input[0], 'param_cuda_1')
         net.Relu('nonsense_input', 'moment')
         # should not raise inplace error
         core.InjectCrossDeviceCopies(net)
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 60e5c39bed1318..f60d9fe5c97336 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -127,15 +127,15 @@ def Parallelize(
         device scope was: {}".format(scope.CurrentDeviceScope())
 
     if devices is None:
-        devices = list(range(0, workspace.NumCudaDevices())),
+        devices = list(range(0, workspace.NumGpuDevices())),
 
     if not cpu_device:
         for gpu in devices:
-            if gpu >= workspace.NumCudaDevices():
+            if gpu >= workspace.NumGpuDevices():
                 log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumCudaDevices(), devices))
+                    workspace.NumGpuDevices(), devices))
                 break
-        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
         model_helper_obj._device_prefix = "gpu"
         model_helper_obj._shared_model = False
         device_name = "GPU"
@@ -165,7 +165,6 @@ def Parallelize(
     model_helper_obj._grad_names = []
 
     assert isinstance(model_helper_obj, model_helper.ModelHelper)
-
     # Keep track of params that were in the model before: they are not
     # data parallel, so we need to handle them separately
     non_datapar_params = copy.copy(model_helper_obj.params)
@@ -200,7 +199,10 @@ def Parallelize(
         # TODO: make into assert
 
     for device in devices:
-        device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+        if workspace.has_hip_support:
+            device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device)
+        else:
+            device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device)
         with core.DeviceScope(device_opt):
             with core.NameScope("{}_{}".format(model_helper_obj._device_prefix,
                                                device)):
@@ -307,7 +309,10 @@ def Parallelize(
 
     if param_update_builder_fun is not None:
         for device in devices:
-            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+            if workspace.has_hip_support:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device)
+            else:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device)
             with core.DeviceScope(device_opt):
                 with core.NameScope(
                     "{}_{}".format(model_helper_obj._device_prefix, device)
@@ -352,7 +357,10 @@ def Parallelize(
     # i.e. making sure multi-precision copies of parameters are up-to-date
     if post_sync_builder_fun is not None:
         for device in devices:
-            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+            if workspace.has_hip_support:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device)
+            else:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device)
             with core.DeviceScope(device_opt):
                 with core.NameScope(
                     "{}_{}".format(model_helper_obj._device_prefix, device)
@@ -432,17 +440,17 @@ def Parallelize_BMUF(
     assert isinstance(model_helper_obj, model_helper.ModelHelper)
 
     if devices is None:
-        devices = list(range(0, workspace.NumCudaDevices()))
+        devices = list(range(0, workspace.NumGpuDevices()))
     if master_device is None:
         master_device = devices[0]
 
     if not cpu_device:
         for gpu in devices:
-            if gpu >= workspace.NumCudaDevices():
+            if gpu >= workspace.NumGpuDevices():
                 log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumCudaDevices(), devices))
+                    workspace.NumGpuDevices(), devices))
                 break
-        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
         model_helper_obj._device_prefix = "gpu"
     else:
         model_helper_obj._device_type = caffe2_pb2.CPU
@@ -453,7 +461,10 @@ def Parallelize_BMUF(
     model_helper_obj._sync_barrier_net = None
     model_helper_obj._broadcast_context = None
     model_helper_obj._shared_model = False
-    master_dev_opt = core.DeviceOption(model_helper_obj._device_type, master_device)
+    if workspace.has_hip_support:
+        master_dev_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=master_device)
+    else:
+        master_dev_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=master_device)
 
     # question: rendezvous structure
     num_shards = rendezvous['num_shards'] if rendezvous else 1
@@ -797,9 +808,9 @@ def builder_fun(model):
     if device is None:
         device = scope.CurrentDeviceScope()
 
-    device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
+    device_prefix = "gpu" if (device.device_type == caffe2_pb2.CUDA or device.device_type == caffe2_pb2.HIP) else "cpu"
 
-    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
+    namescope = "{}_{}/".format(device_prefix, device.hip_gpu_id if workspace.has_hip_support else device.cuda_gpu_id)
     for op in mnet.Proto().op:
         if "RecurrentNetwork" in op.type:
             raise("RecurrentNetwork conversion not yet supported")
@@ -820,7 +831,10 @@ def builder_fun(model):
 def _ForEachDevice(devices, f, device_type, device_prefix, scoped=False,
                    *args, **kwargs):
     for device in devices:
-        device_opt = core.DeviceOption(device_type, device)
+        if workspace.has_hip_support:
+            device_opt = core.DeviceOption(device_type, hip_gpu_id=device)
+        else:
+            device_opt = core.DeviceOption(device_type, cuda_gpu_id=device)
         with core.DeviceScope(device_opt):
             if scoped:
                 with core.NameScope("{}_{}".format(device_prefix, device)):
@@ -836,7 +850,10 @@ def create_grad(lossp):
     loss_grad = {}
     # Explicitly need to create gradients on each GPU
     for gpu_id in devices:
-        device = core.DeviceOption(model._device_type, gpu_id)
+        if workspace.has_hip_support:
+            device = core.DeviceOption(model._device_type, hip_gpu_id=gpu_id)
+        else:
+            device = core.DeviceOption(model._device_type, cuda_gpu_id=gpu_id)
         with core.DeviceScope(device):
             for l in losses_by_gpu[gpu_id]:
                 lg = create_grad(l)
@@ -954,7 +971,7 @@ def GetLearningRateBlobNames(model):
     if model._optimizer is not None:
         if model._device_type == caffe2_pb2.CPU:
             return [model._optimizer.get_cpu_blob_name('lr')]
-        elif model._device_type == caffe2_pb2.CUDA:
+        elif model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP:
             return [model._optimizer.get_gpu_blob_name('lr', gpu, '')
                     for gpu in model._devices]
         else:
@@ -975,7 +992,10 @@ def _Broadcast(devices, model, net, param, use_nccl=False):
 
     if use_nccl:
         if _IsGPUBlob(model, param):
-            master_device_opt = core.DeviceOption(model._device_type, master_dev)
+            if workspace.has_hip_support:
+                master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=master_dev)
+            else:
+                master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=master_dev)
             with core.DeviceScope(master_device_opt):
                 # Note that the root is the root _rank_ and not the root
                 # _device_. Thus we always use root=0, regardless of the
@@ -986,10 +1006,12 @@ def _Broadcast(devices, model, net, param, use_nccl=False):
                     root=0,
                 )
                 return
-
     for dev_idx in devices[1:]:
         if _IsGPUBlob(model, param):
-            device_opt = core.DeviceOption(caffe2_pb2.CUDA, dev_idx)
+            if workspace.has_hip_support:
+                device_opt = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=dev_idx)
+            else:
+                device_opt = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=dev_idx)
         else:
             device_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
         with core.DeviceScope(device_opt):
@@ -1010,6 +1032,8 @@ def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None):
 
     if model._device_type == caffe2_pb2.CUDA:
         p2p_access_pattern = workspace.GetCudaPeerAccessPattern()
+    elif model._device_type == caffe2_pb2.HIP:
+        p2p_access_pattern = workspace.GetHipPeerAccessPattern()
     else:
         p2p_access_pattern = None
 
@@ -1034,7 +1058,10 @@ def sumN(*dev_indices):
                     blobs[i],
                     'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer)
                 )
-        device_opt = core.DeviceOption(model._device_type, devices[0])
+        if workspace.has_hip_support:
+            device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+        else:
+            device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
         with core.DeviceScope(device_opt):
             net.Sum(blobs, [blobs[0]], name='dpm')
 
@@ -1140,8 +1167,10 @@ def _SyncAllParamsDistributed(
     max_concurrent_distributed_ops
 ):
     assert rendezvous['num_shards'] > 1
-
-    gpu_device_opt = core.DeviceOption(model._device_type, devices[0])
+    if workspace.has_hip_support:
+        gpu_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+    else:
+        gpu_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
     cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU)
 
     if model._broadcast_context is None:
@@ -1316,8 +1345,10 @@ def _AllReduceBlobsDistributed(
     num_workers = model.net.Proto().num_workers
     assert num_workers > 1, "Please specify more than 1 worker"
     all_reduce_engine = rendezvous['engine']
-
-    master_device_opt = core.DeviceOption(model._device_type, devices[0])
+    if workspace.has_hip_support:
+        master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+    else:
+        master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
 
     reducing_device_opt = master_device_opt
 
@@ -1393,7 +1424,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
 
     # Now we need to Allreduce blobs on all the GPUs.
     # Pick GPU #0 as a master GPU.
-    master_device_opt = core.DeviceOption(model._device_type, devices[0])
+    if workspace.has_hip_support:
+        master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+    else:
+        master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
     last_out = None
     concatenated_idx = set()
 
@@ -1439,7 +1473,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
                             name="note:data_parallel_model")
 
                         for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
-                            device_opt = core.DeviceOption(model._device_type, gpu)
+                            if workspace.has_hip_support:
+                                device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu)
+                            else:
+                                device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu)
                             with core.DeviceScope(device_opt):
                                 model.Copy(grad_idx_concat, g.indices)
                                 concatenated_idx.add(g.indices)
@@ -1451,7 +1488,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
                         axis=0, name="note:data_parallel_model")
 
                     for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
-                        device_opt = core.DeviceOption(model._device_type, gpu)
+                        if workspace.has_hip_support:
+                            device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu)
+                        else:
+                            device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu)
                         with core.DeviceScope(device_opt):
                             model.Copy(grad_val_concat, g.values)
 
@@ -1526,11 +1566,15 @@ def _AnalyzeOperators(model):
             continue
 
         op_dev = op.device_option
-        op_gpu = op_dev.cuda_gpu_id
+        op_gpu = op_dev.hip_gpu_id if workspace.has_hip_support else op_dev.cuda_gpu_id
 
         # This avoids failing on operators that are only for CPU
-        if op_dev.device_type != caffe2_pb2.CUDA:
-            continue
+        if workspace.has_hip_support:
+            if op_dev.device_type != caffe2_pb2.HIP:
+                continue
+        else:
+            if op_dev.device_type != caffe2_pb2.CUDA:
+                continue
 
         namescope = "{}_{}/".format(model._device_prefix, op_gpu)
         for inp in list(op.input) + list(op.output):
@@ -1572,14 +1616,16 @@ def map_ops(proto):
 
 def _IsGPUBlob(model, blob_name):
     if blob_name in model._blob_to_device:
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \
+                model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP
     else:
         blob_name = "{}_{}/{}".format(
             model._device_prefix, model._devices[0], blob_name
         )
         if blob_name not in model._blob_to_device:
-            return model._device_type == caffe2_pb2.CUDA
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+            return model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP
+        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \
+                model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP
 
 
 def _GroupByDevice(model, devices, params, non_data_params):
@@ -1890,7 +1936,10 @@ def _InterleaveOps(model):
     new_ops = []
     ops = {d: [] for d in range(num_devices)}
     for op in orig_ops:
-        ops[op.device_option.cuda_gpu_id].append(op)
+        if workspace.has_hip_support:
+            ops[op.device_option.hip_gpu_id].append(op)
+        else:
+            ops[op.device_option.cuda_gpu_id].append(op)
 
     for j in range(num_ops_per_dev):
         tp = None
diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py
index 1bc41b7d1ccbc5..b7fcb708d8951e 100644
--- a/caffe2/python/docs/generator.py
+++ b/caffe2/python/docs/generator.py
@@ -103,7 +103,8 @@ def __init__(self, name):
     def getDeviceImpl(self):
         deviceImplList = []
         for device, impl in [('CPU', OpSchema.get_cpu_impl(self.op_name)),
-                             ('CUDA', OpSchema.get_cuda_impl(self.op_name))]:
+                             ('CUDA', OpSchema.get_cuda_impl(self.op_name)),
+                             ('HIP', OpSchema.get_hip_impl(self.op_name))]:
             if not impl:
                 continue
             deviceImplList.append((device, impl))
@@ -194,7 +195,8 @@ def generateDevices(self, formatter):
             self.getInfo(formatter,
                          'CPU', OpSchema.get_cpu_impl(self.name)),
             self.getInfo(formatter,
-                         'GPU', OpSchema.get_cuda_impl(self.name)),
+                         'HIP', OpSchema.get_hip_impl(self.name)) if workspace.has_hip_support else
+                        self.getInfo(formatter, 'GPU', OpSchema.get_cuda_impl(self.name)),
         ]
         formatter.addList([i for i in devices if i])
 
diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py
index e7803e829bb431..db252d8f704d0f 100644
--- a/caffe2/python/functional_test.py
+++ b/caffe2/python/functional_test.py
@@ -46,7 +46,7 @@ def _tensor_splits(draw, add_axis=False):
 
 
 class TestFunctional(hu.HypothesisTestCase):
-    @given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    @given(X=hu.tensor(), engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **hu.gcs)
     def test_relu(self, X, engine, gc, dc):
         X += 0.02 * np.sign(X)
         X[X == 0.0] += 0.02
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
index f1c190aa6efac6..52da4e7dd49304 100644
--- a/caffe2/python/gradient_check_test.py
+++ b/caffe2/python/gradient_check_test.py
@@ -23,9 +23,9 @@
 import unittest
 
 
-if workspace.has_gpu_support and workspace.NumCudaDevices() > 0:
+if (workspace.has_gpu_support or workspace.has_hip_support) and workspace.NumGpuDevices() > 0:
     gpu_device_option = caffe2_pb2.DeviceOption()
-    gpu_device_option.device_type = caffe2_pb2.CUDA
+    gpu_device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
     cpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_checker = device_checker.DeviceChecker(
         0.01, [gpu_device_option]
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index f640f6db20eff8..e0083167c4b56e 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -278,7 +278,7 @@ def gradient_checker_device_option():
 )
 
 gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
-gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
+gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]+[hip_do]), dc=st.just([gpu_do]+[hip_do]))
 gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip))
 
 
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
index 6536280d8a6057..cb5712f425e223 100644
--- a/caffe2/python/memonger_test.py
+++ b/caffe2/python/memonger_test.py
@@ -223,13 +223,13 @@ def test_gradient_optim(self, input_dim, output_dim, batch_size):
         np.testing.assert_almost_equal(loss, optimized_loss)
         np.testing.assert_almost_equal(grad, optimized_grad)
 
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support.")
     def test_memonger_mix_cpu_gpu(self):
         '''
         Check that memonger does not make blobs cross CPU/GPU boundary
         '''
         m = model_helper.ModelHelper()
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)):
             fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
             fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
             fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
@@ -259,7 +259,10 @@ def test_memonger_mix_cpu_gpu(self):
 
         # Create set of blobs on CPU side and GPU side and check they don't
         # overlap
-        device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
+        if workspace.has_hip_support:
+            device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.HIP: set()}
+        else:
+            device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
         for op in optim_proto.op:
             if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
                 dev = op.device_option.device_type
@@ -267,7 +270,7 @@ def test_memonger_mix_cpu_gpu(self):
                     device_blobs[dev].add(b)
 
         device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
-            device_blobs[caffe2_pb2.CUDA]
+            device_blobs[caffe2_pb2.HIP] if workspace.has_hip_support else device_blobs[caffe2_pb2.CUDA]
         )
         self.assertEquals(device_crossers, set())
 
diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py
index 31cba3facb0559..3f438d35cad35e 100644
--- a/caffe2/python/model_device_test.py
+++ b/caffe2/python/model_device_test.py
@@ -124,7 +124,7 @@ def _testMiniAlexNet(self, order):
         cpu_device = caffe2_pb2.DeviceOption()
         cpu_device.device_type = caffe2_pb2.CPU
         gpu_device = caffe2_pb2.DeviceOption()
-        gpu_device.device_type = caffe2_pb2.CUDA
+        gpu_device.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
 
         checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
         ret = checker.CheckNet(
@@ -136,7 +136,7 @@ def _testMiniAlexNet(self, order):
         )
         self.assertEqual(ret, True)
 
-    @unittest.skipIf(not workspace.has_gpu_support,
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support,
                      "No GPU support. Skipping test.")
     def testMiniAlexNetNCHW(self):
         self._testMiniAlexNet("NCHW")
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index e84369fbbbef7f..e03943e3f084f4 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -570,7 +570,10 @@ def rename_list(proto_list):
                             rename_list(step_op.output)
                             if device is not None:
                                 step_op.device_option.device_type = device.device_type
-                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                                if workspace.has_hip_support:
+                                    step_op.device_option.hip_gpu_id = device.hip_gpu_id
+                                else:
+                                    step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
 
                         rename_list(arg.n.external_input)
                         rename_list(arg.n.external_output)
@@ -584,7 +587,10 @@ def rename_list(proto_list):
 
             if device is not None:
                 op.device_option.device_type = device.device_type
-                op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                if workspace.has_hip_support:
+                    op.device_option.hip_gpu_id = device.hip_gpu_id
+                else:
+                    op.device_option.cuda_gpu_id = device.cuda_gpu_id
             validate_op(op)
             predict_proto.op.extend([op])
             known_blobs.update(op.output)
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index b407f96d2391f8..186857fe569430 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -25,8 +25,12 @@ def OnGPU(gpu_id):
   specified gpu id.
   """
     device_option = caffe2_pb2.DeviceOption()
-    device_option.device_type = caffe2_pb2.CUDA
-    device_option.cuda_gpu_id = gpu_id
+    if workspace.has_hip_support:
+        device_option.device_type = caffe2_pb2.HIP  
+        device_option.hip_gpu_id = gpu_id
+    else:
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = gpu_id
     return device_option
 
 
@@ -48,7 +52,7 @@ def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
             "gpu_indices length and blobs length mismatch: %d vs %d" %
             (len(gpu_indices), len(blobs))
         )
-    pattern = workspace.GetCudaPeerAccessPattern()
+    pattern = workspace.GetHipPeerAccessPattern() if workspace.has_hip_support else workspace.GetCudaPeerAccessPattern()
     if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
         return Allreduce2(net, blobs, reduced_affix, gpu_indices)
     elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 4b5cddb61d244e..60b3598f3fb192 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -8,6 +8,7 @@
 from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef
 from caffe2.python.checkpoint import Job
 from caffe2.python.core import Net, ExecutionStep, Plan
+from caffe2.python import workspace
 from caffe2.python.task import Task, TaskGroup, WorkspaceType, TaskOutput
 from collections import defaultdict
 from contextlib import contextmanager
@@ -267,12 +268,13 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False):
 
 
 def format_device_option(dev_opt):
+    gpu_id = dev_opt.hip_gpu_id if workspace.has_hip_support else dev_opt.cuda_gpu_id
     if not dev_opt or not (
-            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
+            dev_opt.device_type or gpu_id or dev_opt.node_name):
         return None
     return call(
         'DeviceOption',
-        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
+        [dev_opt.device_type, gpu_id, "'%s'" % dev_opt.node_name])
 
 
 @Printer.register(OperatorDef)
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
index 0d35110740825b..9d5d961f71c756 100644
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -17,7 +17,7 @@
 
 class TestActivations(hu.HypothesisTestCase):
     @given(X=hu.tensor(), in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **mu.gcs)
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **mu.gcs)
     def test_relu(self, X, in_place, engine, gc, dc):
         if gc == mu.mkl_do:
             in_place = False
@@ -43,7 +43,7 @@ def relu_ref(X):
     @unittest.skipIf(not workspace.has_gpu_support,
                      "Relu for float16 can only run on GPU now.")
     @given(X=hu.tensor(dtype=np.float16), in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_gpu_only)
+           engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]), **hu.gcs_gpu_only)
     def test_relu_fp16(self, X, in_place, engine, gc, dc):
         op = core.CreateOperator(
             "Relu",
@@ -102,7 +102,7 @@ def relu_n_ref(X):
 
     @given(X=hu.tensor(),
            alpha=st.floats(min_value=0.1, max_value=2.0),
-           in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
+           in_place=st.booleans(), engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]),
            **hu.gcs)
     def test_elu(self, X, alpha, in_place, engine, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 638248d60bafe5..0980cc5c043e78 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -7,7 +7,7 @@
 import hypothesis.strategies as st
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 
 
@@ -47,10 +47,10 @@ def ref(x, mask):
 
     @staticmethod
     def _dtype_conversion(x, dtype, gc, dc):
-        """SequenceMask only supports fp16 with CUDA."""
+        """SequenceMask only supports fp16 with CUDA/HIP."""
         if dtype == np.float16:
-            assume(gc.device_type == caffe2_pb2.CUDA)
-            dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA]
+            assume(gc.device_type == caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)
+            dc = [d for d in dc if d.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)]
             x = x.astype(dtype)
         return x, dc
 
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
index 79293fc6453232..1a77c2965b6aba 100644
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from caffe2.python import core
+from caffe2.python import core, workspace
 from hypothesis import given
 import hypothesis.strategies as st
 import caffe2.python.hypothesis_test_util as hu
@@ -15,7 +15,7 @@
 class TestCeil(hu.HypothesisTestCase):
 
     @given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]),
            **hu.gcs)
     def test_ceil(self, X, gc, dc, engine):
         op = core.CreateOperator("Ceil", ["X"], ["Y"], engine=engine)
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index f460c191b02f5f..0f8cf66b0ae1ec 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -37,6 +37,15 @@ def _cudnn_supports(
             return False
     return True
 
+def _miopen_supports(
+        dilation=False,
+        nhwc=False,
+        backward=False,
+):
+    """Return True if MIOPEN supports this configuration."""
+    if nhwc or dilation:
+        return False
+    return True
 
 def _cudnn_convolution_algo_count(direction):
     try:
@@ -192,7 +201,7 @@ def test_convolution_separate_stride_pad_layout(self, op_type,
            output_channels=st.integers(1, 8),
            batch_size=st.integers(1, 3),
            order=st.sampled_from(["NCHW", "NHWC"]),
-           engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN", "MKLDNN"]),
            use_bias=st.booleans(),
            force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
            force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
@@ -209,6 +218,10 @@ def test_convolution_gradients(self, op_type, stride, pad, kernel, dilation,
             assume(_cudnn_supports(dilation=(dilation > 1),
                                    nhwc=(order == 'NHWC'),
                                    backward=True))
+        if engine == 'MIOPEN':
+            assume(_cudnn_supports(dilation=(dilation > 1),
+                                   nhwc=(order == 'NHWC'),
+                                   backward=True))
 
         assume(engine != "MKLDNN" or use_bias is True)
 
@@ -451,8 +464,12 @@ def test_convolution_layout(self, op_type, stride, pad, kernel, dilation,
 
         for order in ["NCHW", "NHWC"]:
             engine_list = ['']
-            if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
-                engine_list.append('CUDNN')
+            if workspace.has_hip_support:
+                if _miopen_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
+                    engine_list.append('MIOPEN')
+            else:
+                if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
+                    engine_list.append('CUDNN')
 
             for engine in engine_list:
                 op = core.CreateOperator(
@@ -503,9 +520,9 @@ def canonical(o):
            net_type=st.sampled_from(
                ["simple", "dag"] +
                (["async_dag"] if workspace.has_gpu_support or workspace.has_hip_support else [])),
-           engine=st.sampled_from(["CUDNN", ""]),
-           **hu.gcs_no_hip)
-    def test_convolution_sync(self, net_type, num_workers, engine, gc, dc):
+           do=st.sampled_from(hu.device_options),
+           engine=st.sampled_from(["CUDNN", ""]))
+    def test_convolution_sync(self, net_type, num_workers, do, engine):
         m = ModelHelper(name="test_model")
         n = 1
         d = 2
@@ -557,8 +574,8 @@ def test_convolution_sync(self, net_type, num_workers, engine, gc, dc):
         m.net.SquaredL2Distance(["0_0_flat", "label"], "xent")
         m.net.AveragedLoss("xent", "loss")
         input_to_grad = m.AddGradientOperators(["loss"])
-        m.Proto().device_option.CopyFrom(gc)
-        m.param_init_net.Proto().device_option.CopyFrom(gc)
+        m.Proto().device_option.CopyFrom(do)
+        m.param_init_net.Proto().device_option.CopyFrom(do)
         m.Proto().type = net_type
         m.Proto().num_workers = num_workers
         self.ws.run(m.param_init_net)
@@ -570,10 +587,10 @@ def run():
             for input_blob in input_blobs:
                 self.ws.create_blob(input_blob).feed(
                     np.random.randn(n, d, h, w).astype(np.float32),
-                    device_option=gc)
+                    device_option=do)
                 self.ws.create_blob("label").feed(
                     np.random.randn(n, d * h * w).astype(np.float32),
-                    device_option=gc)
+                    device_option=do)
             self.ws.run(m.net)
             gradients = [
                 self.ws.blobs[str(input_to_grad[input_blob])].fetch()
@@ -623,8 +640,7 @@ def test_use_cudnn_engine_interactions(self):
                             f(**kwargs)
                     else:
                         f(**kwargs)
-                        self.assertEqual(model.Proto().op[-1].engine,
-                                         expected_engine)
+                        self.assertEqual(model.Proto().op[-1].engine, expected_engine)
 
     @given(op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4),
            G=st.integers(1, 4), DX=st.integers(1, 4), DY=st.integers(1, 4),
diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py
index 05a018ff90a2c3..3b07090a2b3283 100644
--- a/caffe2/python/operator_test/copy_ops_test.py
+++ b/caffe2/python/operator_test/copy_ops_test.py
@@ -40,21 +40,29 @@ def run_test_copy_gradient(self, device_opt):
     def test_copy_gradient_cpu(self):
         self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    num_gpu = 0
+    if workspace.has_hip_support:
+        num_gpu = workspace.NumHipDevices()
+    else:
+        num_gpu = workspace.NumCudaDevices()
+
+    @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.")
     def test_copy_gradient_gpu(self):
-        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CUDA, 0))
+        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPU.")
+    @unittest.skipIf(num_gpu < 2, "Need at least 2 GPU.")
     def test_copy_gradient_multiple_gpus(self):
         model = model_helper.ModelHelper(name="copy_test")
 
         with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
             x_cpu = model.net.AddExternalInputs("x_cpu")
 
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        gpu_device = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
+
+        with core.DeviceScope(core.DeviceOption(gpu_device, 0)):
             x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1")
 
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)):
+        with core.DeviceScope(core.DeviceOption(gpu_device, 1)):
             x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2")
             loss = model.AveragedLoss(x_gpu_2, "loss")
             gradient_map = model.AddGradientOperators([loss])
@@ -80,20 +88,20 @@ def get_op_with_output(model, output_blob_name):
 
         self.assertEqual(
             get_op_with_output(model, "x_gpu_2_grad").device_option,
-            core.DeviceOption(caffe2_pb2.CUDA, 1),
+            core.DeviceOption(gpu_device, 1),
         )
         self.assertEqual(
             get_op_with_output(model, "x_cpu_grad").device_option,
-            core.DeviceOption(caffe2_pb2.CUDA, 0),
+            core.DeviceOption(gpu_device, 0),
         )
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.")
     def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
         model = model_helper.ModelHelper(name="copy_test")
         v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
         indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
         cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
 
         with core.DeviceScope(gpu_opt):
             vcpu = model.CopyGPUToCPU(v, "vcpu")
@@ -112,13 +120,13 @@ def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
         self.assertTrue("v" in gradient_map)
         self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.")
     def test_cpu2gpu_gpu2cpu_gradients(self):
         model = model_helper.ModelHelper(name="copy_test")
 
         batch = 32
         cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
 
         with core.NameScope("cpu"):
             with core.DeviceScope(cpu_opt):
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index e767a0db161a9f..74325052bb5e37 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -405,8 +405,8 @@ def test_sum_reduce(self, gc, dc):
         np.testing.assert_array_almost_equal(out, res)
         self.assertDeviceChecks(dc, op, [X, Y], [0])
 
-        # fp64 is not supported with the CUDA op
-        dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA]
+        # fp64 is not supported with the CUDA/HIP op
+        dc_cpu_only = [d for d in dc if (d.device_type != caffe2_pb2.CUDA or d.device_type != caffe2_pb2.HIP)]
         self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
 
     @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py
index 956d0ec9619987..4301b5c60f66f3 100644
--- a/caffe2/python/operator_test/pooling_test.py
+++ b/caffe2/python/operator_test/pooling_test.py
@@ -209,12 +209,14 @@ def test_pooling_with_index(self, stride, pad, kernel, size,
 
     @given(sz=st.integers(1, 20),
            batch_size=st.integers(1, 4),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            op_type=st.sampled_from(["AveragePool", "AveragePool2D"]),
            **hu.gcs)
     @settings(max_examples=3, timeout=10)
     def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc):
         ''' Special test to stress the fast path of NCHW average pool '''
+        if engine == 'MIOPEN':
+            assume(sz<16)
         op = core.CreateOperator(
             op_type,
             ["X"],
@@ -233,7 +235,7 @@ def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc):
 
     @given(sz=st.integers(1, 20),
            batch_size=st.integers(1, 4),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            op_type=st.sampled_from(["MaxPool", "MaxPool2D"]),
            **hu.gcs)
     @settings(max_examples=3, timeout=10)
@@ -241,7 +243,10 @@ def test_global_max_pool_nchw(self, op_type, sz,
                                   batch_size, engine, gc, dc):
         ''' Special test to stress the fast path of NCHW max pool '''
         # CuDNN 5 does not support deterministic max pooling.
-        assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN")
+        if engine == 'MIOPEN':
+            assume(sz<16)
+        if not workspace.has_hip_support:
+            assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN")
         op = core.CreateOperator(
             op_type,
             ["X"],
@@ -270,12 +275,14 @@ def test_global_max_pool_nchw(self, op_type, sz,
            order=st.sampled_from(["NCHW", "NHWC"]),
            op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool",
                                    "MaxPool2D", "AveragePool2D"]),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            **hu.gcs)
     def test_pooling(self, stride, pad, kernel, size,
                      input_channels, batch_size,
                      order, op_type, engine, gc, dc):
         assume(pad < kernel)
+        if engine == 'MIOPEN':
+            assume(op_type != "LpPool" and order == "NCHW")
         op = core.CreateOperator(
             op_type,
             ["X"],
@@ -300,12 +307,14 @@ def test_pooling(self, stride, pad, kernel, size,
            batch_size=st.integers(1, 3),
            order=st.sampled_from(["NCHW", "NHWC"]),
            op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool"]),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            **hu.gcs)
     def test_global_pooling(self, size, input_channels, batch_size,
                             order, op_type, engine, gc, dc):
         # CuDNN 5 does not support deterministic max pooling.
         assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool")
+        if engine == 'MIOPEN':
+            assume(op_type != "LpPool" and order == "NCHW")
         op = core.CreateOperator(
             op_type,
             ["X"],
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 2ebcf1d92a1240..b26cc24b7e09e8 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -75,9 +75,11 @@ def make_unique_blob_name(self, base_str):
         if current_scope is None:
             return self.get_cpu_blob_name(base_str)
 
-        if current_scope.device_type == caffe2_pb2.CUDA:
+        if current_scope.device_type == caffe2_pb2.CUDA or current_scope.device_type == caffe2_pb2.HIP:
             return self.get_gpu_blob_name(
-                base_str, current_scope.cuda_gpu_id, current_scope.node_name
+                base_str, 
+                current_scope.hip_gpu_id if workspace.has_hip_support else current_scope.cuda_gpu_id, 
+                current_scope.node_name
             )
         else:
             return self.get_cpu_blob_name(base_str, current_scope.node_name)
@@ -121,7 +123,7 @@ def build_lr(self, net, param_init_net, base_learning_rate,
         if self._local_lr_multiplier is not None:
             current_scope = scope.CurrentDeviceScope()
             if (current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)
                     and not self._local_lr_multiplier_on_gpu):
                 local_lr_multiplier = net.CopyFromCPUInput(
                     self._local_lr_multiplier,
@@ -204,14 +206,6 @@ def scale_learning_rate(self, *args, **kwargs):
         raise NotImplementedError(
             "Optimizer Need to Implement `scale_learning_rate` method.")
 
-    def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
-        wd = param_init_net.ConstantFill([], "weight_decay",
-            shape=[1], value=weight_decay)
-        trust = param_init_net.ConstantFill([], "trust", shape=[1], value=trust)
-        lr_max = param_init_net.ConstantFill([], "lr_max", shape=[1],
-            value=lr_max)
-        return wd, trust, lr_max
-
 
 class SgdOptimizer(Optimizer):
     def __init__(self, base_learning_rate=0.01, policy='fixed',
@@ -241,18 +235,15 @@ def _run(self, net, param_init_net, param_info):
         if self.lars is not None and not isinstance(grad, core.GradientSlice):
             assert self.lars >= 0, (
                 'Lars offset must be nonnegative, got {}'.format(self.lars))
-            wd, trust, lr_max = self.create_lars_inputs(
-                param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
             lr_lars_multiplier = net.Lars(
-                [param, grad, wd, trust, lr_max],
+                [param, grad],
                 self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars,
-                lr_min=0.0)
+                offset=self.lars)
             current_scope = scope.CurrentDeviceScope()
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)),
             )
 
         # We need negative sign for LR when used directly with WeightedSum
@@ -273,7 +264,7 @@ def _run(self, net, param_init_net, param_info):
         # to include device information.
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
+            "ONE_{}_{}{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id, dev.node_name),
             shape=[1],
             value=1.0
         )
@@ -482,12 +473,12 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
         WD = param_init_net.ConstantFill(
-            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            [], "wd_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id),
             shape=[1], value=self.weight_decay
         )
 
@@ -531,19 +522,15 @@ def _run(self, net, param_init_net, param_info):
         if self.lars is not None and not isinstance(grad, core.GradientSlice):
             assert self.lars >= 0, (
                 'Lars offset must be nonnegative, got {}'.format(self.lars))
-            wd, trust, lr_max = self.create_lars_inputs(
-                param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
             lr_lars_multiplier = net.Lars(
-                [param, grad, wd, trust, lr_max],
+                [param, grad],
                 self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars,
-                lr_min=0.0)
-
+                offset=self.lars)
             current_scope = scope.CurrentDeviceScope()
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)),
             )
 
         lr, _ = self.build_lr(
@@ -656,18 +643,15 @@ def _run(self, net, param_init_net, param_info):
         if self.lars is not None and not isinstance(grad, core.GradientSlice):
             assert self.lars >= 0, (
                 'Lars offset must be nonnegative, got {}'.format(self.lars))
-            wd, trust, lr_max = self.create_lars_inputs(
-                param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
             lr_lars_multiplier = net.Lars(
-                [param, grad, wd, trust, lr_max],
+                [param, grad],
                 self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars,
-                lr_min=0.0)
+                offset=self.lars)
             current_scope = scope.CurrentDeviceScope()
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)),
             )
 
         lr, _ = self.build_lr(
@@ -1144,7 +1128,7 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
@@ -1439,8 +1423,7 @@ def build_ftrl(model, engine="SIMD", **kwargs):
 
 
 def build_gftrl(model, engine="", **kwargs):
-    if engine == "SIMD":
-        assert core.IsOperator('GFtrl_ENGINE_SIMD')
+    # SIMD version of GFTRL is not supported
     gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
     return _build(model, gftrl_optimizer)
 
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
index bbb0c02625f09a..a136684cd0e38e 100644
--- a/caffe2/python/optimizer_test.py
+++ b/caffe2/python/optimizer_test.py
@@ -77,7 +77,7 @@ def check_optimizer(self, optimizer):
             tensor = workspace.FetchBlob(param)
             np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
 
-    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support , "No GPU support")
     def testGPUDense(self):
         super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16)
 
@@ -434,11 +434,11 @@ def test_caffe2_cpu_vs_numpy(self):
                 )
 
     @unittest.skip("Results might vary too much. Only for individual use.")
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support")
     def test_caffe2_gpu_vs_numpy(self):
         n_dim = 1000000
         n_iter = 50
-        gpu_device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
         with core.DeviceScope(gpu_device_opt):
             for zero_debias in [False, True]:
                 for grad_coef in [1.0, 0.1, 0.01]:
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index dbb0dbeae2dd37..4c62d9ee21b797 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -70,7 +70,7 @@ def testDense(self):
 
     @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
     def testGPUDense(self, dtype=core.DataType.FLOAT):
-        device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
         with core.DeviceScope(device_opt):
             model, _perfect_model, data, label = self._createDense(dtype)
             if dtype == core.DataType.FLOAT16:
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index 11f7a2c44046af..f4b3aa4a244d17 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from caffe2.python import scope, core
+from caffe2.python import scope, core, workspace
 from caffe2.proto import caffe2_pb2
 
 import unittest
@@ -18,7 +18,10 @@ def thread_runner(idx, testobj):
     testobj.assertEquals(scope.CurrentNameScope(), "")
     testobj.assertEquals(scope.CurrentDeviceScope(), None)
     namescope = "namescope_{}".format(idx)
-    dsc = core.DeviceOption(caffe2_pb2.CUDA, idx)
+    if workspace.has_hip_support:
+        dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=idx)
+    else:
+        dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=idx)
     with scope.DeviceScope(dsc):
         with scope.NameScope(namescope):
             testobj.assertEquals(scope.CurrentNameScope(), namescope + "/")
@@ -58,7 +61,10 @@ def testNamescopeAssertion(self):
     def testDevicescopeBasic(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        if workspace.has_hip_support:
+            dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9)
+        else:
+            dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9)
         with scope.DeviceScope(dsc):
             self.assertEquals(scope.CurrentDeviceScope(), dsc)
 
@@ -67,7 +73,10 @@ def testDevicescopeBasic(self):
     def testEmptyDevicescopeBasic(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        if workspace.has_hip_support:
+            dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9)
+        else:
+            dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9)
         with scope.DeviceScope(dsc):
             self.assertEquals(scope.CurrentDeviceScope(), dsc)
             with scope.EmptyDeviceScope():
@@ -78,7 +87,10 @@ def testEmptyDevicescopeBasic(self):
     def testDevicescopeAssertion(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        if workspace.has_hip_support:
+            dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9)
+        else:
+            dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9)
 
         try:
             with scope.DeviceScope(dsc):
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index c033e0684bb5d8..79ac32b6269dc5 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -47,7 +47,7 @@
     NumCudaDevices = C.num_cuda_devices
     GetCUDAVersion = C.get_cuda_version
     GetCuDNNVersion = C.get_cudnn_version
-
+    NumGpuDevices = NumCudaDevices
     def GetCudaPeerAccessPattern():
         return np.asarray(C.get_cuda_peer_access_pattern())
 
@@ -59,6 +59,18 @@ def GetCudaPeerAccessPattern():
     GetCudaPeerAccessPattern = lambda: np.array([]) # noqa
     GetDeviceProperties = lambda x: None # noqa
 
+if has_hip_support:
+    NumHipDevices = C.num_hip_devices
+    NumGpuDevices = NumHipDevices
+    def GetHipPeerAccessPattern():
+        return np.asarray(C.get_hip_peer_access_pattern())
+
+    GetDeviceProperties = C.get_device_properties
+else:
+    NumHipDevices = lambda: 0 # noqa
+    GetHipPeerAccessPattern = lambda: np.array([]) # noqa
+    GetDeviceProperties = lambda x: None # noqa
+
 IsNUMAEnabled = C.is_numa_enabled
 GetNumNUMANodes = C.get_num_numa_nodes
 GetBlobNUMANode = C.get_blob_numa_node
@@ -322,10 +334,10 @@ def FeedBlob(name, arr, device_option=None):
     if device_option is None:
         device_option = scope.CurrentDeviceScope()
 
-    if device_option and device_option.device_type == caffe2_pb2.CUDA:
+    if device_option and (device_option.device_type == caffe2_pb2.CUDA or device_option.device_type == caffe2_pb2.HIP):
         if arr.dtype == np.dtype('float64'):
             logger.warning(
-                "CUDA operators do not support 64-bit doubles, " +
+                "CUDA/HIP operators do not support 64-bit doubles, " +
                 "please use arr.astype(np.float32) or np.int32 for ints." +
                 " Blob: {}".format(name) +
                 " type: {}".format(str(arr.dtype))
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 5da37c7f22efca..72935a94ac45e7 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -317,7 +317,7 @@ def testCreateWorkspace(self):
         self.assertTrue("test" in workspaces)
 
 
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support.")
 class TestWorkspaceGPU(test_util.TestCase):
 
     def setUp(self):
@@ -339,6 +339,7 @@ def testFetchBlobGPU(self):
         self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
         np.testing.assert_array_equal(fetched_again, 2.0)
 
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
     def testGetCudaPeerAccessPattern(self):
         pattern = workspace.GetCudaPeerAccessPattern()
         self.assertEqual(type(pattern), np.ndarray)
@@ -346,6 +347,14 @@ def testGetCudaPeerAccessPattern(self):
         self.assertEqual(pattern.shape[0], pattern.shape[1])
         self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
 
+    @unittest.skipIf(not workspace.has_hip_support, "No hip support.")
+    def testGetHipPeerAccessPattern(self):
+        pattern = workspace.GetHipPeerAccessPattern()
+        self.assertEqual(type(pattern), np.ndarray)
+        self.assertEqual(pattern.ndim, 2)
+        self.assertEqual(pattern.shape[0], pattern.shape[1])
+        self.assertEqual(pattern.shape[0], workspace.NumHipDevices())
+
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "No MKLDNN support.")
 class TestWorkspaceMKLDNN(test_util.TestCase):
diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md
new file mode 100644
index 00000000000000..74ff92b72a26a9
--- /dev/null
+++ b/rocm-docs/caffe2-build.md
@@ -0,0 +1,108 @@
+# Caffe2: Building From Source on ROCm Platform
+
+## Intro
+This instruction provides a starting point to build caffe2 on AMD GPUs (Caffe2 ROCm port) from source.
+*Note*: it is recommended to start with a clean Ubuntu 16.04 system
+
+## Install docker
+
+ If your machine doesn't have docker installed, follow the steps [here](https://docs.docker.com/install/linux/docker-ce/ubuntu/#install-docker-ce) to install docker.
+
+## Install ROCm
+
+Install ROCm stack following steps at [link](https://github.com/RadeonOpenCompute/ROCm/blob/master/README.md) if your machine doesn't have ROCm already.
+
+Once the machine is ready with ROCm stack, there are two ways to use caffe2 
+* Run the docker container with caffe2 installed in it.
+
+* Build caffe2 from source inside a docker with all the dependencies.
+
+## Launch docker container with caffe2 pre-installed
+```
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v2
+```
+
+To run benchmarks, skip directly to benchmarks section of the document.
+
+## Build Caffe2 from source
+### Pull the docker image
+```
+docker pull rocm/caffe2:unbuilt-rocm1.8.2-v2
+```
+This docker image has all the dependencies for caffe2 pre-installed.
+
+### Pull the latest caffe2 source:
+
+Checkout the `caffe2_specific` branch of this repository.
+* Using https 
+```
+git clone --recurse-submodules -b caffe2_specific https://github.com/ROCmSoftwarePlatform/pytorch.git
+```
+* Using ssh
+```
+git clone --recurse-submodules -b caffe2_specific git@github.com:ROCmSoftwarePlatform/pytorch.git
+```
+Navigate to repo directory
+```
+cd pytorch
+```
+
+### Launch the docker container
+```	
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v2
+``` 
+Navigate to pytorch directory `cd /pytorch` inside the container.
+
+### Build caffe2 Project from source
+
+* Run the command  
+
+	`.jenkins/caffe2/amd/build_amd.sh`
+
+	
+* Test the rocm-caffe2 Installation 
+
+	Before running the tests, make sure that the required environment variables are set:
+	```
+	cd build_caffe2 && python -c 'from caffe2.python import core' 2>/dev/null && echo "Success" || echo "Failure"
+	```
+Make sure the the following environment variables are set.
+
+```
+LD_lIBRARY_PATH=/usr/local/caffe2/lib
+PYTHONPATH=/usr/local/caffe2/lib/python2.7/dist-packages
+```
+
+## Run benchmarks
+
+Navigate to build directory, `cd /pytorch/build_caffe2` to run benchmarks.
+
+Caffe2 benchmarking script supports the following networks.
+1. MLP
+2. AlexNet
+3. OverFeat
+4. VGGA
+5. Inception
+6. Inception_v2
+7. Resnet50 
+
+*Special case:* Inception_v2 and Resnet50 will need their corresponding protobuf files to run the benchmarks. Protobufs can be downloaded from caffe2 model zoo using the below command. Substitute model_name with `inception_v2` or `resnet50`
+
+```
+python caffe2/python/models/download.py <model_name>
+```
+This will download the protobufs to current working directory.
+
+To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, run the command replacing `<name_of_the_netwrok>` with one of the networks. 
+
+```
+python caffe2/python/convnet_benchmarks.py --batch_size 64 --model <name_of_the_network> --engine MIOPEN --layer_wise_benchmark True --net_type simple
+
+```
+To run Inception_v2 or Resnet50, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above.
+
+```
+python caffe2/python/convnet_benchmarks.py --batch_size 64 --model <name_of_the_network> --engine MIOPEN --layer_wise_benchmark True --net_type simple --model_path <path_to_model_protobufs>
+
+```
+

From 50886a625c3d9e2923aa91cf4f4fd6dfc1f7d963 Mon Sep 17 00:00:00 2001
From: root <rohith.nallamaddi@gmail.com>
Date: Tue, 14 Aug 2018 15:57:02 -0700
Subject: [PATCH 02/28] add execute permissions to build script

---
 .jenkins/caffe2/amd/build_amd.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 mode change 100644 => 100755 .jenkins/caffe2/amd/build_amd.sh

diff --git a/.jenkins/caffe2/amd/build_amd.sh b/.jenkins/caffe2/amd/build_amd.sh
old mode 100644
new mode 100755

From 0a337b42a84223a18cc2699508b4d35de2b9593a Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 27 Aug 2018 18:47:06 -0700
Subject: [PATCH 03/28] update docs

---
 rocm-docs/caffe2-build.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md
index 74ff92b72a26a9..d19116a018b7bd 100644
--- a/rocm-docs/caffe2-build.md
+++ b/rocm-docs/caffe2-build.md
@@ -27,7 +27,7 @@ To run benchmarks, skip directly to benchmarks section of the document.
 ## Build Caffe2 from source
 ### Pull the docker image
 ```
-docker pull rocm/caffe2:unbuilt-rocm1.8.2-v2
+docker pull rocm/caffe2:unbuilt-rocm1.8.2-v3
 ```
 This docker image has all the dependencies for caffe2 pre-installed.
 
@@ -49,7 +49,7 @@ cd pytorch
 
 ### Launch the docker container
 ```	
-docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v2
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v3
 ``` 
 Navigate to pytorch directory `cd /pytorch` inside the container.
 

From 53acb376bbaf159657b273033802793df73b96a0 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 29 Aug 2018 11:34:44 -0700
Subject: [PATCH 04/28] Update caffe2-build.md

---
 rocm-docs/caffe2-build.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md
index d19116a018b7bd..0050a01b3c1c61 100644
--- a/rocm-docs/caffe2-build.md
+++ b/rocm-docs/caffe2-build.md
@@ -19,7 +19,7 @@ Once the machine is ready with ROCm stack, there are two ways to use caffe2
 
 ## Launch docker container with caffe2 pre-installed
 ```
-docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v2
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v3
 ```
 
 To run benchmarks, skip directly to benchmarks section of the document.

From 6423f9d942eca24fbf05525bf992a179aa5f8305 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 10 Sep 2018 13:56:36 -0700
Subject: [PATCH 05/28] add thinlto

---
 .jenkins/caffe2/amd/build_amd.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.jenkins/caffe2/amd/build_amd.sh b/.jenkins/caffe2/amd/build_amd.sh
index 5c5dd0a3af41d5..be9248e1385155 100755
--- a/.jenkins/caffe2/amd/build_amd.sh
+++ b/.jenkins/caffe2/amd/build_amd.sh
@@ -35,9 +35,11 @@ fi
 export LANG=C.UTF-8
 export LC_ALL=C.UTF-8
 export HCC_AMDGPU_TARGET=gfx900
+export KMTHINLTO=1
 
 ########## HIPIFY Caffe2 operators
 ${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"
+${PYTHON} "${ROOT_DIR}/tools/amd_build/build_pytorch_amd.py"
 
 MAX_JOBS=$(nproc)
 

From 740e2e78feb681d48a6358e5dbdabcde7d1c2cfa Mon Sep 17 00:00:00 2001
From: Yaxun Sam Liu <yaxun.liu@amd.com>
Date: Wed, 12 Sep 2018 16:14:34 -0400
Subject: [PATCH 06/28] cmake fixes

---
 caffe2/CMakeLists.txt    |  4 ++--
 cmake/Dependencies.cmake | 23 +++++++++++------------
 2 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 4e9b3eef8937f4..cc4d03da58fecf 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -360,7 +360,7 @@ if(USE_ROCM)
   hip_add_library(caffe2_hip ${Caffe2_HIP_SRCS})
 
   # Since PyTorch files contain HIP headers, these flags are required for the necessary definitions to be added.
-  set_target_properties(caffe2_hip PROPERTIES COMPILE_FLAGS ${HIP_HIPCC_FLAGS})
+  target_compile_options(caffe2_hip PRIVATE ${HIP_HIPCC_FLAGS})
   target_link_libraries(caffe2_hip PUBLIC caffe2)
   target_link_libraries(caffe2_hip PUBLIC ${Caffe2_HIP_DEPENDENCY_LIBS})
 
@@ -536,7 +536,7 @@ if (BUILD_PYTHON)
   if(USE_ROCM)
     hip_add_library(caffe2_pybind11_state_hip MODULE ${Caffe2_HIP_PYTHON_SRCS})
     set_target_properties(caffe2_pybind11_state_hip PROPERTIES LINKER_LANGUAGE HIP)
-    set_target_properties(caffe2_pybind11_state_hip PROPERTIES COMPILE_FLAGS "${HIP_HIPCC_FLAGS} -fvisibility=hidden")
+    target_compile_options(caffe2_pybind11_state_hip PRIVATE ${HIP_HIPCC_FLAGS} -fvisibility=hidden)
     set_target_properties(caffe2_pybind11_state_hip PROPERTIES PREFIX "")
     set_target_properties(caffe2_pybind11_state_hip PROPERTIES SUFFIX ${PY_EXT_SUFFIX})
     if (APPLE)
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index dbec0d07ebf6e3..aeb5a4a46b4d14 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -531,18 +531,17 @@ if(NOT BUILD_ATEN_MOBILE)
     message(INFO "Compiling with HIP for AMD.")
     caffe2_update_option(USE_ROCM ON)
 
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -fPIC")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_PLATFORM_HCC__=1")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -DCUDA_HAS_FP16=1")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_NO_HALF_OPERATORS__=1")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -D__HIP_NO_HALF_CONVERSIONS__=1")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-macro-redefined")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-inconsistent-missing-override")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-exceptions")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-negative")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-shift-count-overflow")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-unused-command-line-argument")
-    set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} -Wno-duplicate-decl-specifier")
+    list(APPEND HIP_HIPCC_FLAGS -fPIC)
+    list(APPEND HIP_HIPCC_FLAGS -D__HIP_PLATFORM_HCC__=1)
+    list(APPEND HIP_HIPCC_FLAGS -DCUDA_HAS_FP16=1)
+    list(APPEND HIP_HIPCC_FLAGS -D__HIP_NO_HALF_OPERATORS__=1)
+    list(APPEND HIP_HIPCC_FLAGS -D__HIP_NO_HALF_CONVERSIONS__=1)
+    list(APPEND HIP_HIPCC_FLAGS -Wno-macro-redefined)
+    list(APPEND HIP_HIPCC_FLAGS -Wno-inconsistent-missing-override)
+    list(APPEND HIP_HIPCC_FLAGS -Wno-exceptions)
+    list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-negative)
+    list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-overflow)
+    list(APPEND HIP_HIPCC_FLAGS -Wno-unused-command-line-argument)
 
     set(Caffe2_HIP_INCLUDES
       ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDES})

From 0d892b050a7883e08b0519a9b27f6db5a697ca41 Mon Sep 17 00:00:00 2001
From: Yaxun Sam Liu <yaxun.liu@amd.com>
Date: Wed, 12 Sep 2018 16:12:36 -0400
Subject: [PATCH 07/28] Replace hc::__lane_id with __lane_id

---
 aten/src/THC/THCAsmUtils.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/aten/src/THC/THCAsmUtils.cuh b/aten/src/THC/THCAsmUtils.cuh
index c419fface4c299..89e97d9783ef6f 100644
--- a/aten/src/THC/THCAsmUtils.cuh
+++ b/aten/src/THC/THCAsmUtils.cuh
@@ -86,7 +86,7 @@ struct Bitfield<uint64_t> {
 
 __device__ __forceinline__ int getLaneId() {
 #if defined(__HIP_PLATFORM_HCC__)
-  return hc::__lane_id();
+  return __lane_id();
 #else
   int laneId;
   asm("mov.s32 %0, %laneid;" : "=r"(laneId) );

From 48e6c170214890f0d12307a1139ff79bee749962 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 17 Sep 2018 14:17:10 -0700
Subject: [PATCH 08/28] update tests

---
 caffe2/python/operator_test/conv_test.py       | 8 ++++----
 caffe2/python/operator_test/group_conv_test.py | 9 +++++----
 caffe2/python/operator_test/rnn_cell_test.py   | 9 ++++++++-
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index bca8e327031a54..c4eb708e0de801 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -584,8 +584,8 @@ def test_convolution_sync(self, net_type, num_workers, engine, gc, dc):
         m.net.SquaredL2Distance(["0_0_flat", "label"], "xent")
         m.net.AveragedLoss("xent", "loss")
         input_to_grad = m.AddGradientOperators(["loss"])
-        m.Proto().device_option.CopyFrom(do)
-        m.param_init_net.Proto().device_option.CopyFrom(do)
+        m.Proto().device_option.CopyFrom(gc)
+        m.param_init_net.Proto().device_option.CopyFrom(gc)
         m.Proto().type = net_type
         m.Proto().num_workers = num_workers
         self.ws.run(m.param_init_net)
@@ -597,10 +597,10 @@ def run():
             for input_blob in input_blobs:
                 self.ws.create_blob(input_blob).feed(
                     np.random.randn(n, d, h, w).astype(np.float32),
-                    device_option=do)
+                    device_option=gc)
                 self.ws.create_blob("label").feed(
                     np.random.randn(n, d * h * w).astype(np.float32),
-                    device_option=do)
+                    device_option=gc)
             self.ws.run(m.net)
             gradients = [
                 self.ws.blobs[str(input_to_grad[input_blob])].fetch()
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
index fd4c5adf0075d6..082c0c20c46a62 100644
--- a/caffe2/python/operator_test/group_conv_test.py
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -7,7 +7,7 @@
 import hypothesis.strategies as st
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 
 import unittest
@@ -38,7 +38,8 @@ def test_group_convolution(
             order, engine, use_bias, gc, dc):
         assume(size >= kernel)
         # TODO: Group conv in NHWC not implemented for GPU yet.
-        assume(group == 1 or order == "NCHW" or gc.device_type != caffe2_pb2.CUDA)
+        gpu_device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
+        assume(group == 1 or order == "NCHW" or gc.device_type != gpu_device_type)
         input_channels = input_channels_per_group * group
         output_channels = output_channels_per_group * group
 
@@ -65,8 +66,8 @@ def test_group_convolution(
             w = w.transpose((0, 3, 1, 2))
 
         inputs = [X, w, b] if use_bias else [X, w]
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
+        if order != 'NHWC' or group == 1:
+            self.assertDeviceChecks(dc, op, inputs, [0])
         for i in range(len(inputs)):
             self.assertGradientChecks(gc, op, inputs, i, [0])
 
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 9d9bb38e178517..99e9ec17078843 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1174,6 +1174,10 @@ def test_lstm_extract_predictor_net(self):
             shapes[b] = workspace.FetchBlob(b).shape
 
         # But export in CPU
+        if workspace.has_hip_support:
+            device = core.DeviceOption(caffe2_pb2.CPU, hip_gpu_id=1)
+        else:
+            device = core.DeviceOption(caffe2_pb2.CPU, cuda_gpu_id=1)
         (predict_net, export_blobs) = ExtractPredictorNet(
             net_proto=model.net.Proto(),
             input_blobs=["input"],
@@ -1216,7 +1220,10 @@ def test_lstm_extract_predictor_net(self):
                     if arg.name == "step_net":
                         for step_op in arg.n.op:
                             self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
+                            if workspace.has_hip_support:
+                                self.assertEqual(1, step_op.device_option.hip_gpu_id)
+                            else:
+                                self.assertEqual(1, step_op.device_option.cuda_gpu_id)
                     elif arg.name == 'backward_step_net':
                         self.assertEqual(caffe2_pb2.NetDef(), arg.n)
 

From 48371a673b1ea289e94a8949b14d0b55b8a199eb Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 17 Sep 2018 14:44:57 -0700
Subject: [PATCH 09/28] update optimizer.py

---
 caffe2/python/optimizer.py | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 1fd7801339535c..3b372f4372e949 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -210,6 +210,13 @@ def scale_learning_rate(self, *args, **kwargs):
         raise NotImplementedError(
             "Optimizer Need to Implement `scale_learning_rate` method.")
 
+    def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
+        wd = param_init_net.ConstantFill([], "weight_decay",
+            shape=[1], value=weight_decay)
+        trust = param_init_net.ConstantFill([], "trust", shape=[1], value=trust)
+        lr_max = param_init_net.ConstantFill([], "lr_max", shape=[1],
+            value=lr_max)
+        return wd, trust, lr_max
 
 class SgdOptimizer(Optimizer):
     def __init__(self, base_learning_rate=0.01, policy='fixed',
@@ -239,10 +246,13 @@ def _run(self, net, param_init_net, param_info):
         if self.lars is not None and not isinstance(grad, core.GradientSlice):
             assert self.lars >= 0, (
                 'Lars offset must be nonnegative, got {}'.format(self.lars))
+            wd, trust, lr_max = self.create_lars_inputs(
+                param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
             lr_lars_multiplier = net.Lars(
-                [param, grad],
+                [param, grad, wd, trust, lr_max],
                 self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars)
+                offset=self.lars,
+                lr_min=0.0)
             current_scope = scope.CurrentDeviceScope()
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
@@ -526,10 +536,13 @@ def _run(self, net, param_init_net, param_info):
         if self.lars is not None and not isinstance(grad, core.GradientSlice):
             assert self.lars >= 0, (
                 'Lars offset must be nonnegative, got {}'.format(self.lars))
+            wd, trust, lr_max = self.create_lars_inputs(
+                param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
             lr_lars_multiplier = net.Lars(
-                [param, grad],
+                [param, grad, wd, trust, lr_max],
                 self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars)
+                offset=self.lars,
+                lr_min=0.0)
             current_scope = scope.CurrentDeviceScope()
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
@@ -660,10 +673,13 @@ def _run(self, net, param_init_net, param_info):
         if self.lars is not None and not isinstance(grad, core.GradientSlice):
             assert self.lars >= 0, (
                 'Lars offset must be nonnegative, got {}'.format(self.lars))
+            wd, trust, lr_max = self.create_lars_inputs(
+                param_init_net, 0.0, 1.0, np.finfo(np.float32).max)
             lr_lars_multiplier = net.Lars(
-                [param, grad],
+                [param, grad, wd, trust, lr_max],
                 self.make_unique_blob_name(str(param) + "_lars"),
-                offset=self.lars)
+                offset=self.lars,
+                lr_min=0.0)
             current_scope = scope.CurrentDeviceScope()
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,

From 2419ec5d71574582289ac96f48bb2f595de1acf5 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 17 Sep 2018 17:15:32 -0700
Subject: [PATCH 10/28] add ignore tests

---
 .jenkins/caffe2/amd/binary_tests.sh | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/.jenkins/caffe2/amd/binary_tests.sh b/.jenkins/caffe2/amd/binary_tests.sh
index 132cc9c5606be0..1476ce7a0663e2 100755
--- a/.jenkins/caffe2/amd/binary_tests.sh
+++ b/.jenkins/caffe2/amd/binary_tests.sh
@@ -11,6 +11,16 @@ echo "Running C++ tests.."
 
 for file in $(find "${ROOT_DIR}/build_caffe2/bin" -executable -type f); do
 	if [[ "$file" =~ "test" ]]; then
-		$file
+		case "$file" in
+		    # skip tests we know are hanging or bad
+		    */mkl_utils_test|*/aten/integer_divider_test)
+		      continue
+		      ;;
+		    */scalar_tensor_test|*/basic|*/native_test)
+				continue
+			  ;;
+			*)
+			  "$file"
+		esac
 	fi
 done

From 9154b6a3d0fe5ab4071caec4bcc3f615eee13f23 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 19 Sep 2018 22:41:41 -0700
Subject: [PATCH 11/28] resolve merge conflict

---
 caffe2/python/operator_test/ceil_op_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
index 196e66fe0ec9ea..079e669370722f 100644
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -3,13 +3,9 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-<<<<<<< HEAD
 from caffe2.python import core, workspace
-=======
-from caffe2.python import core
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
->>>>>>> upstream/master
 from hypothesis import given
 import hypothesis.strategies as st
 import numpy as np

From c2dc0b77c68adcd88b6d66b0e42e46edbce9d98f Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Fri, 21 Sep 2018 14:48:21 -0700
Subject: [PATCH 12/28] update docker images versions in the docs

---
 rocm-docs/caffe2-build.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md
index 0050a01b3c1c61..2bec136b774581 100644
--- a/rocm-docs/caffe2-build.md
+++ b/rocm-docs/caffe2-build.md
@@ -19,7 +19,7 @@ Once the machine is ready with ROCm stack, there are two ways to use caffe2
 
 ## Launch docker container with caffe2 pre-installed
 ```
-docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.8.2-v3
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.9-v2
 ```
 
 To run benchmarks, skip directly to benchmarks section of the document.
@@ -27,7 +27,7 @@ To run benchmarks, skip directly to benchmarks section of the document.
 ## Build Caffe2 from source
 ### Pull the docker image
 ```
-docker pull rocm/caffe2:unbuilt-rocm1.8.2-v3
+docker pull rocm/caffe2:unbuilt-rocm1.9-v1
 ```
 This docker image has all the dependencies for caffe2 pre-installed.
 

From daced6b05956be30bdd8a08a33dc2d70e60d98ae Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 24 Sep 2018 13:34:52 -0700
Subject: [PATCH 13/28] fix bug in rnn_cell test

---
 caffe2/python/operator_test/rnn_cell_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 99e9ec17078843..8804dc3bbdbcbc 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1182,7 +1182,7 @@ def test_lstm_extract_predictor_net(self):
             net_proto=model.net.Proto(),
             input_blobs=["input"],
             output_blobs=[output],
-            device=core.DeviceOption(caffe2_pb2.CPU, 1),
+            device=device,
         )
 
         # Create the net and run once to see it is valid

From b1b216e78cc840dbde10a91806aa092a025f009d Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Mon, 24 Sep 2018 13:35:11 -0700
Subject: [PATCH 14/28] exclude fp16 for hip

---
 caffe2/python/operator_test/boolean_mask_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index dda13c76f636be..695cb3818deb5f 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -47,10 +47,10 @@ def ref(x, mask):
 
     @staticmethod
     def _dtype_conversion(x, dtype, gc, dc):
-        """SequenceMask only supports fp16 with CUDA/HIP."""
+        """SequenceMask only supports fp16 with CUDA."""
         if dtype == np.float16:
-            assume(gc.device_type == caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)
-            dc = [d for d in dc if d.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)]
+            assume(gc.device_type == caffe2_pb2.CUDA)
+            dc = [d for d in dc if d.device_type == caffe2_pb2.CUDA]
             x = x.astype(dtype)
         return x, dc
 

From 3309875d97de7274afccadd6e8ccd406badcc2ce Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Tue, 25 Sep 2018 11:32:54 -0700
Subject: [PATCH 15/28] add correct conditioning in workspace.py

---
 caffe2/python/workspace.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index 9e69bace2e09c4..a1e7bf40ae7971 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -57,7 +57,6 @@ def GetCudaPeerAccessPattern():
     GetCuDNNVersion = lambda: 0 # noqa
     GetCuDNNVersion = lambda: 0 # noqa
     GetCudaPeerAccessPattern = lambda: np.array([]) # noqa
-    GetDeviceProperties = lambda x: None # noqa
 
 if has_hip_support:
     NumHipDevices = C.num_hip_devices
@@ -69,8 +68,10 @@ def GetHipPeerAccessPattern():
 else:
     NumHipDevices = lambda: 0 # noqa
     GetHipPeerAccessPattern = lambda: np.array([]) # noqa
-    GetDeviceProperties = lambda x: None # noqa
 
+if not has_gpu_support and not has_hip_support:
+    GetDeviceProperties = lambda x: None # noqa
+    
 IsNUMAEnabled = C.is_numa_enabled
 GetNumNUMANodes = C.get_num_numa_nodes
 GetBlobNUMANode = C.get_blob_numa_node

From 413e94085494d94ea315d17ad1e65a3c7f48edd3 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Tue, 25 Sep 2018 16:46:56 -0700
Subject: [PATCH 16/28] fix device options in hypothesis_test_util

---
 caffe2/python/hypothesis_test_util.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 69ee82a932856d..af8d7f6a561146 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -256,6 +256,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 # temporarily skip some flaky tests on ROCM before it's getting more mature.
 _device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
 device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else [])
+ _device_options_hip_or_gpu = [hip_do] if workspace.has_hip_support else [gpu_do] 
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
@@ -278,7 +279,8 @@ def gradient_checker_device_option():
 )
 
 gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
-gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]+[hip_do]), dc=st.just([gpu_do]+[hip_do]))
+gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
+gcs_gpu_or_hip_only = dict(gc=st.sampled_from(_device_options_hip_or_gpu), dc=st.just(_device_options_hip_or_gpu))
 gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip))
 
 

From 30cfc8e5516d9278ef2c9ff779fb852952ccc53a Mon Sep 17 00:00:00 2001
From: Ashish <ashish.farmer@amd.com>
Date: Tue, 25 Sep 2018 16:49:35 -0700
Subject: [PATCH 17/28] Fixed a typo

---
 caffe2/python/operator_test/conv_test.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 071ab68ebd0e58..a3cb4e3f05749e 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -226,9 +226,9 @@ def test_convolution_gradients(
                                    nhwc=(order == 'NHWC'),
                                    backward=True))
         if engine == 'MIOPEN':
-            assume(_cudnn_supports(dilation=(dilation > 1),
-                                   nhwc=(order == 'NHWC'),
-                                   backward=True))
+            assume(_miopen_supports(dilation=(dilation > 1),
+                                    nhwc=(order == 'NHWC'),
+                                    backward=True))
 
         assume(engine != "MKLDNN" or use_bias is True)
 

From 402461b8396a5a6ce599da234f5fcae6f5c4734b Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Tue, 25 Sep 2018 17:28:00 -0700
Subject: [PATCH 18/28] fix typo

---
 caffe2/python/hypothesis_test_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index af8d7f6a561146..04700cec76f313 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -256,7 +256,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 # temporarily skip some flaky tests on ROCM before it's getting more mature.
 _device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
 device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else [])
- _device_options_hip_or_gpu = [hip_do] if workspace.has_hip_support else [gpu_do] 
+_device_options_hip_or_gpu = [hip_do] if workspace.has_hip_support else [gpu_do] 
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (

From 4e6de586497b9cbd136dceb5d26e7bfb57ac005f Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 26 Sep 2018 13:08:53 -0700
Subject: [PATCH 19/28] modify resnet benchmarking script

---
 caffe2/python/convnet_benchmarks.py | 128 ++++++++++++++++++----------
 1 file changed, 85 insertions(+), 43 deletions(-)

diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
index 6b8af12f5cec71..fe9420902ad928 100644
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@@ -61,8 +61,10 @@
 import argparse
 import os
 
-from caffe2.python import workspace, brew, model_helper, core
+from caffe2.python import workspace, brew, model_helper, core, net_drawer, memonger
 from caffe2.proto import caffe2_pb2
+from caffe2.python import data_parallel_model as dpm
+from caffe2.python.models import resnet
 
 def MLP(order, cudnn_ws, model_path=""):
     model = model_helper.ModelHelper(name="MLP")
@@ -563,45 +565,81 @@ def Inception(order, cudnn_ws, model_path=""):
     model.net.AveragedLoss(xent, "loss")
     return model, 224
 
-def Resnet50(order, cudnn_ws, model_path=""):
-    if model_path == "":
-        print("ERROR: please specify paths to init_net and predict_net protobufs for Resnet50")
-        exit(1)
-    device_opts = caffe2_pb2.DeviceOption()
-    device_opts.device_type = caffe2_pb2.HIP
-    device_opts.hip_gpu_id = 0
 
-    INIT_NET_PB = os.path.join(model_path, "init_net.pb")
-    PREDICT_NET_PB = os.path.join(model_path, "predict_net.pb")
-    init_def = caffe2_pb2.NetDef()
-    with open(INIT_NET_PB, 'rb') as f:
-        init_def.ParseFromString(f.read())
-        init_def.device_option.CopyFrom(device_opts)
+def Resnet50(args):
+    gpus = [0]
+    device_opt = core.DeviceOption(caffe2_pb2.HIP)
+    device_opt.hip_gpu_id = gpus[0]
+
+    # Batch size of 32 sums up to roughly 5GB of memory per device
+    batch_per_device = args.batch_size
+    total_batch_size = batch_per_device * len(gpus)
+
+    num_labels = 1000
+
+    base_learning_rate = 0.0004 * total_batch_size
+
+    # Weight decay (L2 regularization)
+    weight_decay = 1e-4
+
+    num_epochs = 10
+
+    ##################
+    # Define the Model
+    ##################
+    train_model = model_helper.ModelHelper(name="resnet50_train")
+
+    def create_resnet50_model_ops(model, loss_scale=1.0):
+        # residual network
+        [softmax, loss] = resnet.create_resnet50(model,
+                                                 "data",
+                                                 num_input_channels=3,
+                                                 num_labels=num_labels,
+                                                 label="label", )
+        prefix = model.net.Proto().name
+        loss = model.net.Scale(loss, prefix + "_loss", scale=loss_scale)
+        brew.accuracy(model, [softmax, "label"], prefix + "_accuracy")
+        return [loss]
+
+    def add_parameter_update_ops(model):
+        brew.add_weight_decay(model, weight_decay)
+        iter = brew.iter(model, "iter")
+        lr = model.net.LearningRate([iter],
+                                    "lr",
+                                    base_lr=base_learning_rate,
+                                    policy="fixed",
+                                    gamma=0.1)
+        for param in model.GetParams():
+            param_grad = model.param_to_grad[param]
+            param_momentum = model.param_init_net.ConstantFill(
+                [param], param + '_momentum', value=0.0 )
+
+            model.net.MomentumSGDUpdate(
+                [param_grad, param_momentum, lr, param],
+                [param_grad, param_momentum, param],
+                momentum=0.9,
+                nesterov=1)
+
+    def optimize_gradient_memory(model, loss):
+        model.net._net = memonger.share_grad_blobs(
+            model.net,
+            loss,
+            set(model.param_to_grad.values()),
+            namescope="",
+            share_activations=False)
+
+
+    with core.NameScope(""):
+        with core.DeviceScope(device_opt):
+            losses = create_resnet50_model_ops(train_model)
+            if not args.forward_only:
+                blobs_to_gradients = train_model.AddGradientOperators(losses)
+                add_parameter_update_ops(train_model)
+        if not args.forward_only:
+            optimize_gradient_memory(train_model, [blobs_to_gradients[losses[0]]])
+
+    return train_model, 224
 
-    net_def = caffe2_pb2.NetDef()
-    with open(PREDICT_NET_PB, 'rb') as f:
-        net_def.ParseFromString(f.read())
-        net_def.device_option.CopyFrom(device_opts)
-
-    init_net = core.Net(init_def)
-    predict_net = core.Net(net_def)
-    for op in init_net.Proto().op:
-        op.device_option.CopyFrom(device_opts)
-    for op in predict_net.Proto().op:
-        op.device_option.CopyFrom(device_opts)
-    my_arg_scope = {
-        'order': order,
-    }
-    model = model_helper.ModelHelper(
-        name="resnet50",
-        arg_scope=my_arg_scope,
-    )
-
-    model.param_init_net = init_net
-    model.net = predict_net
-    xent = model.net.LabelCrossEntropy(["gpu_0/softmax", "label"], "xent")
-    model.net.AveragedLoss(xent, "loss")
-    return model, 224
 
 def Inception_v2(order, cudnn_ws, model_path=""):
     if model_path == "":
@@ -654,7 +692,10 @@ def AddParameterUpdate(model):
 
 
 def Benchmark(model_gen, arg):
-    model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.model_path)
+    if arg.model == 'Resnet50':
+        model, input_size = model_gen(arg)
+    else:
+        model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.model_path)
     model.Proto().type = arg.net_type
     model.Proto().num_workers = arg.num_workers
 
@@ -669,7 +710,7 @@ def Benchmark(model_gen, arg):
 
     model.param_init_net.GaussianFill(
         [],
-        "gpu_0/data" if arg.model == "Resnet50" else "data",
+        "data",
         shape=input_shape,
         mean=0.0,
         std=1.0
@@ -686,8 +727,9 @@ def Benchmark(model_gen, arg):
         print('{}: running forward only.'.format(arg.model))
     else:
         print('{}: running forward-backward.'.format(arg.model))
-        model.AddGradientOperators(["loss"])
-        AddParameterUpdate(model)
+        if not arg.model == "Resnet50":
+            model.AddGradientOperators(["loss"])
+            AddParameterUpdate(model)
         if arg.order == 'NHWC':
             print(
                 '==WARNING==\n'
@@ -805,6 +847,6 @@ def GetArgumentParser():
             'MLP': MLP,
             'Resnet50': Resnet50,
             'Inception_v2':Inception_v2
-
         }
         Benchmark(model_map[args.model], args)
+

From 98ba4a1fd85a4564431ed5acb64971fd71fd415f Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 26 Sep 2018 13:10:56 -0700
Subject: [PATCH 20/28] filter group > 1 in conv_test

---
 caffe2/python/operator_test/conv_test.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index 071ab68ebd0e58..bca477c0278958 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -45,9 +45,10 @@ def _miopen_supports(
         dilation=False,
         nhwc=False,
         backward=False,
+        group=False,
 ):
     """Return True if MIOPEN supports this configuration."""
-    if nhwc or dilation:
+    if nhwc or dilation or group:
         return False
     return True
 
@@ -226,9 +227,10 @@ def test_convolution_gradients(
                                    nhwc=(order == 'NHWC'),
                                    backward=True))
         if engine == 'MIOPEN':
-            assume(_cudnn_supports(dilation=(dilation > 1),
+            assume(_miopen_supports(dilation=(dilation > 1),
                                    nhwc=(order == 'NHWC'),
-                                   backward=True))
+                                   backward=True,
+                                   group=(group > 1)))
 
         assume(engine != "MKLDNN" or use_bias is True)
 

From 83b85e0d9d202b2531d61d71dbff64076d377d57 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 26 Sep 2018 13:43:25 -0700
Subject: [PATCH 21/28] fix whitespaces Dependencies.cmake

---
 cmake/Dependencies.cmake | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 7498b4540915a7..869a563d05a27a 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -548,7 +548,6 @@ if(NOT BUILD_ATEN_MOBILE)
     list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-negative)
     list(APPEND HIP_HIPCC_FLAGS -Wno-shift-count-overflow)
     list(APPEND HIP_HIPCC_FLAGS -Wno-unused-command-line-argument)
-
     list(APPEND HIP_HIPCC_FLAGS -Wno-duplicate-decl-specifier)
     
     if(CMAKE_BUILD_TYPE MATCHES Debug)

From 775d102613944e68c3d1a021d712500ea1ca8ca6 Mon Sep 17 00:00:00 2001
From: Ashish <ashish.farmer@amd.com>
Date: Wed, 26 Sep 2018 13:56:31 -0700
Subject: [PATCH 22/28] Using MIOpen group conv API

---
 caffe2/operators/hip/conv_op_miopen.cc | 536 ++++++-------------------
 1 file changed, 128 insertions(+), 408 deletions(-)

diff --git a/caffe2/operators/hip/conv_op_miopen.cc b/caffe2/operators/hip/conv_op_miopen.cc
index 5b9df03dc78953..13f2428bf9daa2 100644
--- a/caffe2/operators/hip/conv_op_miopen.cc
+++ b/caffe2/operators/hip/conv_op_miopen.cc
@@ -48,7 +48,11 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
 
     if ((operator_def.type().substr(0, 6) == "Conv") ||
         (operator_def.type().substr(0, 14) == "ConvGradient")) {
-      mode_ = miopenConvolution;
+      if(group_ > 1) {
+        mode_ = miopenGroupConv;
+      } else{
+        mode_ = miopenConvolution;
+      }
     } else if (
         (operator_def.type().substr(0, 7) == "Trans") ||
         (operator_def.type().substr(0, 15) == "TransGradient")) {
@@ -57,6 +61,12 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
       LOG(FATAL) << "Unsupported convolution method: " << operator_def.type();
     }
 
+    if(mode_ == miopenGroupConv) {
+      OPERATOR_NEEDS_FEATURE(
+        dilation_h() == 1 && dilation_w() == 1,
+        "MIOpen convolution does not support dilation for groups > 1.");
+    }
+
     MIOPEN_ENFORCE(miopenInitConvolutionDescriptor(
         conv_desc_,
         mode_,
@@ -66,6 +76,9 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
         stride_w(),
         dilation_h(),
         dilation_w()));
+
+    MIOPEN_ENFORCE(miopenSetConvolutionGroupCount(
+        conv_desc_, group_));
   }
 
   ~MIOPENConvOpBase() {
@@ -155,9 +168,6 @@ class MIOPENConvGradientOp final : public MIOPENConvOpBase {
         bwdDataWsSize_(0),
         bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM),
         bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) {
-    OPERATOR_NEEDS_FEATURE(
-        group_ == 1,
-        "Group convolution not supported yet for MIOpen ConvGradient.");
     CAFFE_ENFORCE(
         !(no_bias_ && OutputSize() == 3),
         "If bias is not present, you should not have 3 grad output.");
@@ -247,186 +257,70 @@ bool MIOPENConvOp::DoRunWithType() {
       "If you set group, the number of output channels should be divisible "
       "by group.");
 
-  if (group_ > 1) {
-    int group_offset_filter = Weight.size() / group_;
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_W>::type,
-        M / group_,
-        C / group_,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C / group_, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (InputSize() == 3) {
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          bias_desc_, miopenTypeWrapper<T_B>::type, 1, Y->dim32(1), 1, 1));
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      weight_desc_,
+      miopenTypeWrapper<T_W>::type,
+      M,
+      C / group_,
+      kernel_h(),
+      kernel_w()));
+
+  MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+      conv_desc_,
+      bottom_desc_,
+      weight_desc_,
+      &N_out,
+      &C_out,
+      &H_out,
+      &W_out));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+  if (InputSize() == 3) {
       MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          top_desc_for_bias_,
-          miopenTypeWrapper<T_X>::type,
-          Y->dim32(0),
-          Y->dim32(1),
-          H_out,
-          W_out));
-    }
-
-    int group_offset_X = C / group_ * H * W * D;
-    int group_offset_Y = M / group_ * H_out * W_out * D_out;
+          bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
+  }
 
-    while (!bestAlgoFound_) {
+  while (!bestAlgoFound_) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        weight_desc_,
-        bottom_desc_,
-        conv_desc_,
-        top_desc_,
-        &fwdConvWsSize_));
-      if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
-          state->miopen_handle(),
-          bottom_desc_,
-          X.template data<T_X>(),
+          miopen_wrapper_.inline_miopen_handle(),
           weight_desc_,
-          Weight.template data<T_W>(),
-          conv_desc_,
-          top_desc_,
-          Y->template mutable_data<T_Y>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          fwdConvWs_,
-          fwdConvWsSize_,
-          false));
-      });
-      bestAlgoFound_ = true;
-      fwdAlgo_ = perf.fwd_algo;
-    }
-
-    for (int g = 0; g < group_; g++) {
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionForward(
-          state->miopen_handle(),
-          &alpha_,
           bottom_desc_,
-          X.template data<T_X>() + g * group_offset_X,
-          weight_desc_,
-          Weight.template data<T_W>() + g * group_offset_filter,
           conv_desc_,
-          fwdAlgo_,
-          &beta_,
           top_desc_,
-          Y->template mutable_data<T_Y>() + g * group_offset_Y,
-          fwdConvWs_,
-          fwdConvWsSize_));
-      });
-    }
-    hipDeviceSynchronize();
-
-    // BIAS
-    if (InputSize() == 3) {
-      auto& bias = Input(BIAS);
-
-      CAFFE_ENFORCE_EQ(bias.ndim(), 1);
-      CAFFE_ENFORCE_EQ(bias.dim32(0), M);
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionForwardBias(
-          state->miopen_handle(),
-          &alpha_,
-          bias_desc_,
-          bias.template data<T_B>(),
-          &beta_,
-          top_desc_for_bias_,
-          Y->template mutable_data<T_Y>()));
-      });
-    }
-
-    hipDeviceSynchronize();
-  } else {
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_W>::type,
-        M,
-        C,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (InputSize() == 3) {
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          bias_desc_, miopenTypeWrapper<T_B>::type, 1, C_out, 1, 1));
-    }
-
-    while (!bestAlgoFound_) {
-      miopenConvAlgoPerf_t perf;
-
-      MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        weight_desc_,
-        bottom_desc_,
-        conv_desc_,
-        top_desc_,
-        &fwdConvWsSize_));
-
+          &fwdConvWsSize_));
       if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
+          HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
       }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
-          state->miopen_handle(),
-          bottom_desc_,
-          X.template data<T_X>(),
-          weight_desc_,
-          Weight.template data<T_W>(),
-          conv_desc_,
-          top_desc_,
-          Y->template mutable_data<T_Y>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          fwdConvWs_,
-          fwdConvWsSize_,
-          false));
+          MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
+            state->miopen_handle(),
+            bottom_desc_,
+            X.template data<T_X>(),
+            weight_desc_,
+            Weight.template data<T_W>(),
+            conv_desc_,
+            top_desc_,
+            Y->template mutable_data<T_Y>(),
+            requestAlgoCount_,
+            &returnedAlgoCount_,
+            &perf,
+            fwdConvWs_,
+            fwdConvWsSize_,
+            false));
       });
       bestAlgoFound_ = true;
       fwdAlgo_ = perf.fwd_algo;
-    }
-    miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
+  }
+
+  miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
       MIOPEN_ENFORCE(miopenConvolutionForward(
         state->miopen_handle(),
         &alpha_,
@@ -441,10 +335,9 @@ bool MIOPENConvOp::DoRunWithType() {
         Y->template mutable_data<T_Y>(),
         fwdConvWs_,
         fwdConvWsSize_));
-    });
+  });
 
-    // BIAS
-    if (InputSize() == 3) {
+  if (InputSize() == 3) {
       auto& bias = Input(BIAS);
 
       CAFFE_ENFORCE_EQ(bias.ndim(), 1);
@@ -459,13 +352,10 @@ bool MIOPENConvOp::DoRunWithType() {
           top_desc_,
           Y->template mutable_data<T_Y>()));
       });
-    }
-
-    hipDeviceSynchronize();
   }
-
   return true;
 }
+
 // TODO : enable fp16 support.
 bool MIOPENConvOp::RunOnDevice() {
   if (Input(0).IsType<float>()) {
@@ -535,215 +425,46 @@ bool MIOPENConvGradientOp::DoRunWithType() {
 
   bool doBwdDataComputation = (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2)));
 
-  if (group_ > 1) {
-    int group_offset_filter = Weight.size() / group_;
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_X>::type,
-        M / group_,
-        C / group_,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C / group_, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (!no_bias_) {
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      weight_desc_,
+      miopenTypeWrapper<T_X>::type,
+      M,
+      C / group_,
+      kernel_h(),
+      kernel_w()));
+
+  MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+      conv_desc_,
+      bottom_desc_,
+      weight_desc_,
+      &N_out,
+      &C_out,
+      &H_out,
+      &W_out));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+  if (!no_bias_) {
       MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
           bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          top_desc_for_bias_,
-          miopenTypeWrapper<T_X>::type,
-          dY.dim32(0),
-          M,
-          H_out,
-          W_out));
-    }
-
-    int group_offset_X = C / group_ * H * W * D;
-    int group_offset_Y = M / group_ * H_out * W_out * D_out;
+  }
 
-    while ((!bestDataAlgoFound_) && doBwdDataComputation) {
+  while ((!bestDataAlgoFound_) && doBwdDataComputation) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        weight_desc_,
-        conv_desc_,
-        bottom_desc_,
-        &bwdDataWsSize_));
-      if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm(
-          state->miopen_handle(),
+          miopen_wrapper_.inline_miopen_handle(),
           top_desc_,
-          dY.template data<T_DY>(),
           weight_desc_,
-          Weight.template data<T_W>(),
-          conv_desc_,
-          bottom_desc_,
-          dX->template mutable_data<T_DX>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          bwdDataWs_,
-          bwdDataWsSize_,
-          false));
-      });
-
-      bestDataAlgoFound_ = true;
-      bwdDataAlgo_ = perf.bwd_data_algo;
-    }
-
-    while (!bestWeightAlgoFound_) {
-      miopenConvAlgoPerf_t perf;
-
-      MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        bottom_desc_,
-        conv_desc_,
-        weight_desc_,
-        &bwdWeightWsSize_));
-      if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
-          state->miopen_handle(),
-          top_desc_,
-          dY.template data<T_DY>(),
-          bottom_desc_,
-          X.template data<T_X>(),
           conv_desc_,
-          weight_desc_,
-          dW->template mutable_data<T_DW>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          bwdWeightWs_,
-          bwdWeightWsSize_,
-          false));
-      });
-      bestWeightAlgoFound_ = true;
-      bwdWeiAlgo_ = perf.bwd_weights_algo;
-    }
-
-    for (int g = 0; g < group_; g++) {
-      if (doBwdDataComputation) {
-        miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-          MIOPEN_ENFORCE(miopenConvolutionBackwardData(
-            state->miopen_handle(),
-            &alpha_,
-            top_desc_,
-            dY.template data<T_DY>() + g * group_offset_Y,
-            weight_desc_,
-            Weight.template data<T_W>() + g * group_offset_filter,
-            conv_desc_,
-            bwdDataAlgo_,
-            &beta_,
-            bottom_desc_,
-            dX->template mutable_data<T_DX>() + g * group_offset_X,
-            bwdDataWs_,
-            bwdDataWsSize_));
-        });
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
-          state->miopen_handle(),
-          &alpha_,
-          top_desc_,
-          dY.template data<T_DY>() + g * group_offset_Y,
           bottom_desc_,
-          X.template data<T_X>() + g * group_offset_X,
-          conv_desc_,
-          bwdWeiAlgo_,
-          &beta_,
-          weight_desc_,
-          dW->template mutable_data<T_DW>() + g * group_offset_filter,
-          bwdWeightWs_,
-          bwdWeightWsSize_));
-      });
-    }
-
-    // Synchronize the work across groups.
-    hipDeviceSynchronize();
-
-    ////////////////////////////////////// BIAS ///////////////////////////
-    if (!no_bias_) {
-      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
-      dbias->Resize(M);
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionBackwardBias(
-          state->miopen_handle(),
-          &alpha_,
-          top_desc_for_bias_,
-          dY.template data<T_DY>(),
-          &beta_,
-          bias_desc_,
-          dbias->template mutable_data<T_DB>()));
-      });
-    }
-  } else // No group
-  {
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_X>::type,
-        M,
-        C,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (!no_bias_) {
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
-    }
-
-    while ((!bestDataAlgoFound_) && doBwdDataComputation) {
-      miopenConvAlgoPerf_t perf;
-
-      MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        weight_desc_,
-        conv_desc_,
-        bottom_desc_,
-        &bwdDataWsSize_));
+          &bwdDataWsSize_));
       if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
+          HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
       }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
@@ -766,22 +487,22 @@ bool MIOPENConvGradientOp::DoRunWithType() {
 
       bestDataAlgoFound_ = true;
       bwdDataAlgo_ = perf.bwd_data_algo;
-    }
+  }
 
-    while (!bestWeightAlgoFound_) {
+  while (!bestWeightAlgoFound_) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        bottom_desc_,
-        conv_desc_,
-        weight_desc_,
-        &bwdWeightWsSize_));
+          miopen_wrapper_.inline_miopen_handle(),
+          top_desc_,
+          bottom_desc_,
+          conv_desc_,
+          weight_desc_,
+          &bwdWeightWsSize_));
       if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
         HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
       }
-      
+
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
         MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
           state->miopen_handle(),
@@ -801,9 +522,9 @@ bool MIOPENConvGradientOp::DoRunWithType() {
       });
       bestWeightAlgoFound_ = true;
       bwdWeiAlgo_ = perf.bwd_weights_algo;
-    }
+  }
 
-    if (doBwdDataComputation) {
+  if (doBwdDataComputation) {
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
         MIOPEN_ENFORCE(miopenConvolutionBackwardData(
           state->miopen_handle(),
@@ -820,30 +541,30 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           bwdDataWs_,
           bwdDataWsSize_));
         });
-    }
-
-    miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-      MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
-        state->miopen_handle(),
-        &alpha_,
-        top_desc_,
-        dY.template data<T_DY>(),
-        bottom_desc_,
-        X.template data<T_X>(),
-        conv_desc_,
-        bwdWeiAlgo_,
-        &beta_,
-        weight_desc_,
-        dW->template mutable_data<T_DW>(),
-        bwdWeightWs_,
-        bwdWeightWsSize_));
-    });
-
-    // Synchronize the work across groups.
-    hipDeviceSynchronize();
+  }
 
-    ////////////////////////////////////// BIAS ///////////////////////////
-    if (!no_bias_) {
+  miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
+    MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
+      state->miopen_handle(),
+      &alpha_,
+      top_desc_,
+      dY.template data<T_DY>(),
+      bottom_desc_,
+      X.template data<T_X>(),
+      conv_desc_,
+      bwdWeiAlgo_,
+      &beta_,
+      weight_desc_,
+      dW->template mutable_data<T_DW>(),
+      bwdWeightWs_,
+      bwdWeightWsSize_));
+  });
+
+  // Synchronize the work across groups.
+  hipDeviceSynchronize();
+
+  ////////////////////////////////////// BIAS ///////////////////////////
+  if (!no_bias_) {
       auto* dbias = Output(BIAS_OR_INPUT_GRAD);
       dbias->Resize(M);
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
@@ -856,7 +577,6 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           bias_desc_,
           dbias->template mutable_data<T_DB>()));
       });
-    }
   }
 
   return true;

From 0c8b222bacb9d6f1e64ac8e1a48d7c25bcf38e27 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 26 Sep 2018 14:06:58 -0700
Subject: [PATCH 23/28] address comments, whitespaces

---
 caffe2/python/core.py                              | 2 +-
 caffe2/python/data_parallel_model.py               | 3 +++
 caffe2/python/operator_test/activation_ops_test.py | 2 +-
 caffe2/python/optimizer.py                         | 5 ++++-
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 75e7ccc3f53630..f08d72347c15b8 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -2118,7 +2118,7 @@ def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
-                op.engine = 'CUDNN'
+                op.engine = "CUDNN"
 
     def RunAllOnMKL(self):
         """A convenient function to run everything using MKLDNN."""
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 4b1aa22a282dc7..b6d425a699f220 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -1020,6 +1020,7 @@ def _Broadcast(devices, model, net, param, use_nccl=False):
                     root=0,
                 )
                 return
+
     for dev_idx in devices[1:]:
         if _IsGPUBlob(model, param):
             if workspace.has_hip_support:
@@ -1181,6 +1182,7 @@ def _SyncAllParamsDistributed(
     max_concurrent_distributed_ops
 ):
     assert rendezvous['num_shards'] > 1
+
     if workspace.has_hip_support:
         gpu_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
     else:
@@ -1359,6 +1361,7 @@ def _AllReduceBlobsDistributed(
     num_workers = model.net.Proto().num_workers
     assert num_workers > 1, "Please specify more than 1 worker"
     all_reduce_engine = rendezvous['engine']
+    
     if workspace.has_hip_support:
         master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
     else:
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
index aff55bcb7834de..200647c95998e4 100644
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -16,7 +16,7 @@
 import unittest
 
 
-class TestActivations(hu.HypothesisTestCase):
+class TestActivations(serial.SerializedTestCase):
     @serial.given(X=hu.tensor(), in_place=st.booleans(),
            engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **mu.gcs)
     def test_relu(self, X, in_place, engine, gc, dc):
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 799c3aa36fd2ca..b9838ed0ee1950 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -218,6 +218,7 @@ def create_lars_inputs(self, param_init_net, weight_decay, trust, lr_max):
             value=lr_max)
         return wd, trust, lr_max
 
+
 class SgdOptimizer(Optimizer):
     def __init__(self, base_learning_rate=0.01, policy='fixed',
                  momentum=0.0, nesterov=1, sparse_dedup_aggregator=None,
@@ -543,6 +544,7 @@ def _run(self, net, param_init_net, param_info):
                 self.make_unique_blob_name(str(param) + "_lars"),
                 offset=self.lars,
                 lr_min=0.0)
+
             current_scope = scope.CurrentDeviceScope()
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
@@ -1453,7 +1455,8 @@ def build_ftrl(model, engine="SIMD", **kwargs):
 
 
 def build_gftrl(model, engine="", **kwargs):
-    # SIMD version of GFTRL is not supported
+    if engine == "SIMD":
+        assert core.IsOperator('GFtrl_ENGINE_SIMD')    
     gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
     return _build(model, gftrl_optimizer)
 

From 8bed68c63a0d9b98b672590aaab1972324f82563 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 26 Sep 2018 14:12:39 -0700
Subject: [PATCH 24/28] will instead add a readme for this soon

---
 .../caffe2/amd/install_dependencies_amd.sh    | 87 -------------------
 1 file changed, 87 deletions(-)
 delete mode 100755 .jenkins/caffe2/amd/install_dependencies_amd.sh

diff --git a/.jenkins/caffe2/amd/install_dependencies_amd.sh b/.jenkins/caffe2/amd/install_dependencies_amd.sh
deleted file mode 100755
index 51c2fe98ce4aa0..00000000000000
--- a/.jenkins/caffe2/amd/install_dependencies_amd.sh
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-apt-get update
-apt-get install -y --no-install-recommends \
-        autoconf \
-        build-essential \
-        ca-certificates \
-        cmake \
-        curl \
-        git \
-        libgoogle-glog-dev \
-        libhiredis-dev \
-        libiomp-dev \
-        libleveldb-dev \
-        liblmdb-dev \
-        libopencv-dev \
-        libpthread-stubs0-dev \
-        libsnappy-dev \
-        libprotobuf-dev \
-        protobuf-compiler \
-        python-dev \
-        python-setuptools \
-        python-pip
-
-pip install networkx==2.0
-
-pip install --user \
-    click \
-    future \
-    hypothesis \
-    jupyter \
-    numpy \
-    protobuf \
-    pytest \
-    pyyaml \
-    scipy==0.19.1 \
-    scikit-image \
-    tabulate \
-    virtualenv \
-    mock \
-    typing \
-    typing-extensions \
-    pyyaml
-
-CLANG_VERSION=3.8
-
-apt-get update
-apt-get install -y --no-install-recommends clang-"$CLANG_VERSION"
-rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
-
-# Use update-alternatives to make this version the default
-update-alternatives --install /usr/bin/gcc gcc /usr/bin/clang-"$CLANG_VERSION" 50
-update-alternatives --install /usr/bin/g++ g++ /usr/bin/clang++-"$CLANG_VERSION" 50
-
-
-apt-get install -y --no-install-recommends \
-        rocm-dev \
-        rocm-utils \
-        rocm-libs \
-        rocfft \
-        miopen-hip \
-        miopengemm \
-        rocblas \
-        rocrand 
-
-
-if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
-    sudo sed -i 's/\ -I${dir}/\ $<$<BOOL:${dir}>:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake
-fi
-
-if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
-    sudo sed -i 's/set(_hip_build_configuration "${CMAKE_BUILD_TYPE}")/string(TOUPPER _hip_build_configuration "${CMAKE_BUILD_TYPE}")/' /opt/rocm/hip/cmake/FindHIP.cmake
-fi
-
-git clone --recursive https://github.com/ROCmSoftwarePlatform/Thrust.git /data/Thrust
-rm -rf /data/Thrust/thrust/system/cuda/detail/cub-hip
-git clone --recursive https://github.com/ROCmSoftwarePlatform/cub-hip.git /data/Thrust/thrust/system/cuda/detail/cub-hip
-
-
-export PATH=/opt/rocm/bin:$PATH
-export PATH=/opt/rocm/hcc/bin:$PATH
-export PATH=/opt/rocm/hip/bin:$PATH
-export PATH=/opt/rocm/opencl/bin:$PATH
-export THRUST_ROOT=/data/Thrust
-export HIP_PLATFORM=hcc

From 65f24b16792652ee505a20769aadab7a8c792800 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 26 Sep 2018 15:41:19 -0700
Subject: [PATCH 25/28] remove unused varaibles

---
 caffe2/python/convnet_benchmarks.py | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
index fe9420902ad928..dfb729e9aeb90e 100644
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@@ -570,20 +570,12 @@ def Resnet50(args):
     gpus = [0]
     device_opt = core.DeviceOption(caffe2_pb2.HIP)
     device_opt.hip_gpu_id = gpus[0]
-
-    # Batch size of 32 sums up to roughly 5GB of memory per device
-    batch_per_device = args.batch_size
-    total_batch_size = batch_per_device * len(gpus)
-
     num_labels = 1000
-
-    base_learning_rate = 0.0004 * total_batch_size
+    base_learning_rate = 0.0004 * args.batch_size
 
     # Weight decay (L2 regularization)
     weight_decay = 1e-4
 
-    num_epochs = 10
-
     ##################
     # Define the Model
     ##################

From fc7fcc1323adeaa793c5e92227096cb8e69db45b Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Wed, 26 Sep 2018 20:26:58 -0700
Subject: [PATCH 26/28] whitespace changes

---
 caffe2/python/convnet_benchmarks.py | 1 -
 caffe2/python/optimizer.py          | 2 +-
 caffe2/python/workspace.py          | 2 ++
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
index dfb729e9aeb90e..89bbab6bf751a6 100644
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@@ -841,4 +841,3 @@ def GetArgumentParser():
             'Inception_v2':Inception_v2
         }
         Benchmark(model_map[args.model], args)
-
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index b9838ed0ee1950..adc648c8218d19 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -1456,7 +1456,7 @@ def build_ftrl(model, engine="SIMD", **kwargs):
 
 def build_gftrl(model, engine="", **kwargs):
     if engine == "SIMD":
-        assert core.IsOperator('GFtrl_ENGINE_SIMD')    
+        assert core.IsOperator('GFtrl_ENGINE_SIMD')
     gftrl_optimizer = GFtrlOptimizer(engine=engine, **kwargs)
     return _build(model, gftrl_optimizer)
 
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index a1e7bf40ae7971..c474fe334b6f1e 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -48,6 +48,7 @@
     GetCUDAVersion = C.get_cuda_version
     GetCuDNNVersion = C.get_cudnn_version
     NumGpuDevices = NumCudaDevices
+    
     def GetCudaPeerAccessPattern():
         return np.asarray(C.get_cuda_peer_access_pattern())
 
@@ -61,6 +62,7 @@ def GetCudaPeerAccessPattern():
 if has_hip_support:
     NumHipDevices = C.num_hip_devices
     NumGpuDevices = NumHipDevices
+    
     def GetHipPeerAccessPattern():
         return np.asarray(C.get_hip_peer_access_pattern())
 

From d03ae3f5799f3cd238fef7091d28f8be90b89d08 Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Thu, 27 Sep 2018 10:51:20 -0700
Subject: [PATCH 27/28] update-docs

---
 rocm-docs/caffe2-build.md | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md
index 2bec136b774581..6b1de555235f1c 100644
--- a/rocm-docs/caffe2-build.md
+++ b/rocm-docs/caffe2-build.md
@@ -27,20 +27,19 @@ To run benchmarks, skip directly to benchmarks section of the document.
 ## Build Caffe2 from source
 ### Pull the docker image
 ```
-docker pull rocm/caffe2:unbuilt-rocm1.9-v1
+docker pull rocm/caffe2:unbuilt-rocm1.9-v2
 ```
 This docker image has all the dependencies for caffe2 pre-installed.
 
 ### Pull the latest caffe2 source:
 
-Checkout the `caffe2_specific` branch of this repository.
 * Using https 
 ```
-git clone --recurse-submodules -b caffe2_specific https://github.com/ROCmSoftwarePlatform/pytorch.git
+git clone --recurse-submodules https://github.com/ROCmSoftwarePlatform/pytorch.git
 ```
 * Using ssh
 ```
-git clone --recurse-submodules -b caffe2_specific git@github.com:ROCmSoftwarePlatform/pytorch.git
+git clone --recurse-submodules git@github.com:ROCmSoftwarePlatform/pytorch.git
 ```
 Navigate to repo directory
 ```
@@ -49,7 +48,7 @@ cd pytorch
 
 ### Launch the docker container
 ```	
-docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.8.2-v3
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.9-v2
 ``` 
 Navigate to pytorch directory `cd /pytorch` inside the container.
 
@@ -62,11 +61,10 @@ Navigate to pytorch directory `cd /pytorch` inside the container.
 	
 * Test the rocm-caffe2 Installation 
 
-	Before running the tests, make sure that the required environment variables are set:
 	```
 	cd build_caffe2 && python -c 'from caffe2.python import core' 2>/dev/null && echo "Success" || echo "Failure"
 	```
-Make sure the the following environment variables are set.
+If the test fails, make sure the following environment variables are set.
 
 ```
 LD_lIBRARY_PATH=/usr/local/caffe2/lib
@@ -86,23 +84,21 @@ Caffe2 benchmarking script supports the following networks.
 6. Inception_v2
 7. Resnet50 
 
-*Special case:* Inception_v2 and Resnet50 will need their corresponding protobuf files to run the benchmarks. Protobufs can be downloaded from caffe2 model zoo using the below command. Substitute model_name with `inception_v2` or `resnet50`
-
+*Special case:* Inception_v2 will need model protobuf files to run the benchmarks. Protobufs can be downloaded from caffe2 model zoo using the below command.
 ```
-python caffe2/python/models/download.py <model_name>
+python caffe2/python/models/download.py inception_v2
 ```
 This will download the protobufs to current working directory.
 
-To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, run the command replacing `<name_of_the_netwrok>` with one of the networks. 
+To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, Resnet50 run the command replacing `<name_of_the_netwrok>` with one of the networks. 
 
 ```
 python caffe2/python/convnet_benchmarks.py --batch_size 64 --model <name_of_the_network> --engine MIOPEN --layer_wise_benchmark True --net_type simple
 
 ```
-To run Inception_v2 or Resnet50, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above.
+To run Inception_v2, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above.
 
 ```
 python caffe2/python/convnet_benchmarks.py --batch_size 64 --model <name_of_the_network> --engine MIOPEN --layer_wise_benchmark True --net_type simple --model_path <path_to_model_protobufs>
 
 ```
-

From 56fa470426c62b161c9804c8c70dc4f0d4694cbe Mon Sep 17 00:00:00 2001
From: rohithkrn <rohith.nallamaddi@gmail.com>
Date: Thu, 27 Sep 2018 12:45:52 -0700
Subject: [PATCH 28/28] change benchmarking command

---
 rocm-docs/caffe2-build.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md
index 6b1de555235f1c..7ba7e5952dfcd3 100644
--- a/rocm-docs/caffe2-build.md
+++ b/rocm-docs/caffe2-build.md
@@ -93,12 +93,12 @@ This will download the protobufs to current working directory.
 To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, Resnet50 run the command replacing `<name_of_the_netwrok>` with one of the networks. 
 
 ```
-python caffe2/python/convnet_benchmarks.py --batch_size 64 --model <name_of_the_network> --engine MIOPEN --layer_wise_benchmark True --net_type simple
+python caffe2/python/convnet_benchmarks.py --batch_size 32 --model <name_of_the_network> --engine MIOPEN
 
 ```
 To run Inception_v2, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above.
 
 ```
-python caffe2/python/convnet_benchmarks.py --batch_size 64 --model <name_of_the_network> --engine MIOPEN --layer_wise_benchmark True --net_type simple --model_path <path_to_model_protobufs>
+python caffe2/python/convnet_benchmarks.py --batch_size 32 --model <name_of_the_network> --engine MIOPEN  --model_path <path_to_model_protobufs>
 
 ```