diff --git a/.jenkins/caffe2/amd/binary_tests.sh b/.jenkins/caffe2/amd/binary_tests.sh
new file mode 100755
index 00000000000000..1476ce7a0663e2
--- /dev/null
+++ b/.jenkins/caffe2/amd/binary_tests.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../../../ && pwd)
+
+cd "$ROOT_DIR"
+
+echo "Running C++ tests.."
+
+for file in $(find "${ROOT_DIR}/build_caffe2/bin" -executable -type f); do
+	if [[ "$file" =~ "test" ]]; then
+		case "$file" in
+		    # skip tests we know are hanging or bad
+		    */mkl_utils_test|*/aten/integer_divider_test)
+		      continue
+		      ;;
+		    */scalar_tensor_test|*/basic|*/native_test)
+				continue
+			  ;;
+			*)
+			  "$file"
+		esac
+	fi
+done
diff --git a/.jenkins/caffe2/amd/build_amd.sh b/.jenkins/caffe2/amd/build_amd.sh
new file mode 100755
index 00000000000000..be9248e1385155
--- /dev/null
+++ b/.jenkins/caffe2/amd/build_amd.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+set -ex
+
+# The INSTALL_PREFIX here must match up with test.sh
+INSTALL_PREFIX="/usr/local/caffe2"
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../../.. && pwd)
+CMAKE_ARGS=()
+
+##############################################################################
+# Explicitly set Python executable.
+###############################################################################
+# On Ubuntu 16.04 the default Python is still 2.7.
+PYTHON="$(which python)"
+
+###############################################################################
+# Set cmake args
+###############################################################################
+CMAKE_ARGS+=("-DBUILD_BINARY=ON")
+CMAKE_ARGS+=("-DBUILD_TEST=ON")
+CMAKE_ARGS+=("-DINSTALL_TEST=ON")
+CMAKE_ARGS+=("-DUSE_OBSERVERS=ON")
+CMAKE_ARGS+=("-DUSE_ZSTD=ON")
+CMAKE_ARGS+=("-DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX}")
+
+# TODO: This is patching the official FindHip to properly handly
+# cmake generator expression. A PR is opened in the upstream repo here:
+# https://github.com/ROCm-Developer-Tools/HIP/pull/516
+# remove this hack once it's merged.
+if [[ -f /opt/rocm/hip/cmake/FindHIP.cmake ]]; then
+  sudo sed -i 's/\ -I${dir}/\ $<$<BOOL:${dir}>:-I${dir}>/' /opt/rocm/hip/cmake/FindHIP.cmake
+fi
+
+export LANG=C.UTF-8
+export LC_ALL=C.UTF-8
+export HCC_AMDGPU_TARGET=gfx900
+export KMTHINLTO=1
+
+########## HIPIFY Caffe2 operators
+${PYTHON} "${ROOT_DIR}/tools/amd_build/build_caffe2_amd.py"
+${PYTHON} "${ROOT_DIR}/tools/amd_build/build_pytorch_amd.py"
+
+MAX_JOBS=$(nproc)
+
+###############################################################################
+# Configure and make
+###############################################################################
+# Run cmake from ./build_caffe2 directory so it doesn't conflict with
+# standard PyTorch build directory. Eventually these won't need to
+# be separate.
+rm -rf build_caffe2
+mkdir build_caffe2
+cd ./build_caffe2
+
+# Configure
+cmake "${ROOT_DIR}" ${CMAKE_ARGS[*]} "$@"
+
+# Build
+if [ "$(uname)" == "Linux" ]; then
+  make "-j${MAX_JOBS}" install
+else
+  echo "Don't know how to build on $(uname)"
+  exit 1
+fi
+
+###############################################################################
+# Install ONNX
+###############################################################################
+
+# Install ONNX into a local directory
+pip install --user -b /tmp/pip_install_onnx "file://${ROOT_DIR}/third_party/onnx#egg=onnx"
+
diff --git a/.jenkins/caffe2/amd/python_tests.sh b/.jenkins/caffe2/amd/python_tests.sh
new file mode 100755
index 00000000000000..46daa7c3e98a6f
--- /dev/null
+++ b/.jenkins/caffe2/amd/python_tests.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+set -ex
+
+LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+ROOT_DIR=$(cd "$LOCAL_DIR"/../../../ && pwd)
+
+cd "$ROOT_DIR"
+
+# Get the relative path to where the caffe2 python module was installed
+CAFFE2_PYPATH="$ROOT_DIR/build_caffe2/caffe2"
+
+rocm_ignore_test=()
+# need to debug
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/arg_ops_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/piecewise_linear_transform_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/softmax_ops_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/unique_ops_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/model_device_test.py")
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/data_parallel_model_test.py")
+
+# Need to go through roi ops to replace max(...) with fmaxf(...)
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/roi_align_rotated_op_test.py")
+
+# cuda top_k op has some asm code, the hipified version doesn't
+# compile yet, so we don't have top_k operator for now
+rocm_ignore_test+=("--ignore $CAFFE2_PYPATH/python/operator_test/top_k_test.py")
+
+
+# Python tests
+echo "Running Python tests.."
+python \
+  -m pytest \
+  -v \
+  --ignore "$CAFFE2_PYPATH/python/test/executor_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/operator_test/matmul_op_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/operator_test/pack_ops_test.py" \
+  --ignore "$CAFFE2_PYPATH/python/mkl/mkl_sbn_speed_test.py" \
+  ${rocm_ignore_test[@]} \
+  "$CAFFE2_PYPATH/python" 
\ No newline at end of file
diff --git a/caffe2/operators/hip/conv_op_miopen.cc b/caffe2/operators/hip/conv_op_miopen.cc
index 5b9df03dc78953..13f2428bf9daa2 100644
--- a/caffe2/operators/hip/conv_op_miopen.cc
+++ b/caffe2/operators/hip/conv_op_miopen.cc
@@ -48,7 +48,11 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
 
     if ((operator_def.type().substr(0, 6) == "Conv") ||
         (operator_def.type().substr(0, 14) == "ConvGradient")) {
-      mode_ = miopenConvolution;
+      if(group_ > 1) {
+        mode_ = miopenGroupConv;
+      } else{
+        mode_ = miopenConvolution;
+      }
     } else if (
         (operator_def.type().substr(0, 7) == "Trans") ||
         (operator_def.type().substr(0, 15) == "TransGradient")) {
@@ -57,6 +61,12 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
       LOG(FATAL) << "Unsupported convolution method: " << operator_def.type();
     }
 
+    if(mode_ == miopenGroupConv) {
+      OPERATOR_NEEDS_FEATURE(
+        dilation_h() == 1 && dilation_w() == 1,
+        "MIOpen convolution does not support dilation for groups > 1.");
+    }
+
     MIOPEN_ENFORCE(miopenInitConvolutionDescriptor(
         conv_desc_,
         mode_,
@@ -66,6 +76,9 @@ class MIOPENConvOpBase : public ConvPoolOpBase<HIPContext> {
         stride_w(),
         dilation_h(),
         dilation_w()));
+
+    MIOPEN_ENFORCE(miopenSetConvolutionGroupCount(
+        conv_desc_, group_));
   }
 
   ~MIOPENConvOpBase() {
@@ -155,9 +168,6 @@ class MIOPENConvGradientOp final : public MIOPENConvOpBase {
         bwdDataWsSize_(0),
         bwdWeiAlgo_(miopenConvolutionBwdWeightsAlgoGEMM),
         bwdDataAlgo_(miopenConvolutionBwdDataAlgoGEMM) {
-    OPERATOR_NEEDS_FEATURE(
-        group_ == 1,
-        "Group convolution not supported yet for MIOpen ConvGradient.");
     CAFFE_ENFORCE(
         !(no_bias_ && OutputSize() == 3),
         "If bias is not present, you should not have 3 grad output.");
@@ -247,186 +257,70 @@ bool MIOPENConvOp::DoRunWithType() {
       "If you set group, the number of output channels should be divisible "
       "by group.");
 
-  if (group_ > 1) {
-    int group_offset_filter = Weight.size() / group_;
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_W>::type,
-        M / group_,
-        C / group_,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C / group_, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (InputSize() == 3) {
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          bias_desc_, miopenTypeWrapper<T_B>::type, 1, Y->dim32(1), 1, 1));
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      weight_desc_,
+      miopenTypeWrapper<T_W>::type,
+      M,
+      C / group_,
+      kernel_h(),
+      kernel_w()));
+
+  MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+      conv_desc_,
+      bottom_desc_,
+      weight_desc_,
+      &N_out,
+      &C_out,
+      &H_out,
+      &W_out));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+  if (InputSize() == 3) {
       MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          top_desc_for_bias_,
-          miopenTypeWrapper<T_X>::type,
-          Y->dim32(0),
-          Y->dim32(1),
-          H_out,
-          W_out));
-    }
-
-    int group_offset_X = C / group_ * H * W * D;
-    int group_offset_Y = M / group_ * H_out * W_out * D_out;
+          bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
+  }
 
-    while (!bestAlgoFound_) {
+  while (!bestAlgoFound_) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        weight_desc_,
-        bottom_desc_,
-        conv_desc_,
-        top_desc_,
-        &fwdConvWsSize_));
-      if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
-          state->miopen_handle(),
-          bottom_desc_,
-          X.template data<T_X>(),
+          miopen_wrapper_.inline_miopen_handle(),
           weight_desc_,
-          Weight.template data<T_W>(),
-          conv_desc_,
-          top_desc_,
-          Y->template mutable_data<T_Y>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          fwdConvWs_,
-          fwdConvWsSize_,
-          false));
-      });
-      bestAlgoFound_ = true;
-      fwdAlgo_ = perf.fwd_algo;
-    }
-
-    for (int g = 0; g < group_; g++) {
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionForward(
-          state->miopen_handle(),
-          &alpha_,
           bottom_desc_,
-          X.template data<T_X>() + g * group_offset_X,
-          weight_desc_,
-          Weight.template data<T_W>() + g * group_offset_filter,
           conv_desc_,
-          fwdAlgo_,
-          &beta_,
           top_desc_,
-          Y->template mutable_data<T_Y>() + g * group_offset_Y,
-          fwdConvWs_,
-          fwdConvWsSize_));
-      });
-    }
-    hipDeviceSynchronize();
-
-    // BIAS
-    if (InputSize() == 3) {
-      auto& bias = Input(BIAS);
-
-      CAFFE_ENFORCE_EQ(bias.ndim(), 1);
-      CAFFE_ENFORCE_EQ(bias.dim32(0), M);
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionForwardBias(
-          state->miopen_handle(),
-          &alpha_,
-          bias_desc_,
-          bias.template data<T_B>(),
-          &beta_,
-          top_desc_for_bias_,
-          Y->template mutable_data<T_Y>()));
-      });
-    }
-
-    hipDeviceSynchronize();
-  } else {
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_W>::type,
-        M,
-        C,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (InputSize() == 3) {
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          bias_desc_, miopenTypeWrapper<T_B>::type, 1, C_out, 1, 1));
-    }
-
-    while (!bestAlgoFound_) {
-      miopenConvAlgoPerf_t perf;
-
-      MIOPEN_ENFORCE(miopenConvolutionForwardGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        weight_desc_,
-        bottom_desc_,
-        conv_desc_,
-        top_desc_,
-        &fwdConvWsSize_));
-
+          &fwdConvWsSize_));
       if ((fwdConvWsSize_ > 0) && (fwdConvWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
+          HIP_CHECK(hipMalloc(&fwdConvWs_, fwdConvWsSize_));
       }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
-          state->miopen_handle(),
-          bottom_desc_,
-          X.template data<T_X>(),
-          weight_desc_,
-          Weight.template data<T_W>(),
-          conv_desc_,
-          top_desc_,
-          Y->template mutable_data<T_Y>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          fwdConvWs_,
-          fwdConvWsSize_,
-          false));
+          MIOPEN_ENFORCE(miopenFindConvolutionForwardAlgorithm(
+            state->miopen_handle(),
+            bottom_desc_,
+            X.template data<T_X>(),
+            weight_desc_,
+            Weight.template data<T_W>(),
+            conv_desc_,
+            top_desc_,
+            Y->template mutable_data<T_Y>(),
+            requestAlgoCount_,
+            &returnedAlgoCount_,
+            &perf,
+            fwdConvWs_,
+            fwdConvWsSize_,
+            false));
       });
       bestAlgoFound_ = true;
       fwdAlgo_ = perf.fwd_algo;
-    }
-    miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
+  }
+
+  miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
       MIOPEN_ENFORCE(miopenConvolutionForward(
         state->miopen_handle(),
         &alpha_,
@@ -441,10 +335,9 @@ bool MIOPENConvOp::DoRunWithType() {
         Y->template mutable_data<T_Y>(),
         fwdConvWs_,
         fwdConvWsSize_));
-    });
+  });
 
-    // BIAS
-    if (InputSize() == 3) {
+  if (InputSize() == 3) {
       auto& bias = Input(BIAS);
 
       CAFFE_ENFORCE_EQ(bias.ndim(), 1);
@@ -459,13 +352,10 @@ bool MIOPENConvOp::DoRunWithType() {
           top_desc_,
           Y->template mutable_data<T_Y>()));
       });
-    }
-
-    hipDeviceSynchronize();
   }
-
   return true;
 }
+
 // TODO : enable fp16 support.
 bool MIOPENConvOp::RunOnDevice() {
   if (Input(0).IsType<float>()) {
@@ -535,215 +425,46 @@ bool MIOPENConvGradientOp::DoRunWithType() {
 
   bool doBwdDataComputation = (OutputSize() == 3 || (no_bias_ && (OutputSize() == 2)));
 
-  if (group_ > 1) {
-    int group_offset_filter = Weight.size() / group_;
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_X>::type,
-        M / group_,
-        C / group_,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C / group_, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (!no_bias_) {
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      weight_desc_,
+      miopenTypeWrapper<T_X>::type,
+      M,
+      C / group_,
+      kernel_h(),
+      kernel_w()));
+
+  MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
+      conv_desc_,
+      bottom_desc_,
+      weight_desc_,
+      &N_out,
+      &C_out,
+      &H_out,
+      &W_out));
+
+  MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
+      top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
+
+  if (!no_bias_) {
       MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
           bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          top_desc_for_bias_,
-          miopenTypeWrapper<T_X>::type,
-          dY.dim32(0),
-          M,
-          H_out,
-          W_out));
-    }
-
-    int group_offset_X = C / group_ * H * W * D;
-    int group_offset_Y = M / group_ * H_out * W_out * D_out;
+  }
 
-    while ((!bestDataAlgoFound_) && doBwdDataComputation) {
+  while ((!bestDataAlgoFound_) && doBwdDataComputation) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        weight_desc_,
-        conv_desc_,
-        bottom_desc_,
-        &bwdDataWsSize_));
-      if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionBackwardDataAlgorithm(
-          state->miopen_handle(),
+          miopen_wrapper_.inline_miopen_handle(),
           top_desc_,
-          dY.template data<T_DY>(),
           weight_desc_,
-          Weight.template data<T_W>(),
-          conv_desc_,
-          bottom_desc_,
-          dX->template mutable_data<T_DX>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          bwdDataWs_,
-          bwdDataWsSize_,
-          false));
-      });
-
-      bestDataAlgoFound_ = true;
-      bwdDataAlgo_ = perf.bwd_data_algo;
-    }
-
-    while (!bestWeightAlgoFound_) {
-      miopenConvAlgoPerf_t perf;
-
-      MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        bottom_desc_,
-        conv_desc_,
-        weight_desc_,
-        &bwdWeightWsSize_));
-      if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
-          state->miopen_handle(),
-          top_desc_,
-          dY.template data<T_DY>(),
-          bottom_desc_,
-          X.template data<T_X>(),
           conv_desc_,
-          weight_desc_,
-          dW->template mutable_data<T_DW>(),
-          requestAlgoCount_,
-          &returnedAlgoCount_,
-          &perf,
-          bwdWeightWs_,
-          bwdWeightWsSize_,
-          false));
-      });
-      bestWeightAlgoFound_ = true;
-      bwdWeiAlgo_ = perf.bwd_weights_algo;
-    }
-
-    for (int g = 0; g < group_; g++) {
-      if (doBwdDataComputation) {
-        miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-          MIOPEN_ENFORCE(miopenConvolutionBackwardData(
-            state->miopen_handle(),
-            &alpha_,
-            top_desc_,
-            dY.template data<T_DY>() + g * group_offset_Y,
-            weight_desc_,
-            Weight.template data<T_W>() + g * group_offset_filter,
-            conv_desc_,
-            bwdDataAlgo_,
-            &beta_,
-            bottom_desc_,
-            dX->template mutable_data<T_DX>() + g * group_offset_X,
-            bwdDataWs_,
-            bwdDataWsSize_));
-        });
-      }
-
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
-          state->miopen_handle(),
-          &alpha_,
-          top_desc_,
-          dY.template data<T_DY>() + g * group_offset_Y,
           bottom_desc_,
-          X.template data<T_X>() + g * group_offset_X,
-          conv_desc_,
-          bwdWeiAlgo_,
-          &beta_,
-          weight_desc_,
-          dW->template mutable_data<T_DW>() + g * group_offset_filter,
-          bwdWeightWs_,
-          bwdWeightWsSize_));
-      });
-    }
-
-    // Synchronize the work across groups.
-    hipDeviceSynchronize();
-
-    ////////////////////////////////////// BIAS ///////////////////////////
-    if (!no_bias_) {
-      auto* dbias = Output(BIAS_OR_INPUT_GRAD);
-      dbias->Resize(M);
-      miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-        MIOPEN_ENFORCE(miopenConvolutionBackwardBias(
-          state->miopen_handle(),
-          &alpha_,
-          top_desc_for_bias_,
-          dY.template data<T_DY>(),
-          &beta_,
-          bias_desc_,
-          dbias->template mutable_data<T_DB>()));
-      });
-    }
-  } else // No group
-  {
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        weight_desc_,
-        miopenTypeWrapper<T_X>::type,
-        M,
-        C,
-        kernel_h(),
-        kernel_w()));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        bottom_desc_, miopenTypeWrapper<T_X>::type, N, C, H, W));
-
-    MIOPEN_ENFORCE(miopenGetConvolutionForwardOutputDim(
-        conv_desc_,
-        bottom_desc_,
-        weight_desc_,
-        &N_out,
-        &C_out,
-        &H_out,
-        &W_out));
-
-    MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-        top_desc_, miopenTypeWrapper<T_X>::type, N_out, C_out, H_out, W_out));
-
-    if (!no_bias_) {
-      MIOPEN_ENFORCE(miopenSet4dTensorDescriptor(
-          bias_desc_, miopenTypeWrapper<T_B>::type, 1, M, 1, 1));
-    }
-
-    while ((!bestDataAlgoFound_) && doBwdDataComputation) {
-      miopenConvAlgoPerf_t perf;
-
-      MIOPEN_ENFORCE(miopenConvolutionBackwardDataGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        weight_desc_,
-        conv_desc_,
-        bottom_desc_,
-        &bwdDataWsSize_));
+          &bwdDataWsSize_));
       if ((bwdDataWsSize_ > 0) && (bwdDataWs_ == nullptr)) {
-        HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
+          HIP_CHECK(hipMalloc(&bwdDataWs_, bwdDataWsSize_));
       }
 
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
@@ -766,22 +487,22 @@ bool MIOPENConvGradientOp::DoRunWithType() {
 
       bestDataAlgoFound_ = true;
       bwdDataAlgo_ = perf.bwd_data_algo;
-    }
+  }
 
-    while (!bestWeightAlgoFound_) {
+  while (!bestWeightAlgoFound_) {
       miopenConvAlgoPerf_t perf;
 
       MIOPEN_ENFORCE(miopenConvolutionBackwardWeightsGetWorkSpaceSize(
-        miopen_wrapper_.inline_miopen_handle(),
-        top_desc_,
-        bottom_desc_,
-        conv_desc_,
-        weight_desc_,
-        &bwdWeightWsSize_));
+          miopen_wrapper_.inline_miopen_handle(),
+          top_desc_,
+          bottom_desc_,
+          conv_desc_,
+          weight_desc_,
+          &bwdWeightWsSize_));
       if ((bwdWeightWsSize_ > 0) && (bwdWeightWs_ == nullptr)) {
         HIP_CHECK(hipMalloc(&bwdWeightWs_, bwdWeightWsSize_));
       }
-      
+
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
         MIOPEN_ENFORCE(miopenFindConvolutionBackwardWeightsAlgorithm(
           state->miopen_handle(),
@@ -801,9 +522,9 @@ bool MIOPENConvGradientOp::DoRunWithType() {
       });
       bestWeightAlgoFound_ = true;
       bwdWeiAlgo_ = perf.bwd_weights_algo;
-    }
+  }
 
-    if (doBwdDataComputation) {
+  if (doBwdDataComputation) {
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
         MIOPEN_ENFORCE(miopenConvolutionBackwardData(
           state->miopen_handle(),
@@ -820,30 +541,30 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           bwdDataWs_,
           bwdDataWsSize_));
         });
-    }
-
-    miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
-      MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
-        state->miopen_handle(),
-        &alpha_,
-        top_desc_,
-        dY.template data<T_DY>(),
-        bottom_desc_,
-        X.template data<T_X>(),
-        conv_desc_,
-        bwdWeiAlgo_,
-        &beta_,
-        weight_desc_,
-        dW->template mutable_data<T_DW>(),
-        bwdWeightWs_,
-        bwdWeightWsSize_));
-    });
-
-    // Synchronize the work across groups.
-    hipDeviceSynchronize();
+  }
 
-    ////////////////////////////////////// BIAS ///////////////////////////
-    if (!no_bias_) {
+  miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
+    MIOPEN_ENFORCE(miopenConvolutionBackwardWeights(
+      state->miopen_handle(),
+      &alpha_,
+      top_desc_,
+      dY.template data<T_DY>(),
+      bottom_desc_,
+      X.template data<T_X>(),
+      conv_desc_,
+      bwdWeiAlgo_,
+      &beta_,
+      weight_desc_,
+      dW->template mutable_data<T_DW>(),
+      bwdWeightWs_,
+      bwdWeightWsSize_));
+  });
+
+  // Synchronize the work across groups.
+  hipDeviceSynchronize();
+
+  ////////////////////////////////////// BIAS ///////////////////////////
+  if (!no_bias_) {
       auto* dbias = Output(BIAS_OR_INPUT_GRAD);
       dbias->Resize(M);
       miopen_wrapper_.with_miopen_state(miopen_state_, [&](MIOPENState* state) {
@@ -856,7 +577,6 @@ bool MIOPENConvGradientOp::DoRunWithType() {
           bias_desc_,
           dbias->template mutable_data<T_DB>()));
       });
-    }
   }
 
   return true;
diff --git a/caffe2/python/convnet_benchmarks.py b/caffe2/python/convnet_benchmarks.py
index 3aac78c18df16b..89bbab6bf751a6 100644
--- a/caffe2/python/convnet_benchmarks.py
+++ b/caffe2/python/convnet_benchmarks.py
@@ -59,11 +59,14 @@
 """
 
 import argparse
+import os
 
-from caffe2.python import workspace, brew, model_helper
+from caffe2.python import workspace, brew, model_helper, core, net_drawer, memonger
+from caffe2.proto import caffe2_pb2
+from caffe2.python import data_parallel_model as dpm
+from caffe2.python.models import resnet
 
-
-def MLP(order, cudnn_ws):
+def MLP(order, cudnn_ws, model_path=""):
     model = model_helper.ModelHelper(name="MLP")
     d = 256
     depth = 20
@@ -98,7 +101,7 @@ def MLP(order, cudnn_ws):
     return model, d
 
 
-def AlexNet(order, cudnn_ws):
+def AlexNet(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -191,7 +194,7 @@ def AlexNet(order, cudnn_ws):
     return model, 224
 
 
-def OverFeat(order, cudnn_ws):
+def OverFeat(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -277,7 +280,7 @@ def OverFeat(order, cudnn_ws):
     return model, 231
 
 
-def VGGA(order, cudnn_ws):
+def VGGA(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -475,7 +478,7 @@ def _InceptionModule(
     return output
 
 
-def Inception(order, cudnn_ws):
+def Inception(order, cudnn_ws, model_path=""):
     my_arg_scope = {
         'order': order,
         'use_cudnn': True,
@@ -563,6 +566,112 @@ def Inception(order, cudnn_ws):
     return model, 224
 
 
+def Resnet50(args):
+    gpus = [0]
+    device_opt = core.DeviceOption(caffe2_pb2.HIP)
+    device_opt.hip_gpu_id = gpus[0]
+    num_labels = 1000
+    base_learning_rate = 0.0004 * args.batch_size
+
+    # Weight decay (L2 regularization)
+    weight_decay = 1e-4
+
+    ##################
+    # Define the Model
+    ##################
+    train_model = model_helper.ModelHelper(name="resnet50_train")
+
+    def create_resnet50_model_ops(model, loss_scale=1.0):
+        # residual network
+        [softmax, loss] = resnet.create_resnet50(model,
+                                                 "data",
+                                                 num_input_channels=3,
+                                                 num_labels=num_labels,
+                                                 label="label", )
+        prefix = model.net.Proto().name
+        loss = model.net.Scale(loss, prefix + "_loss", scale=loss_scale)
+        brew.accuracy(model, [softmax, "label"], prefix + "_accuracy")
+        return [loss]
+
+    def add_parameter_update_ops(model):
+        brew.add_weight_decay(model, weight_decay)
+        iter = brew.iter(model, "iter")
+        lr = model.net.LearningRate([iter],
+                                    "lr",
+                                    base_lr=base_learning_rate,
+                                    policy="fixed",
+                                    gamma=0.1)
+        for param in model.GetParams():
+            param_grad = model.param_to_grad[param]
+            param_momentum = model.param_init_net.ConstantFill(
+                [param], param + '_momentum', value=0.0 )
+
+            model.net.MomentumSGDUpdate(
+                [param_grad, param_momentum, lr, param],
+                [param_grad, param_momentum, param],
+                momentum=0.9,
+                nesterov=1)
+
+    def optimize_gradient_memory(model, loss):
+        model.net._net = memonger.share_grad_blobs(
+            model.net,
+            loss,
+            set(model.param_to_grad.values()),
+            namescope="",
+            share_activations=False)
+
+
+    with core.NameScope(""):
+        with core.DeviceScope(device_opt):
+            losses = create_resnet50_model_ops(train_model)
+            if not args.forward_only:
+                blobs_to_gradients = train_model.AddGradientOperators(losses)
+                add_parameter_update_ops(train_model)
+        if not args.forward_only:
+            optimize_gradient_memory(train_model, [blobs_to_gradients[losses[0]]])
+
+    return train_model, 224
+
+
+def Inception_v2(order, cudnn_ws, model_path=""):
+    if model_path == "":
+        print("ERROR: please specify paths to init_net and predict_net protobufs for Inception_v2")
+        exit(1)
+    device_opts = caffe2_pb2.DeviceOption()
+    device_opts.device_type = caffe2_pb2.HIP
+    device_opts.hip_gpu_id = 0
+
+    INIT_NET_PB = os.path.join(model_path, "init_net.pb")
+    PREDICT_NET_PB = os.path.join(model_path, "predict_net.pb")
+    init_def = caffe2_pb2.NetDef()
+    with open(INIT_NET_PB, 'rb') as f:
+        init_def.ParseFromString(f.read())
+        init_def.device_option.CopyFrom(device_opts)
+
+    net_def = caffe2_pb2.NetDef()
+    with open(PREDICT_NET_PB, 'rb') as f:
+        net_def.ParseFromString(f.read())
+        net_def.device_option.CopyFrom(device_opts)
+
+    init_net = core.Net(init_def)
+    predict_net = core.Net(net_def)
+
+    my_arg_scope = {
+        'order': order,
+    }
+
+    model = model_helper.ModelHelper(
+        name="GoogleNet",
+        arg_scope=my_arg_scope,
+    )
+
+    model.param_init_net = init_net
+    model.net = predict_net
+    xent = model.net.LabelCrossEntropy(["prob", "label"], "xent")
+    model.net.AveragedLoss(xent, "loss")
+    return model, 224
+
+
 def AddParameterUpdate(model):
     """ Simple plain SGD update -- not tuned to actually train the models """
     ITER = brew.iter(model, "iter")
@@ -575,7 +684,10 @@ def AddParameterUpdate(model):
 
 
 def Benchmark(model_gen, arg):
-    model, input_size = model_gen(arg.order, arg.cudnn_ws)
+    if arg.model == 'Resnet50':
+        model, input_size = model_gen(arg)
+    else:
+        model, input_size = model_gen(arg.order, arg.cudnn_ws, arg.model_path)
     model.Proto().type = arg.net_type
     model.Proto().num_workers = arg.num_workers
 
@@ -607,8 +719,9 @@ def Benchmark(model_gen, arg):
         print('{}: running forward only.'.format(arg.model))
     else:
         print('{}: running forward-backward.'.format(arg.model))
-        model.AddGradientOperators(["loss"])
-        AddParameterUpdate(model)
+        if not arg.model == "Resnet50":
+            model.AddGradientOperators(["loss"])
+            AddParameterUpdate(model)
         if arg.order == 'NHWC':
             print(
                 '==WARNING==\n'
@@ -701,6 +814,7 @@ def GetArgumentParser():
     parser.add_argument("--num_workers", type=int, default=2)
     parser.add_argument("--use-nvtx", default=False, action='store_true')
     parser.add_argument("--htrace_span_log_path", type=str)
+    parser.add_argument("--model_path", type=str, default="", help="set path to init net and predict_net protobufs")
     return parser
 
 
@@ -723,5 +837,7 @@ def GetArgumentParser():
             'VGGA': VGGA,
             'Inception': Inception,
             'MLP': MLP,
+            'Resnet50': Resnet50,
+            'Inception_v2':Inception_v2
         }
         Benchmark(model_map[args.model], args)
diff --git a/caffe2/python/core.py b/caffe2/python/core.py
index 6850c02fc13964..f08d72347c15b8 100644
--- a/caffe2/python/core.py
+++ b/caffe2/python/core.py
@@ -85,6 +85,7 @@ def IsOperatorWithEngine(op_type, engine):
 def DeviceOption(
     device_type,
     cuda_gpu_id=0,
+    hip_gpu_id=0,
     random_seed=None,
     node_name=None,
     numa_node_id=None,
@@ -93,6 +94,7 @@ def DeviceOption(
     option = caffe2_pb2.DeviceOption()
     option.device_type = device_type
     option.cuda_gpu_id = cuda_gpu_id
+    option.hip_gpu_id = hip_gpu_id
     if node_name is not None:
         option.node_name = node_name
     if random_seed is not None:
@@ -115,7 +117,7 @@ def device_option_equal(opt1, opt2, ignore_node_name=True, ignore_random_seed=Tr
     if not opt1.device_type or not opt2.device_type:
         # At least one option is for CPU, check if both are for CPU.
         return not opt1.device_type and not opt2.device_type
-    return opt1.cuda_gpu_id == opt2.cuda_gpu_id
+    return (opt1.cuda_gpu_id == opt2.cuda_gpu_id) and (opt1.hip_gpu_id == opt2.hip_gpu_id)
 
 
 def InferBlobDevices(net):
@@ -2110,8 +2112,9 @@ def DeduplicateGradientSlices(self, g, aggregator='sum'):
     def RunAllOnGPU(self, gpu_id=0, use_cudnn=False):
         """A convenient function to run everything on the GPU."""
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
+        device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
         device_option.cuda_gpu_id = gpu_id
+        device_option.hip_gpu_id = gpu_id
         self._net.device_option.CopyFrom(device_option)
         if use_cudnn:
             for op in self._net.op:
@@ -2280,27 +2283,38 @@ def remap_input(op, blob_name_remapping):
 
 def copy_func_between_devices(src, dst):
     CPU = caffe2_pb2.CPU
-    CUDA = caffe2_pb2.CUDA
+    if workspace.has_hip_support:
+        GPU = caffe2_pb2.HIP
+    else:
+        GPU = caffe2_pb2.CUDA
 
     if src.device_type == CPU and dst.device_type == CPU:
         return None
 
-    if src.device_type == CUDA and dst.device_type == CUDA:
-        if src.cuda_gpu_id == dst.cuda_gpu_id:
-            return None
+    if src.device_type == GPU and dst.device_type == GPU:
+        def fun(net, *args, **kw):
+            with DeviceScope(dst):
+                return net.Copy(*args, **kw)
+
+        if workspace.has_hip_support:
+            if src.hip_gpu_id == dst.hip_gpu_id:
+                return None
+            else:
+                return fun    
         else:
-            def fun(net, *args, **kw):
-                with DeviceScope(dst):
-                    return net.Copy(*args, **kw)
-            return fun
+            if src.cuda_gpu_id == dst.cuda_gpu_id:
+                return None
+            else:
+                return fun
+               
 
-    if src.device_type == CUDA and dst.device_type == CPU:
+    if src.device_type == GPU and dst.device_type == CPU:
         def fun(net, *args, **kw):
             with DeviceScope(src):
                 return net.CopyGPUToCPU(*args, **kw)
         return fun
 
-    if src.device_type == CPU and dst.device_type == CUDA:
+    if src.device_type == CPU and dst.device_type == GPU:
         def fun(net, *args, **kw):
             with DeviceScope(dst):
                 return net.CopyCPUToGPU(*args, **kw)
@@ -2315,7 +2329,12 @@ def device_equal(src, dst):
     comparison between empty device_options and {device_type:0, cuda_gpu_id:0}
     returns not equal in some cases.
     '''
-    return src.device_type == dst.device_type and src.cuda_gpu_id == dst.cuda_gpu_id
+    if workspace.has_hip_support:
+        gpu_id_eq = src.hip_gpu_id == dst.hip_gpu_id
+    else:
+        gpu_id_eq = src.cuda_gpu_id == dst.cuda_gpu_id
+
+    return src.device_type == dst.device_type and gpu_id_eq
 
 
 def update_placeholder_op_output(op, blob_to_device):
@@ -2426,10 +2445,13 @@ def InjectCrossDeviceCopies(net, blob_to_device=None, blob_remap=None,
                     def _gen_new_name(blob, device_option):
                         CPU = caffe2_pb2.CPU
                         CUDA = caffe2_pb2.CUDA
+                        HIP = caffe2_pb2.HIP
                         if device_option.device_type == CPU:
                             suffix = '_cpu'
                         elif device_option.device_type == CUDA:
                             suffix = '_cuda_' + str(device_option.cuda_gpu_id)
+                        elif device_option.device_type == HIP:
+                            suffix = '_hip_' + str(device_option.hip_gpu_id)
                         else:
                             raise RuntimeError(
                                 "Unknown device type: {}".
diff --git a/caffe2/python/core_gradients_test.py b/caffe2/python/core_gradients_test.py
index bf25806f20dde2..88b07eab347323 100644
--- a/caffe2/python/core_gradients_test.py
+++ b/caffe2/python/core_gradients_test.py
@@ -9,7 +9,7 @@
 import unittest
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core, test_util
+from caffe2.python import core, test_util, workspace
 from caffe2.python.core import CreateOperator, GradientRegistry
 from caffe2.python import workspace
 
@@ -94,7 +94,7 @@ def assertOperatorListEqual(self, operatorDefList1, operatorDefList2):
 
     @given(device_option=st.sampled_from([
         None,
-        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+        core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)]))
     def testDirect(self, device_option):
         operators = [
             CreateOperator('Direct', 'in', 'hidden'),
@@ -279,7 +279,7 @@ def testUseInputButInputHasBeenChanged(self):
 
     @given(device_option=st.sampled_from([
         None,
-        core.DeviceOption(caffe2_pb2.CUDA, 1)]))
+        core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=1) if workspace.has_hip_support else core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=1)]))
     def testMultiUseInput(self, device_option):
         """Test gradient for the following case:
 
diff --git a/caffe2/python/core_test.py b/caffe2/python/core_test.py
index 7120843f33152d..67a9a795c573b6 100644
--- a/caffe2/python/core_test.py
+++ b/caffe2/python/core_test.py
@@ -82,18 +82,30 @@ def testDeviceScope(self):
         self.assertFalse(op.HasField('device_option'))
         # explicitly setting a device
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
-        op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
-        self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+            op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
+            self.assertTrue(op.HasField('device_option'))
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
+            op = core.CreateOperator("Relu", "x", "y", device_option=device_option)
+            self.assertTrue(op.HasField('device_option'))
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         with core.DeviceScope(device_option):
             # from device scope
             op = core.CreateOperator("Relu", "x", "y")
             self.assertTrue(op.HasField('device_option'))
-            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            if workspace.has_hip_support:
+                self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+                self.assertEqual(op.device_option.hip_gpu_id, 1)
+            else:
+                self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+                self.assertEqual(op.device_option.cuda_gpu_id, 1)
             # from an overridden device option
             override_device = caffe2_pb2.DeviceOption()
             override_device.device_type = caffe2_pb2.CPU
@@ -108,14 +120,22 @@ def testDeviceScope(self):
 
     def testNameAndDeviceScopeTogether(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             with core.NameScope("foo"):
                 op = core.CreateOperator("Relu", "x", "y")
                 self.assertTrue(op.HasField('device_option'))
-                self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-                self.assertEqual(op.device_option.cuda_gpu_id, 1)
+                if workspace.has_hip_support:
+                    self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+                    self.assertEqual(op.device_option.hip_gpu_id, 1)
+                else:
+                    self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+                    self.assertEqual(op.device_option.cuda_gpu_id, 1)
                 self.assertEqual(len(op.input), 1)
                 self.assertEqual(op.input[0], "foo/x")
                 self.assertEqual(len(op.output), 1)
@@ -254,8 +274,12 @@ def testSetInputRecordWithoutBlobs(self):
 class TestCreateOperator(test_util.TestCase):
     def testCreate(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         op = core.CreateOperator(
             "Ludicrous", "x", "y", name="ludicrous",
             control_input="z", device_option=device_option,
@@ -270,8 +294,12 @@ def testCreate(self):
         self.assertEqual(len(op.control_input), 1)
         self.assertEqual(op.control_input[0], "z")
         self.assertTrue(op.HasField('device_option'))
-        self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.HIP)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:    
+            self.assertEqual(op.device_option.device_type, caffe2_pb2.CUDA)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertTrue(len(op.arg), 3)
 
         # can't guarantee ordering of kwargs, so generate a set of args
@@ -574,12 +602,15 @@ def test_check_equal_default_value(self):
         opt2 = caffe2_pb2.DeviceOption()
         opt1.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.cuda_gpu_id = 5
+        if workspace.has_hip_support:
+            opt1.hip_gpu_id = 5
+        else:
+            opt1.cuda_gpu_id = 5
         # opt1 still is on CPU, so the options should be equal
         self.assertTrue(core.device_option_equal(opt1, opt2))
         opt2.device_type = 0
         self.assertTrue(core.device_option_equal(opt1, opt2))
-        opt1.device_type = 1
+        opt1.device_type = 6 if workspace.has_hip_support else 1
         self.assertFalse(core.device_option_equal(opt1, opt2))
 
 
@@ -643,14 +674,18 @@ def test_inject_copy(self):
         self.assertEqual(op.input[2], "fc_b")
 
 
-@unittest.skipIf(not workspace.has_gpu_support, 'No GPU support')
+@unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, 'No GPU support')
 class TestInferDevice(test_util.TestCase):
 
     def setUp(self):
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
-        self.cuda_option = device_option
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
+        self.gpu_option = device_option
         self.cpu_option = caffe2_pb2.DeviceOption()
 
     def _test_op(
@@ -662,7 +697,7 @@ def _test_op(
         inputs=None,
         outputs=None
     ):
-        op_option = self.cuda_option if not op_option else op_option
+        op_option = self.gpu_option if not op_option else op_option
         inputs = ["blob_1"] if not inputs else inputs
         outputs = ["blob_2"] if not outputs else outputs
         with core.DeviceScope(op_option):
@@ -690,9 +725,9 @@ def _test_op(
     def test_infer_device(self):
         self._test_op(
             "FC",
-            self.cuda_option,
-            self.cuda_option,
-            op_option=self.cuda_option,
+            self.gpu_option,
+            self.gpu_option,
+            op_option=self.gpu_option,
             inputs=["data", "fc_w", "fc_b"],
             outputs=["fc_1"]
         )
@@ -700,17 +735,17 @@ def test_infer_device(self):
     def test_infer_device_split_by_lengths(self):
         self._test_op(
             "SplitByLengths",
-            [self.cuda_option, self.cpu_option],
-            self.cuda_option,
-            op_option=self.cuda_option,
+            [self.gpu_option, self.cpu_option],
+            self.gpu_option,
+            op_option=self.gpu_option,
             inputs=["data", "fc_w"],
             outputs=["fc_1"]
         )
 
     def test_infer_device_cross_device(self):
-        self._test_op("CopyGPUToCPU", self.cuda_option, self.cpu_option)
-        self._test_op("CopyCPUToGPU", self.cpu_option, self.cuda_option)
-        self._test_op("CopyFromCPUInput", self.cpu_option, self.cuda_option)
+        self._test_op("CopyGPUToCPU", self.gpu_option, self.cpu_option)
+        self._test_op("CopyCPUToGPU", self.cpu_option, self.gpu_option)
+        self._test_op("CopyFromCPUInput", self.cpu_option, self.gpu_option)
         self._test_op(
             "CopyFromCPUInput",
             self.cpu_option,
@@ -720,7 +755,7 @@ def test_infer_device_cross_device(self):
 
     def test_device_inference_function(self):
         # ConcatOp.
-        op_option = self.cuda_option
+        op_option = self.gpu_option
         with core.DeviceScope(op_option):
             op = core.CreateOperator(
                 'Concat',
@@ -732,7 +767,7 @@ def test_device_inference_function(self):
         self.assertEqual(output_dev[1], self.cpu_option)
 
         #SplitOp.
-        op_option = self.cuda_option
+        op_option = self.gpu_option
         with core.DeviceScope(op_option):
             op = core.CreateOperator(
                 'Split',
@@ -747,8 +782,12 @@ def test_inject_copy(self):
         net = core.Net("test")
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
 
@@ -761,11 +800,18 @@ def test_inject_copy(self):
         )
         op = new_net._net.op[-1]
         self.assertEqual(op.type, "FC")
-        self.assertEqual(op.input[0], "data_cuda_1")
-        self.assertEqual(op.input[1], "fc_w_cuda_1")
-        self.assertEqual(op.input[2], "fc_b_cuda_1")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[0], "data_hip_1")
+            self.assertEqual(op.input[1], "fc_w_hip_1")
+            self.assertEqual(op.input[2], "fc_b_hip_1")
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            self.assertEqual(op.input[0], "data_cuda_1")
+            self.assertEqual(op.input[1], "fc_w_cuda_1")
+            self.assertEqual(op.input[2], "fc_b_cuda_1")
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(new_net._net.op[-2].type, "CopyCPUToGPU")
         self.assertEqual(new_net._net.op[0].type, "CopyCPUToGPU")
         self.assertNotEqual(blob_to_device["fc_w"], device_option)
@@ -774,8 +820,12 @@ def test_cross_nets(self):
         net = core.Net("test")
         init_net = core.Net("init")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
         weight = init_net.XavierFill([], 'fc_w', shape=[10, 100])
         bias = init_net.ConstantFill([], 'fc_b', shape=[10, ])
         const = init_net.ConstantFill([], 'const', shape=[], value=1.)
@@ -790,28 +840,53 @@ def test_cross_nets(self):
         )
         op = nets[1]._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.output[0], "fc_w_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_w_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_w_cuda_1")
         op = nets[1]._net.op[1]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.output[0], "fc_b_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_b_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.output[0], "fc_b_cuda_1")
         op = nets[1]._net.op[2]
         self.assertEqual(op.type, "FC")
         self.assertEqual(op.input[0], "data")
-        self.assertEqual(op.input[1], "fc_w_cuda_1")
-        self.assertEqual(op.input[2], "fc_b_cuda_1")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[1], "fc_w_hip_1")
+            self.assertEqual(op.input[2], "fc_b_hip_1")
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            self.assertEqual(op.input[1], "fc_w_cuda_1")
+            self.assertEqual(op.input[2], "fc_b_cuda_1")
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         op = nets[1]._net.op[3]
         self.assertEqual(op.type, "Add")
         self.assertEqual(op.input[0], "fc1")
-        self.assertEqual(op.input[1], "const_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[1], "const_hip_1")
+        else:
+            self.assertEqual(op.input[1], "const_cuda_1")
         # check that moved blob is in input to the new net
-        for c in ["data", "fc_w", "fc_b", "const_cuda_1"]:
-            self.assertTrue(c in nets[1]._net.external_input)
+        if workspace.has_hip_support:
+            for c in ["data", "fc_w", "fc_b", "const_hip_1"]:
+                self.assertTrue(c in nets[1]._net.external_input)
+        else:
+            for c in ["data", "fc_w", "fc_b", "const_cuda_1"]:
+                self.assertTrue(c in nets[1]._net.external_input)
         """
 For reference, net.Proto() should be like:
 name: ""
@@ -911,8 +986,12 @@ def test_cross_nets_no_change(self):
     def test_inject_copy_multi_use(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
 
         with core.DeviceScope(device_option):
             net.Relu("data", "relu1")
@@ -920,23 +999,38 @@ def test_inject_copy_multi_use(self):
         with core.DeviceScope(device_option):
             net.Relu("data", "relu3")
         net.Relu("data", "relu4")
-        device_option.cuda_gpu_id = 0
+        if workspace.has_hip_support:
+            device_option.hip_gpu_id = 0
+        else:
+            device_option.cuda_gpu_id = 0
         with core.DeviceScope(device_option):
             net.Relu("data", "relu5")
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.cuda_gpu_id = 1
         with core.DeviceScope(device_option):
             net.Relu("data", "relu6")
 
         new_net, _ = core.InjectCrossDeviceCopies(net)
         op = new_net._net.op[0]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.output[0], "data_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.output[0], "data_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.output[0], "data_cuda_1")
         op = new_net._net.op[1]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
         self.assertEqual(op.output[0], "relu1")
         op = new_net._net.op[2]
         self.assertEqual(op.type, "Relu")
@@ -944,9 +1038,14 @@ def test_inject_copy_multi_use(self):
         self.assertEqual(op.output[0], "relu2")
         op = new_net._net.op[3]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.input[0], "data_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu3")
         op = new_net._net.op[4]
         self.assertEqual(op.type, "Relu")
@@ -954,20 +1053,35 @@ def test_inject_copy_multi_use(self):
         self.assertEqual(op.output[0], "relu4")
         op = new_net._net.op[5]
         self.assertEqual(op.type, "CopyCPUToGPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
-        self.assertEqual(op.output[0], "data_cuda_0")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+            self.assertEqual(op.output[0], "data_hip_0")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
+            self.assertEqual(op.output[0], "data_cuda_0")
         op = new_net._net.op[6]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
-        self.assertEqual(op.input[0], "data_cuda_0")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+            self.assertEqual(op.input[0], "data_hip_0")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
+            self.assertEqual(op.input[0], "data_cuda_0")
         self.assertEqual(op.output[0], "relu5")
         op = new_net._net.op[7]
         self.assertEqual(op.type, "Relu")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 1)
-        self.assertEqual(op.input[0], "data_cuda_1")
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_hip_1")
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 1)
+            self.assertEqual(op.input[0], "data_cuda_1")
         self.assertEqual(op.output[0], "relu6")
         """
 For reference, net.Proto() should be like:
@@ -1059,8 +1173,12 @@ def test_inject_copy_placeholder_ops(self):
             cpu_device.append(caffe2_pb2.DeviceOption())
             cpu_device[i].node_name = 'node:' + str(i)
             gpu_device.append(caffe2_pb2.DeviceOption())
-            gpu_device[i].device_type = caffe2_pb2.CUDA
-            gpu_device[i].cuda_gpu_id = 0
+            if workspace.has_hip_support:
+                gpu_device[i].device_type = caffe2_pb2.HIP
+                gpu_device[i].hip_gpu_id = 0
+            else:
+                gpu_device[i].device_type = caffe2_pb2.CUDA
+                gpu_device[i].cuda_gpu_id = 0
             gpu_device[i].node_name = 'node:' + str(i)
         send_node = 'node:0'
         recv_node = 'node:1'
@@ -1099,13 +1217,21 @@ def test_inject_copy_placeholder_ops(self):
         # Verify (init_net)
         op = init_net._net.op[2]
         self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_w_cpu")
         op = init_net._net.op[3]
         self.assertEqual(op.type, "CopyGPUToCPU")
-        self.assertEqual(op.device_option.device_type, 1)
-        self.assertEqual(op.device_option.cuda_gpu_id, 0)
+        if workspace.has_hip_support:
+            self.assertEqual(op.device_option.device_type, 6)
+            self.assertEqual(op.device_option.hip_gpu_id, 0)
+        else:
+            self.assertEqual(op.device_option.device_type, 1)
+            self.assertEqual(op.device_option.cuda_gpu_id, 0)
         self.assertEqual(op.output[0], "fc_b_cpu")
         op = init_net._net.op[4]
         self.assertEqual(op.type, placeholder_send)
@@ -1127,8 +1253,12 @@ def test_inject_copy_placeholder_ops(self):
     def test_blob_inplace(self):
         net = core.Net("test")
         device_option = caffe2_pb2.DeviceOption()
-        device_option.device_type = caffe2_pb2.CUDA
-        device_option.cuda_gpu_id = 1
+        if workspace.has_hip_support:
+            device_option.device_type = caffe2_pb2.HIP
+            device_option.hip_gpu_id = 1
+        else:
+            device_option.device_type = caffe2_pb2.CUDA
+            device_option.cuda_gpu_id = 1
 
         net.Adagrad(['param', 'moment', 'grad', 'lr'], ['param', 'moment'])
         with core.DeviceScope(device_option):
@@ -1137,10 +1267,15 @@ def test_blob_inplace(self):
         op = net._net.op[1]
         self.assertEqual(op.type, 'CopyCPUToGPU')
         self.assertEqual(op.input[0], 'param')
-        self.assertEqual(op.output[0], 'param_cuda_1')
+        if workspace.has_hip_support:
+            self.assertEqual(op.output[0], 'param_hip_1')
+        else:
+            self.assertEqual(op.output[0], 'param_cuda_1')
         op = net._net.op[2]
-        self.assertEqual(op.input[0], 'param_cuda_1')
-
+        if workspace.has_hip_support:
+            self.assertEqual(op.input[0], 'param_hip_1')
+        else:
+            self.assertEqual(op.input[0], 'param_cuda_1')
         net.Relu('nonsense_input', 'moment')
         # should not raise inplace error
         core.InjectCrossDeviceCopies(net)
diff --git a/caffe2/python/data_parallel_model.py b/caffe2/python/data_parallel_model.py
index 89770dc6ea7d9a..b6d425a699f220 100644
--- a/caffe2/python/data_parallel_model.py
+++ b/caffe2/python/data_parallel_model.py
@@ -136,17 +136,17 @@ def Parallelize(
 
     if devices is None:
         if not cpu_device:
-            devices = list(range(0, workspace.NumCudaDevices()))
+            devices = list(range(0, workspace.NumGpuDevices()))
         else:
             devices = list(range(0, cpu_count()))
 
     if not cpu_device:
         for gpu in devices:
-            if gpu >= workspace.NumCudaDevices():
+            if gpu >= workspace.NumGpuDevices():
                 log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumCudaDevices(), devices))
+                    workspace.NumGpuDevices(), devices))
                 break
-        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
         model_helper_obj._device_prefix = "gpu"
         model_helper_obj._shared_model = False
         device_name = "GPU"
@@ -176,7 +176,6 @@ def Parallelize(
     model_helper_obj._grad_names = []
 
     assert isinstance(model_helper_obj, model_helper.ModelHelper)
-
     # Keep track of params that were in the model before: they are not
     # data parallel, so we need to handle them separately
     non_datapar_params = copy.copy(model_helper_obj.params)
@@ -211,7 +210,10 @@ def Parallelize(
         # TODO: make into assert
 
     for device in devices:
-        device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+        if workspace.has_hip_support:
+            device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device)
+        else:
+            device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device)
         with core.DeviceScope(device_opt):
             with core.NameScope("{}_{}".format(model_helper_obj._device_prefix,
                                                device)):
@@ -321,7 +323,10 @@ def Parallelize(
 
     if param_update_builder_fun is not None:
         for device in devices:
-            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+            if workspace.has_hip_support:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device)
+            else:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device)
             with core.DeviceScope(device_opt):
                 with core.NameScope(
                     "{}_{}".format(model_helper_obj._device_prefix, device)
@@ -366,7 +371,10 @@ def Parallelize(
     # i.e. making sure multi-precision copies of parameters are up-to-date
     if post_sync_builder_fun is not None:
         for device in devices:
-            device_opt = core.DeviceOption(model_helper_obj._device_type, device)
+            if workspace.has_hip_support:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=device)
+            else:
+                device_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=device)
             with core.DeviceScope(device_opt):
                 with core.NameScope(
                     "{}_{}".format(model_helper_obj._device_prefix, device)
@@ -446,17 +454,17 @@ def Parallelize_BMUF(
     assert isinstance(model_helper_obj, model_helper.ModelHelper)
 
     if devices is None:
-        devices = list(range(0, workspace.NumCudaDevices()))
+        devices = list(range(0, workspace.NumGpuDevices()))
     if master_device is None:
         master_device = devices[0]
 
     if not cpu_device:
         for gpu in devices:
-            if gpu >= workspace.NumCudaDevices():
+            if gpu >= workspace.NumGpuDevices():
                 log.warning("** Only {} GPUs available, GPUs {} requested".format(
-                    workspace.NumCudaDevices(), devices))
+                    workspace.NumGpuDevices(), devices))
                 break
-        model_helper_obj._device_type = caffe2_pb2.CUDA
+        model_helper_obj._device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
         model_helper_obj._device_prefix = "gpu"
     else:
         model_helper_obj._device_type = caffe2_pb2.CPU
@@ -467,7 +475,10 @@ def Parallelize_BMUF(
     model_helper_obj._sync_barrier_net = None
     model_helper_obj._broadcast_context = None
     model_helper_obj._shared_model = False
-    master_dev_opt = core.DeviceOption(model_helper_obj._device_type, master_device)
+    if workspace.has_hip_support:
+        master_dev_opt = core.DeviceOption(model_helper_obj._device_type, hip_gpu_id=master_device)
+    else:
+        master_dev_opt = core.DeviceOption(model_helper_obj._device_type, cuda_gpu_id=master_device)
 
     # question: rendezvous structure
     num_shards = rendezvous['num_shards'] if rendezvous else 1
@@ -811,9 +822,9 @@ def builder_fun(model):
     if device is None:
         device = scope.CurrentDeviceScope()
 
-    device_prefix = "gpu" if device.device_type == caffe2_pb2.CUDA else "cpu"
+    device_prefix = "gpu" if (device.device_type == caffe2_pb2.CUDA or device.device_type == caffe2_pb2.HIP) else "cpu"
 
-    namescope = "{}_{}/".format(device_prefix, device.cuda_gpu_id)
+    namescope = "{}_{}/".format(device_prefix, device.hip_gpu_id if workspace.has_hip_support else device.cuda_gpu_id)
     for op in mnet.Proto().op:
         if "RecurrentNetwork" in op.type:
             raise("RecurrentNetwork conversion not yet supported")
@@ -834,7 +845,10 @@ def builder_fun(model):
 def _ForEachDevice(devices, f, device_type, device_prefix, scoped=False,
                    *args, **kwargs):
     for device in devices:
-        device_opt = core.DeviceOption(device_type, device)
+        if workspace.has_hip_support:
+            device_opt = core.DeviceOption(device_type, hip_gpu_id=device)
+        else:
+            device_opt = core.DeviceOption(device_type, cuda_gpu_id=device)
         with core.DeviceScope(device_opt):
             if scoped:
                 with core.NameScope("{}_{}".format(device_prefix, device)):
@@ -850,7 +864,10 @@ def create_grad(lossp):
     loss_grad = {}
     # Explicitly need to create gradients on each GPU
     for gpu_id in devices:
-        device = core.DeviceOption(model._device_type, gpu_id)
+        if workspace.has_hip_support:
+            device = core.DeviceOption(model._device_type, hip_gpu_id=gpu_id)
+        else:
+            device = core.DeviceOption(model._device_type, cuda_gpu_id=gpu_id)
         with core.DeviceScope(device):
             for l in losses_by_gpu[gpu_id]:
                 lg = create_grad(l)
@@ -968,7 +985,7 @@ def GetLearningRateBlobNames(model):
     if model._optimizer is not None:
         if model._device_type == caffe2_pb2.CPU:
             return [model._optimizer.get_cpu_blob_name('lr')]
-        elif model._device_type == caffe2_pb2.CUDA:
+        elif model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP:
             return [model._optimizer.get_gpu_blob_name('lr', gpu, '')
                     for gpu in model._devices]
         else:
@@ -989,7 +1006,10 @@ def _Broadcast(devices, model, net, param, use_nccl=False):
 
     if use_nccl:
         if _IsGPUBlob(model, param):
-            master_device_opt = core.DeviceOption(model._device_type, master_dev)
+            if workspace.has_hip_support:
+                master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=master_dev)
+            else:
+                master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=master_dev)
             with core.DeviceScope(master_device_opt):
                 # Note that the root is the root _rank_ and not the root
                 # _device_. Thus we always use root=0, regardless of the
@@ -1003,7 +1023,10 @@ def _Broadcast(devices, model, net, param, use_nccl=False):
 
     for dev_idx in devices[1:]:
         if _IsGPUBlob(model, param):
-            device_opt = core.DeviceOption(caffe2_pb2.CUDA, dev_idx)
+            if workspace.has_hip_support:
+                device_opt = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=dev_idx)
+            else:
+                device_opt = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=dev_idx)
         else:
             device_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
         with core.DeviceScope(device_opt):
@@ -1024,6 +1047,8 @@ def _AllReduce(devices, model, net, param, use_nccl=False, control_input=None):
 
     if model._device_type == caffe2_pb2.CUDA:
         p2p_access_pattern = workspace.GetCudaPeerAccessPattern()
+    elif model._device_type == caffe2_pb2.HIP:
+        p2p_access_pattern = workspace.GetHipPeerAccessPattern()
     else:
         p2p_access_pattern = None
 
@@ -1048,7 +1073,10 @@ def sumN(*dev_indices):
                     blobs[i],
                     'gpu_{}/{}_gpu{}_copy'.format(devices[0], param, peer)
                 )
-        device_opt = core.DeviceOption(model._device_type, devices[0])
+        if workspace.has_hip_support:
+            device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+        else:
+            device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
         with core.DeviceScope(device_opt):
             net.Sum(blobs, [blobs[0]], name='dpm')
 
@@ -1155,7 +1183,10 @@ def _SyncAllParamsDistributed(
 ):
     assert rendezvous['num_shards'] > 1
 
-    gpu_device_opt = core.DeviceOption(model._device_type, devices[0])
+    if workspace.has_hip_support:
+        gpu_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+    else:
+        gpu_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
     cpu_device_opt = core.DeviceOption(caffe2_pb2.CPU)
 
     if model._broadcast_context is None:
@@ -1330,8 +1361,11 @@ def _AllReduceBlobsDistributed(
     num_workers = model.net.Proto().num_workers
     assert num_workers > 1, "Please specify more than 1 worker"
     all_reduce_engine = rendezvous['engine']
-
-    master_device_opt = core.DeviceOption(model._device_type, devices[0])
+    
+    if workspace.has_hip_support:
+        master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+    else:
+        master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
 
     reducing_device_opt = master_device_opt
 
@@ -1407,7 +1441,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
 
     # Now we need to Allreduce blobs on all the GPUs.
     # Pick GPU #0 as a master GPU.
-    master_device_opt = core.DeviceOption(model._device_type, devices[0])
+    if workspace.has_hip_support:
+        master_device_opt = core.DeviceOption(model._device_type, hip_gpu_id=devices[0])
+    else:
+        master_device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=devices[0])
     last_out = None
     concatenated_idx = set()
 
@@ -1453,7 +1490,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
                             name="note:data_parallel_model")
 
                         for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
-                            device_opt = core.DeviceOption(model._device_type, gpu)
+                            if workspace.has_hip_support:
+                                device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu)
+                            else:
+                                device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu)
                             with core.DeviceScope(device_opt):
                                 model.Copy(grad_idx_concat, g.indices)
                                 concatenated_idx.add(g.indices)
@@ -1465,7 +1505,10 @@ def _AllReduceBlobsSingleHost(blob_names, devices, model, net, use_nccl):
                         axis=0, name="note:data_parallel_model")
 
                     for gpu, g in viewitems(model._device_grouped_blobs[blob_name]):
-                        device_opt = core.DeviceOption(model._device_type, gpu)
+                        if workspace.has_hip_support:
+                            device_opt = core.DeviceOption(model._device_type, hip_gpu_id=gpu)
+                        else:
+                            device_opt = core.DeviceOption(model._device_type, cuda_gpu_id=gpu)
                         with core.DeviceScope(device_opt):
                             model.Copy(grad_val_concat, g.values)
 
@@ -1540,11 +1583,15 @@ def _AnalyzeOperators(model):
             continue
 
         op_dev = op.device_option
-        op_gpu = op_dev.cuda_gpu_id
+        op_gpu = op_dev.hip_gpu_id if workspace.has_hip_support else op_dev.cuda_gpu_id
 
         # This avoids failing on operators that are only for CPU
-        if op_dev.device_type != caffe2_pb2.CUDA:
-            continue
+        if workspace.has_hip_support:
+            if op_dev.device_type != caffe2_pb2.HIP:
+                continue
+        else:
+            if op_dev.device_type != caffe2_pb2.CUDA:
+                continue
 
         namescope = "{}_{}/".format(model._device_prefix, op_gpu)
         for inp in list(op.input) + list(op.output):
@@ -1586,14 +1633,16 @@ def map_ops(proto):
 
 def _IsGPUBlob(model, blob_name):
     if blob_name in model._blob_to_device:
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \
+                model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP
     else:
         blob_name = "{}_{}/{}".format(
             model._device_prefix, model._devices[0], blob_name
         )
         if blob_name not in model._blob_to_device:
-            return model._device_type == caffe2_pb2.CUDA
-        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA
+            return model._device_type == caffe2_pb2.CUDA or model._device_type == caffe2_pb2.HIP
+        return model._blob_to_device[blob_name].device_type == caffe2_pb2.CUDA or \
+                model._blob_to_device[blob_name].device_type == caffe2_pb2.HIP
 
 
 def _GroupByDevice(model, devices, params, non_data_params):
@@ -1904,7 +1953,10 @@ def _InterleaveOps(model):
     new_ops = []
     ops = {d: [] for d in range(num_devices)}
     for op in orig_ops:
-        ops[op.device_option.cuda_gpu_id].append(op)
+        if workspace.has_hip_support:
+            ops[op.device_option.hip_gpu_id].append(op)
+        else:
+            ops[op.device_option.cuda_gpu_id].append(op)
 
     for j in range(num_ops_per_dev):
         tp = None
diff --git a/caffe2/python/docs/generator.py b/caffe2/python/docs/generator.py
index 1bc41b7d1ccbc5..b7fcb708d8951e 100644
--- a/caffe2/python/docs/generator.py
+++ b/caffe2/python/docs/generator.py
@@ -103,7 +103,8 @@ def __init__(self, name):
     def getDeviceImpl(self):
         deviceImplList = []
         for device, impl in [('CPU', OpSchema.get_cpu_impl(self.op_name)),
-                             ('CUDA', OpSchema.get_cuda_impl(self.op_name))]:
+                             ('CUDA', OpSchema.get_cuda_impl(self.op_name)),
+                             ('HIP', OpSchema.get_hip_impl(self.op_name))]:
             if not impl:
                 continue
             deviceImplList.append((device, impl))
@@ -194,7 +195,8 @@ def generateDevices(self, formatter):
             self.getInfo(formatter,
                          'CPU', OpSchema.get_cpu_impl(self.name)),
             self.getInfo(formatter,
-                         'GPU', OpSchema.get_cuda_impl(self.name)),
+                         'HIP', OpSchema.get_hip_impl(self.name)) if workspace.has_hip_support else
+                        self.getInfo(formatter, 'GPU', OpSchema.get_cuda_impl(self.name)),
         ]
         formatter.addList([i for i in devices if i])
 
diff --git a/caffe2/python/functional_test.py b/caffe2/python/functional_test.py
index e7803e829bb431..db252d8f704d0f 100644
--- a/caffe2/python/functional_test.py
+++ b/caffe2/python/functional_test.py
@@ -46,7 +46,7 @@ def _tensor_splits(draw, add_axis=False):
 
 
 class TestFunctional(hu.HypothesisTestCase):
-    @given(X=hu.tensor(), engine=st.sampled_from(["", "CUDNN"]), **hu.gcs)
+    @given(X=hu.tensor(), engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **hu.gcs)
     def test_relu(self, X, engine, gc, dc):
         X += 0.02 * np.sign(X)
         X[X == 0.0] += 0.02
diff --git a/caffe2/python/gradient_check_test.py b/caffe2/python/gradient_check_test.py
index f1c190aa6efac6..52da4e7dd49304 100644
--- a/caffe2/python/gradient_check_test.py
+++ b/caffe2/python/gradient_check_test.py
@@ -23,9 +23,9 @@
 import unittest
 
 
-if workspace.has_gpu_support and workspace.NumCudaDevices() > 0:
+if (workspace.has_gpu_support or workspace.has_hip_support) and workspace.NumGpuDevices() > 0:
     gpu_device_option = caffe2_pb2.DeviceOption()
-    gpu_device_option.device_type = caffe2_pb2.CUDA
+    gpu_device_option.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
     cpu_device_option = caffe2_pb2.DeviceOption()
     gpu_device_checker = device_checker.DeviceChecker(
         0.01, [gpu_device_option]
diff --git a/caffe2/python/hypothesis_test_util.py b/caffe2/python/hypothesis_test_util.py
index 5cc18f99bd9eb9..04700cec76f313 100644
--- a/caffe2/python/hypothesis_test_util.py
+++ b/caffe2/python/hypothesis_test_util.py
@@ -256,6 +256,7 @@ def tensors1d(n, min_len=1, max_len=64, dtype=np.float32, elements=None):
 # temporarily skip some flaky tests on ROCM before it's getting more mature.
 _device_options_no_hip = [cpu_do] + ([gpu_do] if workspace.has_gpu_support else [])
 device_options = _device_options_no_hip + ([hip_do] if workspace.has_hip_support else [])
+_device_options_hip_or_gpu = [hip_do] if workspace.has_hip_support else [gpu_do] 
 
 # Include device option for each GPU
 expanded_device_options = [cpu_do] + (
@@ -279,6 +280,7 @@ def gradient_checker_device_option():
 
 gcs_cpu_only = dict(gc=st.sampled_from([cpu_do]), dc=st.just([cpu_do]))
 gcs_gpu_only = dict(gc=st.sampled_from([gpu_do]), dc=st.just([gpu_do]))
+gcs_gpu_or_hip_only = dict(gc=st.sampled_from(_device_options_hip_or_gpu), dc=st.just(_device_options_hip_or_gpu))
 gcs_no_hip = dict(gc=st.sampled_from(_device_options_no_hip), dc=st.just(_device_options_no_hip))
 
 
diff --git a/caffe2/python/memonger_test.py b/caffe2/python/memonger_test.py
index 6536280d8a6057..cb5712f425e223 100644
--- a/caffe2/python/memonger_test.py
+++ b/caffe2/python/memonger_test.py
@@ -223,13 +223,13 @@ def test_gradient_optim(self, input_dim, output_dim, batch_size):
         np.testing.assert_almost_equal(loss, optimized_loss)
         np.testing.assert_almost_equal(grad, optimized_grad)
 
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support.")
     def test_memonger_mix_cpu_gpu(self):
         '''
         Check that memonger does not make blobs cross CPU/GPU boundary
         '''
         m = model_helper.ModelHelper()
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        with core.DeviceScope(core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)):
             fc1 = brew.fc(m, "data", "fc1", dim_in=2, dim_out=2)
             fc2 = brew.fc(m, fc1, "fc2", dim_in=2, dim_out=2)
             fc3 = brew.fc(m, fc2, "fc3", dim_in=2, dim_out=2)
@@ -259,7 +259,10 @@ def test_memonger_mix_cpu_gpu(self):
 
         # Create set of blobs on CPU side and GPU side and check they don't
         # overlap
-        device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
+        if workspace.has_hip_support:
+            device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.HIP: set()}
+        else:
+            device_blobs = {caffe2_pb2.CPU: set(), caffe2_pb2.CUDA: set()}
         for op in optim_proto.op:
             if op.type not in ['CopyCPUToGPU', "CopyGPUToCPU"]:
                 dev = op.device_option.device_type
@@ -267,7 +270,7 @@ def test_memonger_mix_cpu_gpu(self):
                     device_blobs[dev].add(b)
 
         device_crossers = device_blobs[caffe2_pb2.CPU].intersection(
-            device_blobs[caffe2_pb2.CUDA]
+            device_blobs[caffe2_pb2.HIP] if workspace.has_hip_support else device_blobs[caffe2_pb2.CUDA]
         )
         self.assertEquals(device_crossers, set())
 
diff --git a/caffe2/python/model_device_test.py b/caffe2/python/model_device_test.py
index 31cba3facb0559..3f438d35cad35e 100644
--- a/caffe2/python/model_device_test.py
+++ b/caffe2/python/model_device_test.py
@@ -124,7 +124,7 @@ def _testMiniAlexNet(self, order):
         cpu_device = caffe2_pb2.DeviceOption()
         cpu_device.device_type = caffe2_pb2.CPU
         gpu_device = caffe2_pb2.DeviceOption()
-        gpu_device.device_type = caffe2_pb2.CUDA
+        gpu_device.device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
 
         checker = device_checker.DeviceChecker(0.05, [cpu_device, gpu_device])
         ret = checker.CheckNet(
@@ -136,7 +136,7 @@ def _testMiniAlexNet(self, order):
         )
         self.assertEqual(ret, True)
 
-    @unittest.skipIf(not workspace.has_gpu_support,
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support,
                      "No GPU support. Skipping test.")
     def testMiniAlexNetNCHW(self):
         self._testMiniAlexNet("NCHW")
diff --git a/caffe2/python/model_helper.py b/caffe2/python/model_helper.py
index f8e3f32bb2c225..d8c5e7a63ef6f6 100644
--- a/caffe2/python/model_helper.py
+++ b/caffe2/python/model_helper.py
@@ -596,7 +596,10 @@ def rename_list(proto_list):
                             rename_list(step_op.output)
                             if device is not None:
                                 step_op.device_option.device_type = device.device_type
-                                step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                                if workspace.has_hip_support:
+                                    step_op.device_option.hip_gpu_id = device.hip_gpu_id
+                                else:
+                                    step_op.device_option.cuda_gpu_id = device.cuda_gpu_id
 
                         rename_list(arg.n.external_input)
                         rename_list(arg.n.external_output)
@@ -610,7 +613,10 @@ def rename_list(proto_list):
 
             if device is not None:
                 op.device_option.device_type = device.device_type
-                op.device_option.cuda_gpu_id = device.cuda_gpu_id
+                if workspace.has_hip_support:
+                    op.device_option.hip_gpu_id = device.hip_gpu_id
+                else:
+                    op.device_option.cuda_gpu_id = device.cuda_gpu_id
             validate_op(op)
             predict_proto.op.extend([op])
             known_blobs.update(op.output)
diff --git a/caffe2/python/muji.py b/caffe2/python/muji.py
index b407f96d2391f8..186857fe569430 100644
--- a/caffe2/python/muji.py
+++ b/caffe2/python/muji.py
@@ -25,8 +25,12 @@ def OnGPU(gpu_id):
   specified gpu id.
   """
     device_option = caffe2_pb2.DeviceOption()
-    device_option.device_type = caffe2_pb2.CUDA
-    device_option.cuda_gpu_id = gpu_id
+    if workspace.has_hip_support:
+        device_option.device_type = caffe2_pb2.HIP  
+        device_option.hip_gpu_id = gpu_id
+    else:
+        device_option.device_type = caffe2_pb2.CUDA
+        device_option.cuda_gpu_id = gpu_id
     return device_option
 
 
@@ -48,7 +52,7 @@ def Allreduce(net, blobs, reduced_affix="_reduced", gpu_indices=None):
             "gpu_indices length and blobs length mismatch: %d vs %d" %
             (len(gpu_indices), len(blobs))
         )
-    pattern = workspace.GetCudaPeerAccessPattern()
+    pattern = workspace.GetHipPeerAccessPattern() if workspace.has_hip_support else workspace.GetCudaPeerAccessPattern()
     if len(blobs) == 2 and pattern.shape[0] >= 2 and np.all(pattern[:2, :2]):
         return Allreduce2(net, blobs, reduced_affix, gpu_indices)
     elif len(blobs) == 4 and pattern.shape[0] >= 4 and np.all(pattern[:4, :4]):
diff --git a/caffe2/python/net_printer.py b/caffe2/python/net_printer.py
index 4b5cddb61d244e..60b3598f3fb192 100644
--- a/caffe2/python/net_printer.py
+++ b/caffe2/python/net_printer.py
@@ -8,6 +8,7 @@
 from caffe2.proto.caffe2_pb2 import OperatorDef, NetDef
 from caffe2.python.checkpoint import Job
 from caffe2.python.core import Net, ExecutionStep, Plan
+from caffe2.python import workspace
 from caffe2.python.task import Task, TaskGroup, WorkspaceType, TaskOutput
 from collections import defaultdict
 from contextlib import contextmanager
@@ -267,12 +268,13 @@ def call(op, inputs=None, outputs=None, factor_prefixes=False):
 
 
 def format_device_option(dev_opt):
+    gpu_id = dev_opt.hip_gpu_id if workspace.has_hip_support else dev_opt.cuda_gpu_id
     if not dev_opt or not (
-            dev_opt.device_type or dev_opt.cuda_gpu_id or dev_opt.node_name):
+            dev_opt.device_type or gpu_id or dev_opt.node_name):
         return None
     return call(
         'DeviceOption',
-        [dev_opt.device_type, dev_opt.cuda_gpu_id, "'%s'" % dev_opt.node_name])
+        [dev_opt.device_type, gpu_id, "'%s'" % dev_opt.node_name])
 
 
 @Printer.register(OperatorDef)
diff --git a/caffe2/python/operator_test/activation_ops_test.py b/caffe2/python/operator_test/activation_ops_test.py
index 5be8b689f115cb..200647c95998e4 100644
--- a/caffe2/python/operator_test/activation_ops_test.py
+++ b/caffe2/python/operator_test/activation_ops_test.py
@@ -18,7 +18,7 @@
 
 class TestActivations(serial.SerializedTestCase):
     @serial.given(X=hu.tensor(), in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **mu.gcs)
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]), **mu.gcs)
     def test_relu(self, X, in_place, engine, gc, dc):
         if gc == mu.mkl_do:
             in_place = False
@@ -44,7 +44,7 @@ def relu_ref(X):
     @unittest.skipIf(not workspace.has_gpu_support,
                      "Relu for float16 can only run on GPU now.")
     @given(X=hu.tensor(dtype=np.float16), in_place=st.booleans(),
-           engine=st.sampled_from(["", "CUDNN"]), **hu.gcs_gpu_only)
+           engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]), **hu.gcs_gpu_only)
     def test_relu_fp16(self, X, in_place, engine, gc, dc):
         op = core.CreateOperator(
             "Relu",
@@ -103,7 +103,7 @@ def relu_n_ref(X):
 
     @serial.given(X=hu.tensor(),
            alpha=st.floats(min_value=0.1, max_value=2.0),
-           in_place=st.booleans(), engine=st.sampled_from(["", "CUDNN"]),
+           in_place=st.booleans(), engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]),
            **hu.gcs)
     def test_elu(self, X, alpha, in_place, engine, gc, dc):
         op = core.CreateOperator(
diff --git a/caffe2/python/operator_test/boolean_mask_test.py b/caffe2/python/operator_test/boolean_mask_test.py
index 8811f5667503b8..695cb3818deb5f 100644
--- a/caffe2/python/operator_test/boolean_mask_test.py
+++ b/caffe2/python/operator_test/boolean_mask_test.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import assume, given
diff --git a/caffe2/python/operator_test/ceil_op_test.py b/caffe2/python/operator_test/ceil_op_test.py
index 130364261ea166..079e669370722f 100644
--- a/caffe2/python/operator_test/ceil_op_test.py
+++ b/caffe2/python/operator_test/ceil_op_test.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from caffe2.python import core
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 import caffe2.python.serialized_test.serialized_test_util as serial
 from hypothesis import given
@@ -16,7 +16,7 @@
 class TestCeil(serial.SerializedTestCase):
 
     @serial.given(X=hu.tensor(),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from([""] if workspace.has_hip_support else ["", "CUDNN"]),
            **hu.gcs)
     def test_ceil(self, X, gc, dc, engine):
         op = core.CreateOperator("Ceil", ["X"], ["Y"], engine=engine)
diff --git a/caffe2/python/operator_test/conv_test.py b/caffe2/python/operator_test/conv_test.py
index d29d724b89c29d..a3cb4e3f05749e 100644
--- a/caffe2/python/operator_test/conv_test.py
+++ b/caffe2/python/operator_test/conv_test.py
@@ -41,6 +41,15 @@ def _cudnn_supports(
             return False
     return True
 
+def _miopen_supports(
+        dilation=False,
+        nhwc=False,
+        backward=False,
+):
+    """Return True if MIOPEN supports this configuration."""
+    if nhwc or dilation:
+        return False
+    return True
 
 def _cudnn_convolution_algo_count(direction):
     try:
@@ -195,7 +204,7 @@ def test_convolution_separate_stride_pad_layout(
            batch_size=st.integers(1, 3),
            group=st.integers(1, 2),
            order=st.sampled_from(["NCHW", "NHWC"]),
-           engine=st.sampled_from(["", "CUDNN", "MKLDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN", "MKLDNN"]),
            use_bias=st.booleans(),
            force_algo_fwd=_cudnn_convolution_algo_count("fwd"),
            force_algo_dgrad=_cudnn_convolution_algo_count("dgrad"),
@@ -216,6 +225,10 @@ def test_convolution_gradients(
             assume(_cudnn_supports(dilation=(dilation > 1),
                                    nhwc=(order == 'NHWC'),
                                    backward=True))
+        if engine == 'MIOPEN':
+            assume(_miopen_supports(dilation=(dilation > 1),
+                                    nhwc=(order == 'NHWC'),
+                                    backward=True))
 
         assume(engine != "MKLDNN" or use_bias is True)
 
@@ -462,8 +475,12 @@ def test_convolution_layout(self, op_type, stride, pad, kernel, dilation,
 
         for order in ["NCHW", "NHWC"]:
             engine_list = ['']
-            if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
-                engine_list.append('CUDNN')
+            if workspace.has_hip_support:
+                if _miopen_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
+                    engine_list.append('MIOPEN')
+            else:
+                if _cudnn_supports(dilation=(dilation > 1), nhwc=(order == 'NHWC')):
+                    engine_list.append('CUDNN')
 
             for engine in engine_list:
                 op = core.CreateOperator(
@@ -635,8 +652,7 @@ def test_use_cudnn_engine_interactions(self):
                             f(**kwargs)
                     else:
                         f(**kwargs)
-                        self.assertEqual(model.Proto().op[-1].engine,
-                                         expected_engine)
+                        self.assertEqual(model.Proto().op[-1].engine, expected_engine)
 
     @serial.given(
         op_type=st.sampled_from(["Conv", "Conv2D"]), N=st.integers(1, 4),
diff --git a/caffe2/python/operator_test/copy_ops_test.py b/caffe2/python/operator_test/copy_ops_test.py
index 05a018ff90a2c3..3b07090a2b3283 100644
--- a/caffe2/python/operator_test/copy_ops_test.py
+++ b/caffe2/python/operator_test/copy_ops_test.py
@@ -40,21 +40,29 @@ def run_test_copy_gradient(self, device_opt):
     def test_copy_gradient_cpu(self):
         self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CPU, 0))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    num_gpu = 0
+    if workspace.has_hip_support:
+        num_gpu = workspace.NumHipDevices()
+    else:
+        num_gpu = workspace.NumCudaDevices()
+
+    @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.")
     def test_copy_gradient_gpu(self):
-        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.CUDA, 0))
+        self.run_test_copy_gradient(core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 2, "Need at least 2 GPU.")
+    @unittest.skipIf(num_gpu < 2, "Need at least 2 GPU.")
     def test_copy_gradient_multiple_gpus(self):
         model = model_helper.ModelHelper(name="copy_test")
 
         with core.DeviceScope(core.DeviceOption(caffe2_pb2.CPU, 0)):
             x_cpu = model.net.AddExternalInputs("x_cpu")
 
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 0)):
+        gpu_device = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
+
+        with core.DeviceScope(core.DeviceOption(gpu_device, 0)):
             x_gpu_1 = model.CopyCPUToGPU(x_cpu, "x_gpu_1")
 
-        with core.DeviceScope(core.DeviceOption(caffe2_pb2.CUDA, 1)):
+        with core.DeviceScope(core.DeviceOption(gpu_device, 1)):
             x_gpu_2 = model.Copy(x_gpu_1, "x_gpu_2")
             loss = model.AveragedLoss(x_gpu_2, "loss")
             gradient_map = model.AddGradientOperators([loss])
@@ -80,20 +88,20 @@ def get_op_with_output(model, output_blob_name):
 
         self.assertEqual(
             get_op_with_output(model, "x_gpu_2_grad").device_option,
-            core.DeviceOption(caffe2_pb2.CUDA, 1),
+            core.DeviceOption(gpu_device, 1),
         )
         self.assertEqual(
             get_op_with_output(model, "x_cpu_grad").device_option,
-            core.DeviceOption(caffe2_pb2.CUDA, 0),
+            core.DeviceOption(gpu_device, 0),
         )
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.")
     def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
         model = model_helper.ModelHelper(name="copy_test")
         v = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
         indices = model.param_init_net.UniformFill([], ["v"], shape=[16, 4])
         cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
 
         with core.DeviceScope(gpu_opt):
             vcpu = model.CopyGPUToCPU(v, "vcpu")
@@ -112,13 +120,13 @@ def test_cpu2gpu_gpu2cpu_sparse_gradients(self):
         self.assertTrue("v" in gradient_map)
         self.assertTrue(isinstance(gradient_map['v'], core.GradientSlice))
 
-    @unittest.skipIf(workspace.NumCudaDevices() < 1, "Need at least 1 GPU.")
+    @unittest.skipIf(num_gpu < 1, "Need at least 1 GPU.")
     def test_cpu2gpu_gpu2cpu_gradients(self):
         model = model_helper.ModelHelper(name="copy_test")
 
         batch = 32
         cpu_opt = core.DeviceOption(caffe2_pb2.CPU, 0)
-        gpu_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
 
         with core.NameScope("cpu"):
             with core.DeviceScope(cpu_opt):
diff --git a/caffe2/python/operator_test/elementwise_op_broadcast_test.py b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
index 161f5fc0724b14..9f618715dd98ea 100644
--- a/caffe2/python/operator_test/elementwise_op_broadcast_test.py
+++ b/caffe2/python/operator_test/elementwise_op_broadcast_test.py
@@ -406,8 +406,8 @@ def test_sum_reduce(self, gc, dc):
         np.testing.assert_array_almost_equal(out, res)
         self.assertDeviceChecks(dc, op, [X, Y], [0])
 
-        # fp64 is not supported with the CUDA op
-        dc_cpu_only = [d for d in dc if d.device_type != caffe2_pb2.CUDA]
+        # fp64 is not supported with the CUDA/HIP op
+        dc_cpu_only = [d for d in dc if (d.device_type != caffe2_pb2.CUDA or d.device_type != caffe2_pb2.HIP)]
         self.assertDeviceChecks(dc_cpu_only, op, [X, Y], [0])
 
     @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
diff --git a/caffe2/python/operator_test/group_conv_test.py b/caffe2/python/operator_test/group_conv_test.py
index fd4c5adf0075d6..082c0c20c46a62 100644
--- a/caffe2/python/operator_test/group_conv_test.py
+++ b/caffe2/python/operator_test/group_conv_test.py
@@ -7,7 +7,7 @@
 import hypothesis.strategies as st
 
 from caffe2.proto import caffe2_pb2
-from caffe2.python import core
+from caffe2.python import core, workspace
 import caffe2.python.hypothesis_test_util as hu
 
 import unittest
@@ -38,7 +38,8 @@ def test_group_convolution(
             order, engine, use_bias, gc, dc):
         assume(size >= kernel)
         # TODO: Group conv in NHWC not implemented for GPU yet.
-        assume(group == 1 or order == "NCHW" or gc.device_type != caffe2_pb2.CUDA)
+        gpu_device_type = caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA
+        assume(group == 1 or order == "NCHW" or gc.device_type != gpu_device_type)
         input_channels = input_channels_per_group * group
         output_channels = output_channels_per_group * group
 
@@ -65,8 +66,8 @@ def test_group_convolution(
             w = w.transpose((0, 3, 1, 2))
 
         inputs = [X, w, b] if use_bias else [X, w]
-
-        self.assertDeviceChecks(dc, op, inputs, [0])
+        if order != 'NHWC' or group == 1:
+            self.assertDeviceChecks(dc, op, inputs, [0])
         for i in range(len(inputs)):
             self.assertGradientChecks(gc, op, inputs, i, [0])
 
diff --git a/caffe2/python/operator_test/pooling_test.py b/caffe2/python/operator_test/pooling_test.py
index 956d0ec9619987..4301b5c60f66f3 100644
--- a/caffe2/python/operator_test/pooling_test.py
+++ b/caffe2/python/operator_test/pooling_test.py
@@ -209,12 +209,14 @@ def test_pooling_with_index(self, stride, pad, kernel, size,
 
     @given(sz=st.integers(1, 20),
            batch_size=st.integers(1, 4),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            op_type=st.sampled_from(["AveragePool", "AveragePool2D"]),
            **hu.gcs)
     @settings(max_examples=3, timeout=10)
     def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc):
         ''' Special test to stress the fast path of NCHW average pool '''
+        if engine == 'MIOPEN':
+            assume(sz<16)
         op = core.CreateOperator(
             op_type,
             ["X"],
@@ -233,7 +235,7 @@ def test_global_avg_pool_nchw(self, op_type, sz, batch_size, engine, gc, dc):
 
     @given(sz=st.integers(1, 20),
            batch_size=st.integers(1, 4),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            op_type=st.sampled_from(["MaxPool", "MaxPool2D"]),
            **hu.gcs)
     @settings(max_examples=3, timeout=10)
@@ -241,7 +243,10 @@ def test_global_max_pool_nchw(self, op_type, sz,
                                   batch_size, engine, gc, dc):
         ''' Special test to stress the fast path of NCHW max pool '''
         # CuDNN 5 does not support deterministic max pooling.
-        assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN")
+        if engine == 'MIOPEN':
+            assume(sz<16)
+        if not workspace.has_hip_support:
+            assume(workspace.GetCuDNNVersion() >= 6000 or engine != "CUDNN")
         op = core.CreateOperator(
             op_type,
             ["X"],
@@ -270,12 +275,14 @@ def test_global_max_pool_nchw(self, op_type, sz,
            order=st.sampled_from(["NCHW", "NHWC"]),
            op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool",
                                    "MaxPool2D", "AveragePool2D"]),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            **hu.gcs)
     def test_pooling(self, stride, pad, kernel, size,
                      input_channels, batch_size,
                      order, op_type, engine, gc, dc):
         assume(pad < kernel)
+        if engine == 'MIOPEN':
+            assume(op_type != "LpPool" and order == "NCHW")
         op = core.CreateOperator(
             op_type,
             ["X"],
@@ -300,12 +307,14 @@ def test_pooling(self, stride, pad, kernel, size,
            batch_size=st.integers(1, 3),
            order=st.sampled_from(["NCHW", "NHWC"]),
            op_type=st.sampled_from(["MaxPool", "AveragePool", "LpPool"]),
-           engine=st.sampled_from(["", "CUDNN"]),
+           engine=st.sampled_from(["", "MIOPEN" if workspace.has_hip_support else "CUDNN"]),
            **hu.gcs)
     def test_global_pooling(self, size, input_channels, batch_size,
                             order, op_type, engine, gc, dc):
         # CuDNN 5 does not support deterministic max pooling.
         assume(workspace.GetCuDNNVersion() >= 6000 or op_type != "MaxPool")
+        if engine == 'MIOPEN':
+            assume(op_type != "LpPool" and order == "NCHW")
         op = core.CreateOperator(
             op_type,
             ["X"],
diff --git a/caffe2/python/operator_test/rnn_cell_test.py b/caffe2/python/operator_test/rnn_cell_test.py
index 9d9bb38e178517..8804dc3bbdbcbc 100644
--- a/caffe2/python/operator_test/rnn_cell_test.py
+++ b/caffe2/python/operator_test/rnn_cell_test.py
@@ -1174,11 +1174,15 @@ def test_lstm_extract_predictor_net(self):
             shapes[b] = workspace.FetchBlob(b).shape
 
         # But export in CPU
+        if workspace.has_hip_support:
+            device = core.DeviceOption(caffe2_pb2.CPU, hip_gpu_id=1)
+        else:
+            device = core.DeviceOption(caffe2_pb2.CPU, cuda_gpu_id=1)
         (predict_net, export_blobs) = ExtractPredictorNet(
             net_proto=model.net.Proto(),
             input_blobs=["input"],
             output_blobs=[output],
-            device=core.DeviceOption(caffe2_pb2.CPU, 1),
+            device=device,
         )
 
         # Create the net and run once to see it is valid
@@ -1216,7 +1220,10 @@ def test_lstm_extract_predictor_net(self):
                     if arg.name == "step_net":
                         for step_op in arg.n.op:
                             self.assertEqual(0, step_op.device_option.device_type)
-                            self.assertEqual(1, step_op.device_option.cuda_gpu_id)
+                            if workspace.has_hip_support:
+                                self.assertEqual(1, step_op.device_option.hip_gpu_id)
+                            else:
+                                self.assertEqual(1, step_op.device_option.cuda_gpu_id)
                     elif arg.name == 'backward_step_net':
                         self.assertEqual(caffe2_pb2.NetDef(), arg.n)
 
diff --git a/caffe2/python/optimizer.py b/caffe2/python/optimizer.py
index 482d16a0dfa6a6..adc648c8218d19 100644
--- a/caffe2/python/optimizer.py
+++ b/caffe2/python/optimizer.py
@@ -79,9 +79,11 @@ def make_unique_blob_name(self, base_str):
         if current_scope is None:
             return self.get_cpu_blob_name(base_str)
 
-        if current_scope.device_type == caffe2_pb2.CUDA:
+        if current_scope.device_type == caffe2_pb2.CUDA or current_scope.device_type == caffe2_pb2.HIP:
             return self.get_gpu_blob_name(
-                base_str, current_scope.cuda_gpu_id, current_scope.node_name
+                base_str, 
+                current_scope.hip_gpu_id if workspace.has_hip_support else current_scope.cuda_gpu_id, 
+                current_scope.node_name
             )
         else:
             return self.get_cpu_blob_name(base_str, current_scope.node_name)
@@ -125,7 +127,7 @@ def build_lr(self, net, param_init_net, base_learning_rate,
         if self._local_lr_multiplier is not None:
             current_scope = scope.CurrentDeviceScope()
             if (current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)
                     and not self._local_lr_multiplier_on_gpu):
                 local_lr_multiplier = net.CopyFromCPUInput(
                     self._local_lr_multiplier,
@@ -256,7 +258,7 @@ def _run(self, net, param_init_net, param_info):
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)),
             )
 
         # We need negative sign for LR when used directly with WeightedSum
@@ -277,7 +279,7 @@ def _run(self, net, param_init_net, param_info):
         # to include device information.
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}{}".format(dev.device_type, dev.cuda_gpu_id, dev.node_name),
+            "ONE_{}_{}{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id, dev.node_name),
             shape=[1],
             value=1.0
         )
@@ -486,12 +488,12 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
         WD = param_init_net.ConstantFill(
-            [], "wd_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            [], "wd_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id),
             shape=[1], value=self.weight_decay
         )
 
@@ -547,7 +549,7 @@ def _run(self, net, param_init_net, param_info):
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)),
             )
 
         lr, _ = self.build_lr(
@@ -684,7 +686,7 @@ def _run(self, net, param_init_net, param_info):
             self._add_local_lr_multiplier(
                 lr_lars_multiplier,
                 is_gpu_blob=(current_scope is not None
-                    and current_scope.device_type == caffe2_pb2.CUDA),
+                    and current_scope.device_type == (caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA)),
             )
 
         lr, _ = self.build_lr(
@@ -1158,7 +1160,7 @@ def _run(self, net, param_init_net, param_info):
 
         ONE = param_init_net.ConstantFill(
             [],
-            "ONE_{}_{}".format(dev.device_type, dev.cuda_gpu_id),
+            "ONE_{}_{}".format(dev.device_type, dev.hip_gpu_id if workspace.has_hip_support else dev.cuda_gpu_id),
             shape=[1],
             value=1.0
         )
diff --git a/caffe2/python/optimizer_test.py b/caffe2/python/optimizer_test.py
index bbb0c02625f09a..a136684cd0e38e 100644
--- a/caffe2/python/optimizer_test.py
+++ b/caffe2/python/optimizer_test.py
@@ -77,7 +77,7 @@ def check_optimizer(self, optimizer):
             tensor = workspace.FetchBlob(param)
             np.testing.assert_allclose(np.array([1.0]), tensor, atol=1e-5)
 
-    @unittest.skipIf(not workspace.has_gpu_support, "No GPU support")
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support , "No GPU support")
     def testGPUDense(self):
         super(TestMultiPrecisionSgd, self).testGPUDense(core.DataType.FLOAT16)
 
@@ -434,11 +434,11 @@ def test_caffe2_cpu_vs_numpy(self):
                 )
 
     @unittest.skip("Results might vary too much. Only for individual use.")
-    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
+    @unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support")
     def test_caffe2_gpu_vs_numpy(self):
         n_dim = 1000000
         n_iter = 50
-        gpu_device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        gpu_device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
         with core.DeviceScope(gpu_device_opt):
             for zero_debias in [False, True]:
                 for grad_coef in [1.0, 0.1, 0.01]:
diff --git a/caffe2/python/optimizer_test_util.py b/caffe2/python/optimizer_test_util.py
index dbb0dbeae2dd37..4c62d9ee21b797 100644
--- a/caffe2/python/optimizer_test_util.py
+++ b/caffe2/python/optimizer_test_util.py
@@ -70,7 +70,7 @@ def testDense(self):
 
     @unittest.skipIf(not workspace.has_gpu_support, "No gpu support")
     def testGPUDense(self, dtype=core.DataType.FLOAT):
-        device_opt = core.DeviceOption(caffe2_pb2.CUDA, 0)
+        device_opt = core.DeviceOption(caffe2_pb2.HIP if workspace.has_hip_support else caffe2_pb2.CUDA, 0)
         with core.DeviceScope(device_opt):
             model, _perfect_model, data, label = self._createDense(dtype)
             if dtype == core.DataType.FLOAT16:
diff --git a/caffe2/python/scope_test.py b/caffe2/python/scope_test.py
index 11f7a2c44046af..f4b3aa4a244d17 100644
--- a/caffe2/python/scope_test.py
+++ b/caffe2/python/scope_test.py
@@ -3,7 +3,7 @@
 from __future__ import print_function
 from __future__ import unicode_literals
 
-from caffe2.python import scope, core
+from caffe2.python import scope, core, workspace
 from caffe2.proto import caffe2_pb2
 
 import unittest
@@ -18,7 +18,10 @@ def thread_runner(idx, testobj):
     testobj.assertEquals(scope.CurrentNameScope(), "")
     testobj.assertEquals(scope.CurrentDeviceScope(), None)
     namescope = "namescope_{}".format(idx)
-    dsc = core.DeviceOption(caffe2_pb2.CUDA, idx)
+    if workspace.has_hip_support:
+        dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=idx)
+    else:
+        dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=idx)
     with scope.DeviceScope(dsc):
         with scope.NameScope(namescope):
             testobj.assertEquals(scope.CurrentNameScope(), namescope + "/")
@@ -58,7 +61,10 @@ def testNamescopeAssertion(self):
     def testDevicescopeBasic(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        if workspace.has_hip_support:
+            dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9)
+        else:
+            dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9)
         with scope.DeviceScope(dsc):
             self.assertEquals(scope.CurrentDeviceScope(), dsc)
 
@@ -67,7 +73,10 @@ def testDevicescopeBasic(self):
     def testEmptyDevicescopeBasic(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        if workspace.has_hip_support:
+            dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9)
+        else:
+            dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9)
         with scope.DeviceScope(dsc):
             self.assertEquals(scope.CurrentDeviceScope(), dsc)
             with scope.EmptyDeviceScope():
@@ -78,7 +87,10 @@ def testEmptyDevicescopeBasic(self):
     def testDevicescopeAssertion(self):
         self.assertEquals(scope.CurrentDeviceScope(), None)
 
-        dsc = core.DeviceOption(caffe2_pb2.CUDA, 9)
+        if workspace.has_hip_support:
+            dsc = core.DeviceOption(caffe2_pb2.HIP, hip_gpu_id=9)
+        else:
+            dsc = core.DeviceOption(caffe2_pb2.CUDA, cuda_gpu_id=9)
 
         try:
             with scope.DeviceScope(dsc):
diff --git a/caffe2/python/workspace.py b/caffe2/python/workspace.py
index a41cc153177639..c474fe334b6f1e 100644
--- a/caffe2/python/workspace.py
+++ b/caffe2/python/workspace.py
@@ -47,7 +47,8 @@
     NumCudaDevices = C.num_cuda_devices
     GetCUDAVersion = C.get_cuda_version
     GetCuDNNVersion = C.get_cudnn_version
-
+    NumGpuDevices = NumCudaDevices
+    
     def GetCudaPeerAccessPattern():
         return np.asarray(C.get_cuda_peer_access_pattern())
 
@@ -57,8 +58,22 @@ def GetCudaPeerAccessPattern():
     GetCuDNNVersion = lambda: 0 # noqa
     GetCuDNNVersion = lambda: 0 # noqa
     GetCudaPeerAccessPattern = lambda: np.array([]) # noqa
-    GetDeviceProperties = lambda x: None # noqa
 
+if has_hip_support:
+    NumHipDevices = C.num_hip_devices
+    NumGpuDevices = NumHipDevices
+    
+    def GetHipPeerAccessPattern():
+        return np.asarray(C.get_hip_peer_access_pattern())
+
+    GetDeviceProperties = C.get_device_properties
+else:
+    NumHipDevices = lambda: 0 # noqa
+    GetHipPeerAccessPattern = lambda: np.array([]) # noqa
+
+if not has_gpu_support and not has_hip_support:
+    GetDeviceProperties = lambda x: None # noqa
+    
 IsNUMAEnabled = C.is_numa_enabled
 GetNumNUMANodes = C.get_num_numa_nodes
 GetBlobNUMANode = C.get_blob_numa_node
diff --git a/caffe2/python/workspace_test.py b/caffe2/python/workspace_test.py
index 5da37c7f22efca..72935a94ac45e7 100644
--- a/caffe2/python/workspace_test.py
+++ b/caffe2/python/workspace_test.py
@@ -317,7 +317,7 @@ def testCreateWorkspace(self):
         self.assertTrue("test" in workspaces)
 
 
-@unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
+@unittest.skipIf(not workspace.has_gpu_support and not workspace.has_hip_support, "No gpu support.")
 class TestWorkspaceGPU(test_util.TestCase):
 
     def setUp(self):
@@ -339,6 +339,7 @@ def testFetchBlobGPU(self):
         self.assertEqual(fetched_again.shape, (1, 2, 3, 4))
         np.testing.assert_array_equal(fetched_again, 2.0)
 
+    @unittest.skipIf(not workspace.has_gpu_support, "No gpu support.")
     def testGetCudaPeerAccessPattern(self):
         pattern = workspace.GetCudaPeerAccessPattern()
         self.assertEqual(type(pattern), np.ndarray)
@@ -346,6 +347,14 @@ def testGetCudaPeerAccessPattern(self):
         self.assertEqual(pattern.shape[0], pattern.shape[1])
         self.assertEqual(pattern.shape[0], workspace.NumCudaDevices())
 
+    @unittest.skipIf(not workspace.has_hip_support, "No hip support.")
+    def testGetHipPeerAccessPattern(self):
+        pattern = workspace.GetHipPeerAccessPattern()
+        self.assertEqual(type(pattern), np.ndarray)
+        self.assertEqual(pattern.ndim, 2)
+        self.assertEqual(pattern.shape[0], pattern.shape[1])
+        self.assertEqual(pattern.shape[0], workspace.NumHipDevices())
+
 
 @unittest.skipIf(not workspace.C.has_mkldnn, "No MKLDNN support.")
 class TestWorkspaceMKLDNN(test_util.TestCase):
diff --git a/rocm-docs/caffe2-build.md b/rocm-docs/caffe2-build.md
new file mode 100644
index 00000000000000..7ba7e5952dfcd3
--- /dev/null
+++ b/rocm-docs/caffe2-build.md
@@ -0,0 +1,104 @@
+# Caffe2: Building From Source on ROCm Platform
+
+## Intro
+This instruction provides a starting point to build caffe2 on AMD GPUs (Caffe2 ROCm port) from source.
+*Note*: it is recommended to start with a clean Ubuntu 16.04 system
+
+## Install docker
+
+ If your machine doesn't have docker installed, follow the steps [here](https://docs.docker.com/install/linux/docker-ce/ubuntu/#install-docker-ce) to install docker.
+
+## Install ROCm
+
+Install ROCm stack following steps at [link](https://github.com/RadeonOpenCompute/ROCm/blob/master/README.md) if your machine doesn't have ROCm already.
+
+Once the machine is ready with ROCm stack, there are two ways to use caffe2 
+* Run the docker container with caffe2 installed in it.
+
+* Build caffe2 from source inside a docker with all the dependencies.
+
+## Launch docker container with caffe2 pre-installed
+```
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video rocm/caffe2:rocm1.9-v2
+```
+
+To run benchmarks, skip directly to benchmarks section of the document.
+
+## Build Caffe2 from source
+### Pull the docker image
+```
+docker pull rocm/caffe2:unbuilt-rocm1.9-v2
+```
+This docker image has all the dependencies for caffe2 pre-installed.
+
+### Pull the latest caffe2 source:
+
+* Using https 
+```
+git clone --recurse-submodules https://github.com/ROCmSoftwarePlatform/pytorch.git
+```
+* Using ssh
+```
+git clone --recurse-submodules git@github.com:ROCmSoftwarePlatform/pytorch.git
+```
+Navigate to repo directory
+```
+cd pytorch
+```
+
+### Launch the docker container
+```	
+docker run -it --network=host --device=/dev/kfd --device=/dev/dri --group-add video -v $PWD:/pytorch rocm/caffe2:unbuilt-rocm1.9-v2
+``` 
+Navigate to pytorch directory `cd /pytorch` inside the container.
+
+### Build caffe2 Project from source
+
+* Run the command  
+
+	`.jenkins/caffe2/amd/build_amd.sh`
+
+	
+* Test the rocm-caffe2 Installation 
+
+	```
+	cd build_caffe2 && python -c 'from caffe2.python import core' 2>/dev/null && echo "Success" || echo "Failure"
+	```
+If the test fails, make sure the following environment variables are set.
+
+```
+LD_lIBRARY_PATH=/usr/local/caffe2/lib
+PYTHONPATH=/usr/local/caffe2/lib/python2.7/dist-packages
+```
+
+## Run benchmarks
+
+Navigate to build directory, `cd /pytorch/build_caffe2` to run benchmarks.
+
+Caffe2 benchmarking script supports the following networks.
+1. MLP
+2. AlexNet
+3. OverFeat
+4. VGGA
+5. Inception
+6. Inception_v2
+7. Resnet50 
+
+*Special case:* Inception_v2 will need model protobuf files to run the benchmarks. Protobufs can be downloaded from caffe2 model zoo using the below command.
+```
+python caffe2/python/models/download.py inception_v2
+```
+This will download the protobufs to current working directory.
+
+To run benchmarks for networks MLP, AlexNet, OverFeat, VGGA, Inception, Resnet50 run the command replacing `<name_of_the_netwrok>` with one of the networks. 
+
+```
+python caffe2/python/convnet_benchmarks.py --batch_size 32 --model <name_of_the_network> --engine MIOPEN
+
+```
+To run Inception_v2, please add additional argument `--model_path` to the above command which should point to the model directories downloaded above.
+
+```
+python caffe2/python/convnet_benchmarks.py --batch_size 32 --model <name_of_the_network> --engine MIOPEN  --model_path <path_to_model_protobufs>
+
+```