ROCm
diff --git a/‎aten/src/ATen/Dispatch.h
Lines changed: 7 additions & 7 deletions b/‎aten/src/ATen/Dispatch.h
Lines changed: 7 additions & 7 deletions
diff --git a/‎aten/src/ATen/native/TensorFactories.cpp
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/native/TensorFactories.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎binaries/benchmark_helper.cc
Lines changed: 5 additions & 1 deletion b/‎binaries/benchmark_helper.cc
Lines changed: 5 additions & 1 deletion
diff --git a/‎binaries/benchmark_helper.h
Lines changed: 1 addition & 0 deletions b/‎binaries/benchmark_helper.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎binaries/caffe2_benchmark.cc
Lines changed: 6 additions & 1 deletion b/‎binaries/caffe2_benchmark.cc
Lines changed: 6 additions & 1 deletion
diff --git a/‎binaries/predictor_verifier.cc
Lines changed: 1 addition & 1 deletion b/‎binaries/predictor_verifier.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎caffe2/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎caffe2/core/operator.h
Lines changed: 7 additions & 5 deletions b/‎caffe2/core/operator.h
Lines changed: 7 additions & 5 deletions
diff --git a/‎caffe2/ideep/operators/operator_fallback_ideep.cc
Lines changed: 10 additions & 0 deletions b/‎caffe2/ideep/operators/operator_fallback_ideep.cc
Lines changed: 10 additions & 0 deletions
diff --git a/‎caffe2/image/image_input_op.h
Lines changed: 13 additions & 2 deletions b/‎caffe2/image/image_input_op.h
Lines changed: 13 additions & 2 deletions
diff --git a/‎caffe2/mobile/contrib/ios/ios_caffe.cc
Lines changed: 1 addition & 1 deletion b/‎caffe2/mobile/contrib/ios/ios_caffe.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/mobile/contrib/ios/ios_caffe.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/mobile/contrib/ios/ios_caffe.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/mobile/contrib/ios/ios_caffe_predictor.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/mobile/contrib/ios/ios_caffe_predictor.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/mobile/contrib/opengl/core/GLPredictor.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/mobile/contrib/opengl/core/GLPredictor.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/mobile/contrib/opengl/core/rewrite_net.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/mobile/contrib/opengl/core/rewrite_net.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/onnx/backend_rep.h
Lines changed: 1 addition & 1 deletion b/‎caffe2/onnx/backend_rep.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
Lines changed: 15 additions & 1 deletion b/‎caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.h
Lines changed: 15 additions & 1 deletion
diff --git a/‎caffe2/operators/lengths_reducer_ops.h
Lines changed: 15 additions & 1 deletion b/‎caffe2/operators/lengths_reducer_ops.h
Lines changed: 15 additions & 1 deletion
diff --git a/‎caffe2/operators/one_hot_ops.cc
Lines changed: 3 additions & 0 deletions b/‎caffe2/operators/one_hot_ops.cc
Lines changed: 3 additions & 0 deletions
diff --git a/‎caffe2/operators/one_hot_ops.h
Lines changed: 8 additions & 0 deletions b/‎caffe2/operators/one_hot_ops.h
Lines changed: 8 additions & 0 deletions
diff --git a/‎caffe2/operators/order_switch_ops.cc
Lines changed: 9 additions & 22 deletions b/‎caffe2/operators/order_switch_ops.cc
Lines changed: 9 additions & 22 deletions
@@ -17,7 +17,7 @@
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
       default:                                                                \
-        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
     }                                                                         \
   }()
 
@@ -27,9 +27,9 @@
     switch (the_type.scalarType()) {                                          \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Double, double, __VA_ARGS__)       \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Float, float, __VA_ARGS__)         \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, Half, __VA_ARGS__)           \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)       \
       default:                                                                \
-        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
     }                                                                         \
   }()
 
@@ -43,7 +43,7 @@
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
       default:                                                                \
-        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
     }                                                                         \
   }()
 
@@ -59,7 +59,7 @@
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
       default:                                                                \
-        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
     }                                                                         \
   }()
 
@@ -74,8 +74,8 @@
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Int, int32_t, __VA_ARGS__)         \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Long, int64_t, __VA_ARGS__)        \
       AT_PRIVATE_CASE_TYPE(at::ScalarType::Short, int16_t, __VA_ARGS__)       \
-      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, Half, __VA_ARGS__)           \
+      AT_PRIVATE_CASE_TYPE(at::ScalarType::Half, at::Half, __VA_ARGS__)       \
       default:                                                                \
-        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'"); \
+        AT_ERROR(#NAME, " not implemented for '", the_type.toString(), "'");  \
     }                                                                         \
   }()
@@ -581,6 +581,9 @@ Tensor hamming_window(
     double beta,
     const TensorOptions& options) {
   window_function_checks("hamming_window", options, window_length);
+  if (window_length == 0) {
+    return native::empty({0}, options);
+  }
   if (window_length == 1) {
     return native::ones({1}, options);
   }
 
@@ -215,7 +215,8 @@ void runNetwork(
     const bool wipe_cache,
     const bool run_individual,
     const int warmup,
-    const int iter) {
+    const int iter,
+    const int sleep_before_run) {
   if (!net_def.has_name()) {
     net_def.set_name("benchmark");
   }
@@ -234,6 +235,9 @@ void runNetwork(
   if (wipe_cache) {
     caffe2::wipe_cache();
   }
+  if (sleep_before_run > 0) {
+    sleep(sleep_before_run);
+  }
   LOG(INFO) << "Main runs.";
   CAFFE_ENFORCE(
       iter >= 0,
 
@@ -96,4 +96,5 @@ void runNetwork(
     const bool,
     const bool,
     const int,
+    const int,
     const int);
@@ -62,6 +62,10 @@ CAFFE2_DEFINE_bool(
     run_individual,
     false,
     "Whether to benchmark individual operators.");
+CAFFE2_DEFINE_int(
+    sleep_before_run,
+    0,
+    "The seconds to sleep before starting the benchmarking.");
 CAFFE2_DEFINE_bool(
     text_output,
     false,
@@ -115,7 +119,8 @@ int main(int argc, char** argv) {
       caffe2::FLAGS_wipe_cache,
       caffe2::FLAGS_run_individual,
       caffe2::FLAGS_warmup,
-      caffe2::FLAGS_iter);
+      caffe2::FLAGS_iter,
+      caffe2::FLAGS_sleep_before_run);
 
   writeOutput(
       workspace,
 
@@ -16,7 +16,7 @@
 
 #include "caffe2/core/flags.h"
 #include "caffe2/core/init.h"
-#include "caffe2/core/predictor.h"
+#include "caffe2/predictor/predictor.h"
 #include "caffe2/utils/proto_utils.h"
 
 CAFFE2_DEFINE_string(init_net, "", "The given path to the init protobuffer.");
 
@@ -65,6 +65,7 @@ if(BUILD_CAFFE2)
   add_subdirectory(proto)
   add_subdirectory(contrib)
   add_subdirectory(core)
+  add_subdirectory(predictor)
   add_subdirectory(core/nomnigraph)
   add_subdirectory(core/dispatch)
   if (USE_NVRTC)
 
@@ -533,10 +533,11 @@ class Operator : public OperatorBase {
     return fillers;
   }
 
-#define DISABLE_INPUT_FILLERS(Context)                                  \
-  std::vector<TensorFiller<Context>> InputFillers(                      \
-      const std::vector<std::vector<TIndex>>& /* unused */) override {  \
-    throw UnsupportedOperatorFeature("Op does not have input fillers"); \
+#define DISABLE_INPUT_FILLERS(Context)                                 \
+  std::vector<TensorFiller<Context>> InputFillers(                     \
+      const std::vector<std::vector<TIndex>>& /* unused */) override { \
+    throw UnsupportedOperatorFeature(                                  \
+        OperatorBase::type() + " does not have input fillers");        \
   }
 
   void SparseLengthsFillerHelper(
@@ -554,7 +555,8 @@ class Operator : public OperatorBase {
       size_t segment_index,
       std::vector<TensorFiller<Context>>* fillers) {
     CAFFE_ENFORCE_EQ(shapes[segment_index].size(), 1);
-    // TODO: what would be a proper #segments
+    // TODO (mnaumov): distribution of value
+    (*fillers)[value_index].Min(0).Max(shapes[value_index].front() * 2);
     (*fillers)[segment_index].SparseSegments(shapes[value_index].front() - 1);
   }
 
 
@@ -7,6 +7,8 @@
 #include <caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.h>
 #include <caffe2/operators/conv_transpose_op.h>
 #include <caffe2/operators/cross_entropy_op.h>
+#include <caffe2/operators/ctc_beam_search_decoder_op.h>
+#include <caffe2/operators/ctc_greedy_decoder_op.h>
 #include <caffe2/operators/dropout_op.h>
 #include <caffe2/operators/elementwise_ops.h>
 #include <caffe2/operators/filler_op.h>
@@ -112,4 +114,12 @@ REGISTER_IDEEP_OPERATOR(
     PRelu,
     IDEEPFallbackOp<PReluOp<float, CPUContext>>);
 
+// ctc decoder operators
+REGISTER_IDEEP_OPERATOR(
+    CTCGreedyDecoder,
+    IDEEPFallbackOp<CTCGreedyDecoderOp<CPUContext>>);
+REGISTER_IDEEP_OPERATOR(
+    CTCBeamSearchDecoder,
+    IDEEPFallbackOp<CTCBeamSearchDecoderOp<CPUContext>>);
+
 } // namespace caffe2
@@ -658,8 +658,16 @@ bool ImageInputOp<Context>::GetImageAndLabelAndInfoFromDBValue(
         for (int j = 0; j < additional_output_proto.int64_data_size(); ++j) {
           additional_output[j] = additional_output_proto.int64_data(j);
         }
-      }
-      else {
+      } else if (additional_output_proto.data_type() == TensorProto::UINT8) {
+        uint8_t* additional_output =
+            prefetched_additional_outputs_[i].template mutable_data<uint8_t>() +
+            item_id * additional_output_proto.int32_data_size();
+
+        for (int j = 0; j < additional_output_proto.int32_data_size(); ++j) {
+          additional_output[j] =
+              static_cast<uint8_t>(additional_output_proto.int32_data(j));
+        }
+      } else {
         LOG(FATAL) << "Unsupported output type.";
       }
     }
@@ -1148,6 +1156,9 @@ bool ImageInputOp<Context>::Prefetch() {
           } else if (
               additional_output_proto.data_type() == TensorProto::INT64) {
             prefetched_additional_outputs_[i].template mutable_data<int64_t>();
+          } else if (
+              additional_output_proto.data_type() == TensorProto::UINT8) {
+            prefetched_additional_outputs_[i].template mutable_data<uint8_t>();
           } else {
             LOG(FATAL) << "Unsupported output type.";
           }
 
@@ -1,8 +1,8 @@
 
 #include "ios_caffe.h"
-#include "caffe2/core/predictor.h"
 #include "caffe2/core/tensor.h"
 #include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
+#include "caffe2/predictor/predictor.h"
 
 Caffe2IOSPredictor* MakeCaffe2Predictor(const std::string& init_net_str,
                                         const std::string& predict_net_str,
 
@@ -3,9 +3,9 @@
 
 #include <string>
 #include <vector>
-#include "caffe2/core/predictor.h"
 #include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
 #include "caffe2/mobile/contrib/ios/ios_caffe_predictor.h"
+#include "caffe2/predictor/predictor.h"
 
 extern "C" {
 
 
@@ -3,8 +3,8 @@
 
 #include <string>
 #include "caffe2/core/net.h"
-#include "caffe2/core/predictor.h"
 #include "caffe2/mobile/contrib/ios/ios_caffe_defines.h"
+#include "caffe2/predictor/predictor.h"
 
 struct Tensor {
   std::vector<int64_t> dims;
 
@@ -3,7 +3,7 @@
 
 #include "GLImage.h"
 #include "caffe2/core/net.h"
-#include "caffe2/core/predictor.h"
+#include "caffe2/predictor/predictor.h"
 
 namespace caffe2 {
 class GLPredictor : public Predictor {
 
@@ -1,7 +1,7 @@
 
 #pragma once
 #include "GLPredictor.h"
-#include "caffe2/core/predictor.h"
+#include "caffe2/predictor/predictor.h"
 
 namespace caffe2 {
 bool tryConvertToOpenGL(const NetDef& initNet,
 
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "caffe2/core/predictor.h"
+#include "caffe2/predictor/predictor.h"
 #include "caffe2/proto/caffe2.pb.h"
 
 #include <memory>
 
@@ -68,7 +68,21 @@ class SparseLengthsFused8BitRowwiseOp : public Operator<Context> {
     return true;
   }
 
-  USE_VALUE_KEY_LENGTH_INPUT_FILLERS(Context, DATA, INDICES, LENGTHS)
+  std::vector<TensorFiller<Context>> InputFillers(
+      const std::vector<std::vector<TIndex>>& shapes) override {
+    CAFFE_ENFORCE_EQ(shapes.size(), Operator<Context>::Inputs().size());
+    auto fillers = Operator<Context>::InputFillers(shapes);
+    if (with_weights) {
+      // TODO: enable the fillers
+      throw UnsupportedOperatorFeature(
+          OperatorBase::type() + " does not have input fillers");
+    }
+    Operator<Context>::SparseLengthsFillerHelper(
+        shapes, INDICES, LENGTHS, &fillers);
+    Operator<Context>::SparseSegmentsFillerHelper(
+        shapes, DATA, INDICES, &fillers);
+    return fillers;
+  }
 
  private:
   enum {
 
@@ -92,7 +92,21 @@ class CPUSparseLengthsReductionOp : public Operator<CPUContext> {
     return true;
   }
 
-  USE_VALUE_KEY_LENGTH_INPUT_FILLERS(CPUContext, DATA, INDICES, LENGTHS)
+  std::vector<TensorFiller<CPUContext>> InputFillers(
+      const std::vector<std::vector<TIndex>>& shapes) override {
+    CAFFE_ENFORCE_EQ(shapes.size(), Operator<CPUContext>::Inputs().size());
+    auto fillers = Operator<CPUContext>::InputFillers(shapes);
+    if (USE_WEIGHT) {
+      // TODO: enable the fillers
+      throw UnsupportedOperatorFeature(
+          OperatorBase::type() + " does not have input fillers");
+    }
+    Operator<CPUContext>::SparseLengthsFillerHelper(
+        shapes, INDICES, LENGTHS, &fillers);
+    Operator<CPUContext>::SparseSegmentsFillerHelper(
+        shapes, DATA, INDICES, &fillers);
+    return fillers;
+  }
 
  private:
   enum {
 
@@ -172,6 +172,9 @@ class SegmentOneHotOp : public Operator<CPUContext> {
   SegmentOneHotOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator(operator_def, ws) {}
 
+  // TODO: enable input filler
+  DISABLE_INPUT_FILLERS(CPUContext)
+
   bool RunOnDevice() override {
     auto& lengths = Input(0);
     auto& indices = Input(1);
 
@@ -13,6 +13,9 @@ class OneHotOp final : public Operator<Context> {
  public:
   USE_OPERATOR_CONTEXT_FUNCTIONS;
 
+  // TODO: enable input filler
+  DISABLE_INPUT_FILLERS(Context)
+
   OneHotOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws) {}
 
@@ -58,6 +61,8 @@ class BatchOneHotOp final : public Operator<Context> {
   BatchOneHotOp(const OperatorDef& operator_def, Workspace* ws)
       : Operator<Context>(operator_def, ws) {}
 
+  USE_VALUE_KEY_LENGTH_INPUT_FILLERS(Context, X, VALS, LENS)
+
   bool RunOnDevice() override {
     return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(this, Input(X));
   }
@@ -83,6 +88,9 @@ class BatchBucketOneHotOp final : public Operator<Context> {
 
   bool RunOnDevice() override;
 
+  // TODO: enable input filler
+  DISABLE_INPUT_FILLERS(Context)
+
  protected:
   INPUT_TAGS(X, LENS, BOUNDARIES);
   OUTPUT_TAGS(ONE_HOT);
 
@@ -10,16 +10,10 @@ bool NHWC2NCHWOp<float, CPUContext>::RunOnDevice() {
   const int N = X.dim32(0), H = X.dim32(1), W = X.dim32(2), C = X.dim32(3);
   Y->Resize(N, C, H, W);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
-  for (int n = 0; n < N; ++n) {
-    for (int h = 0; h < H; ++h) {
-      for (int w = 0; w < W; ++w) {
-        for (int c = 0; c < C; ++c) {
-          Ydata[((n * C + c) * H + h) * W + w] = *(Xdata++);
-        }
-      }
-    }
-  }
+  float* Ydata = Y->template mutable_data<float>();
+  std::array<int, 4> dims = {N, H, W, C};
+  std::array<int, 4> axes = {0, 3, 1, 2};
+  math::Transpose(4, dims.data(), axes.data(), Xdata, Ydata, &context_);
   return true;
 }
 
@@ -31,20 +25,13 @@ bool NCHW2NHWCOp<float, CPUContext>::RunOnDevice() {
   const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
   Y->Resize(N, H, W, C);
   const float* Xdata = X.data<float>();
-  float* Ydata = Y->mutable_data<float>();
-  for (int n = 0; n < N; ++n) {
-    for (int c = 0; c < C; ++c) {
-      for (int h = 0; h < H; ++h) {
-        for (int w = 0; w < W; ++w) {
-          Ydata[((n * H + h) * W + w) * C + c] = *(Xdata++);
-        }
-      }
-    }
-  }
+  float* Ydata = Y->template mutable_data<float>();
+  std::array<int, 4> dims = {N, C, H, W};
+  std::array<int, 4> axes = {0, 2, 3, 1};
+  math::Transpose(4, dims.data(), axes.data(), Xdata, Ydata, &context_);
   return true;
 }
 
-
 REGISTER_CPU_OPERATOR(NHWC2NCHW, NHWC2NCHWOp<float, CPUContext>);
 REGISTER_CPU_OPERATOR(NCHW2NHWC, NCHW2NHWCOp<float, CPUContext>);
 
@@ -102,4 +89,4 @@ class GetNCHW2NHWCGradient : public GradientMakerBase {
   }
 };
 REGISTER_GRADIENT(NCHW2NHWC, GetNCHW2NHWCGradient);
-}  // namespace caffe2
+} // namespace caffe2