ROCm · iotamudelta · Oct 9, 2018 · Oct 8, 2018 · Oct 8, 2018 · Oct 8, 2018
diff --git a/.circleci/config.yml b/.circleci/config.yml
diff --git a/.gitmodules b/.gitmodules
@@ -76,3 +76,6 @@
 [submodule "third_party/ideep"]
 	path = third_party/ideep
 	url = https://github.com/intel/ideep
+[submodule "third_party/nccl/nccl"]
+	path = third_party/nccl/nccl
+	url = https://github.com/NVIDIA/nccl
diff --git a/README.md b/README.md
@@ -88,7 +88,7 @@ You get the best of speed and flexibility for your crazy research.
 
 PyTorch is not a Python binding into a monolithic C++ framework.
 It is built to be deeply integrated into Python.
-You can use it naturally like you would use NumPy / SciPy / scikit-learn etc.
+You can use it naturally like you would use [NumPy](http://www.numpy.org/) / [SciPy](https://www.scipy.org/) / [scikit-learn](http://scikit-learn.org) etc.
 You can write your new neural network layers in Python itself, using your favorite libraries
 and use packages such as Cython and Numba.
 Our goal is to not reinvent the wheel where appropriate.
@@ -104,7 +104,7 @@ We hope you never spend hours debugging your code because of bad stack traces or
 ### Fast and Lean
 
 PyTorch has minimal framework overhead. We integrate acceleration libraries
-such as Intel MKL and NVIDIA (cuDNN, NCCL) to maximize speed.
+such as [Intel MKL](https://software.intel.com/mkl) and NVIDIA (cuDNN, NCCL) to maximize speed.
 At the core, its CPU and GPU Tensor and neural network backends
 (TH, THC, THNN, THCUNN) are mature and have been tested for years.
 
@@ -226,7 +226,7 @@ should increase shared memory size either with `--ipc=host` or `--shm-size` comm
 
 ### Building the Documentation
 
-To build documentation in various formats, you will need Sphinx and the
+To build documentation in various formats, you will need [Sphinx](http://www.sphinx-doc.org) and the
 readthedocs theme.
 
 ```

diff --git a/aten/src/ATen/function_wrapper.py b/aten/src/ATen/function_wrapper.py
@@ -780,7 +780,7 @@ def emit_nn_body(option):
             # _out variants must create buffers and insert them in the
             # arguments list between output and input arguments
             for buffer in option['buffers']:
-                body.append('Tensor {} = tensor();'.format(buffer['name']))
+                body.append('Tensor {} = at::empty({{0}}, this->options());'.format(buffer['name']))
             actuals = [arg['name'] for arg in option['arguments'] if arg.get('output')]
             actuals += [buffer['name'] for buffer in option['buffers']]
             actuals += [arg['name'] for arg in option['arguments'] if not arg.get('output')]

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
@@ -1901,9 +1901,6 @@
     SparseCPU: new_with_size_sparse
     SparseCUDA: new_with_size_sparse
 
-- func: tensor(Type dtype) -> Tensor
-  variants: []
-
 - func: tensor(Type dtype, IntList size) -> Tensor
   variants: []
 

diff --git a/binaries/bench_gen/bench_gen.py b/binaries/bench_gen/bench_gen.py
@@ -6,6 +6,7 @@
 from __future__ import unicode_literals
 
 import argparse
+import ast
 
 from caffe2.python.model_helper import ModelHelper
 from caffe2.python.predictor import mobile_exporter
@@ -15,18 +16,15 @@
 def parse_kwarg(kwarg_str):
     key, value = kwarg_str.split('=')
     try:
-        value = int(value)
+        value = ast.literal_eval(value)
     except ValueError:
-        try:
-            value = float(value)
-        except ValueError:
-            pass
+        pass
     return key, value
 
 
 def main(args):
     # User defined keyword arguments
-    kwargs = {"order": "NCHW"}
+    kwargs = {"order": "NCHW", "use_cudnn": False}
     kwargs.update(dict(args.kwargs))
 
     model = ModelHelper(name=args.benchmark_name)

diff --git a/c10/test/registry_test.cpp b/c10/test/registry_test.cpp
@@ -13,6 +13,7 @@ class Foo {
   explicit Foo(int x) {
     // LOG(INFO) << "Foo " << x;
   }
+  virtual ~Foo() {}
 };
 
 C10_DECLARE_REGISTRY(FooRegistry, Foo, int);
@@ -46,4 +47,45 @@ TEST(RegistryTest, ReturnNullOnNonExistingCreator) {
   EXPECT_EQ(FooRegistry()->Create("Non-existing bar", 1), nullptr);
 }
 
+// C10_REGISTER_CLASS_WITH_PRIORITY defines static variable
+void RegisterFooDefault() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_DEFAULT, Foo);
+}
+
+void RegisterFooDefaultAgain() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_DEFAULT, Foo);
+}
+
+void RegisterFooBarFallback() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_FALLBACK, Bar);
+}
+
+void RegisterFooBarPreferred() {
+  C10_REGISTER_CLASS_WITH_PRIORITY(
+      FooRegistry, FooWithPriority, c10::REGISTRY_PREFERRED, Bar);
+}
+
+TEST(RegistryTest, RegistryPriorities) {
+  FooRegistry()->SetTerminate(false);
+  RegisterFooDefault();
+
+  // throws because Foo is already registered with default priority
+  EXPECT_THROW(RegisterFooDefaultAgain(), std::runtime_error);
+
+#ifdef __GXX_RTTI
+  // not going to register Bar because Foo is registered with Default priority
+  RegisterFooBarFallback();
+  std::unique_ptr<Foo> bar1(FooRegistry()->Create("FooWithPriority", 1));
+  EXPECT_EQ(dynamic_cast<Bar*>(bar1.get()), nullptr);
+
+  // will register Bar because of higher priority
+  RegisterFooBarPreferred();
+  std::unique_ptr<Foo> bar2(FooRegistry()->Create("FooWithPriority", 1));
+  EXPECT_NE(dynamic_cast<Bar*>(bar2.get()), nullptr);
+#endif
+}
+
 } // namespace c10_test
diff --git a/c10/util/Registry.h b/c10/util/Registry.h
@@ -24,15 +24,21 @@
 namespace c10 {
 
 template <typename KeyType>
-inline void PrintOffendingKey(const KeyType& /*key*/) {
-  printf("[key type printing not supported]\n");
+inline std::string KeyStrRepr(const KeyType& /*key*/) {
+  return "[key type printing not supported]";
 }
 
 template <>
-inline void PrintOffendingKey(const std::string& key) {
-  printf("Offending key: %s.\n", key.c_str());
+inline std::string KeyStrRepr(const std::string& key) {
+  return key;
 }
 
+enum RegistryPriority {
+  REGISTRY_FALLBACK = 1,
+  REGISTRY_DEFAULT = 2,
+  REGISTRY_PREFERRED = 3,
+};
+
 /**
  * @brief A template class that allows one to register classes by keys.
  *
@@ -48,9 +54,12 @@ class Registry {
  public:
   typedef std::function<ObjectPtrType(Args...)> Creator;
 
-  Registry() : registry_() {}
+  Registry() : registry_(), priority_(), terminate_(true) {}
 
-  void Register(const SrcType& key, Creator creator) {
+  void Register(
+      const SrcType& key,
+      Creator creator,
+      const RegistryPriority priority = REGISTRY_DEFAULT) {
     std::lock_guard<std::mutex> lock(register_mutex_);
     // The if statement below is essentially the same as the following line:
     // CHECK_EQ(registry_.count(key), 0) << "Key " << key
@@ -59,18 +68,40 @@ class Registry {
     // carried out at static initialization time, we do not want to have an
     // explicit dependency on glog's initialization function.
     if (registry_.count(key) != 0) {
-      printf("Key already registered.\n");
-      PrintOffendingKey(key);
-      std::exit(1);
+      auto cur_priority = priority_[key];
+      if (priority > cur_priority) {
+        std::string warn_msg =
+            "Overwriting already registered item for key " + KeyStrRepr(key);
+        fprintf(stderr, "%s\n", warn_msg.c_str());
+        registry_[key] = creator;
+        priority_[key] = priority;
+      } else if (priority == cur_priority) {
+        std::string err_msg =
+            "Key already registered with the same priority: " + KeyStrRepr(key);
+        fprintf(stderr, "%s\n", err_msg.c_str());
+        if (terminate_) {
+          std::exit(1);
+        } else {
+          throw std::runtime_error(err_msg);
+        }
+      } else {
+        std::string warn_msg =
+            "Higher priority item already registered, skipping registration of " +
+            KeyStrRepr(key);
+        fprintf(stderr, "%s\n", warn_msg.c_str());
+      }
+    } else {
+      registry_[key] = creator;
+      priority_[key] = priority;
     }
-    registry_[key] = creator;
   }
 
   void Register(
       const SrcType& key,
       Creator creator,
-      const std::string& help_msg) {
-    Register(key, creator);
+      const std::string& help_msg,
+      const RegistryPriority priority = REGISTRY_DEFAULT) {
+    Register(key, creator, priority);
     help_message_[key] = help_msg;
   }
 
@@ -109,8 +140,16 @@ class Registry {
     return it->second.c_str();
   }
 
+  // Used for testing, if terminate is unset, Registry throws instead of
+  // calling std::exit
+  void SetTerminate(bool terminate) {
+    terminate_ = terminate;
+  }
+
  private:
   std::unordered_map<SrcType, Creator> registry_;
+  std::unordered_map<SrcType, RegistryPriority> priority_;
+  bool terminate_;
   std::unordered_map<SrcType, std::string> help_message_;
   std::mutex register_mutex_;
 
@@ -120,14 +159,23 @@ class Registry {
 template <class SrcType, class ObjectPtrType, class... Args>
 class Registerer {
  public:
-  Registerer(
+  explicit Registerer(
       const SrcType& key,
       Registry<SrcType, ObjectPtrType, Args...>* registry,
       typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
       const std::string& help_msg = "") {
     registry->Register(key, creator, help_msg);
   }
 
+  explicit Registerer(
+      const SrcType& key,
+      const RegistryPriority priority,
+      Registry<SrcType, ObjectPtrType, Args...>* registry,
+      typename Registry<SrcType, ObjectPtrType, Args...>::Creator creator,
+      const std::string& help_msg = "") {
+    registry->Register(key, creator, help_msg, priority);
+  }
+
   template <class DerivedType>
   static ObjectPtrType DefaultCreator(Args... args) {
     return ObjectPtrType(new DerivedType(args...));
@@ -187,13 +235,27 @@ class Registerer {
   static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
       key, RegistryName(), ##__VA_ARGS__);
 
+#define C10_REGISTER_TYPED_CREATOR_WITH_PRIORITY(                           \
+    RegistryName, key, priority, ...)                                       \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key, priority, RegistryName(), ##__VA_ARGS__);
+
 #define C10_REGISTER_TYPED_CLASS(RegistryName, key, ...)                    \
   static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
       key,                                                                  \
       RegistryName(),                                                       \
       Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
       ::c10::demangle_type<__VA_ARGS__>());
 
+#define C10_REGISTER_TYPED_CLASS_WITH_PRIORITY(                             \
+    RegistryName, key, priority, ...)                                       \
+  static Registerer##RegistryName C10_ANONYMOUS_VARIABLE(g_##RegistryName)( \
+      key,                                                                  \
+      priority,                                                             \
+      RegistryName(),                                                       \
+      Registerer##RegistryName::DefaultCreator<__VA_ARGS__>,                \
+      ::c10::demangle_type<__VA_ARGS__>());
+
 // C10_DECLARE_REGISTRY and C10_DEFINE_REGISTRY are hard-wired to use
 // std::string as the key type, because that is the most commonly used cases.
 #define C10_DECLARE_REGISTRY(RegistryName, ObjectType, ...) \
@@ -218,9 +280,17 @@ class Registerer {
 #define C10_REGISTER_CREATOR(RegistryName, key, ...) \
   C10_REGISTER_TYPED_CREATOR(RegistryName, #key, __VA_ARGS__)
 
+#define C10_REGISTER_CREATOR_WITH_PRIORITY(RegistryName, key, priority, ...) \
+  C10_REGISTER_TYPED_CREATOR_WITH_PRIORITY(                                  \
+      RegistryName, #key, priority, __VA_ARGS__)
+
 #define C10_REGISTER_CLASS(RegistryName, key, ...) \
   C10_REGISTER_TYPED_CLASS(RegistryName, #key, __VA_ARGS__)
 
+#define C10_REGISTER_CLASS_WITH_PRIORITY(RegistryName, key, priority, ...) \
+  C10_REGISTER_TYPED_CLASS_WITH_PRIORITY(                                  \
+      RegistryName, #key, priority, __VA_ARGS__)
+
 } // namespace c10
 
 #endif // C10_UTIL_REGISTRY_H_
diff --git a/caffe2/contrib/aten/gen_op.py b/caffe2/contrib/aten/gen_op.py
@@ -237,12 +237,7 @@ def find_factory_methods(decls):
         }
         defined_inferred_type = False
 
-        if 'Tensor' in o['method_of']:
-            # make sure 'self' is the first argument. currently Declarations.yaml
-            # does not always do this. Instead it keeps the argument list the same order
-            # as the Type method.
-            o['arguments'] = self_as_first_argument(o['arguments'])
-        elif 'namespace' not in o['method_of']:
+        if 'namespace' not in o['method_of'] and 'Tensor' not in o['method_of']:
             # methods on type like 'ones' or 'zeros' always take a
             # string attribute that is translated into the at::Type object
             # e.g. "Float" is at::kFloat
@@ -289,11 +284,11 @@ def find_factory_methods(decls):
             assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
             env['assignments'].append(assignment)
 
-        if 'Tensor' in o['method_of']:
+        if 'namespace' in o['method_of']:
+            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
+        elif 'Tensor' in o['method_of']:
             env['invocation'] = "self.{}({})".format(
                 o['name'], ', '.join(env['arguments'][1:]))
-        elif 'namespace' in o['method_of']:
-            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
         else:
             assert('Type' in o['method_of'])
             env['invocation'] = CT(

diff --git a/caffe2/core/hip/net_async_hip_thread_pool_hip.cc b/caffe2/core/hip/net_async_hip_thread_pool_hip.cc
@@ -23,7 +23,7 @@ C10_DEFINE_int(
 
 namespace caffe2 {
 
-std::shared_ptr<TaskThreadPool>
+std::shared_ptr<TaskThreadPoolBase>
 GetAsyncNetHIPThreadPool(int hip_gpu_id, int pool_size, bool create_new) {
   // For GPU, use per device thread pools of predefined constant size
   if (pool_size != c10::FLAGS_caffe2_threads_per_hip_gpu) {

diff --git a/caffe2/core/net.cc b/caffe2/core/net.cc
@@ -173,7 +173,7 @@ unique_ptr<NetBase> CreateNet(
   return net;
 }
 
-TaskThreadPool* ExecutorHelper::GetPool(
+TaskThreadPoolBase* ExecutorHelper::GetPool(
     const DeviceOption& /* unused */) const {
   CAFFE_THROW("Not implemented");
 }

diff --git a/caffe2/core/net.h b/caffe2/core/net.h
@@ -130,7 +130,7 @@ class CAFFE2_API NetBase : public Observable<NetBase> {
 class CAFFE2_API ExecutorHelper {
  public:
   ExecutorHelper() {}
-  virtual TaskThreadPool* GetPool(const DeviceOption& option) const;
+  virtual TaskThreadPoolBase* GetPool(const DeviceOption& option) const;
   virtual ~ExecutorHelper() {}
 };