diff --git a/docs/advanced/pycpp/numpy.rst b/docs/advanced/pycpp/numpy.rst
index 57a52f6a82..fbcebc4211 100644
--- a/docs/advanced/pycpp/numpy.rst
+++ b/docs/advanced/pycpp/numpy.rst
@@ -239,27 +239,13 @@ by the compiler. The result is returned as a NumPy array of type
 The scalar argument ``z`` is transparently replicated 4 times.  The input
 arrays ``x`` and ``y`` are automatically converted into the right types (they
 are of type  ``numpy.dtype.int64`` but need to be ``numpy.dtype.int32`` and
-``numpy.dtype.float32``, respectively)
+``numpy.dtype.float32``, respectively).
 
-Sometimes we might want to explicitly exclude an argument from the vectorization
-because it makes little sense to wrap it in a NumPy array. For instance,
-suppose the function signature was
+.. note::
 
-.. code-block:: cpp
-
-    double my_func(int x, float y, my_custom_type *z);
-
-This can be done with a stateful Lambda closure:
-
-.. code-block:: cpp
-
-    // Vectorize a lambda function with a capture object (e.g. to exclude some arguments from the vectorization)
-    m.def("vectorized_func",
-        [](py::array_t<int> x, py::array_t<float> y, my_custom_type *z) {
-            auto stateful_closure = [z](int x, float y) { return my_func(x, y, z); };
-            return py::vectorize(stateful_closure)(x, y);
-        }
-    );
+    Only arithmetic, complex, and POD types passed by value or by ``const &``
+    reference are vectorized; all other arguments are passed through as-is.
+    Functions taking rvalue reference arguments cannot be vectorized.
 
 In cases where the computation is too complicated to be reduced to
 ``vectorize``, it will be necessary to create and access the buffer contents
diff --git a/include/pybind11/cast.h b/include/pybind11/cast.h
index c997b90c01..0a8ef83188 100644
--- a/include/pybind11/cast.h
+++ b/include/pybind11/cast.h
@@ -15,6 +15,7 @@
 #include "descr.h"
 #include <array>
 #include <limits>
+#include <tuple>
 
 NAMESPACE_BEGIN(pybind11)
 NAMESPACE_BEGIN(detail)
@@ -390,7 +391,7 @@ class type_caster_generic {
  */
 template <typename T>
 using cast_op_type =
-    conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+    conditional_t<std::is_pointer<remove_reference_t<T>>::value,
         typename std::add_pointer<intrinsic_t<T>>::type,
         typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
 
@@ -883,7 +884,7 @@ template <typename CharT> struct type_caster<CharT, enable_if_t<is_std_char_type
     }
 
     static PYBIND11_DESCR name() { return type_descr(_(PYBIND11_STRING_NAME)); }
-    template <typename _T> using cast_op_type = typename std::remove_reference<pybind11::detail::cast_op_type<_T>>::type;
+    template <typename _T> using cast_op_type = remove_reference_t<pybind11::detail::cast_op_type<_T>>;
 };
 
 template <typename T1, typename T2> class type_caster<std::pair<T1, T2>> {
diff --git a/include/pybind11/common.h b/include/pybind11/common.h
index ddf966ea76..387300922e 100644
--- a/include/pybind11/common.h
+++ b/include/pybind11/common.h
@@ -348,10 +348,12 @@ inline internals &get_internals();
 using std::enable_if_t;
 using std::conditional_t;
 using std::remove_cv_t;
+using std::remove_reference_t;
 #else
 template <bool B, typename T = void> using enable_if_t = typename std::enable_if<B, T>::type;
 template <bool B, typename T, typename F> using conditional_t = typename std::conditional<B, T, F>::type;
 template <typename T> using remove_cv_t = typename std::remove_cv<T>::type;
+template <typename T> using remove_reference_t = typename std::remove_reference<T>::type;
 #endif
 
 /// Index sequences
@@ -365,6 +367,12 @@ template<size_t ...S> struct make_index_sequence_impl <0, S...> { typedef index_
 template<size_t N> using make_index_sequence = typename make_index_sequence_impl<N>::type;
 #endif
 
+/// Make an index sequence of the indices of true arguments
+template <typename ISeq, size_t, bool...> struct select_indices_impl { using type = ISeq; };
+template <size_t... IPrev, size_t I, bool B, bool... Bs> struct select_indices_impl<index_sequence<IPrev...>, I, B, Bs...>
+    : select_indices_impl<conditional_t<B, index_sequence<IPrev..., I>, index_sequence<IPrev...>>, I + 1, Bs...> {};
+template <bool... Bs> using select_indices = typename select_indices_impl<index_sequence<>, 0, Bs...>::type;
+
 /// Backports of std::bool_constant and std::negation to accomodate older compilers
 template <bool B> using bool_constant = std::integral_constant<bool, B>;
 template <typename T> struct negation : bool_constant<!T::value> { };
@@ -504,6 +512,14 @@ struct is_input_iterator<T, void_t<decltype(*std::declval<T &>()), decltype(++st
 /// Ignore that a variable is unused in compiler warnings
 inline void ignore_unused(const int *) { }
 
+/// Apply a function over each element of a parameter pack
+#ifdef __cpp_fold_expressions
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) (((PATTERN), void()), ...)
+#else
+using expand_side_effects = bool[];
+#define PYBIND11_EXPAND_SIDE_EFFECTS(PATTERN) pybind11::detail::expand_side_effects{ ((PATTERN), void(), false)..., false }
+#endif
+
 NAMESPACE_END(detail)
 
 /// Returns a named pointer that is shared among all extension modules (using the same
diff --git a/include/pybind11/eigen.h b/include/pybind11/eigen.h
index 8ceb941014..53d4dab08f 100644
--- a/include/pybind11/eigen.h
+++ b/include/pybind11/eigen.h
@@ -542,7 +542,7 @@ struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
 template<typename Type>
 struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
     typedef typename Type::Scalar Scalar;
-    typedef typename std::remove_reference<decltype(*std::declval<Type>().outerIndexPtr())>::type StorageIndex;
+    typedef remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())> StorageIndex;
     typedef typename Type::Index Index;
     static constexpr bool rowMajor = Type::IsRowMajor;
 
diff --git a/include/pybind11/numpy.h b/include/pybind11/numpy.h
index 8901d9744d..1e12ede194 100644
--- a/include/pybind11/numpy.h
+++ b/include/pybind11/numpy.h
@@ -530,7 +530,7 @@ class array : public buffer {
           const void *ptr = nullptr, handle base = handle()) {
 
         if (strides->empty())
-            *strides = default_strides(*shape, dt.itemsize());
+            *strides = c_strides(*shape, dt.itemsize());
 
         auto ndim = shape->size();
         if (ndim != strides->size())
@@ -758,15 +758,21 @@ class array : public buffer {
             throw std::domain_error("array is not writeable");
     }
 
-    static std::vector<ssize_t> default_strides(const std::vector<ssize_t>& shape, ssize_t itemsize) {
+    // Default, C-style strides
+    static std::vector<ssize_t> c_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
         auto ndim = shape.size();
-        std::vector<ssize_t> strides(ndim);
-        if (ndim) {
-            std::fill(strides.begin(), strides.end(), itemsize);
-            for (size_t i = 0; i < ndim - 1; i++)
-                for (size_t j = 0; j < ndim - 1 - i; j++)
-                    strides[j] *= shape[ndim - 1 - i];
-        }
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = ndim - 1; i > 0; --i)
+            strides[i - 1] = strides[i] * shape[i];
+        return strides;
+    }
+
+    // F-style strides; default when constructing an array_t with `ExtraFlags & f_style`
+    static std::vector<ssize_t> f_strides(const std::vector<ssize_t> &shape, ssize_t itemsize) {
+        auto ndim = shape.size();
+        std::vector<ssize_t> strides(ndim, itemsize);
+        for (size_t i = 1; i < ndim; ++i)
+            strides[i] = strides[i - 1] * shape[i - 1];
         return strides;
     }
 
@@ -797,6 +803,11 @@ class array : public buffer {
 };
 
 template <typename T, int ExtraFlags = array::forcecast> class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor, ShapeContainer &&shape, StridesContainer &&strides, const T *ptr, handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
 public:
     static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
 
@@ -822,7 +833,9 @@ template <typename T, int ExtraFlags = array::forcecast> class array_t : public
         : array(std::move(shape), std::move(strides), ptr, base) { }
 
     explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
-        : array(std::move(shape), ptr, base) { }
+        : array_t(private_ctor{}, std::move(shape),
+                ExtraFlags & f_style ? f_strides(*shape, itemsize()) : c_strides(*shape, itemsize()),
+                ptr, base) { }
 
     explicit array_t(size_t count, const T *ptr = nullptr, handle base = handle())
         : array({count}, {}, ptr, base) { }
@@ -1276,8 +1289,8 @@ template <size_t N> class multi_array_iterator {
         return *this;
     }
 
-    template <size_t K, class T> const T& data() const {
-        return *reinterpret_cast<T*>(m_common_iterator[K].data());
+    template <size_t K, class T = void> T* data() const {
+        return reinterpret_cast<T*>(m_common_iterator[K].data());
     }
 
 private:
@@ -1327,7 +1340,7 @@ enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
 // buffer; returns `non_trivial` otherwise.
 template <size_t N>
 broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
-    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info& buf) {
+    ndim = std::accumulate(buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
         return std::max(res, buf.ndim);
     });
 
@@ -1398,21 +1411,63 @@ broadcast_trivial broadcast(const std::array<buffer_info, N> &buffers, ssize_t &
         broadcast_trivial::non_trivial;
 }
 
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value, "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize =
+        satisfies_any_of<call_type, std::is_arithmetic, is_complex, std::is_pod>::value &&
+        satisfies_none_of<call_type, std::is_pointer, std::is_array, is_std_array, std::is_enum>::value &&
+        (!std::is_reference<T>::value ||
+         (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
 template <typename Func, typename Return, typename... Args>
 struct vectorize_helper {
-    typename std::remove_reference<Func>::type f;
+private:
     static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(NVectorized >= 1,
+            "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
 
+public:
     template <typename T>
-    explicit vectorize_helper(T&&f) : f(std::forward<T>(f)) { }
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) { }
 
-    object operator()(array_t<Args, array::forcecast>... args) {
-        return run(args..., make_index_sequence<N>());
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
     }
 
-    template <size_t ... Index> object run(array_t<Args, array::forcecast>&... args, index_sequence<Index...> index) {
-        /* Request buffers from all parameters */
-        std::array<buffer_info, N> buffers {{ args.request()... }};
+private:
+    remove_reference_t<Func> f;
+
+    template <size_t Index> using param_n_t = typename pack_element<Index, typename vectorize_arg<Args>::call_type...>::type;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex> object run(
+            typename vectorize_arg<Args>::type &...args,
+            index_sequence<Index...> i_seq, index_sequence<VIndex...> vi_seq, index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{ &args... }};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{{ reinterpret_cast<array *>(params[VIndex])->request()... }};
 
         /* Determine dimensions parameters of output array */
         ssize_t nd = 0;
@@ -1420,61 +1475,79 @@ struct vectorize_helper {
         auto trivial = broadcast(buffers, nd, shape);
         size_t ndim = (size_t) nd;
 
-        ssize_t size = 1;
-        std::vector<ssize_t> strides(ndim);
-        if (ndim > 0) {
-            if (trivial == broadcast_trivial::f_trivial) {
-                strides[0] = sizeof(Return);
-                for (size_t i = 1; i < ndim; ++i) {
-                    strides[i] = strides[i - 1] * shape[i - 1];
-                    size *= shape[i - 1];
-                }
-                size *= shape[ndim - 1];
-            }
-            else {
-                strides[ndim-1] = sizeof(Return);
-                for (size_t i = ndim - 1; i > 0; --i) {
-                    strides[i - 1] = strides[i] * shape[i];
-                    size *= shape[i];
-                }
-                size *= shape[0];
-            }
+        size_t size = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...));
         }
 
-        if (size == 1)
-            return cast(f(*reinterpret_cast<Args *>(buffers[Index].ptr)...));
+        array_t<Return> result;
+        if (trivial == broadcast_trivial::f_trivial) result = array_t<Return, array::f_style>(shape);
+        else result = array_t<Return>(shape);
 
-        array_t<Return> result(shape, strides);
-        auto buf = result.request();
-        auto output = (Return *) buf.ptr;
+        if (size == 0) return result;
 
         /* Call the function */
-        if (trivial == broadcast_trivial::non_trivial) {
-            apply_broadcast<Index...>(buffers, buf, index);
-        } else {
-            for (ssize_t i = 0; i < size; ++i)
-                output[i] = f((reinterpret_cast<Args *>(buffers[Index].ptr)[buffers[Index].size == 1 ? 0 : i])...);
-        }
+        if (trivial == broadcast_trivial::non_trivial)
+            apply_broadcast(buffers, params, result, i_seq, vi_seq, bi_seq);
+        else
+            apply_trivial(buffers, params, result.mutable_data(), size, i_seq, vi_seq, bi_seq);
 
         return result;
     }
 
-    template <size_t... Index>
-    void apply_broadcast(const std::array<buffer_info, N> &buffers,
-                         buffer_info &output, index_sequence<Index...>) {
-        using input_iterator = multi_array_iterator<N>;
-        using output_iterator = array_iterator<Return>;
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{{
+            std::pair<unsigned char *&, const size_t>(
+                    reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                    buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>)
+            )...
+        }};
+
+        for (size_t i = 0; i < size; ++i) {
+            out[i] = f(*reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) x.first += x.second;
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         array_t<Return> &output_array,
+                         index_sequence<Index...>, index_sequence<VIndex...>, index_sequence<BIndex...>) {
 
-        input_iterator input_iter(buffers, output.shape);
-        output_iterator output_end = array_end<Return>(output);
+        buffer_info output = output_array.request();
+        multi_array_iterator<NVectorized> input_iter(buffers, output.shape);
 
-        for (output_iterator iter = array_begin<Return>(output);
-             iter != output_end; ++iter, ++input_iter) {
-            *iter = f((input_iter.template data<Index, Args>())...);
+        for (array_iterator<Return> iter = array_begin<Return>(output), end = array_end<Return>(output);
+             iter != end;
+             ++iter, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((
+                params[VIndex] = input_iter.template data<BIndex>()
+            ));
+            *iter = f(*reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
         }
     }
 };
 
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...>
+vectorize_extractor(const Func &f, Return (*) (Args ...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
 template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
     static PYBIND11_DESCR name() {
         return _("numpy.ndarray[") + npy_format_descriptor<T>::name() + _("]");
@@ -1483,22 +1556,32 @@ template <typename T, int Flags> struct handle_type_name<array_t<T, Flags>> {
 
 NAMESPACE_END(detail)
 
-template <typename Func, typename Return, typename... Args>
-detail::vectorize_helper<Func, Return, Args...>
-vectorize(const Func &f, Return (*) (Args ...)) {
-    return detail::vectorize_helper<Func, Return, Args...>(f);
-}
-
+// Vanilla pointer vectorizer:
 template <typename Return, typename... Args>
-detail::vectorize_helper<Return (*) (Args ...), Return, Args...>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...>
 vectorize(Return (*f) (Args ...)) {
-    return vectorize<Return (*) (Args ...), Return, Args...>(f, f);
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
 }
 
-template <typename Func, typename FuncType = typename detail::remove_class<decltype(&std::remove_reference<Func>::type::operator())>::type>
+// lambda vectorizer:
+template <typename Func, typename FuncType = typename detail::remove_class<decltype(&detail::remove_reference_t<Func>::operator())>::type>
 auto vectorize(Func &&f) -> decltype(
-        vectorize(std::forward<Func>(f), (FuncType *) nullptr)) {
-    return vectorize(std::forward<Func>(f), (FuncType *) nullptr);
+        detail::vectorize_extractor(std::forward<Func>(f), (FuncType *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f), (FuncType *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())), Return, Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (non-const):
+template <typename Return, typename Class, typename... Args,
+          typename Helper = detail::vectorize_helper<decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())), Return, const Class *, Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
 }
 
 NAMESPACE_END(pybind11)
diff --git a/include/pybind11/pybind11.h b/include/pybind11/pybind11.h
index c4e6131e4a..4954dcf4d3 100644
--- a/include/pybind11/pybind11.h
+++ b/include/pybind11/pybind11.h
@@ -56,12 +56,12 @@ class cpp_function : public function {
     /// Construct a cpp_function from a lambda function (possibly with internal state)
     template <typename Func, typename... Extra, typename = detail::enable_if_t<
         detail::satisfies_none_of<
-            typename std::remove_reference<Func>::type,
+            detail::remove_reference_t<Func>,
             std::is_function, std::is_pointer, std::is_member_pointer
         >::value>
     >
     cpp_function(Func &&f, const Extra&... extra) {
-        using FuncType = typename detail::remove_class<decltype(&std::remove_reference<Func>::type::operator())>::type;
+        using FuncType = typename detail::remove_class<decltype(&detail::remove_reference_t<Func>::operator())>::type;
         initialize(std::forward<Func>(f),
                    (FuncType *) nullptr, extra...);
     }
@@ -93,7 +93,7 @@ class cpp_function : public function {
     template <typename Func, typename Return, typename... Args, typename... Extra>
     void initialize(Func &&f, Return (*)(Args...), const Extra&... extra) {
 
-        struct capture { typename std::remove_reference<Func>::type f; };
+        struct capture { detail::remove_reference_t<Func> f; };
 
         /* Store the function including any extra state it might have (e.g. a lambda capture object) */
         auto rec = make_function_record();
@@ -947,8 +947,7 @@ class class_ : public detail::generic_type {
         set_operator_new<type>(&record);
 
         /* Register base classes specified via template arguments to class_, if any */
-        bool unused[] = { (add_base<options>(record), false)..., false };
-        (void) unused;
+        PYBIND11_EXPAND_SIDE_EFFECTS(add_base<options>(record));
 
         /* Process optional arguments, if any */
         process_attributes<Extra...>::init(extra..., &record);
diff --git a/include/pybind11/pytypes.h b/include/pybind11/pytypes.h
index 8a8499c134..b186c0b3f7 100644
--- a/include/pybind11/pytypes.h
+++ b/include/pybind11/pytypes.h
@@ -44,7 +44,7 @@ using tuple_accessor = accessor<accessor_policies::tuple_item>;
 
 /// Tag and check to identify a class which implements the Python object API
 class pyobject_tag { };
-template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, typename std::remove_reference<T>::type>;
+template <typename T> using is_pyobject = std::is_base_of<pyobject_tag, remove_reference_t<T>>;
 
 /** \rst
     A mixin class which adds common functions to `handle`, `object` and various accessors.
diff --git a/tests/test_numpy_vectorize.cpp b/tests/test_numpy_vectorize.cpp
index 34197469c7..b1f820813d 100644
--- a/tests/test_numpy_vectorize.cpp
+++ b/tests/test_numpy_vectorize.cpp
@@ -20,6 +20,17 @@ std::complex<double> my_func3(std::complex<double> c) {
     return c * std::complex<double>(2.f);
 }
 
+struct VectorizeTestClass {
+    VectorizeTestClass(int v) : value{v} {};
+    float method(int x, float y) { return y + (float) (x + value); }
+    int value = 0;
+};
+
+struct NonPODClass {
+    NonPODClass(int v) : value{v} {}
+    int value;
+};
+
 test_initializer numpy_vectorize([](py::module &m) {
     // Vectorize all arguments of a function (though non-vector arguments are also allowed)
     m.def("vectorized_func", py::vectorize(my_func));
@@ -40,6 +51,22 @@ test_initializer numpy_vectorize([](py::module &m) {
     m.def("selective_func", [](py::array_t<std::complex<float>, py::array::c_style>) { return "Complex float branch taken."; });
 
 
+    // Passthrough test: references and non-pod types should be automatically passed through (in the
+    // function definition below, only `b`, `d`, and `g` are vectorized):
+    py::class_<NonPODClass>(m, "NonPODClass").def(py::init<int>());
+    m.def("vec_passthrough", py::vectorize(
+        [](double *a, double b, py::array_t<double> c, const int &d, int &e, NonPODClass f, const double g) {
+            return *a + b + c.at(0) + d + e + f.value + g;
+        }
+    ));
+
+    py::class_<VectorizeTestClass> vtc(m, "VectorizeTestClass");
+    vtc .def(py::init<int>())
+        .def_readwrite("value", &VectorizeTestClass::value);
+
+    // Automatic vectorizing of methods
+    vtc.def("method", py::vectorize(&VectorizeTestClass::method));
+
     // Internal optimization test for whether the input is trivially broadcastable:
     py::enum_<py::detail::broadcast_trivial>(m, "trivial")
         .value("f_trivial", py::detail::broadcast_trivial::f_trivial)
diff --git a/tests/test_numpy_vectorize.py b/tests/test_numpy_vectorize.py
index 7ae777227d..362b036262 100644
--- a/tests/test_numpy_vectorize.py
+++ b/tests/test_numpy_vectorize.py
@@ -159,3 +159,45 @@ def test_trivial_broadcasting():
     assert vectorized_func(1, y2, 1).flags.f_contiguous
     assert vectorized_func(z1[1::4, 1::4], y2, 1).flags.f_contiguous
     assert vectorized_func(y1[1::4, 1::4], z2, 1).flags.c_contiguous
+
+
+def test_passthrough_arguments(doc):
+    from pybind11_tests import vec_passthrough, NonPODClass
+
+    assert doc(vec_passthrough) == (
+        "vec_passthrough("
+        "arg0: float, arg1: numpy.ndarray[float64], arg2: numpy.ndarray[float64], "
+        "arg3: numpy.ndarray[int32], arg4: int, arg5: m.NonPODClass, arg6: numpy.ndarray[float64]"
+        ") -> object")
+
+    b = np.array([[10, 20, 30]], dtype='float64')
+    c = np.array([100, 200])  # NOT a vectorized argument
+    d = np.array([[1000], [2000], [3000]], dtype='int')
+    g = np.array([[1000000, 2000000, 3000000]], dtype='int')  # requires casting
+    assert np.all(
+        vec_passthrough(1, b, c, d, 10000, NonPODClass(100000), g) ==
+        np.array([[1111111, 2111121, 3111131],
+                  [1112111, 2112121, 3112131],
+                  [1113111, 2113121, 3113131]]))
+
+
+def test_method_vectorization():
+    from pybind11_tests import VectorizeTestClass
+
+    o = VectorizeTestClass(3)
+    x = np.array([1, 2], dtype='int')
+    y = np.array([[10], [20]], dtype='float32')
+    assert np.all(o.method(x, y) == [[14, 15], [24, 25]])
+
+
+def test_array_collapse():
+    from pybind11_tests import vectorized_func
+
+    assert not isinstance(vectorized_func(1, 2, 3), np.ndarray)
+    assert not isinstance(vectorized_func(np.array(1), 2, 3), np.ndarray)
+    z = vectorized_func([1], 2, 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, )
+    z = vectorized_func(1, [[[2]]], 3)
+    assert isinstance(z, np.ndarray)
+    assert z.shape == (1, 1, 1)