Use rocblas_gemm_ex for batched gemms with broadcasted B (#1354)

turneram · web-flow · commit a10a8ef11824 · 2022-09-13T17:12:14.000-04:00
Improves performance for 4/6 GEMMs used by huggingface BERT models with batch_size&gt;1 by using a non-batched rocBLAS call for GEMMs where the B input has a broadcasted batch dimension.
The four verify tests added reflect the actual configurations used by bert-base-cased, with varied batch sizes.

Also adds a matcher to simplify_reshapes to move multibroadcasts after concats.
diff --git a/src/simplify_reshapes.cpp b/src/simplify_reshapes.cpp
@@ -271,6 +271,44 @@ struct find_nested_slice
     }
 };
 
+struct find_concat_multibroadcasts
+{
+    auto matcher() const
+    {
+        return match::name("concat")(match::all_of[match::inputs()](match::name("multibroadcast")));
+    }
+
+    void apply(module& m, const match::matcher_result& mr) const
+    {
+        auto ins        = mr.result;
+        auto op         = any_cast<op::concat>(ins->get_operator());
+        auto out_lens   = ins->get_shape().lens();
+        auto inputs     = ins->inputs();
+        auto in_strides = inputs.front()->get_shape().strides();
+
+        // Only apply when concat axis is not a broadcasted dimension
+        if(std::any_of(inputs.begin(), inputs.end(), [&](auto i) {
+               return i->get_shape().strides()[op.axis] == 0;
+           }))
+        {
+            return;
+        }
+
+        // Use inputs of multibroadcast ops as inputs to new concat op
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [](auto i) {
+            return i->inputs().front();
+        });
+
+        // Reduce axis by number of leading broadcasted dimensions
+        if(inputs.front()->get_shape().lens().size() < out_lens.size())
+            op.axis -= std::count(in_strides.begin(), in_strides.begin() + op.axis, 0);
+
+        auto concat = m.insert_instruction(ins, op, inputs);
+        m.replace_instruction(
+            ins, migraphx::make_op("multibroadcast", {{"out_lens", out_lens}}), concat);
+    }
+};
+
 struct find_concat_transpose
 {
     auto matcher() const
@@ -764,6 +802,7 @@ void simplify_reshapes::apply(module& m) const
                             find_reshaper{},
                             find_transpose{},
                             find_concat_transpose{},
+                            find_concat_multibroadcasts{},
                             find_nested_convert{},
                             find_nested_slice{},
                             find_nested_concat{},
diff --git a/src/targets/gpu/gemm_impl.cpp b/src/targets/gpu/gemm_impl.cpp
@@ -176,8 +176,13 @@ void gemm_impl(context& ctx,
 
         auto num_matrices = std::accumulate(
             out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
-        if(num_matrices == 1)
+        if(num_matrices == 1 or (num_matrices > 1 and get_batch_stride(args[1]) == 0))
         {
+            // If the batch dimension of B is broadcasted, then we can
+            // multiply m by the batch_size and use rocblas_gemm_ex
+            // instead of rocblas_gemm_strided_batched_ex.
+            m *= num_matrices;
+
             // the rocblas_gemm API handles inputs and output matrices as
             // column-major format. When doing a C = A * B, we actually do
             // C^T = (B^T) * (A^T). That is the reason we input args[1] as
diff --git a/test/simplify_reshapes_test.cpp b/test/simplify_reshapes_test.cpp
@@ -48,6 +48,26 @@ inline std::vector<std::vector<std::size_t>> to_lens(const std::vector<migraphx:
     return result;
 }
 
+migraphx::module make_concat_multibroadcast(const std::vector<size_t>& in_lens,
+                                            const std::vector<size_t>& mbcast_lens,
+                                            const int axis)
+{
+    migraphx::module m;
+    auto s = migraphx::shape{migraphx::shape::float_type, in_lens};
+    auto x = m.add_parameter("x", s);
+    auto y = m.add_parameter("y", s);
+    auto z = m.add_parameter("z", s);
+    auto xm =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", mbcast_lens}}), x);
+    auto ym =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", mbcast_lens}}), y);
+    auto zm =
+        m.add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", mbcast_lens}}), z);
+    auto concat = m.add_instruction(migraphx::make_op("concat", {{"axis", axis}}), xm, ym, zm);
+    m.add_return({concat});
+    return m;
+}
+
 TEST_CASE(double_contig)
 {
     migraphx::program p;
@@ -337,6 +357,87 @@ TEST_CASE(nop_convert)
     EXPECT(std::distance(m.begin(), m.end()) == n - 1);
 }
 
+TEST_CASE(concat_multibroadcasts1)
+{
+    // Broadcasted batch dim, new axis < old axis
+    std::vector<std::size_t> in_lens     = {3, 4};
+    std::vector<std::size_t> mbcast_lens = {2, 3, 4};
+    const int axis                       = 2;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto out_shape                       = m.get_output_shapes().back();
+    auto n                               = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(m.get_output_shapes().back().lens() == out_shape.lens());
+    EXPECT(std::distance(m.begin(), m.end()) == n - 2);
+    auto new_concat =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "concat"; });
+    EXPECT(bool{new_concat != m.end()});
+    auto cd = std::distance(m.begin(), new_concat);
+    auto new_mb =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "multibroadcast"; });
+    auto md = std::distance(m.begin(), new_mb);
+    EXPECT(cd == md - 1);
+    EXPECT(migraphx::any_cast<migraphx::op::concat>(new_concat->get_operator()).axis == 1);
+}
+
+TEST_CASE(concat_multibroadcasts2)
+{
+    // Broadcasted middle dim, new axis == old axis
+    std::vector<std::size_t> in_lens     = {3, 1, 4};
+    std::vector<std::size_t> mbcast_lens = {3, 2, 4};
+    const int axis                       = 0;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto out_shape                       = m.get_output_shapes().back();
+    auto n                               = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(m.get_output_shapes().back().lens() == out_shape.lens());
+    EXPECT(std::distance(m.begin(), m.end()) == n - 2);
+    auto new_concat =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "concat"; });
+    EXPECT(bool{new_concat != m.end()});
+    auto cd = std::distance(m.begin(), new_concat);
+    auto new_mb =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "multibroadcast"; });
+    auto md = std::distance(m.begin(), new_mb);
+    EXPECT(cd == md - 1);
+    EXPECT(migraphx::any_cast<migraphx::op::concat>(new_concat->get_operator()).axis == 0);
+}
+
+TEST_CASE(concat_multibroadcasts3)
+{
+    // Broadcasted middle dim, new axis == old axis
+    std::vector<std::size_t> in_lens     = {3, 1, 4};
+    std::vector<std::size_t> mbcast_lens = {3, 2, 4};
+    const int axis                       = 2;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto out_shape                       = m.get_output_shapes().back();
+    auto n                               = std::distance(m.begin(), m.end());
+    run_pass(m);
+    EXPECT(m.get_output_shapes().back().lens() == out_shape.lens());
+    EXPECT(std::distance(m.begin(), m.end()) == n - 2);
+    auto new_concat =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "concat"; });
+    EXPECT(bool{new_concat != m.end()});
+    auto cd = std::distance(m.begin(), new_concat);
+    auto new_mb =
+        std::find_if(m.begin(), m.end(), [](auto ins) { return ins.name() == "multibroadcast"; });
+    auto md = std::distance(m.begin(), new_mb);
+    EXPECT(cd == md - 1);
+    EXPECT(migraphx::any_cast<migraphx::op::concat>(new_concat->get_operator()).axis == 2);
+}
+
+TEST_CASE(concat_multibroadcasts4)
+{
+    // Broadcasted batch dim, axis is broadcasted dim
+    std::vector<std::size_t> in_lens     = {3, 4};
+    std::vector<std::size_t> mbcast_lens = {2, 3, 4};
+    const int axis                       = 0;
+    auto m                               = make_concat_multibroadcast(in_lens, mbcast_lens, axis);
+    auto m1                              = m;
+    run_pass(m);
+    EXPECT(m1 == m);
+}
+
 TEST_CASE(concat_transpose1)
 {
     migraphx::module m;
diff --git a/test/verify/test_unbatched_gemm_1.cpp b/test/verify/test_unbatched_gemm_1.cpp
@@ -0,0 +1,58 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
+struct test_unbatched_gemm_1 : verify_program<test_unbatched_gemm_1>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape m1_shape{migraphx::shape::float_type, {4, 384, 768}};
+        migraphx::shape m2_shape{migraphx::shape::float_type, {768, 768}};
+        migraphx::shape m3_shape{migraphx::shape::float_type, {4, 384, 2304}};
+        auto l1 = mm->add_parameter("1", m1_shape);
+        auto l2 = mm->add_literal(migraphx::generate_literal(m2_shape));
+        l2 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {4, 768, 768}}}),
+                                 l2);
+        auto l3 = mm->add_literal(migraphx::generate_literal(m2_shape));
+        l3 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {4, 768, 768}}}),
+                                 l3);
+        auto l4 = mm->add_literal(migraphx::generate_literal(m2_shape));
+        l4 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {4, 768, 768}}}),
+                                 l4);
+        auto concat = mm->add_instruction(migraphx::make_op("concat", {{"axis", 2}}), l2, l3, l4);
+
+        auto l5     = mm->add_parameter("3", m3_shape);
+        float alpha = 1.0f;
+        float beta  = 1.0f;
+        migraphx::add_apply_alpha_beta(
+            *mm, {l1, concat, l5}, migraphx::make_op("dot"), alpha, beta);
+        return p;
+    }
+};
diff --git a/test/verify/test_unbatched_gemm_2.cpp b/test/verify/test_unbatched_gemm_2.cpp
@@ -0,0 +1,46 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+#include <migraphx/apply_alpha_beta.hpp>
+struct test_unbatched_gemm_2 : verify_program<test_unbatched_gemm_2>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape m1_shape{migraphx::shape::float_type, {4, 384, 768}};
+        migraphx::shape m2_shape{migraphx::shape::float_type, {768, 768}};
+        auto l1 = mm->add_parameter("1", m1_shape);
+        auto l2 = mm->add_literal(migraphx::generate_literal(m2_shape));
+        l2 = mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {4, 768, 768}}}),
+                                 l2);
+
+        mm->add_instruction(migraphx::make_op("dot"), l1, l2);
+        return p;
+    }
+};