Skip to content

implement views::concat (P2542) #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3,758 commits into from
Dec 22, 2024
Merged

implement views::concat (P2542) #2

merged 3,758 commits into from
Dec 22, 2024

Conversation

changkhothuychung
Copy link
Owner

@changkhothuychung changkhothuychung commented Nov 17, 2024

Copy link

github-actions bot commented Nov 17, 2024

⚠️ C/C++ code formatter, clang-format found issues in your code. ⚠️

You can test this locally with the following command:
git-clang-format --diff 497b2ebb9edcfd5315586b796f47589e9820b4b9 bf9d232583ff7e6ad7bbb9e554a826bba5d529cb -- libcxx/include/__ranges/concat_view.h
View the diff from clang-format here.
diff --git a/libcxx/include/__ranges/concat_view.h b/libcxx/include/__ranges/concat_view.h
index 78bf3f4c5..fdc4e98c6 100644
--- a/libcxx/include/__ranges/concat_view.h
+++ b/libcxx/include/__ranges/concat_view.h
@@ -48,52 +48,40 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-//#if _LIBCPP_STD_VER >= 20
+// #if _LIBCPP_STD_VER >= 20
 
 namespace ranges {
 
+template <class View, class... Views>
+struct last_view : last_view<Views...> {};
 
-template<class View, class... Views> 
-struct last_view : last_view<Views...>
-{
-
-}; 
-
-template<class View>
-struct last_view<View> 
-{
-    using type = View; 
+template <class View>
+struct last_view<View> {
+  using type = View;
 };
 
 template <class Ref, class RRef, class It>
-concept concat_indirectly_readable_impl =  
-requires (const It it) { 
+concept concat_indirectly_readable_impl = requires(const It it) {
   { *it } -> convertible_to<Ref>;
   { ranges::iter_move(it) } -> convertible_to<RRef>;
 };
 
-
 template <class... Rs>
-using concat_reference_t = common_reference_t<range_reference_t<Rs>...>; 
+using concat_reference_t = common_reference_t<range_reference_t<Rs>...>;
 
 template <class... Rs>
-using concat_value_t = common_type_t<range_value_t<Rs>...>; 
+using concat_value_t = common_type_t<range_value_t<Rs>...>;
 
 template <class... Rs>
-using concat_rvalue_reference_t = common_reference_t<range_rvalue_reference_t<Rs>...>; 
+using concat_rvalue_reference_t = common_reference_t<range_rvalue_reference_t<Rs>...>;
 
 template <class... Rs>
-concept concat_indirectly_readable = 
-  common_reference_with<concat_reference_t<Rs...>&&, 
-                        concat_value_t<Rs...>&> &&
-  common_reference_with<concat_reference_t<Rs...>&&, 
-                        concat_rvalue_reference_t<Rs...>&&> &&
-  common_reference_with<concat_rvalue_reference_t<Rs...>&&, 
-                        concat_value_t<Rs...> const&> &&
-  (concat_indirectly_readable_impl<concat_reference_t<Rs...>, 
-                                   concat_rvalue_reference_t<Rs...>, 
-                                   iterator_t<Rs>> && ...);
-
+concept concat_indirectly_readable =
+    common_reference_with<concat_reference_t<Rs...>&&, concat_value_t<Rs...>&> &&
+    common_reference_with<concat_reference_t<Rs...>&&, concat_rvalue_reference_t<Rs...>&&> &&
+    common_reference_with<concat_rvalue_reference_t<Rs...>&&, concat_value_t<Rs...> const&> &&
+    (concat_indirectly_readable_impl<concat_reference_t<Rs...>, concat_rvalue_reference_t<Rs...>, iterator_t<Rs>> &&
+     ...);
 
 template <class... Rs>
 concept concatable = requires { // exposition only
@@ -103,301 +91,274 @@ concept concatable = requires { // exposition only
 } && concat_indirectly_readable<Rs...>;
 
 template <bool Const, class... Rs>
-concept concat_is_random_access = 
-   all_random_access<Const, Rs...> &&
-   (sized_range<maybe_const<Const, Rs>> && ...);
+concept concat_is_random_access = all_random_access<Const, Rs...> && (sized_range<maybe_const<Const, Rs>> && ...);
 
 template <class R>
-concept constant_time_reversible =          // exposition only
-(bidirectional_range<R> && common_range<R>) ||
-(sized_range<R> && random_access_range<R>);
+concept constant_time_reversible = // exposition only
+    (bidirectional_range<R> && common_range<R>) || (sized_range<R> && random_access_range<R>);
 
 template <bool Const, class... Rs>
-concept concat_is_bidirectional = 
-    (bidirectional_range<maybe_const<Const, V>> 
-    && ... && 
-    constant-time-reversible<maybe_const<Const, Fs>>);
-
+concept concat_is_bidirectional =
+    (bidirectional_range<maybe_const<Const, V>> && ... && constant - time - reversible<maybe_const<Const, Fs>>);
 
 template <input_range... Views>
-    requires (view<Views> && ...) && (sizeof...(Views) > 0) &&
-              concatable<Views...>
-class concat_view : public view_interface<concat_view<Views...>> 
-{
-    tuple<Views...> views_;                   // exposition only
+  requires(view<Views> && ...) && (sizeof...(Views) > 0) && concatable<Views...>
+class concat_view : public view_interface<concat_view<Views...>> {
+  tuple<Views...> views_; // exposition only
 
-    template <bool Const>
-    class iterator;                           // exposition only
-    class sentinel; 
-
-    public:
+  template <bool Const>
+  class iterator; // exposition only
+  class sentinel;
 
-    constexpr concat_view() = default;
+public:
+  constexpr concat_view() = default;
 
-    constexpr explicit concat_view(Views... views): tuple(views...) {}
+  constexpr explicit concat_view(Views... views) : tuple(views...) {}
 
-    constexpr iterator<false> begin() requires(!(simple-view<Views> && ...))
-    {
-        iterator<false> it(this, in_place_index<0>, ranges::begin(get<0>(views_)));
-        it.template satisfy<0>();
-        return it;
-    }
+  constexpr iterator<false> begin()
+    requires(!(simple - view<Views> && ...))
+  {
+    iterator<false> it(this, in_place_index<0>, ranges::begin(get<0>(views_)));
+    it.template satisfy<0>();
+    return it;
+  }
 
-    constexpr iterator<true> begin() const
+  constexpr iterator<true> begin() const
     requires((range<const Views> && ...) && concatable<const Views...>)
-    {
-        iterator<true> it(this, in_place_index<0>, ranges::begin(get<0>(views_)));
-        it.template satisfy<0>();
-        return it;
+  {
+    iterator<true> it(this, in_place_index<0>, ranges::begin(get<0>(views_)));
+    it.template satisfy<0>();
+    return it;
+  }
+
+  constexpr auto end()
+    requires(!(simple - view<Views> && ...))
+  {
+    if constexpr (common_range<last_view<Views...>::type>) {
+      // last_view to be implemented
+      constexpr auto N = sizeof...(Views);
+      return iterator<false>(this, in_place_index<N - 1>, ranges::end(get<N - 1>(views_)));
+    } else {
+      return default_sentinel;
     }
-
-    constexpr auto end() requires(!(simple-view<Views> && ...))
-    {
-        if constexpr (common_range<last_view<Views...>::type>) {
-            // last_view to be implemented 
-            constexpr auto N = sizeof...(Views);
-            return iterator<false>(this, in_place_index<N - 1>, 
-                            ranges::end(get<N - 1>(views_)));
-        } else {
-            return default_sentinel;
-        }
-    }
-
-    constexpr auto end() const requires(range<const Views>&&...)
-    {
-        if constexpr (common_range<last_view>) {
-            // last_view to be implemented 
-            constexpr auto N = sizeof...(Views);
-            return iterator<true>(this, in_place_index<N - 1>, 
-                            ranges::end(get<N - 1>(views_)));
-        } else {
-            return default_sentinel;
-        }
+  }
+
+  constexpr auto end() const
+    requires(range<const Views> && ...)
+  {
+    if constexpr (common_range<last_view>) {
+      // last_view to be implemented
+      constexpr auto N = sizeof...(Views);
+      return iterator<true>(this, in_place_index<N - 1>, ranges::end(get<N - 1>(views_)));
+    } else {
+      return default_sentinel;
     }
-
-    constexpr auto size() requires(sized_range<Views>&&...)
-    {
-        return apply([](auto... sizes) {
-            using CT = make_unsigned_like_t<common_type_t<decltype(sizes)...>>;
-            return (CT(sizes) + ...);
+  }
+
+  constexpr auto size()
+    requires(sized_range<Views> && ...)
+  {
+    return apply(
+        [](auto... sizes) {
+          using CT = make_unsigned_like_t<common_type_t<decltype(sizes)...>>;
+          return (CT(sizes) + ...);
         },
         tuple_transform(ranges::size, views_));
-    }
-
-    constexpr auto size() const requires(sized_range<const Views>&&...)
-    {
-        return apply([](auto... sizes) {
-            using CT = make_unsigned_like_t<common_type_t<decltype(sizes)...>>;
-            return (CT(sizes) + ...);
+  }
+
+  constexpr auto size() const
+    requires(sized_range<const Views> && ...)
+  {
+    return apply(
+        [](auto... sizes) {
+          using CT = make_unsigned_like_t<common_type_t<decltype(sizes)...>>;
+          return (CT(sizes) + ...);
         },
         tuple_transform(ranges::size, views_));
-    }
-  
+  }
 };
 
-
-
-// begin class iterator 
-
+// begin class iterator
 
 template <input_range... Views>
-    requires (view<Views> && ...) && (sizeof...(Views) > 0) &&
-              concatable<Views...>
-  template <bool Const>
-  class concat_view<Views...>::iterator {
-  
-  public:
-    //using iterator_category = see below;                  // not always present.
-    using iterator_concept = _If<random_access_range<_View...>,
-                                random_access_iterator_tag,
-                                _If<bidirectional_range<_View...>,
-                                    bidirectional_iterator_tag,
-                                    _If<forward_range<_View...>,
-                                        forward_iterator_tag,
-                                        /* else */ input_iterator_tag 
-                                        >
-                                    >
-                                >;
-    //using value_type = concat-value-t<maybe-const<Const, Views>...>;
-    //using difference_type = common_type_t<range_difference_t<maybe-const<Const, Views>>...>;
-
-  private:
-    using base_iter =                                     // exposition only
+  requires(view<Views> && ...) && (sizeof...(Views) > 0) && concatable<Views...>
+template <bool Const>
+class concat_view<Views...>::iterator {
+public:
+  // using iterator_category = see below;                  // not always present.
+  using iterator_concept =
+      _If<random_access_range<_View...>,
+          random_access_iterator_tag,
+          _If<bidirectional_range<_View...>,
+              bidirectional_iterator_tag,
+              _If<forward_range<_View...>,
+                  forward_iterator_tag,
+                  /* else */ input_iterator_tag > > >;
+  // using value_type = concat-value-t<maybe-const<Const, Views>...>;
+  // using difference_type = common_type_t<range_difference_t<maybe-const<Const, Views>>...>;
+
+private:
+  using base_iter = // exposition only
       variant<iterator_t<maybe_const<Const, Views>>...>;
-    
-    maybe_const<Const, concat_view>* parent_ = nullptr;   // exposition only
-    base_iter it_;                                        // exposition only
-
-    template <std::size_t N>
-    constexpr void satisfy()
-    {
-        if constexpr (N < (sizeof...(Views) - 1)) {
-            if (get<N>(it_) == ranges::end(get<N>(parent_->views_))) {
-                it_.template emplace<N + 1>(ranges::begin(get<N + 1>(parent_->views_)));
-                satisfy<N + 1>();
-            }
-        }
-    }
 
-    template <std::size_t N>
-    constexpr void prev()
-    {
-        if constexpr (N == 0) {
-            --get<0>(it_);
-        } else {
-            if (get<N>(it_) == ranges::begin(get<N>(parent_->views_))) {
-                using prev_view = maybe-const<Const, tuple_element_t<N - 1, tuple<Views...>>>;
-                if constexpr (common_range<prev_view>) {
-                    it_.template emplace<N - 1>(ranges::end(get<N - 1>(parent_->views_)));
-                } else {
-                    it_.template emplace<N - 1>(
-                        ranges::next(ranges::begin(get<N - 1>(parent_->views_)),
-                                    ranges::size(get<N - 1>(parent_->views_))));
-                }
-                prev<N - 1>();
-            } else {
-                --get<N>(it_);
-            }
-        }
-    }
+  maybe_const<Const, concat_view>* parent_ = nullptr; // exposition only
+  base_iter it_;                                      // exposition only
 
-    template <std::size_t N>
-    constexpr void advance_fwd(difference_type offset, difference_type steps)
-    {
-        using underlying_diff_type = iter_difference_t<variant_alternative_t<N, base-iter>>;
-        if constexpr (N == sizeof...(Views) - 1) {
-            get<N>(it_) += static_cast<underlying_diff_type>(steps);
-        } 
-        else {
-            auto n_size = ranges::distance(get<N>(parent_->views_));
-            if (offset + steps < n_size) {
-                get<N>(it_) += static_cast<underlying_diff_type>(steps);
-            } else {
-                it_.template emplace<N + 1>(ranges::begin(get<N + 1>(parent_->views_)));
-                advance-fwd<N + 1>(0, offset + steps - n_size);
-            }
-        }
+  template <std::size_t N>
+  constexpr void satisfy() {
+    if constexpr (N < (sizeof...(Views) - 1)) {
+      if (get<N>(it_) == ranges::end(get<N>(parent_->views_))) {
+        it_.template emplace<N + 1>(ranges::begin(get<N + 1>(parent_->views_)));
+        satisfy<N + 1>();
+      }
     }
-
-    template <std::size_t N>
-    constexpr void advance_bwd(difference_type offset, difference_type steps)
-    {
-        using underlying_diff_type = iter_difference_t<variant_alternative_t<N, base-iter>>;
-        if constexpr (N == 0) {
-            get<N>(it_) -= static_cast<underlying_diff_type>(steps);
+  }
+
+  template <std::size_t N>
+  constexpr void prev() {
+    if constexpr (N == 0) {
+      --get<0>(it_);
+    } else {
+      if (get<N>(it_) == ranges::begin(get<N>(parent_->views_))) {
+        using prev_view = maybe - const<Const, tuple_element_t<N - 1, tuple<Views...>>>;
+        if constexpr (common_range<prev_view>) {
+          it_.template emplace<N - 1>(ranges::end(get<N - 1>(parent_->views_)));
         } else {
-            if (offset >= steps) {
-                get<N>(it_) -= static_cast<underlying_diff_type>(steps);
-            } else {
-                auto prev_size = ranges::distance(get<N - 1>(parent_->views_));
-                it_.template emplace<N - 1>(ranges::begin(get<N - 1>(parent_->views_)) + prev_size);
-                advance-bwd<N - 1>(prev_size, steps - offset);
-            }
+          it_.template emplace<N - 1>(
+              ranges::next(ranges::begin(get<N - 1>(parent_->views_)), ranges::size(get<N - 1>(parent_->views_))));
         }
+        prev<N - 1>();
+      } else {
+        --get<N>(it_);
+      }
+    }
+  }
+
+  template <std::size_t N>
+  constexpr void advance_fwd(difference_type offset, difference_type steps) {
+    using underlying_diff_type = iter_difference_t<variant_alternative_t<N, base - iter>>;
+    if constexpr (N == sizeof...(Views) - 1) {
+      get<N>(it_) += static_cast<underlying_diff_type>(steps);
+    } else {
+      auto n_size = ranges::distance(get<N>(parent_->views_));
+      if (offset + steps < n_size) {
+        get<N>(it_) += static_cast<underlying_diff_type>(steps);
+      } else {
+        it_.template emplace<N + 1>(ranges::begin(get<N + 1>(parent_->views_)));
+        advance - fwd<N + 1>(0, offset + steps - n_size);
+      }
+    }
+  }
+
+  template <std::size_t N>
+  constexpr void advance_bwd(difference_type offset, difference_type steps) {
+    using underlying_diff_type = iter_difference_t<variant_alternative_t<N, base - iter>>;
+    if constexpr (N == 0) {
+      get<N>(it_) -= static_cast<underlying_diff_type>(steps);
+    } else {
+      if (offset >= steps) {
+        get<N>(it_) -= static_cast<underlying_diff_type>(steps);
+      } else {
+        auto prev_size = ranges::distance(get<N - 1>(parent_->views_));
+        it_.template emplace<N - 1>(ranges::begin(get<N - 1>(parent_->views_)) + prev_size);
+        advance - bwd<N - 1>(prev_size, steps - offset);
+      }
     }
+  }
 
-    template <class... Args>
-    explicit constexpr iterator(maybe-const<Const, concat_view>* parent, Args&&... args) 
-        requires constructible_from<base-iter, Args&&...> 
-        : it_(std::forward<Args>...), parent_(parent) {}
+  template <class... Args>
+  explicit constexpr iterator(maybe - const<Const, concat_view>* parent, Args&&... args)
+    requires constructible_from<base - iter, Args&&...>
+      : it_(std::forward<Args>...), parent_(parent) {}
 
-  public:
+public:
+  iterator() = default;
 
-    iterator() = default;
+  constexpr iterator(iterator<!Const> i)
+    requires Const && (convertible_to<iterator_t<Views>, iterator_t<const Views>> && ...)
+      : it_(std::move(i.it_)), parent_(i.parent_) {}
 
-    constexpr iterator(iterator<!Const> i) 
-        requires Const && (convertible_to<iterator_t<Views>, iterator_t<const Views>> && ...)
-        : it_(std::move(i.it_)), parent_(i.parent_) {}
+  constexpr decltype(auto) operator*() const {
+    using reference = concat_reference_t<maybe_const<Const, Views>...>;
+    return std::visit([](auto&& it) -> reference { return *it; }, it_);
+  }
 
-    constexpr decltype(auto) operator*() const
-    {
-        using reference = concat_reference_t<maybe_const<Const, Views>...>;
-        return std::visit([](auto&& it) -> reference { return *it; }, it_); 
-    }
+  constexpr iterator& operator++() {
+    constexpr auto i = it_.index();
+    ++get<i>(it_);
+    satisfy<i>();
+    return *this;
+  }
 
-    constexpr iterator& operator++()
-    {
-        constexpr auto i = it_.index(); 
-        ++get<i>(it_);
-        satisfy<i>();
-        return *this;
-    }
+  constexpr void operator++(int) { ++*this; }
 
-    constexpr void operator++(int)
-    {
-        ++*this; 
-    }
+  /*
 
-    /*
+  constexpr iterator operator++(int)
+      requires all-forward<Const, Views...>;
 
-    constexpr iterator operator++(int) 
-        requires all-forward<Const, Views...>;
-    
-    constexpr iterator& operator--() 
-        requires concat-is-bidirectional<Const, Views...>;
+  constexpr iterator& operator--()
+      requires concat-is-bidirectional<Const, Views...>;
 
-    constexpr iterator operator--(int) 
-        requires concat-is-bidirectional<Const, Views...>;
+  constexpr iterator operator--(int)
+      requires concat-is-bidirectional<Const, Views...>;
 
-    constexpr iterator& operator+=(difference_type n) 
-        requires concat-is-random-access<Const, Views...>;
+  constexpr iterator& operator+=(difference_type n)
+      requires concat-is-random-access<Const, Views...>;
 
-    constexpr iterator& operator-=(difference_type n) 
-        requires concat-is-random-access<Const, Views...>;
+  constexpr iterator& operator-=(difference_type n)
+      requires concat-is-random-access<Const, Views...>;
 
-    constexpr decltype(auto) operator[](difference_type n) const
-        requires concat-is-random-access<Const, Views...>;
+  constexpr decltype(auto) operator[](difference_type n) const
+      requires concat-is-random-access<Const, Views...>;
 
-    friend constexpr bool operator==(const iterator& x, const iterator& y)
-        requires(equality_comparable<iterator_t<maybe-const<Const, Views>>>&&...);
+  friend constexpr bool operator==(const iterator& x, const iterator& y)
+      requires(equality_comparable<iterator_t<maybe-const<Const, Views>>>&&...);
 
-    friend constexpr bool operator==(const iterator& it, default_sentinel_t);
+  friend constexpr bool operator==(const iterator& it, default_sentinel_t);
 
-    friend constexpr bool operator<(const iterator& x, const iterator& y)
-        requires all-random-access<Const, Views...>;
+  friend constexpr bool operator<(const iterator& x, const iterator& y)
+      requires all-random-access<Const, Views...>;
 
-    friend constexpr bool operator>(const iterator& x, const iterator& y)
-        requires all-random-access<Const, Views...>;
+  friend constexpr bool operator>(const iterator& x, const iterator& y)
+      requires all-random-access<Const, Views...>;
 
-    friend constexpr bool operator<=(const iterator& x, const iterator& y)
-        requires all-random-access<Const, Views...>;
+  friend constexpr bool operator<=(const iterator& x, const iterator& y)
+      requires all-random-access<Const, Views...>;
 
-    friend constexpr bool operator>=(const iterator& x, const iterator& y)
-        requires all-random-access<Const, Views...>;
+  friend constexpr bool operator>=(const iterator& x, const iterator& y)
+      requires all-random-access<Const, Views...>;
 
-    friend constexpr auto operator<=>(const iterator& x, const iterator& y)
-        requires (all-random-access<Const, Views...> &&
-         (three_way_comparable<maybe-const<Const, Views>> &&...));
+  friend constexpr auto operator<=>(const iterator& x, const iterator& y)
+      requires (all-random-access<Const, Views...> &&
+       (three_way_comparable<maybe-const<Const, Views>> &&...));
 
-    friend constexpr iterator operator+(const iterator& it, difference_type n)
-        requires concat-is-random-access<Const, Views...>;
+  friend constexpr iterator operator+(const iterator& it, difference_type n)
+      requires concat-is-random-access<Const, Views...>;
 
-    friend constexpr iterator operator+(difference_type n, const iterator& it)
-        requires concat-is-random-access<Const, Views...>;
+  friend constexpr iterator operator+(difference_type n, const iterator& it)
+      requires concat-is-random-access<Const, Views...>;
 
-    friend constexpr iterator operator-(const iterator& it, difference_type n)
-        requires concat-is-random-access<Const, Views...>;
+  friend constexpr iterator operator-(const iterator& it, difference_type n)
+      requires concat-is-random-access<Const, Views...>;
 
-    friend constexpr difference_type operator-(const iterator& x, const iterator& y) 
-        requires concat-is-random-access<Const, Views...>;
+  friend constexpr difference_type operator-(const iterator& x, const iterator& y)
+      requires concat-is-random-access<Const, Views...>;
 
-    friend constexpr difference_type operator-(const iterator& x, default_sentinel_t) 
-        requires see below;
+  friend constexpr difference_type operator-(const iterator& x, default_sentinel_t)
+      requires see below;
 
-    friend constexpr difference_type operator-(default_sentinel_t, const iterator& x) 
-        requires see below;
+  friend constexpr difference_type operator-(default_sentinel_t, const iterator& x)
+      requires see below;
 
-    friend constexpr decltype(auto) iter_move(const iterator& it) noexcept(see below);
+  friend constexpr decltype(auto) iter_move(const iterator& it) noexcept(see below);
 
-    friend constexpr void iter_swap(const iterator& x, const iterator& y) noexcept(see below)
-        requires see below;
+  friend constexpr void iter_swap(const iterator& x, const iterator& y) noexcept(see below)
+      requires see below;
 
-    */
+  */
 };
 
-
-
-
 } // namespace ranges
 
 #endif // _LIBCPP_STD_VER >= 20

changkhothuychung pushed a commit that referenced this pull request Nov 17, 2024
… depobj construct (llvm#114221)

A codegen crash is occurring when a depend object was initialized with
omp_all_memory in the depobj directive.
llvm#114214
The root cause of issue looks to be the improper handling of the
dependency list when omp_all_memory was specified.

The change introduces the use of OMPTaskDataTy to manage dependencies.
The buildDependences function is called to construct the dependency
list, and the list is iterated over to emit and store the dependencies.

Reduced Test Case : 
```
#include <omp.h>

int main()

{ omp_depend_t obj; #pragma omp depobj(obj) depend(inout: omp_all_memory) }
```

```
 #1 0x0000000003de6623 SignalHandler(int) Signals.cpp:0:0
 #2 0x00007f8e4a6b990f (/lib64/libpthread.so.0+0x1690f)
 #3 0x00007f8e4a117d2a raise (/lib64/libc.so.6+0x4ad2a)
 llvm#4 0x00007f8e4a1193e4 abort (/lib64/libc.so.6+0x4c3e4)
 llvm#5 0x00007f8e4a10fc69 __assert_fail_base (/lib64/libc.so.6+0x42c69)
 llvm#6 0x00007f8e4a10fcf1 __assert_fail (/lib64/libc.so.6+0x42cf1)
 llvm#7 0x0000000004114367 clang::CodeGen::CodeGenFunction::EmitOMPDepobjDirective(clang::OMPDepobjDirective const&) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x4114367)
 llvm#8 0x00000000040f8fac clang::CodeGen::CodeGenFunction::EmitStmt(clang::Stmt const*, llvm::ArrayRef<clang::Attr const*>) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x40f8fac)
 llvm#9 0x00000000040ff4fb clang::CodeGen::CodeGenFunction::EmitCompoundStmtWithoutScope(clang::CompoundStmt const&, bool, clang::CodeGen::AggValueSlot) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x40ff4fb)
llvm#10 0x00000000041847b2 clang::CodeGen::CodeGenFunction::EmitFunctionBody(clang::Stmt const*) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x41847b2)
llvm#11 0x0000000004199e4a clang::CodeGen::CodeGenFunction::GenerateCode(clang::GlobalDecl, llvm::Function*, clang::CodeGen::CGFunctionInfo const&) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x4199e4a)
llvm#12 0x00000000041f7b9d clang::CodeGen::CodeGenModule::EmitGlobalFunctionDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x41f7b9d)
llvm#13 0x00000000041f16a3 clang::CodeGen::CodeGenModule::EmitGlobalDefinition(clang::GlobalDecl, llvm::GlobalValue*) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x41f16a3)
llvm#14 0x00000000041fd954 clang::CodeGen::CodeGenModule::EmitDeferred() (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x41fd954)
llvm#15 0x0000000004200277 clang::CodeGen::CodeGenModule::Release() (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x4200277)
llvm#16 0x00000000046b6a49 (anonymous namespace)::CodeGeneratorImpl::HandleTranslationUnit(clang::ASTContext&) ModuleBuilder.cpp:0:0
llvm#17 0x00000000046b4cb6 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x46b4cb6)
llvm#18 0x0000000006204d5c clang::ParseAST(clang::Sema&, bool, bool) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x6204d5c)
llvm#19 0x000000000496b278 clang::FrontendAction::Execute() (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x496b278)
llvm#20 0x00000000048dd074 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x48dd074)
llvm#21 0x0000000004a38092 clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0x4a38092)
llvm#22 0x0000000000fd4e9c cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0xfd4e9c)
llvm#23 0x0000000000fcca73 ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&, llvm::ToolContext const&) driver.cpp:0:0
llvm#24 0x0000000000fd140c clang_main(int, char**, llvm::ToolContext const&) (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0xfd140c)
llvm#25 0x0000000000ee2ef3 main (/opt/cray/pe/cce/18.0.1/cce-clang/x86_64/bin/clang-18+0xee2ef3)
llvm#26 0x00007f8e4a10224c __libc_start_main (/lib64/libc.so.6+0x3524c)
llvm#27 0x0000000000fcaae9 _start /home/abuild/rpmbuild/BUILD/glibc-2.31/csu/../sysdeps/x86_64/start.S:120:0
clang: error: unable to execute command: Aborted
```

---------

Co-authored-by: Chandra Ghale <[email protected]>
winner245 and others added 28 commits December 19, 2024 11:48
This PR simplifies the internal bitwise logic of the `flip()` function
for `vector<bool>`, and creates new tests to validate the changes.
…120518)

TestFirmwareCorefiles.py has a helper utility,
create-empty-corefile.cpp, which creates corefiles with different
metadata to specify the binary that should be loaded. It normally uses
an actual binary's UUID for the metadata, and it uses the binary's
cputype/cpusubtype for the corefile's mach header.

There is one test where it creates a corefile with metadata for a UUID
that cannot be found -- it is given no binary -- and in that case, the
cputype/cpusubtype it sets in the core file mach header was
uninitialized data. Through luck, on Darwin systems, the uninitialized
data typically matched a CPU_TYPE from machine.h and the test would
work. But when the value doens't match one of thoes defines, lldb would
reject the corefile entirely, and the test would fail. This has been an
infrequent failure on the CI bots for a while and I couldn't ever repo
it. There's a recent configuration where it was happening every time and
I was able to track it down.

rdar://141727563
…lvm#116462)"

This reverts commit 89da344.

Reason: buildbot breakages e.g., https://lab.llvm.org/buildbot/#/builders/55/builds/4556 (for which the reverted patch is the only code change)
…m#120536)

This allows us to write more range based for loops because we no
longer need the iterator. It also matches IR's Use class.
)" (llvm#120594)

This reverts commit e0526b0.

The `v_minmax/maxmin_f16`(GFX11) needs to be updated to t16 with
`v_minmax/maxmin_num_f16`(GFX12) together since they share the same
codegen pattern. Revert the old patch and resubmit
)

This can be used with /llvmlibthin to create thin archives without an
index, which is a prerequisite for porting
https://reviews.llvm.org/D117284 to lld-link.

Creating files like this is already possible with `llvm-ar rcS`, so this
doesn't add additional problems.
Add tests for horizontal add patterns with missing/undemanded elements - which typically prevents folding to the (add (shuffle a, b),(shuffle a, b)) optimal pattern
**Note:** The register reading and writing depends on new register
flavor support in thread_get_state/thread_set_state in the kernel, which
will be first available in macOS 15.4.

The Apple M4 line of cores includes the Scalable Matrix Extension (SME)
feature. The M4s do not implement Scalable Vector Extension (SVE),
although the processor is in Streaming SVE Mode when the SME is being
used. The most obvious side effects of being in SSVE Mode are that (on
the M4 cores) NEON instructions cannot be used, and watchpoints may get
false positives, the address comparisons are done at a lowered
granularity.

When SSVE mode is enabled, the kernel will provide the Streaming Vector
Length register, which is a maximum of 64 bytes with the M4. Also
provided are SVCR (with bits indicating if SSVE mode and SME mode are
enabled), TPIDR2, SVL. Then the SVE registers Z0..31 (SVL bytes long),
P0..15 (SVL/8 bytes), the ZA matrix register (SVL*SVL bytes), and the M4
supports SME2, so the ZT0 register (64 bytes).

When SSVE/SME are disabled, none of these registers are provided by the
kernel - reads and writes of them will fail.

Unlike Linux, lldb cannot modify the SVL through a thread_set_state
call, or change the processor state's SSVE/SME status. There is also no
way for a process to request a lowered SVL size today, so the work that
David did to handle VL/SVL changing while stepping through a process is
not an issue on Darwin today. But debugserver should be providing
everything necessary so we can reuse all of David's work on resizing the
register contexts in lldb if it happens in the future. debugbserver
sends svl, svcr, and tpidr2 in the expedited registers when a thread
stops, if SSVE|SME mode are enabled (if the kernel allows it to read the
ARM_SME_STATE register set).

While the maximum SVL is 64 bytes on M4, the AArch64 maximum possible
SVL is 256; this would give us a 64k ZA register. If debugserver sized
all of its register contexts assuming the largest possible SVL, we could
easily use 2MB more memory for the register contexts of all threads in a
process -- and on iOS et al, processes must run within a small memory
allotment and this would push us over that.

Much of the work in debugserver was changing the arm64 register context
from being a static compile-time array of register sets, to being
initialized at runtime if debugserver is running on a machine with SME.
The ZA is only created to the machine's actual maximum SVL. The size of
the 32 SVE Z registers is less significant so I am statically allocating
those to the architecturally largest possible SVL value today.

Also, debugserver includes information about registers that share the
same part of the register file. e.g. S0 and D0 are the lower parts of
the NEON 128-bit V0 register. And when running on an SME machine, v0 is
the lower 128 bits of the SVE Z0 register. So the register maps used
when defining the VFP registers must differ depending on the
capabilities of the cpu at runtime.

I also changed register reading in debugserver, where formerly when
debugserver was asked to read a register, and the thread_get_state read
of that register failed, it would return all zero's. This is necessary
when constructing a `g` packet that gets all registers - because there
is no separation between register bytes, the offsets are fixed. But when
we are asking for a single register (e.g. Z0) when not in SSVE/SME mode,
this should return an error.

This does mean that when you're running on an SME capabable machine, but
not in SME mode, and do `register read -a`, lldb will report that 48 SVE
registers were unavailable and 5 SME registers were unavailable. But
that's only when `-a` is used.

The register reading and writing depends on new register flavor support
in thread_get_state/thread_set_state in the kernel, which is not yet in
a release. The test case I wrote is skipped on current OSes. I pilfered
the SME register setup from some of David's existing SME test files;
there were a few Linux specific details in those tests that they weren't
easy to reuse on Darwin.

rdar://121608074
…#120488)

This patch converts the profile for memprof_missing_leaf.ll to the
recently introduced YAML-based text format.
Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>

I'm not touching PointerUnion::dyn_cast for now because it's a bit
complicated; we could blindly migrate it to dyn_cast_if_present, but
we should probably use dyn_cast when the operand is known to be
non-null.
Prior to this patch, we required that all users had the same VL in order
to optimize. But as the FIXME said, we can use the largest VL to
optimize, as long as we can determine what the largest is. This patch
implements the FIXME.
This also removes the `RangesReleased` which doesn't give much insight
to whether we should adjust the heuristic of doing page release.
…no_overlap_error_icf.yaml (llvm#120330)

Fixing broken test - calling `sed` in a cross-platform compatible way. 
Verified to pass on Mac (which uses BSD sed).
adoptRef in WebKit constructs Ref/RefPtr so treat it as such in
isCtorOfRefCounted. Also removed the support for makeRef and makeRefPtr
as they don't exist any more.
…120466)

Rename HLSL resource-related intrinsics to be consistent with the naming
conventions discussed in [wg-hlsl:0014].

This is an entirely mechanical change, consisting of the following
commands and automated formatting.

```sh
git grep -l handle.fromBinding | xargs perl -pi -e \
  's/(dx|spv)(.)handle.fromBinding/$1$2resource$2handlefrombinding/g'
git grep -l typedBufferLoad_checkbit | xargs perl -pi -e \
  's/(dx|spv)(.)typedBufferLoad_checkbit/$1$2resource$2loadchecked$2typedbuffer/g'
git grep -l typedBufferLoad | xargs perl -pi -e \
  's/(dx|spv)(.)typedBufferLoad/$1$2resource$2load$2typedbuffer/g'
git grep -l typedBufferStore | xargs perl -pi -e \
  's/(dx|spv)(.)typedBufferStore/$1$2resource$2store$2typedbuffer/g'
git grep -l bufferUpdateCounter | xargs perl -pi -e \
  's/(dx|spv)(.)bufferUpdateCounter/$1$2resource$2updatecounter/g'
git grep -l cast_handle | xargs perl -pi -e \
  's/(dx|spv)(.)cast.handle/$1$2resource$2casthandle/g'
```

[wg-hlsl:0014]: https://github.com/llvm/wg-hlsl/blob/main/proposals/0014-consistent-naming-for-dx-intrinsics.md
- **[AMDGPU] Add new test.**
- **[AMDGPU] Emit S_CBRANCH_SCC for floating-point conditions.**

---------

Co-authored-by: Konstantina Mitropoulou <[email protected]>
…c...` api (llvm#117635)

- update `VectorUtils:isVectorIntrinsicWithScalarOpAtArg` to use TTI for
all uses, to allow specifiction of target specific intrinsics
- add TTI to the `isVectorIntrinsicWithStructReturnOverloadAtField` api
- update TTI api to provide `isTargetIntrinsicWith...` functions and
  consistently name them
- move `isTriviallyScalarizable` to VectorUtils
  
- update all uses of the api and provide the TTI parameter

Resolves llvm#117030
This patch introduces IndexedCallstackIdConveter as a convenience
wrapper around FrameIdConverter and CallStackIdConverter just for
tests.

With the new wrapper, we get to replace idioms like:

  FrameIdConverter<decltype(MemProfData.Frames)> FrameIdConv(
      MemProfData.Frames);
  CallStackIdConverter<decltype(MemProfData.CallStacks)> CSIdConv(
      MemProfData.CallStacks, FrameIdConv);

with:

  IndexedCallstackIdConveter CSIdConv(MemProfData);

Unfortunately, this exact pattern occurs in tests only; the
combinations of the frame ID converter and call stack ID converter are
diverse in production code.
These two constructs are very simple and similar, and only support 3
different clauses, two of which are already implemented.  This patch
adds AST nodes for both constructs, and leaves the device_num clause
unimplemented, but enables the other two.
This is a very simple sema implementation, and just required AST node
plus the existing diagnostics.  This patch adds tests and adds the AST
node required, plus enables it for 'init' and 'shutdown' (only!)
MaskRay and others added 17 commits December 22, 2024 00:01
Note that PointerUnion::{is,get} have been soft deprecated in
PointerUnion.h:

  // FIXME: Replace the uses of is(), get() and dyn_cast() with
  //        isa<T>, cast<T> and the llvm::dyn_cast<T>
Split DerivedIV simplification off from
llvm#112145 and use to remove the
need for extra checks in createScalarIVSteps. Required an extra
simplification run after IV transforms.
The paper is fixing a wording bug, so there's nothing to do for
implementations.
…lvm#118499)

Try to runtime-unroll loops with early-continues depending on
loop-varying loads; this helps with branch-prediction for the
early-continues and can significantly improve performance
for such loops

Builds on top of llvm#118317.

PR: llvm#118499.
…m#120737)

The existing comparison does not insert symbols in the intended place.

Closes llvm#120559.

---------

Co-authored-by: Bjorn Pettersson <[email protected]>
…llvm#120899)

Add const to `import*Renderer` member functions and recursively to
functions called from them.
I didn't do that for `import*Matcher` functions because they mutate
class variables.
… cost-comparison

Helps with debugging to show to that the fold found the match, and shows the old + new costs to indicate whether the fold was/wasn't profitable.
Add a number of tests with dereferenceable assumptions and different
alignment info.
Helps with debugging to show to that the fold found the match.
Interferes with constant folding of the pcmpgt node.

Yes another example where topological node sorting would have helped us.

Fixes llvm#120906
@changkhothuychung changkhothuychung merged commit daf965a into main Dec 22, 2024
53 checks passed
@changkhothuychung changkhothuychung deleted the concat branch December 22, 2024 21:18
@changkhothuychung changkhothuychung restored the concat branch December 22, 2024 21:21
changkhothuychung pushed a commit that referenced this pull request Feb 22, 2025
For function declarations (i.e. func op has no entry block), the
FunctionOpInterface method `insertArgument` and `eraseArgument` will
cause segfault. This PR guards against manipulation of empty entry block
by checking whether func op is external.

An example can be seen in google/heir#1324

The segfault trace

```
 #1 0x0000560f1289d9db PrintStackTraceSignalHandler(void*) /proc/self/cwd/external/llvm-project/llvm/lib/Support/Unix/Signals.inc:874:1
 #2 0x0000560f1289b116 llvm::sys::RunSignalHandlers() /proc/self/cwd/external/llvm-project/llvm/lib/Support/Signals.cpp:105:5
 #3 0x0000560f1289e145 SignalHandler(int) /proc/self/cwd/external/llvm-project/llvm/lib/Support/Unix/Signals.inc:415:1
 llvm#4 0x00007f829a3d9520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 llvm#5 0x0000560f1257f8bc void __gnu_cxx::new_allocator<mlir::BlockArgument>::construct<mlir::BlockArgument, mlir::BlockArgument>(mlir::BlockArgument*, mlir::BlockArgument&&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/ext/new_allocator.h:162:23
 llvm#6 0x0000560f1257f84d void std::allocator_traits<std::allocator<mlir::BlockArgument> >::construct<mlir::BlockArgument, mlir::BlockArgument>(std::allocator<mlir::BlockArgument>&, mlir::BlockArgument*, mlir::BlockArgument&&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/alloc_traits.h:520:2
 llvm#7 0x0000560f12580498 void std::vector<mlir::BlockArgument, std::allocator<mlir::BlockArgument> >::_M_insert_aux<mlir::BlockArgument>(__gnu_cxx::__normal_iterator<mlir::BlockArgument*, std::vector<mlir::BlockArgument, std::allocator<mlir::BlockArgument> > >, mlir::BlockArgument&&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/vector.tcc:405:7
 llvm#8 0x0000560f1257cf7e std::vector<mlir::BlockArgument, std::allocator<mlir::BlockArgument> >::insert(__gnu_cxx::__normal_iterator<mlir::BlockArgument const*, std::vector<mlir::BlockArgument, std::allocator<mlir::BlockArgument> > >, mlir::BlockArgument const&) /usr/lib/gcc/x86_64-linux-gnu/11/../../../../include/c++/11/bits/vector.tcc:154:6
 llvm#9 0x0000560f1257b349 mlir::Block::insertArgument(unsigned int, mlir::Type, mlir::Location) /proc/self/cwd/external/llvm-project/mlir/lib/IR/Block.cpp:178:13
llvm#10 0x0000560f123d2a1c mlir::function_interface_impl::insertFunctionArguments(mlir::FunctionOpInterface, llvm::ArrayRef<unsigned int>, mlir::TypeRange, llvm::ArrayRef<mlir::DictionaryAttr>, llvm::ArrayRef<mlir::Location>, unsigned int, mlir::Type) /proc/self/cwd/external/llvm-project/mlir/lib/Interfaces/FunctionInterfaces.cpp:232:11
llvm#11 0x0000560f0be6b727 mlir::detail::FunctionOpInterfaceTrait<mlir::func::FuncOp>::insertArguments(llvm::ArrayRef<unsigned int>, mlir::TypeRange, llvm::ArrayRef<mlir::DictionaryAttr>, llvm::ArrayRef<mlir::Location>) /proc/self/cwd/bazel-out/k8-dbg/bin/external/llvm-project/mlir/include/mlir/Interfaces/FunctionInterfaces.h.inc:809:7
llvm#12 0x0000560f0be6b536 mlir::detail::FunctionOpInterfaceTrait<mlir::func::FuncOp>::insertArgument(unsigned int, mlir::Type, mlir::DictionaryAttr, mlir::Location) /proc/self/cwd/bazel-out/k8-dbg/bin/external/llvm-project/mlir/include/mlir/Interfaces/FunctionInterfaces.h.inc:796:7
```
changkhothuychung pushed a commit that referenced this pull request Mar 8, 2025
When compiling VLS SVE, the compiler often replaces VL-based offsets
with immediate-based ones. This leads to a mismatch in the allowed
addressing modes due to SVE loads/stores generally expecting immediate
offsets relative to VL. For example, given:
```c

svfloat64_t foo(const double *x) {
  svbool_t pg = svptrue_b64();
  return svld1_f64(pg, x+svcntd());
}
```

When compiled with `-msve-vector-bits=128`, we currently generate:
```gas
foo:
        ptrue   p0.d
        mov     x8, #2
        ld1d    { z0.d }, p0/z, [x0, x8, lsl #3]
        ret
```

Instead, we could be generating:
```gas
foo:
        ldr     z0, [x0, #1, mul vl]
        ret
```

Likewise for other types, stores, and other VLS lengths.

This patch achieves the above by extending `SelectAddrModeIndexedSVE`
to let constants through when `vscale` is known.
changkhothuychung pushed a commit that referenced this pull request Mar 26, 2025
…1027)

No codegen regression on either target. The two builtin_ffs implied on
nvptx CSE away.

```
define internal i64 @__gpu_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv2 = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv2, i1 true)
  %iszero = icmp eq i32 %conv2, 0
  %sub = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv, i32 %sub, i32 31)
  %conv4 = sext i32 %1 to i64
  %shl = shl nsw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv2, i32 %conv1, i32 %sub, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
; becomes

define internal i64 @__gpu_competing_read_first_lane_u64(i64 noundef %__lane_mask, i64 noundef %__x) #2 {
entry:
  %shr = lshr i64 %__x, 32
  %conv = trunc nuw i64 %shr to i32
  %conv1 = trunc i64 %__x to i32
  %conv.i = trunc i64 %__lane_mask to i32
  %0 = tail call range(i32 0, 33) i32 @llvm.cttz.i32(i32 %conv.i, i1 true)
  %iszero = icmp eq i32 %conv.i, 0
  %sub.i = select i1 %iszero, i32 -1, i32 %0
  %1 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv, i32 %sub.i, i32 31)
  %conv4 = zext i32 %1 to i64
  %shl = shl nuw i64 %conv4, 32
  %2 = tail call i32 @llvm.nvvm.shfl.sync.idx.i32(i32 %conv.i, i32 %conv1, i32 %sub.i, i32 31)
  %conv7 = zext i32 %2 to i64
  %or = or disjoint i64 %shl, %conv7
  ret i64 %or
}
```

The sext vs zext difference is vaguely interesting but since the bits
are immediately discarded in either case it make no odds. The amdgcn one
doesn't need CSE, the readfirstlane function is a single call to an
intrinsic.

Drive by fix to __gpu_match_all_u32, it was calling first_lane_u64 and
could use first_lane_u32 instead. Added the missing call to gpuintrin.c
test case and a stray missing static as well.
changkhothuychung pushed a commit that referenced this pull request Mar 26, 2025
…too. (llvm#132267)

Observed in Wine when trying to intercept `ExitThread`, which forwards
to `ntdll.RtlExitUserThread`.

`gdb` interprets it as `xchg %ax,%ax`.
`llvm-mc` outputs simply `nop`.

```
==Asan-i386-calls-Dynamic-Test.exe==964==interception_win: unhandled instruction at 0x7be27cf0: 66 90 55 89 e5 56 50 8b
```

```
Wine-gdb> bt
#0  0x789a1766 in __interception::GetInstructionSize (address=<optimized out>, rel_offset=<optimized out>) at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/interception/interception_win.cpp:983
#1  0x789ab480 in __sanitizer::SharedPrintfCode(bool, char const*, char*) () at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/sanitizer_common/sanitizer_printf.cpp:311
#2  0x789a18e7 in __interception::OverrideFunctionWithHotPatch (old_func=2078440688, new_func=2023702608, orig_old_func=warning: (Internal error: pc 0x792f1a2c in read in CU, but not in symtab.)warning: (Error: pc 0x792f1a2c in address map, but not in symtab.)0x792f1a2c) at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/interception/interception_win.cpp:1118
#3  0x789a1f34 in __interception::OverrideFunction (old_func=2078440688, new_func=2023702608, orig_old_func=warning: (Internal error: pc 0x792f1a2c in read in CU, but not in symtab.)warning: (Error: pc 0x792f1a2c in address map, but not in symtab.)0x792f1a2c) at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/interception/interception_win.cpp:1224
llvm#4  0x789a24ce in __interception::OverrideFunction (func_name=0x78a0bc43 <vtable for __asan::AsanThreadContext+1163> "ExitThread", new_func=2023702608, orig_old_func=warning: (Internal error: pc 0x792f1a2c in read in CU, but not in symtab.)warning: (Error: pc 0x792f1a2c in address map, but not in symtab.)0x792f1a2c)    at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/interception/interception_win.cpp:1369
llvm#5  0x789f40ef in __asan::InitializePlatformInterceptors () at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/asan/asan_win.cpp:190
llvm#6  0x789e0c3c in __asan::InitializeAsanInterceptors () at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/asan/asan_interceptors.cpp:802
llvm#7  0x789ee6b5 in __asan::AsanInitInternal () at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/asan/asan_rtl.cpp:442
llvm#8  0x789eefb0 in __asan::AsanInitFromRtl () at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/asan/asan_rtl.cpp:522
llvm#9  __asan::AsanInitializer::AsanInitializer (this=<optimized out>) at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/asan/asan_rtl.cpp:542
llvm#10 __cxx_global_var_init () at C:/llvm-mingw/llvm-mingw/llvm-project/compiler-rt/lib/asan/asan_rtl.cpp:546
...
Wine-gdb> disassemble /r 2078440688,2078440688+20
Dump of assembler code from 0x7be27cf0 to 0x7be27d04:
   0x7be27cf0 <_RtlExitUserThread@4+0>: 66 90                   xchg   %ax,%ax
...
```
changkhothuychung pushed a commit that referenced this pull request Apr 9, 2025
…#134130)

This should fix failures caused by
llvm#133967
Attn: @sarnex
Thanks

Signed-off-by: Arvind Sudarsanam <[email protected]>
changkhothuychung pushed a commit that referenced this pull request Apr 9, 2025
…d A520 (llvm#132246)

Inefficient SVE codegen occurs on at least two in-order cores,
those being Cortex-A510 and Cortex-A520. For example a simple vector
add

```
void foo(float a, float b, float dst, unsigned n) {
    for (unsigned i = 0; i < n; ++i)
        dst[i] = a[i] + b[i];
}
```

Vectorizes the inner loop into the following interleaved sequence
of instructions.

```
        add     x12, x1, x10
        ld1b    { z0.b }, p0/z, [x1, x10]
        add     x13, x2, x10
        ld1b    { z1.b }, p0/z, [x2, x10]
        ldr     z2, [x12, #1, mul vl]
        ldr     z3, [x13, #1, mul vl]
        dech    x11
        add     x12, x0, x10
        fadd    z0.s, z1.s, z0.s
        fadd    z1.s, z3.s, z2.s
        st1b    { z0.b }, p0, [x0, x10]
        addvl   x10, x10, #2
        str     z1, [x12, #1, mul vl]
```

By adjusting the target features to prefer fixed over scalable if the
cost is equal we get the following vectorized loop.

```
         ldp q0, q3, [x11, #-16]
         subs    x13, x13, llvm#8
         ldp q1, q2, [x10, #-16]
         add x10, x10, llvm#32
         add x11, x11, llvm#32
         fadd    v0.4s, v1.4s, v0.4s
         fadd    v1.4s, v2.4s, v3.4s
         stp q0, q1, [x12, #-16]
         add x12, x12, llvm#32
```

Which is more efficient.
changkhothuychung pushed a commit that referenced this pull request Apr 9, 2025
… A510/A520 (llvm#134606)

Recommit. This work was done by llvm#132246 but failed buildbots due to the
test introduced needing updates

Inefficient SVE codegen occurs on at least two in-order cores, those
being Cortex-A510 and Cortex-A520. For example a simple vector add

```
void foo(float a, float b, float dst, unsigned n) {
    for (unsigned i = 0; i < n; ++i)
        dst[i] = a[i] + b[i];
}
```

Vectorizes the inner loop into the following interleaved sequence of
instructions.

```
        add     x12, x1, x10
        ld1b    { z0.b }, p0/z, [x1, x10]
        add     x13, x2, x10
        ld1b    { z1.b }, p0/z, [x2, x10]
        ldr     z2, [x12, #1, mul vl]
        ldr     z3, [x13, #1, mul vl]
        dech    x11
        add     x12, x0, x10
        fadd    z0.s, z1.s, z0.s
        fadd    z1.s, z3.s, z2.s
        st1b    { z0.b }, p0, [x0, x10]
        addvl   x10, x10, #2
        str     z1, [x12, #1, mul vl]
```

By adjusting the target features to prefer fixed over scalable if the
cost is equal we get the following vectorized loop.

```
         ldp q0, q3, [x11, #-16]
         subs    x13, x13, llvm#8
         ldp q1, q2, [x10, #-16]
         add x10, x10, llvm#32
         add x11, x11, llvm#32
         fadd    v0.4s, v1.4s, v0.4s
         fadd    v1.4s, v2.4s, v3.4s
         stp q0, q1, [x12, #-16]
         add x12, x12, llvm#32
```

Which is more efficient.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.