diff --git a/libcxx/include/__algorithm/find.h b/libcxx/include/__algorithm/find.h
index 24b8b2f96443c..a7d9374b3a1c8 100644
--- a/libcxx/include/__algorithm/find.h
+++ b/libcxx/include/__algorithm/find.h
@@ -106,10 +106,10 @@ __find_bool(__bit_iterator<_Cp, _IsConst> __first, typename __size_difference_ty
   if (__first.__ctz_ != 0) {
     __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
     __storage_type __dn    = std::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
+    __storage_type __m     = std::__middle_mask<__storage_type>(__clz_f - __dn, __first.__ctz_);
     __storage_type __b     = std::__invert_if<!_ToFind>(*__first.__seg_) & __m;
     if (__b)
-      return _It(__first.__seg_, static_cast<unsigned>(std::__libcpp_ctz(__b)));
+      return _It(__first.__seg_, static_cast<unsigned>(std::__countr_zero(__b)));
     if (__n == __dn)
       return __first + __n;
     __n -= __dn;
@@ -119,14 +119,14 @@ __find_bool(__bit_iterator<_Cp, _IsConst> __first, typename __size_difference_ty
   for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word) {
     __storage_type __b = std::__invert_if<!_ToFind>(*__first.__seg_);
     if (__b)
-      return _It(__first.__seg_, static_cast<unsigned>(std::__libcpp_ctz(__b)));
+      return _It(__first.__seg_, static_cast<unsigned>(std::__countr_zero(__b)));
   }
   // do last partial word
   if (__n > 0) {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
+    __storage_type __m = std::__trailing_mask<__storage_type>(__bits_per_word - __n);
     __storage_type __b = std::__invert_if<!_ToFind>(*__first.__seg_) & __m;
     if (__b)
-      return _It(__first.__seg_, static_cast<unsigned>(std::__libcpp_ctz(__b)));
+      return _It(__first.__seg_, static_cast<unsigned>(std::__countr_zero(__b)));
   }
   return _It(__first.__seg_, static_cast<unsigned>(__n));
 }
diff --git a/libcxx/include/__bit/countr.h b/libcxx/include/__bit/countr.h
index 2f7571133bd03..46c43921fc60d 100644
--- a/libcxx/include/__bit/countr.h
+++ b/libcxx/include/__bit/countr.h
@@ -12,9 +12,11 @@
 #ifndef _LIBCPP___BIT_COUNTR_H
 #define _LIBCPP___BIT_COUNTR_H
 
+#include <__assert>
 #include <__bit/rotate.h>
 #include <__concepts/arithmetic.h>
 #include <__config>
+#include <__type_traits/is_unsigned.h>
 #include <limits>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -38,20 +40,24 @@ _LIBCPP_BEGIN_NAMESPACE_STD
   return __builtin_ctzll(__x);
 }
 
+// A constexpr implementation for C++11 and later (using clang extensions for constexpr support)
+// Precondition: __t != 0 (the caller __countr_zero handles __t == 0 as a special case)
 template <class _Tp>
-[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int __countr_zero(_Tp __t) _NOEXCEPT {
-#if __has_builtin(__builtin_ctzg)
-  return __builtin_ctzg(__t, numeric_limits<_Tp>::digits);
-#else  // __has_builtin(__builtin_ctzg)
-  if (__t == 0)
-    return numeric_limits<_Tp>::digits;
-  if (sizeof(_Tp) <= sizeof(unsigned int))
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __countr_zero_impl(_Tp __t) _NOEXCEPT {
+  _LIBCPP_ASSERT_INTERNAL(__t != 0, "__countr_zero_impl called with zero value");
+  static_assert(is_unsigned<_Tp>::value, "__countr_zero_impl only works with unsigned types");
+  if _LIBCPP_CONSTEXPR (sizeof(_Tp) <= sizeof(unsigned int)) {
     return std::__libcpp_ctz(static_cast<unsigned int>(__t));
-  else if (sizeof(_Tp) <= sizeof(unsigned long))
+  } else if _LIBCPP_CONSTEXPR (sizeof(_Tp) <= sizeof(unsigned long)) {
     return std::__libcpp_ctz(static_cast<unsigned long>(__t));
-  else if (sizeof(_Tp) <= sizeof(unsigned long long))
+  } else if _LIBCPP_CONSTEXPR (sizeof(_Tp) <= sizeof(unsigned long long)) {
     return std::__libcpp_ctz(static_cast<unsigned long long>(__t));
-  else {
+  } else {
+#if _LIBCPP_STD_VER == 11
+    unsigned long long __ull       = static_cast<unsigned long long>(__t);
+    const unsigned int __ulldigits = numeric_limits<unsigned long long>::digits;
+    return __ull == 0ull ? __ulldigits + std::__countr_zero_impl<_Tp>(__t >> __ulldigits) : std::__libcpp_ctz(__ull);
+#else
     int __ret                      = 0;
     const unsigned int __ulldigits = numeric_limits<unsigned long long>::digits;
     while (static_cast<unsigned long long>(__t) == 0uLL) {
@@ -59,8 +65,18 @@ template <class _Tp>
       __t >>= __ulldigits;
     }
     return __ret + std::__libcpp_ctz(static_cast<unsigned long long>(__t));
+#endif
   }
-#endif // __has_builtin(__builtin_ctzg)
+}
+
+template <class _Tp>
+[[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __countr_zero(_Tp __t) _NOEXCEPT {
+  static_assert(is_unsigned<_Tp>::value, "__countr_zero only works with unsigned types");
+#if __has_builtin(__builtin_ctzg) // TODO (LLVM 21): This can be dropped once we only support Clang >= 19.
+  return __builtin_ctzg(__t, numeric_limits<_Tp>::digits);
+#else
+  return __t != 0 ? std::__countr_zero_impl(__t) : numeric_limits<_Tp>::digits;
+#endif
 }
 
 #if _LIBCPP_STD_VER >= 20
diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference
index 079d1de9dd523..aca300f851593 100644
--- a/libcxx/include/__bit_reference
+++ b/libcxx/include/__bit_reference
@@ -67,9 +67,30 @@ struct __size_difference_type_traits<_Cp, __void_t<typename _Cp::difference_type
   using size_type       = typename _Cp::size_type;
 };
 
+// The `__x_mask` functions are designed to work exclusively with any unsigned `_StorageType`s, including small
+// integral types such as unsigned char/short, `uint8_t`, and `uint16_t`. To prevent undefined behavior or
+// ambiguities due to integral promotions for the small integral types, all intermediate bitwise operations are
+// explicitly cast back to the unsigned `_StorageType`.
+
+// Creates a mask of type `_StorageType` with a specified number of leading zeros (__clz) and sets all remaining
+// bits to one.
+template <class _StorageType>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _StorageType __trailing_mask(unsigned __clz) {
+  static_assert(is_unsigned<_StorageType>::value, "__trailing_mask only works with unsigned types");
+  return static_cast<_StorageType>(~static_cast<_StorageType>(0)) >> __clz;
+}
+
+// Creates a mask of type `_StorageType` with a specified number of leading zeros (__clz), a specified number of
+// trailing zeros (__ctz), and sets all bits in between to one.
+template <class _StorageType>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _StorageType __middle_mask(unsigned __clz, unsigned __ctz) {
+  static_assert(is_unsigned<_StorageType>::value, "__middle_mask only works with unsigned types");
+  return (static_cast<_StorageType>(~static_cast<_StorageType>(0)) << __ctz) &
+         std::__trailing_mask<_StorageType>(__clz);
+}
+
 // This function is designed to operate correctly even for smaller integral types like `uint8_t`, `uint16_t`,
-// or `unsigned short`. Casting back to _StorageType is crucial to prevent undefined behavior that can arise
-// from integral promotions.
+// or `unsigned short`.
 // See https://github.com/llvm/llvm-project/pull/122410.
 template <class _StoragePointer>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
@@ -79,12 +100,11 @@ __fill_masked_range(_StoragePointer __word, unsigned __clz, unsigned __ctz, bool
   using _StorageType = typename pointer_traits<_StoragePointer>::element_type;
   _LIBCPP_ASSERT_VALID_INPUT_RANGE(
       __ctz + __clz < sizeof(_StorageType) * CHAR_BIT, "__fill_masked_range called with invalid range");
-  _StorageType __m = static_cast<_StorageType>(static_cast<_StorageType>(~static_cast<_StorageType>(0)) >> __clz) &
-                     static_cast<_StorageType>(static_cast<_StorageType>(~static_cast<_StorageType>(0)) << __ctz);
+  _StorageType __m = std::__middle_mask<_StorageType>(__clz, __ctz);
   if (__fill_val)
     *__word |= __m;
   else
-    *__word &= static_cast<_StorageType>(~__m);
+    *__word &= ~__m;
 }
 
 template <class _Cp, bool = __has_storage_type<_Cp>::value>
diff --git a/libcxx/include/__fwd/bit_reference.h b/libcxx/include/__fwd/bit_reference.h
index d65f043e89ad6..73f3bc07736dd 100644
--- a/libcxx/include/__fwd/bit_reference.h
+++ b/libcxx/include/__fwd/bit_reference.h
@@ -10,9 +10,6 @@
 #define _LIBCPP___FWD_BIT_REFERENCE_H
 
 #include <__config>
-#include <__memory/pointer_traits.h>
-#include <__type_traits/enable_if.h>
-#include <__type_traits/is_unsigned.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
@@ -30,6 +27,12 @@ template <class _StoragePointer>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
 __fill_masked_range(_StoragePointer __word, unsigned __ctz, unsigned __clz, bool __fill_val);
 
+template <class _StorageType>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _StorageType __trailing_mask(unsigned __clz);
+
+template <class _StorageType>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _StorageType __middle_mask(unsigned __clz, unsigned __ctz);
+
 _LIBCPP_END_NAMESPACE_STD
 
 #endif // _LIBCPP___FWD_BIT_REFERENCE_H
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
index dd89f2c5ae944..3aaeb9c2f345f 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/find.pass.cpp
@@ -16,6 +16,7 @@
 // ADDITIONAL_COMPILE_FLAGS(cl-style-warnings): /wd4245 /wd4305 /wd4310 /wd4389 /wd4805
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
+// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 // <algorithm>
 
@@ -31,6 +32,7 @@
 #include <vector>
 #include <type_traits>
 
+#include "sized_allocator.h"
 #include "test_macros.h"
 #include "test_iterators.h"
 #include "type_algorithms.h"
@@ -228,7 +230,8 @@ TEST_CONSTEXPR_CXX20 bool test() {
 
   types::for_each(types::integral_types(), TestIntegerPromotions());
 
-  { // Test vector<bool>::iterator optimization
+  {
+    // Test vector<bool>::iterator optimization
     std::vector<bool> vec(256 + 8);
     for (ptrdiff_t i = 8; i <= 256; i *= 2) {
       for (size_t offset = 0; offset < 8; offset += 2) {
@@ -238,6 +241,39 @@ TEST_CONSTEXPR_CXX20 bool test() {
         assert(std::find(vec.begin() + offset, vec.end(), false) == vec.begin() + offset + i);
       }
     }
+
+    // Verify that the std::vector<bool>::iterator optimization works properly for allocators with custom size types
+    // Fix https://github.com/llvm/llvm-project/issues/122528
+    {
+      using Alloc = sized_allocator<bool, std::uint8_t, std::int8_t>;
+      std::vector<bool, Alloc> in(100, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::find(in.begin(), in.end(), true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, std::uint16_t, std::int16_t>;
+      std::vector<bool, Alloc> in(199, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::find(in.begin(), in.end(), true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, unsigned short, short>;
+      std::vector<bool, Alloc> in(200, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::find(in.begin(), in.end(), true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, std::uint32_t, std::int32_t>;
+      std::vector<bool, Alloc> in(205, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::find(in.begin(), in.end(), true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, std::uint64_t, std::int64_t>;
+      std::vector<bool, Alloc> in(257, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::find(in.begin(), in.end(), true) == in.end() - 2);
+    }
   }
 
   return true;
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp
index 46accc5e36f0a..d7e6be9928a2d 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.find/ranges.find.pass.cpp
@@ -34,6 +34,7 @@
 #include <vector>
 
 #include "almost_satisfies_types.h"
+#include "sized_allocator.h"
 #include "test_iterators.h"
 
 struct NotEqualityComparable {};
@@ -202,7 +203,8 @@ constexpr bool test() {
     }
   }
 
-  { // Test vector<bool>::iterator optimization
+  {
+    // Test vector<bool>::iterator optimization
     std::vector<bool> vec(256 + 8);
     for (ptrdiff_t i = 8; i <= 256; i *= 2) {
       for (size_t offset = 0; offset < 8; offset += 2) {
@@ -215,6 +217,39 @@ constexpr bool test() {
                std::ranges::begin(vec) + offset + i);
       }
     }
+
+    // Verify that the std::vector<bool>::iterator optimization works properly for allocators with custom size types
+    // See https://github.com/llvm/llvm-project/issues/122528
+    {
+      using Alloc = sized_allocator<bool, std::uint8_t, std::int8_t>;
+      std::vector<bool, Alloc> in(100, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::ranges::find(in, true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, std::uint16_t, std::int16_t>;
+      std::vector<bool, Alloc> in(199, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::ranges::find(in, true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, unsigned short, short>;
+      std::vector<bool, Alloc> in(200, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::ranges::find(in, true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, std::uint32_t, std::int32_t>;
+      std::vector<bool, Alloc> in(205, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::ranges::find(in, true) == in.end() - 2);
+    }
+    {
+      using Alloc = sized_allocator<bool, std::uint64_t, std::int64_t>;
+      std::vector<bool, Alloc> in(257, false, Alloc(1));
+      in[in.size() - 2] = true;
+      assert(std::ranges::find(in, true) == in.end() - 2);
+    }
   }
 
   return true;
diff --git a/libcxx/test/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp b/libcxx/test/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp
index 521a4ff13abba..265ecb23b2985 100644
--- a/libcxx/test/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp
+++ b/libcxx/test/std/utilities/template.bitset/bitset.members/left_shift_eq.pass.cpp
@@ -6,6 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=15000000
+
 // bitset<N>& operator<<=(size_t pos); // constexpr since C++23
 
 #include <bitset>
@@ -18,20 +20,20 @@
 
 template <std::size_t N>
 TEST_CONSTEXPR_CXX23 bool test_left_shift() {
-    std::vector<std::bitset<N> > const cases = get_test_cases<N>();
-    for (std::size_t c = 0; c != cases.size(); ++c) {
-        for (std::size_t s = 0; s <= N+1; ++s) {
-            std::bitset<N> v1 = cases[c];
-            std::bitset<N> v2 = v1;
-            v1 <<= s;
-            for (std::size_t i = 0; i < v1.size(); ++i)
-                if (i < s)
-                    assert(v1[i] == 0);
-                else
-                    assert(v1[i] == v2[i-s]);
-        }
+  std::vector<std::bitset<N> > const cases = get_test_cases<N>();
+  for (std::size_t c = 0; c != cases.size(); ++c) {
+    for (std::size_t s = 0; s <= N + 1; ++s) {
+      std::bitset<N> v1 = cases[c];
+      std::bitset<N> v2 = v1;
+      v1 <<= s;
+      for (std::size_t i = 0; i < v1.size(); ++i)
+        if (i < s)
+          assert(v1[i] == 0);
+        else
+          assert(v1[i] == v2[i - s]);
     }
-    return true;
+  }
+  return true;
 }
 
 int main(int, char**) {