sethp · sethp · Jan 5, 2024
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
@@ -7126,23 +7126,11 @@ struct BitCastBuffer {
   inline static void copyBitsFrom(APInt &LHS, const BitSlice &Dst,
                                   const APInt &RHS, const BitSlice &Src) {
     assert(Src.size() == Dst.size());
-
-    if (Src.start() > 0 || Src.end() < RHS.getBitWidth() ||
-        RHS.getBitWidth() != Dst.size()) {
-      APInt Val = RHS.lshr(Src.start()).trunc(Src.size()).zext(Dst.size());
-      LHS.insertBits(Val, Dst.start());
-      return;
-    }
-    LHS.insertBits(RHS, Dst.start());
+    LHS.insertBits(RHS, Src.start(), Dst.start(), Src.size());
   }
 
   inline static void clearBits(APInt &Int, const BitSlice &Which) {
-    unsigned Bit = Which.start(), Rem = Which.size() % 64;
-    if (Rem > 0) // else APInt crashes when Bit == 0
-      Int.insertBits(0ull, Bit, Rem);
-    Bit += Rem;
-    for (unsigned End = Which.end(); Bit < End; Bit += 64)
-      Int.insertBits(0ull, Bit, 64u);
+    Int.clearBits(Which.start(), Which.end());
   }
 
   static llvm::FormattedBytes formatInt(const APInt &Int) {

diff --git a/llvm/include/llvm/ADT/APInt.h b/llvm/include/llvm/ADT/APInt.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/MathExtras.h"
 #include <cassert>
 #include <climits>
+#include <cstdint>
 #include <cstring>
 #include <optional>
 #include <utility>
@@ -229,8 +230,8 @@ class [[nodiscard]] APInt {
   /// \p hiBit.
   ///
   /// \param numBits the intended bit width of the result
-  /// \param loBit the index of the lowest bit set.
-  /// \param hiBit the index of the highest bit set.
+  /// \param loBit the index of the lowest bit set. (i.e. inclusive)
+  /// \param hiBit the index after the highest bit set. (i.e. exclusive)
   ///
   /// \returns An APInt value with the requested bits set.
   static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit) {
@@ -1392,6 +1393,20 @@ class [[nodiscard]] APInt {
     *this &= Keep;
   }
 
+  /// clear the bits from loBit (inclusive) to hiBit (exclusive) to 0.
+  /// This function handles case when \p loBit <= \p hiBit.
+  void clearBits(unsigned loBit, unsigned hiBit) {
+    assert(hiBit <= BitWidth && "hiBit out of range");
+    assert(loBit <= BitWidth && "loBit out of range");
+    assert(loBit <= hiBit && "loBit greater than hiBit");
+    if (isSingleWord()) {
+      uint64_t mask = maskTrailingOnes<uint64_t>(hiBit - loBit);
+      mask <<= loBit;
+      U.VAL &= ~mask;
+    } else
+      clearBitsSlowCase(loBit, hiBit);
+  }
+
   /// Set the sign bit to 0.
   void clearSignBit() { clearBit(BitWidth - 1); }
 
@@ -1417,6 +1432,10 @@ class [[nodiscard]] APInt {
     ++(*this);
   }
 
+  /// Insert the numBits from another APInt (starting at srcBitPosition) to
+  /// [bitPosition,bitPosition+numBits)
+  void insertBits(const APInt &SubBits, unsigned srcBit, unsigned bitPosition,
+                  unsigned numBits);
   /// Insert the bits from a smaller APInt starting at bitPosition.
   void insertBits(const APInt &SubBits, unsigned bitPosition);
   void insertBits(uint64_t SubBits, unsigned bitPosition, unsigned numBits);
@@ -2005,6 +2024,9 @@ class [[nodiscard]] APInt {
   /// out-of-line slow case for setBits.
   void setBitsSlowCase(unsigned loBit, unsigned hiBit);
 
+  /// out-of-line slow case for clearBits.
+  void clearBitsSlowCase(unsigned loBit, unsigned hiBit);
+
   /// out-of-line slow case for flipAllBits.
   void flipAllBitsSlowCase();
 

diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
@@ -25,6 +25,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cmath>
+#include <cstdint>
+#include <cstring>
 #include <optional>
 
 using namespace llvm;
@@ -334,6 +336,37 @@ void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) {
     U.pVal[word] = WORDTYPE_MAX;
 }
 
+void APInt::clearBitsSlowCase(unsigned loBit, unsigned hiBit) {
+  unsigned loWord = whichWord(loBit);
+  unsigned hiWord = whichWord(hiBit);
+
+  unsigned loPos = whichBit(loBit);
+  unsigned hiPos = whichBit(hiBit);
+  if (loWord == hiWord) {
+    uint64_t mask = maskTrailingOnes<uint64_t>(hiPos - loPos);
+    mask <<= loPos;
+    U.pVal[loWord] &= ~mask;
+    return;
+  }
+
+  if (loPos > 0) {
+    uint64_t loMask = maskTrailingOnes<uint64_t>(loPos);
+    U.pVal[loWord] &= loMask;
+    loWord++;
+  }
+
+  if (hiPos > 0) {
+    uint64_t hiMask = maskTrailingZeros<uint64_t>(hiPos);
+    U.pVal[hiWord] &= hiMask;
+    if (hiWord <= loWord)
+      return; // we've covered all the bits
+  }
+  assert(hiWord >= loWord);
+
+  // Fill any words between loWord and hiWord with all zeros.
+  memset(&U.pVal[loWord], 0, (hiWord - loWord) * APINT_WORD_SIZE);
+}
+
 // Complement a bignum in-place.
 static void tcComplement(APInt::WordType *dst, unsigned parts) {
   for (unsigned i = 0; i < parts; i++)
@@ -365,62 +398,67 @@ void APInt::flipBit(unsigned bitPosition) {
   setBitVal(bitPosition, !(*this)[bitPosition]);
 }
 
-void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
-  unsigned subBitWidth = subBits.getBitWidth();
-  assert((subBitWidth + bitPosition) <= BitWidth && "Illegal bit insertion");
+void APInt::insertBits(const APInt &SubBits, unsigned bitPosition) {
+  insertBits(SubBits, 0, bitPosition, SubBits.getBitWidth());
+}
+
+void APInt::insertBits(const APInt &SubBits, unsigned srcBit,
+                       unsigned bitPosition, unsigned numBits) {
+  assert((numBits + bitPosition) <= BitWidth && "Illegal bit insertion");
+  assert((numBits + srcBit) <= SubBits.getBitWidth() && "Illegal bit range");
+  // the below suffices because no two APInts share a subrange of their memory
+  assert(&SubBits != this &&
+         "Illegal bit insertion (aliased)"); // neither we nor memcpy handles
+                                             // overlapping memory ranges well
 
   // inserting no bits is a noop.
-  if (subBitWidth == 0)
+  if (numBits == 0)
     return;
 
-  // Insertion is a direct copy.
-  if (subBitWidth == BitWidth) {
-    *this = subBits;
-    return;
-  }
-
-  // Single word result can be done as a direct bitmask.
-  if (isSingleWord()) {
-    uint64_t mask = WORDTYPE_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
-    U.VAL &= ~(mask << bitPosition);
-    U.VAL |= (subBits.U.VAL << bitPosition);
-    return;
-  }
+  uint64_t *Base = isSingleWord() ? &U.VAL : U.pVal;
+  uint64_t *loWord = Base + whichWord(bitPosition);
+  uint64_t *hi1Word = Base + whichWord(bitPosition + numBits - 1);
 
   unsigned loBit = whichBit(bitPosition);
-  unsigned loWord = whichWord(bitPosition);
-  unsigned hi1Word = whichWord(bitPosition + subBitWidth - 1);
-
-  // Insertion within a single word can be done as a direct bitmask.
-  if (loWord == hi1Word) {
-    uint64_t mask = WORDTYPE_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
-    U.pVal[loWord] &= ~(mask << loBit);
-    U.pVal[loWord] |= (subBits.U.VAL << loBit);
-    return;
-  }
-
-  // Insert on word boundaries.
-  if (loBit == 0) {
-    // Direct copy whole words.
-    unsigned numWholeSubWords = subBitWidth / APINT_BITS_PER_WORD;
-    memcpy(U.pVal + loWord, subBits.getRawData(),
+  if (loBit > 0) {
+    // Insertion within a single word can be done as a direct bitmask.
+    uint64_t nb = std::min(APINT_BITS_PER_WORD - loBit, numBits);
+    uint64_t mask = maskTrailingOnes<uint64_t>(nb);
+    *loWord &= ~(mask << loBit);
+    *loWord |= (SubBits.extractBitsAsZExtValue(nb, srcBit) << loBit);
+    ++loWord;
+    srcBit += nb;
+    numBits -= nb;
+  }
+
+  if (loWord > hi1Word)
+    return; // src bits all fit within the single word above
+
+  // set whole words (if any).
+  if (whichBit(srcBit) == 0) {
+    // We get to memcpy directly
+    unsigned numWholeSubWords = numBits / APINT_BITS_PER_WORD;
+    memcpy(loWord, SubBits.getRawData() + whichWord(srcBit),
            numWholeSubWords * APINT_WORD_SIZE);
-
-    // Mask+insert remaining bits.
-    unsigned remainingBits = subBitWidth % APINT_BITS_PER_WORD;
-    if (remainingBits != 0) {
-      uint64_t mask = WORDTYPE_MAX >> (APINT_BITS_PER_WORD - remainingBits);
-      U.pVal[hi1Word] &= ~mask;
-      U.pVal[hi1Word] |= subBits.getWord(subBitWidth - 1);
+    srcBit += numWholeSubWords * APINT_BITS_PER_WORD;
+  } else {
+    // otherwise, we've got to merge together bits from two src words per
+    // inserted word
+    const unsigned loBit = whichBit(srcBit);
+    uint64_t *srcWord = SubBits.U.pVal + whichWord(srcBit);
+    for (; loWord < hi1Word; ++loWord, srcBit += APINT_BITS_PER_WORD) {
+      *loWord = *srcWord >> loBit;
+      *loWord |= *(++srcWord) << (APINT_BITS_PER_WORD - loBit);
     }
-    return;
   }
 
-  // General case - set/clear individual bits in dst based on src.
-  // TODO - there is scope for optimization here, but at the moment this code
-  // path is barely used so prefer readability over performance.
-  for (unsigned i = 0; i != subBitWidth; ++i)
-    setBitVal(bitPosition + i, subBits[i]);
+  // Mask+insert remaining bits.
+  unsigned remainingBits = numBits % APINT_BITS_PER_WORD;
+  if (remainingBits != 0) {
+    uint64_t mask = maskTrailingOnes<uint64_t>(remainingBits);
+    *hi1Word &= ~mask;
+    *hi1Word |= SubBits.extractBitsAsZExtValue(remainingBits, srcBit);
+  }
 }
 
 void APInt::insertBits(uint64_t subBits, unsigned bitPosition, unsigned numBits) {
@@ -432,6 +470,10 @@ void APInt::insertBits(uint64_t subBits, unsigned bitPosition, unsigned numBits)
     return;
   }
 
+  // inserting no bits is a noop.
+  if (numBits == 0)
+    return;
+
   unsigned loBit = whichBit(bitPosition);
   unsigned loWord = whichWord(bitPosition);
   unsigned hiWord = whichWord(bitPosition + numBits - 1);