From 34ffb08e16526f9c7df7caa643c451a6526e2d66 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Sun, 9 Apr 2023 15:45:26 +0100
Subject: [PATCH 01/28] std.math.big.int: Implement depositBits and extractBits

This change implements depositBits and extractBits (equivalents of PDEP
and PEXT) for Zig's bit ints. This change lays the groundwork for
implementation of `@depositBits` and `@extractBits`.

Tests have been added to check the behaviour of these two functions.

The functions currently don't handle negative values (though negative
values may be converted to twos complement externally), and
aren't optimal in either memory or performance.
---
 lib/std/math/big/int.zig      | 74 +++++++++++++++++++++++++++++++++++
 lib/std/math/big/int_test.zig | 54 +++++++++++++++++++++++++
 2 files changed, 128 insertions(+)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index 93ad1ccbe26a..fa97fe1c6f00 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1735,6 +1735,80 @@ pub const Mutable = struct {
         y.shiftRight(y.toConst(), norm_shift);
     }
 
+    // TODO this function is quite inefficient and could be optimised
+    /// r = @depositBits(source, mask)
+    ///
+    /// Asserts that `source` and `mask` are positive
+    ///
+    /// `limbs_buffer` is used as a working area. It must have length of at least `mask.limbs.len`.
+    pub fn depositBits(r: *Mutable, source: Const, mask: Const, limbs_buffer: []Limb) void {
+        assert(source.positive);
+        assert(mask.positive);
+
+        r.positive = true;
+        std.mem.set(Limb, r.limbs, 0);
+
+        var mut_mask = Mutable{ .limbs = limbs_buffer[0..mask.limbs.len], .positive = undefined, .len = undefined };
+        mut_mask.copy(mask);
+
+        var mask_bit_index = mut_mask.toConst().ctz();
+        var i: usize = 0;
+        while (!mut_mask.eqZero()) : ({
+            mask_bit_index = mut_mask.toConst().ctz();
+            i += 1;
+        }) {
+            const mask_limb_index = mask_bit_index / limb_bits;
+            const mask_limb_bit = @intCast(u6, mask_bit_index % limb_bits);
+
+            const i_limb_index = i / limb_bits;
+            const i_limb_bit = @intCast(u6, i % limb_bits);
+
+            mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
+            const source_bit_set = source.limbs[i_limb_index] & (@as(Limb, 1) << i_limb_bit) != 0;
+
+            r.limbs[mask_limb_index] |= @as(Limb, @boolToInt(source_bit_set)) << mask_limb_bit;
+        }
+
+        r.normalize(r.limbs.len);
+    }
+
+    // TODO this function is quite inefficient and could be optimised
+    /// r = @extractBits(source, mask)
+    ///
+    /// Asserts that `source` and `mask` are positive
+    ///
+    /// `limbs_buffer` is used as a working area. It must have length of at least `mask.limbs.len`.
+    pub fn extractBits(r: *Mutable, source: Const, mask: Const, limbs_buffer: []Limb) void {
+        assert(source.positive);
+        assert(mask.positive);
+
+        r.positive = true;
+        std.mem.set(Limb, r.limbs, 0);
+
+        var mut_mask = Mutable{ .limbs = limbs_buffer[0..mask.limbs.len], .positive = undefined, .len = undefined };
+        mut_mask.copy(mask);
+
+        var mask_bit_index = mut_mask.toConst().ctz();
+        var i: usize = 0;
+        while (!mut_mask.eqZero()) : ({
+            mask_bit_index = mut_mask.toConst().ctz();
+            i += 1;
+        }) {
+            const mask_limb_index = mask_bit_index / limb_bits;
+            const mask_limb_bit = @intCast(u6, mask_bit_index % limb_bits);
+
+            const i_limb_index = i / limb_bits;
+            const i_limb_bit = @intCast(u6, i % limb_bits);
+
+            mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
+            const source_bit_set = source.limbs[mask_limb_index] & (@as(Limb, 1) << mask_limb_bit) != 0;
+
+            r.limbs[i_limb_index] |= @as(Limb, @boolToInt(source_bit_set)) << i_limb_bit;
+        }
+
+        r.normalize(r.limbs.len);
+    }
+
     /// If a is positive, this passes through to truncate.
     /// If a is negative, then r is set to positive with the bit pattern ~(a - 1).
     /// r may alias a.
diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig
index 624bdc0b83af..57c51e8f9476 100644
--- a/lib/std/math/big/int_test.zig
+++ b/lib/std/math/big/int_test.zig
@@ -2800,6 +2800,60 @@ fn popCountTest(val: *const Managed, bit_count: usize, expected: usize) !void {
     try testing.expectEqual(expected, val.toConst().popCount(bit_count));
 }
 
+test "big int extractBits" {
+    try extractBitsTest(0x12345678, 0x0, 0x0);
+    try extractBitsTest(0x12345678, 0xf0f0f0f0, 0x1357);
+    try extractBitsTest(0x12345678, 0xff00ff00, 0x1256);
+    try extractBitsTest(0x12345678, 0xffff, 0x5678);
+
+    try extractBitsTest(0x12345678_90123456_78901234_56789012, 0xff << 64, 0x56);
+    try extractBitsTest(0x12345678_90123456_78901234_56789012, (0xff << 64) | 0xff00f, 0x56892);
+}
+
+fn extractBitsTest(comptime source: comptime_int, comptime mask: comptime_int, comptime expected: comptime_int) !void {
+    var source_bigint = try Managed.initSet(testing.allocator, source);
+    defer source_bigint.deinit();
+    var mask_bigint = try Managed.initSet(testing.allocator, mask);
+    defer mask_bigint.deinit();
+    const limbs = try testing.allocator.alloc(Limb, mask_bigint.limbs.len);
+    defer testing.allocator.free(limbs);
+    var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined };
+
+    const limbs_buffer = try testing.allocator.alloc(Limb, mask_bigint.limbs.len);
+    defer testing.allocator.free(limbs_buffer);
+
+    result.extractBits(source_bigint.toConst(), mask_bigint.toConst(), limbs_buffer);
+
+    try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected));
+}
+
+test "big int depositBits" {
+    try depositBitsTest(0x12345678, 0x0, 0x0);
+    try depositBitsTest(0x12345678, 0xf0f0f0f0, 0x50607080);
+    try depositBitsTest(0x12345678, 0xff00ff00, 0x56007800);
+    try depositBitsTest(0x12345678, 0xffff, 0x5678);
+
+    try depositBitsTest(0x1234, 0xff << 64, 0x34_00000000_00000000);
+    try depositBitsTest(0x12345678, (0xff << 64) | 0xff00f, 0x45_00000000_00067008);
+}
+
+fn depositBitsTest(comptime source: comptime_int, comptime mask: comptime_int, comptime expected: comptime_int) !void {
+    var source_bigint = try Managed.initSet(testing.allocator, source);
+    defer source_bigint.deinit();
+    var mask_bigint = try Managed.initSet(testing.allocator, mask);
+    defer mask_bigint.deinit();
+    const limbs = try testing.allocator.alloc(Limb, mask_bigint.limbs.len);
+    defer testing.allocator.free(limbs);
+    var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined };
+
+    const limbs_buffer = try testing.allocator.alloc(Limb, mask_bigint.limbs.len);
+    defer testing.allocator.free(limbs_buffer);
+
+    result.depositBits(source_bigint.toConst(), mask_bigint.toConst(), limbs_buffer);
+
+    try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected));
+}
+
 test "big int conversion read/write twos complement" {
     var a = try Managed.initSet(testing.allocator, (1 << 493) - 1);
     defer a.deinit();

From 32ff10178028cb3de7e42848809f88c193367d59 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Tue, 11 Apr 2023 12:25:30 +0100
Subject: [PATCH 02/28] std.math.big.int: Conversion from 2's complement

Implements std.math.big.int.Mutable.convertFromTwosComplement, to match
convertToTwosComplement.
---
 lib/std/math/big/int.zig      | 34 ++++++++++++++++++++++++++++++++++
 lib/std/math/big/int_test.zig | 27 +++++++++++++++++++++++++++
 2 files changed, 61 insertions(+)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index fa97fe1c6f00..f896d75ee856 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1844,6 +1844,40 @@ pub const Mutable = struct {
         r.normalize(r.len);
     }
 
+    /// Converts a twos-complement value to a magnitude, and sets the sign of `r` to match.
+    /// `a.positive` is ignored
+    /// r may alias a
+    ///
+    /// Asserts `r` has enough storage to store the result.
+    /// The upper bound is `calcTwosCompLimbCount(bit_count)`
+    pub fn convertFromTwosComplement(r: *Mutable, a: Const, signedness: Signedness, bit_count: usize) void {
+        const req_limbs = calcTwosCompLimbCount(bit_count);
+        if (req_limbs == 0 or a.eqZero()) {
+            r.set(0);
+            return;
+        }
+
+        const bit = @truncate(Log2Limb, bit_count - 1);
+        const signmask = @as(Limb, 1) << bit;
+        const mask = (signmask << 1) -% 1;
+
+        if (signedness == .unsigned or req_limbs > a.limbs.len or a.limbs[req_limbs - 1] & signmask == 0) {
+            r.truncate(a, signedness, bit_count);
+            return;
+        }
+
+        r.copy(a);
+        assert(r.limbs.len >= req_limbs);
+        r.len = req_limbs;
+
+        r.addScalar(r.toConst(), -1);
+        llnot(r.limbs[0..r.len]);
+        r.limbs[r.len - 1] &= mask;
+
+        r.positive = false;
+        r.normalize(r.len);
+    }
+
     /// Truncate an integer to a number of bits, following 2s-complement semantics.
     /// r may alias a.
     ///
diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig
index 57c51e8f9476..90c1cf719de2 100644
--- a/lib/std/math/big/int_test.zig
+++ b/lib/std/math/big/int_test.zig
@@ -2854,6 +2854,33 @@ fn depositBitsTest(comptime source: comptime_int, comptime mask: comptime_int, c
     try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected));
 }
 
+test "big int conversion to/from twos complement" {
+    var a = try Managed.initSet(testing.allocator, maxInt(u64));
+    defer a.deinit();
+    var b = try Managed.initSet(testing.allocator, maxInt(u32));
+    defer b.deinit();
+    var c = try Managed.initSet(testing.allocator, maxInt(u493));
+    defer c.deinit();
+
+    var m_a = a.toMutable();
+    m_a.convertToTwosComplement(m_a.toConst(), .unsigned, 64);
+    try testing.expectEqual(m_a.toConst().orderAgainstScalar(maxInt(u64)), .eq);
+    m_a.convertFromTwosComplement(m_a.toConst(), .signed, 64);
+    try testing.expectEqual(m_a.toConst().orderAgainstScalar(-1), .eq);
+
+    var m_b = b.toMutable();
+    m_b.convertToTwosComplement(m_b.toConst(), .unsigned, 32);
+    try testing.expectEqual(m_b.toConst().orderAgainstScalar(maxInt(u32)), .eq);
+    m_b.convertFromTwosComplement(m_b.toConst(), .signed, 32);
+    try testing.expectEqual(m_b.toConst().orderAgainstScalar(-1), .eq);
+
+    var m_c = c.toMutable();
+    m_c.convertToTwosComplement(m_c.toConst(), .unsigned, 493);
+    try testing.expectEqual(m_c.toConst().orderAgainstScalar(maxInt(u493)), .eq);
+    m_c.convertFromTwosComplement(m_c.toConst(), .signed, 493);
+    try testing.expectEqual(m_c.toConst().orderAgainstScalar(-1), .eq);
+}
+
 test "big int conversion read/write twos complement" {
     var a = try Managed.initSet(testing.allocator, (1 << 493) - 1);
     defer a.deinit();

From a33d4f6ba7b889264e4e6f12c88c8d685c0ac075 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 12 Apr 2023 21:00:14 +0100
Subject: [PATCH 03/28] Write docs for `@depositBits` and `@extractBits`

---
 doc/langref.html.in | 56 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 56 insertions(+)

diff --git a/doc/langref.html.in b/doc/langref.html.in
index 6f3e9961c389..e9b14c66200b 100644
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@@ -8558,6 +8558,34 @@ test "main" {
       {#see_also|@cVaArg|@cVaCopy|@cVaEnd#}
       {#header_close#}
 
+      {#header_open|@depositBits#}
+      <pre>{#syntax#}@depositBits(source: T, mask: T) T{#endsyntax#}</pre>
+      <p>
+      {#syntax#}@TypeOf(source){#endsyntax#} must be an integer type.
+      </p>
+      <p>
+      Uses a mask to transfer contiguous lower bits in the {#syntax#}source{#endsyntax#} operand to the destination, transferring them to the corresponding bits in the destination that are set in the mask. All other bits in the destination are zeroed.
+      </p>
+      <p>
+      Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PDEP) in microcode. It may be faster to use an alternative method in both of these cases.
+      </p>
+      <p>
+      Example:
+      </p>
+
+      <!-- TODO make this a test when implemented-->
+      {#syntax_block|zig|@depositBits test#}
+const std = @import("std");
+
+test "deposit bits" {
+    comptime {
+        try std.testing.expectEqual(@depositBits(0x00001234, 0xf0f0f0f0), 0x10203040);
+    }
+}
+      {#end_syntax_block#}
+      {#see_also|@extractBits#}
+      {#header_close#}
+
       {#header_open|@divExact#}
       <pre>{#syntax#}@divExact(numerator: T, denominator: T) T{#endsyntax#}</pre>
       <p>
@@ -8726,6 +8754,34 @@ export fn @"A function name that is a complete sentence."() void {}
       {#see_also|@export#}
       {#header_close#}
 
+      {#header_open|@extractBits#}
+      <pre>{#syntax#}@extractBits(source: T, mask: T) T{#endsyntax#}</pre>
+      <p>
+      {#syntax#}T{#endsyntax#} must be an integer type.
+      </p>
+      <p>
+      Uses a mask to transfer bits in the {#syntax#}source{#endsyntax#} operand to the destination, writing them as contiguous lower bits in the destination. The upper bits of the destination are zeroed.
+      </p>
+      <p>
+      Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PEXT) in microcode. It may be faster to use an alternative method in both of these cases.
+      </p>
+      <p>
+      Example:
+      </p>
+
+      <!-- TODO Make this a test when implemented -->
+      {#syntax_block|zig|@extractBits test#}
+const std = @import("std");
+
+test "extract bits" {
+    comptime {
+        try std.testing.expectEqual(@extractBits(0x12345678, 0xf0f0f0f0), 0x00001357);
+    }
+}
+      {#end_syntax_block#}
+      {#see_also|@depositBits#}
+      {#header_close#}
+
       {#header_open|@fence#}
       <pre>{#syntax#}@fence(order: AtomicOrder) void{#endsyntax#}</pre>
       <p>

From 9bd3bf791a319ea76f22e56142ee45946703f900 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 12 Apr 2023 21:00:40 +0100
Subject: [PATCH 04/28] Implement `@depositBits` and `@extractBits`

Incomplete: currently only implemented for 64-bit-or-smaller integers
for x86(-64) in the LLVM backend.
---
 lib/std/zig/AstGen.zig       |  21 ++++
 lib/std/zig/BuiltinFn.zig    |  15 +++
 lib/std/zig/Zir.zig          |   6 ++
 src/Air.zig                  |   9 ++
 src/Liveness.zig             |   4 +
 src/Sema.zig                 | 186 +++++++++++++++++++++++++++++++++++
 src/arch/aarch64/CodeGen.zig |   3 +
 src/arch/arm/CodeGen.zig     |   3 +
 src/arch/riscv64/CodeGen.zig |   3 +
 src/arch/sparc64/CodeGen.zig |   3 +
 src/arch/wasm/CodeGen.zig    |   4 +
 src/arch/x86_64/CodeGen.zig  |   3 +
 src/codegen/c.zig            |   3 +
 src/codegen/llvm.zig         | 159 ++++++++++++++++++++++++++++++
 src/print_air.zig            |   2 +
 src/print_zir.zig            |   2 +
 16 files changed, 426 insertions(+)

diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig
index a52007eabf24..e483a9e890e6 100644
--- a/lib/std/zig/AstGen.zig
+++ b/lib/std/zig/AstGen.zig
@@ -9691,6 +9691,9 @@ fn builtinCall(
             });
             return rvalue(gz, ri, result, node);
         },
+
+        .deposit_bits => return depositExtractBits(gz, scope, ri, node, params, .deposit_bits),
+        .extract_bits => return depositExtractBits(gz, scope, ri, node, params, .extract_bits),
     }
 }
 
@@ -9958,6 +9961,24 @@ fn overflowArithmetic(
     return rvalue(gz, ri, result, node);
 }
 
+fn depositExtractBits(
+    gz: *GenZir,
+    scope: *Scope,
+    ri: ResultInfo,
+    node: Ast.Node.Index,
+    params: []const Ast.Node.Index,
+    tag: Zir.Inst.Extended,
+) InnerError!Zir.Inst.Ref {
+    const lhs = try expr(gz, scope, .{ .rl = .none }, params[0]);
+    const rhs = try expr(gz, scope, .{ .rl = .none }, params[1]);
+    const result = try gz.addExtendedPayload(tag, Zir.Inst.BinNode{
+        .node = gz.nodeIndexToRelative(node),
+        .lhs = lhs,
+        .rhs = rhs,
+    });
+    return rvalue(gz, ri, result, node);
+}
+
 fn callExpr(
     gz: *GenZir,
     scope: *Scope,
diff --git a/lib/std/zig/BuiltinFn.zig b/lib/std/zig/BuiltinFn.zig
index 11d6a17303c8..37f648893da2 100644
--- a/lib/std/zig/BuiltinFn.zig
+++ b/lib/std/zig/BuiltinFn.zig
@@ -35,6 +35,7 @@ pub const Tag = enum {
     c_va_copy,
     c_va_end,
     c_va_start,
+    deposit_bits,
     div_exact,
     div_floor,
     div_trunc,
@@ -46,6 +47,7 @@ pub const Tag = enum {
     error_cast,
     @"export",
     @"extern",
+    extract_bits,
     fence,
     field,
     field_parent_ptr,
@@ -405,6 +407,12 @@ pub const list = list: {
                 .illegal_outside_function = true,
             },
         },
+        .{
+            "@depositBits", .{
+                .tag = .deposit_bits,
+                .param_count = 2,
+            },
+        },
         .{
             "@divExact",
             .{
@@ -483,6 +491,13 @@ pub const list = list: {
                 .param_count = 2,
             },
         },
+        .{
+            "@extractBits",
+            .{
+                .tag = .extract_bits,
+                .param_count = 2,
+            },
+        },
         .{
             "@fence",
             .{
diff --git a/lib/std/zig/Zir.zig b/lib/std/zig/Zir.zig
index 64e8a1c8050f..b9f3e10dd007 100644
--- a/lib/std/zig/Zir.zig
+++ b/lib/std/zig/Zir.zig
@@ -2060,6 +2060,12 @@ pub const Inst = struct {
         /// Guaranteed to not have the `ptr_cast` flag.
         /// Uses the `pl_node` union field with payload `FieldParentPtr`.
         field_parent_ptr,
+        /// Implements the `@depositBits` builtin.
+        /// `operand` is payload index to `BinNode`.
+        deposit_bits,
+        /// Implements the `@extractBits` builtin.
+        /// `operand` is payload index to `BinNode`.
+        extract_bits,
 
         pub const InstData = struct {
             opcode: Extended,
diff --git a/src/Air.zig b/src/Air.zig
index 9554c55561a5..1b7d8d77b9e9 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -848,6 +848,13 @@ pub const Inst = struct {
         /// Operand is unused and set to Ref.none
         work_group_id,
 
+        /// Implements @depositBits builtin.
+        /// Uses the `bin_op` field.
+        deposit_bits,
+        /// Implements @extractBits builtin.
+        /// Uses the `bin_op` field.
+        extract_bits,
+
         pub fn fromCmpOp(op: std.math.CompareOperator, optimized: bool) Tag {
             switch (op) {
                 .lt => return if (optimized) .cmp_lt_optimized else .cmp_lt,
@@ -1318,6 +1325,8 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool)
         .div_exact_optimized,
         .rem_optimized,
         .mod_optimized,
+        .deposit_bits,
+        .extract_bits,
         => return air.typeOf(datas[@intFromEnum(inst)].bin_op.lhs, ip),
 
         .sqrt,
diff --git a/src/Liveness.zig b/src/Liveness.zig
index 4ca28758e222..dd9f4fe24776 100644
--- a/src/Liveness.zig
+++ b/src/Liveness.zig
@@ -286,6 +286,8 @@ pub fn categorizeOperand(
         .cmp_gte_optimized,
         .cmp_gt_optimized,
         .cmp_neq_optimized,
+        .deposit_bits,
+        .extract_bits,
         => {
             const o = air_datas[@intFromEnum(inst)].bin_op;
             if (o.lhs == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none);
@@ -955,6 +957,8 @@ fn analyzeInst(
         .memset,
         .memset_safe,
         .memcpy,
+        .deposit_bits,
+        .extract_bits,
         => {
             const o = inst_datas[@intFromEnum(inst)].bin_op;
             return analyzeOperands(a, pass, data, inst, .{ o.lhs, o.rhs, .none });
diff --git a/src/Sema.zig b/src/Sema.zig
index d3989f630cb5..c42343161de4 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -1260,6 +1260,8 @@ fn analyzeBodyInner(
                     .work_group_id      => try sema.zirWorkItem(          block, extended, extended.opcode),
                     .in_comptime        => try sema.zirInComptime(        block),
                     .closure_get        => try sema.zirClosureGet(        block, extended),
+                    .deposit_bits          => try sema.zirDepositExtractBits(block, extended, .deposit_bits),
+                    .extract_bits          => try sema.zirDepositExtractBits(block, extended, .extract_bits),
                     // zig fmt: on
 
                     .fence => {
@@ -26390,6 +26392,84 @@ fn zirInComptime(
     return if (block.is_comptime) .bool_true else .bool_false;
 }
 
+fn zirDepositExtractBits(
+    sema: *Sema,
+    block: *Block,
+    extended: Zir.Inst.Extended.InstData,
+    air_tag: Air.Inst.Tag,
+) CompileError!Air.Inst.Ref {
+    const extra = sema.code.extraData(Zir.Inst.BinNode, extended.operand).data;
+    const src = LazySrcLoc.nodeOffset(extra.node);
+
+    const lhs_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = extra.node };
+    const rhs_src: LazySrcLoc = .{ .node_offset_builtin_call_arg1 = extra.node };
+
+    const uncasted_lhs = try sema.resolveInst(extra.lhs);
+    const uncasted_rhs = try sema.resolveInst(extra.rhs);
+
+    const lhs_ty = sema.typeOf(uncasted_lhs);
+    const rhs_ty = sema.typeOf(uncasted_rhs);
+
+    if (lhs_ty.zigTypeTag() != .Int) {
+        return sema.fail(block, lhs_src, "expected integer type, found '{}'", .{lhs_ty.fmt(sema.mod)});
+    }
+
+    if (rhs_ty.zigTypeTag() != .Int) {
+        return sema.fail(block, rhs_src, "expected integer type, found '{}'", .{rhs_ty.fmt(sema.mod)});
+    }
+
+    const instructions = &[_]Air.Inst.Ref{ uncasted_lhs, uncasted_rhs };
+    const dest_ty = try sema.resolvePeerTypes(block, src, instructions, .{
+        .override = &[_]?LazySrcLoc{ lhs_src, rhs_src },
+    });
+
+    assert(dest_ty.zigTypeTag() == .Int);
+
+    const lhs = try sema.coerce(block, dest_ty, uncasted_lhs, lhs_src);
+    const rhs = try sema.coerce(block, dest_ty, uncasted_rhs, rhs_src);
+
+    const maybe_lhs_val = try sema.resolveMaybeUndefVal(lhs);
+    const maybe_rhs_val = try sema.resolveMaybeUndefVal(rhs);
+
+    // If either of the operands are zero, the result is zero
+    // If either of the operands are undefined, the result is undefined
+    if (maybe_lhs_val) |lhs_val| {
+        if (lhs_val.isUndef()) return sema.addConstUndef(dest_ty);
+        if (try lhs_val.compareAllWithZeroAdvanced(.eq, sema)) {
+            return sema.addConstant(dest_ty, Value.zero);
+        }
+    }
+    if (maybe_rhs_val) |rhs_val| {
+        if (rhs_val.isUndef()) return sema.addConstUndef(dest_ty);
+        if (try rhs_val.compareAllWithZeroAdvanced(.eq, sema)) {
+            return sema.addConstant(dest_ty, Value.zero);
+        }
+    }
+
+    if (maybe_lhs_val) |lhs_val| {
+        if (maybe_rhs_val) |rhs_val| {
+            const dest_val = switch (air_tag) {
+                .deposit_bits => try sema.intDepositBits(lhs_val, rhs_val, dest_ty),
+                .extract_bits => try sema.intExtractBits(lhs_val, rhs_val, dest_ty),
+                else => unreachable,
+            };
+
+            return sema.addConstant(dest_ty, dest_val);
+        }
+    }
+
+    const runtime_src = if (maybe_lhs_val == null) lhs_src else rhs_src;
+    try sema.requireRuntimeBlock(block, src, runtime_src);
+
+    return block.addInst(.{
+        .tag = air_tag,
+        .data = .{ .bin_op = .{
+            .lhs = lhs,
+            .rhs = rhs,
+        } },
+    });
+}
+
 fn requireRuntimeBlock(sema: *Sema, block: *Block, src: LazySrcLoc, runtime_src: ?LazySrcLoc) !void {
     if (block.is_comptime) {
         const msg = msg: {
@@ -38992,6 +39072,112 @@ fn intAddWithOverflowScalar(
     };
 }
 
+fn intDepositBits(
+    sema: *Sema,
+    lhs: Value,
+    rhs: Value,
+    ty: Type,
+) !Value {
+    // TODO is this a performance issue? maybe we should try the operation without
+    // resorting to BigInt first. For non-bigints, @intDeposit could be used?
+    const target = sema.mod.getTarget();
+    const arena = sema.arena;
+    const info = ty.intInfo(target);
+
+    var lhs_space: Value.BigIntSpace = undefined;
+    var rhs_space: Value.BigIntSpace = undefined;
+    const lhs_bigint = lhs.toBigInt(&lhs_space, target);
+    const rhs_bigint = rhs.toBigInt(&rhs_space, target);
+
+    const result_limbs = try arena.alloc(
+        std.math.big.Limb,
+        std.math.big.int.calcTwosCompLimbCount(info.bits),
+    );
+
+    const source_limbs = try arena.alloc(
+        std.math.big.Limb,
+        std.math.big.int.calcTwosCompLimbCount(info.bits),
+    );
+    defer arena.free(source_limbs);
+
+    const mask_limbs = try arena.alloc(
+        std.math.big.Limb,
+        std.math.big.int.calcTwosCompLimbCount(info.bits),
+    );
+    defer arena.free(mask_limbs);
+
+    const limbs_buffer = try arena.alloc(
+        std.math.big.Limb,
+        rhs_bigint.limbs.len,
+    );
+    defer arena.free(limbs_buffer);
+
+    var source = std.math.big.int.Mutable{ .limbs = source_limbs, .positive = undefined, .len = undefined };
+    var mask = std.math.big.int.Mutable{ .limbs = mask_limbs, .positive = undefined, .len = undefined };
+    var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
+
+    source.convertToTwosComplement(lhs_bigint, info.signedness, info.bits);
+    mask.convertToTwosComplement(rhs_bigint, info.signedness, info.bits);
+
+    result.depositBits(source.toConst(), mask.toConst(), limbs_buffer);
+
+    result.convertFromTwosComplement(result.toConst(), info.signedness, info.bits);
+    return Value.fromBigInt(arena, result.toConst());
+}
+
+fn intExtractBits(
+    sema: *Sema,
+    lhs: Value,
+    rhs: Value,
+    ty: Type,
+) !Value {
+    // TODO is this a performance issue? maybe we should try the operation without
+    // resorting to BigInt first. For non-bigints, @intExtract could be used?
+    const target = sema.mod.getTarget();
+    const arena = sema.arena;
+    const info = ty.intInfo(target);
+
+    var lhs_space: Value.BigIntSpace = undefined;
+    var rhs_space: Value.BigIntSpace = undefined;
+    const lhs_bigint = lhs.toBigInt(&lhs_space, target);
+    const rhs_bigint = rhs.toBigInt(&rhs_space, target);
+
+    const result_limbs = try arena.alloc(
+        std.math.big.Limb,
+        std.math.big.int.calcTwosCompLimbCount(info.bits),
+    );
+
+    const source_limbs = try arena.alloc(
+        std.math.big.Limb,
+        std.math.big.int.calcTwosCompLimbCount(info.bits),
+    );
+    defer arena.free(source_limbs);
+
+    const mask_limbs = try arena.alloc(
+        std.math.big.Limb,
+        std.math.big.int.calcTwosCompLimbCount(info.bits),
+    );
+    defer arena.free(mask_limbs);
+
+    const limbs_buffer = try arena.alloc(
+        std.math.big.Limb,
+        rhs_bigint.limbs.len,
+    );
+    defer arena.free(limbs_buffer);
+
+    var source = std.math.big.int.Mutable{ .limbs = source_limbs, .positive = undefined, .len = undefined };
+    var mask = std.math.big.int.Mutable{ .limbs = mask_limbs, .positive = undefined, .len = undefined };
+    var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
+
+    source.convertToTwosComplement(lhs_bigint, info.signedness, info.bits);
+    mask.convertToTwosComplement(rhs_bigint, info.signedness, info.bits);
+
+    result.extractBits(source.toConst(), mask.toConst(), limbs_buffer);
+
+    result.convertFromTwosComplement(result.toConst(), info.signedness, info.bits);
+    return Value.fromBigInt(arena, result.toConst());
+}
+
 /// Asserts the values are comparable. Both operands have type `ty`.
 /// For vectors, returns true if the comparison is true for ALL elements.
 ///
diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig
index ddde72345efe..5a327f1a0ae6 100644
--- a/src/arch/aarch64/CodeGen.zig
+++ b/src/arch/aarch64/CodeGen.zig
@@ -899,6 +899,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .work_item_id => unreachable,
             .work_group_size => unreachable,
             .work_group_id => unreachable,
+
+            .deposit_bits => return self.fail("TODO implement deposit_bits", .{}),
+            .extract_bits => return self.fail("TODO implement extract_bits", .{}),
             // zig fmt: on
         }
 
diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig
index 86d4e8f7fdd6..d55c69d48a7a 100644
--- a/src/arch/arm/CodeGen.zig
+++ b/src/arch/arm/CodeGen.zig
@@ -885,6 +885,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .work_item_id => unreachable,
             .work_group_size => unreachable,
             .work_group_id => unreachable,
+
+            .deposit_bits => return self.fail("TODO implement deposit_bits", .{}),
+            .extract_bits => return self.fail("TODO implement extract_bits", .{}),
             // zig fmt: on
         }
 
diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig
index 5abe3afcfd2a..d45904d7e98c 100644
--- a/src/arch/riscv64/CodeGen.zig
+++ b/src/arch/riscv64/CodeGen.zig
@@ -713,6 +713,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .work_item_id => unreachable,
             .work_group_size => unreachable,
             .work_group_id => unreachable,
+
+            .deposit_bits => return self.fail("TODO implement deposit_bits", .{}),
+            .extract_bits => return self.fail("TODO implement extract_bits", .{}),
             // zig fmt: on
         }
         if (std.debug.runtime_safety) {
diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig
index 19c18ec4a6b0..f2fbb813affd 100644
--- a/src/arch/sparc64/CodeGen.zig
+++ b/src/arch/sparc64/CodeGen.zig
@@ -732,6 +732,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .work_item_id => unreachable,
             .work_group_size => unreachable,
             .work_group_id => unreachable,
+
+            .deposit_bits => @panic("TODO implement deposit_bits"),
+            .extract_bits => @panic("TODO implement extract_bits"),
             // zig fmt: on
         }
 
diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig
index 83159ec80e7d..fcf8bd362a0d 100644
--- a/src/arch/wasm/CodeGen.zig
+++ b/src/arch/wasm/CodeGen.zig
@@ -2058,6 +2058,10 @@ fn genInst(func: *CodeGen, inst: Air.Inst.Index) InnerError!void {
         .work_group_size,
         .work_group_id,
         => unreachable,
+
+        .deposit_bits,
+        .extract_bits,
+        => |tag| return func.fail("TODO implement {s}", .{@tagName(tag)}),
     };
 }
 
diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index c165baf7e885..41f00c9c239d 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -2195,6 +2195,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .work_item_id => unreachable,
             .work_group_size => unreachable,
             .work_group_id => unreachable,
+
+            .deposit_bits => return self.fail("TODO implement deposit_bits", .{}),
+            .extract_bits => return self.fail("TODO implement extract_bits", .{}),
             // zig fmt: on
         }
 
diff --git a/src/codegen/c.zig b/src/codegen/c.zig
index 818267a8b819..80da6ff96482 100644
--- a/src/codegen/c.zig
+++ b/src/codegen/c.zig
@@ -3466,6 +3466,9 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail,
             .work_group_size,
             .work_group_id,
             => unreachable,
+
+            .deposit_bits => return f.fail("TODO: C backend: implement deposit_bits", .{}),
+            .extract_bits => return f.fail("TODO: C backend: implement extract_bits", .{}),
             // zig fmt: on
         };
         if (result_value == .new_local) {
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index db0eaa3ce5e6..8fad9c1d77da 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -5103,6 +5103,9 @@ pub const FuncGen = struct {
                 .work_item_id => try self.airWorkItemId(inst),
                 .work_group_size => try self.airWorkGroupSize(inst),
                 .work_group_id => try self.airWorkGroupId(inst),
+
+                .deposit_bits => try self.airDepositBits(inst),
+                .extract_bits => try self.airExtractBits(inst),
                 // zig fmt: on
             };
             if (val != .none) try self.func_inst_table.putNoClobber(self.gpa, inst.toRef(), val);
@@ -10295,6 +10298,162 @@ pub const FuncGen = struct {
         return self.amdgcnWorkIntrinsic(dimension, 0, "amdgcn.workgroup.id");
     }
 
+    fn airDepositBits(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
+
+        const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const lhs = try self.resolveInst(bin_op.lhs);
+        const rhs = try self.resolveInst(bin_op.rhs);
+        const inst_ty = self.air.typeOfIndex(inst);
+
+        const target = self.dg.module.getTarget();
+        const params = [2]*llvm.Value{ lhs, rhs };
+        switch (target.cpu.arch) {
+            .x86, .x86_64 => |tag| blk: {
+                // Doesn't have pdep
+                if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk;
+
+                const bits = inst_ty.intInfo(target).bits;
+                const supports_64 = tag == .x86_64;
+                // Integer size doesn't match the available instruction(s)
+                if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk;
+
+                return self.buildDepositBitsNative(inst_ty, params);
+            },
+            else => {},
+        }
+
+        return self.buildDepositBitsEmulated(inst_ty, params);
+    }
+
+    fn buildDepositBitsNative(
+        self: *FuncGen,
+        ty: Type,
+        params: [2]*llvm.Value,
+    ) !*llvm.Value {
+        const target = self.dg.module.getTarget();
+
+        assert(target.cpu.arch.isX86());
+        assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2));
+
+        const bits = ty.intInfo(target).bits;
+        const intrinsic_name = switch (bits) {
+            1...32 => "llvm.x86.bmi.pdep.32",
+            33...64 => "llvm.x86.bmi.pdep.64",
+            else => unreachable,
+        };
+        const needs_extend = bits != 32 and bits != 64;
+
+        var params_cast = params;
+
+        // Cast to either a 32 or 64-bit integer
+        if (needs_extend) {
+            const llvm_extend_ty = self.context.intType(if (bits <= 32) 32 else 64);
+            params_cast = .{
+                self.builder.buildZExt(params[0], llvm_extend_ty, ""),
+                self.builder.buildZExt(params[1], llvm_extend_ty, ""),
+            };
+        }
+
+        const llvm_fn = self.getIntrinsic(intrinsic_name, &.{});
+        const result = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, &params_cast, 2, .Fast, .Auto, "");
+
+        // No cast needed!
+        if (!needs_extend) return result;
+
+        // Cast back to the original integer size
+        const llvm_trunc_ty = try self.dg.lowerType(ty);
+        return self.builder.buildTrunc(result, llvm_trunc_ty, "");
+    }
+
+    fn buildDepositBitsEmulated(
+        self: *FuncGen,
+        ty: Type,
+        params: [2]*llvm.Value,
+    ) !*llvm.Value {
+        _ = ty;
+        _ = params;
+        return self.dg.todo("implement deposit_bits emulation", .{});
+    }
+
+    fn airExtractBits(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
+        if (self.liveness.isUnused(inst)) return null;
+
+        const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const lhs = try self.resolveInst(bin_op.lhs);
+        const rhs = try self.resolveInst(bin_op.rhs);
+        const inst_ty = self.air.typeOfIndex(inst);
+
+        const target = self.dg.module.getTarget();
+        const params = [2]*llvm.Value{ lhs, rhs };
+        switch (target.cpu.arch) {
+            .x86, .x86_64 => |tag| blk: {
+                // Doesn't have pext
+                if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk;
+
+                const bits = inst_ty.intInfo(target).bits;
+                const supports_64 = tag == .x86_64;
+                // Integer size doesn't match the available instruction(s)
+                if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk;
+
+                return self.buildExtractBitsNative(inst_ty, params);
+            },
+            else => {},
+        }
+
+        return self.buildExtractBitsEmulated(inst_ty, params);
+    }
+
+    fn buildExtractBitsNative(
+        self: *FuncGen,
+        ty: Type,
+        params: [2]*llvm.Value,
+    ) !*llvm.Value {
+        const target = self.dg.module.getTarget();
+
+        assert(target.cpu.arch.isX86());
+        assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2));
+
+        const bits = ty.intInfo(target).bits;
+        const intrinsic_name = switch (bits) {
+            1...32 => "llvm.x86.bmi.pext.32",
+            33...64 => "llvm.x86.bmi.pext.64",
+            else => unreachable,
+        };
+        const needs_extend = bits != 32 and bits != 64;
+
+        var params_cast = params;
+
+        // Cast to either a 32 or 64-bit integer
+        if (needs_extend) {
+            const llvm_extend_ty = self.context.intType(if (bits <= 32) 32 else 64);
+            params_cast = .{
+                self.builder.buildZExt(params[0], llvm_extend_ty, ""),
+                self.builder.buildZExt(params[1], llvm_extend_ty, ""),
+            };
+        }
+
+        const llvm_fn = self.getIntrinsic(intrinsic_name, &.{});
+        const result = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, &params_cast, 2, .Fast, .Auto, "");
+
+        // No cast needed!
+        if (!needs_extend) return result;
+
+        // Cast back to the original integer size
+        const llvm_trunc_ty = try self.dg.lowerType(ty);
+        return self.builder.buildTrunc(result, llvm_trunc_ty, "");
+    }
+
+    fn buildExtractBitsEmulated(
+        self: *FuncGen,
+        ty: Type,
+        params: [2]*llvm.Value,
+    ) !*llvm.Value {
+        _ = ty;
+        _ = params;
+        return self.dg.todo("implement extract_bits emulation", .{});
+    }
+
     fn getErrorNameTable(self: *FuncGen) Allocator.Error!Builder.Variable.Index {
         const o = self.dg.object;
         const mod = o.module;
diff --git a/src/print_air.zig b/src/print_air.zig
index 12e2825d4ef0..e1a8a4ceeeb7 100644
--- a/src/print_air.zig
+++ b/src/print_air.zig
@@ -162,6 +162,8 @@ const Writer = struct {
             .memcpy,
             .memset,
             .memset_safe,
+            .deposit_bits,
+            .extract_bits,
             => try w.writeBinOp(s, inst),
 
             .is_null,
diff --git a/src/print_zir.zig b/src/print_zir.zig
index dfe94d397097..311d1d1c2240 100644
--- a/src/print_zir.zig
+++ b/src/print_zir.zig
@@ -591,6 +591,8 @@ const Writer = struct {
             .wasm_memory_grow,
             .prefetch,
             .c_va_arg,
+            .deposit_bits,
+            .extract_bits,
             => {
                 const inst_data = self.code.extraData(Zir.Inst.BinNode, extended.operand).data;
                 const src = LazySrcLoc.nodeOffset(inst_data.node);

From 2de5fccf5cf0d8d0ff74d8016d329e6be56fd825 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Thu, 13 Apr 2023 19:33:55 +0100
Subject: [PATCH 05/28] LLVM: Implement emulation for `@depositBits`

---
 src/codegen/llvm.zig | 75 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 72 insertions(+), 3 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 8fad9c1d77da..8999ff5554bd 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10366,14 +10366,83 @@ pub const FuncGen = struct {
         return self.builder.buildTrunc(result, llvm_trunc_ty, "");
     }
 
+    // TODO Should this belong in compiler-rt?
+    //
+    // Implements @depositBits(source, mask) in software
+    // (i.e. without platform-specific instructions)
+    //
+    // var bb = 1;
+    // var result = 0;
+    // do {
+    //     const bit = mask & -mask;
+    //     mask &= ~bit;
+    //     const source_bit = source & bb;
+    //     if (source_bit) result |= bit;
+    //     bb += bb;
+    // } while (mask)
+    //
+    // return result;
     fn buildDepositBitsEmulated(
         self: *FuncGen,
         ty: Type,
         params: [2]*llvm.Value,
     ) !*llvm.Value {
-        _ = ty;
-        _ = params;
-        return self.dg.todo("implement deposit_bits emulation", .{});
+        const llvm_ty = try self.dg.lowerType(ty);
+
+        const source = params[0];
+        const mask_start = params[1];
+        const zero = llvm_ty.constNull();
+        const one = llvm_ty.constInt(1, .False);
+        const minus_one = llvm_ty.constInt(@bitCast(c_ulonglong, @as(c_longlong, -1)), .True);
+
+        const prev_block = self.builder.getInsertBlock();
+        const loop_block = self.context.appendBasicBlock(self.llvm_func, "Loop");
+        const after_block = self.context.appendBasicBlock(self.llvm_func, "After");
+
+        _ = self.builder.buildBr(loop_block);
+        self.builder.positionBuilderAtEnd(loop_block);
+        const mask_phi = self.builder.buildPhi(llvm_ty, "");
+        const result_phi = self.builder.buildPhi(llvm_ty, "");
+        const bb_phi = self.builder.buildPhi(llvm_ty, "");
+        const minus_mask = self.builder.buildSub(zero, mask_phi, "");
+        const bit = self.builder.buildAnd(mask_phi, minus_mask, "");
+        const not_bit = self.builder.buildXor(bit, minus_one, "");
+        const new_mask = self.builder.buildAnd(mask_phi, not_bit, "");
+        const source_bit = self.builder.buildAnd(source, bb_phi, "");
+        const source_bit_set = self.builder.buildICmp(.NE, source_bit, zero, "");
+        const bit_or_zero = self.builder.buildSelect(source_bit_set, bit, zero, ""); // avoid using control flow
+        const new_result = self.builder.buildOr(result_phi, bit_or_zero, "");
+        const new_bb = self.builder.buildAdd(bb_phi, bb_phi, "");
+        const while_cond = self.builder.buildICmp(.NE, new_mask, zero, "");
+        _ = self.builder.buildCondBr(while_cond, loop_block, after_block);
+
+        mask_phi.addIncoming(
+            &[2]*llvm.Value{ mask_start, new_mask },
+            &[2]*llvm.BasicBlock{ prev_block, loop_block },
+            2,
+        );
+
+        result_phi.addIncoming(
+            &[2]*llvm.Value{ zero, new_result },
+            &[2]*llvm.BasicBlock{ prev_block, loop_block },
+            2,
+        );
+
+        bb_phi.addIncoming(
+            &[2]*llvm.Value{ one, new_bb },
+            &[2]*llvm.BasicBlock{ prev_block, loop_block },
+            2,
+        );
+
+        self.builder.positionBuilderAtEnd(after_block);
+        const final_result = self.builder.buildPhi(llvm_ty, "");
+        final_result.addIncoming(
+            &[1]*llvm.Value{ new_result },
+            &[1]*llvm.BasicBlock{ loop_block },
+            1,
+        );
+
+        return final_result;
     }
 
     fn airExtractBits(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {

From a2850aabe696e731e26e4a445b817267a5cab80c Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Thu, 13 Apr 2023 21:53:31 +0100
Subject: [PATCH 06/28] LLVM: Implement emulation for `@extractBits`

---
 src/codegen/llvm.zig | 77 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 74 insertions(+), 3 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 8999ff5554bd..193ebd88c73b 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10513,14 +10513,85 @@ pub const FuncGen = struct {
         return self.builder.buildTrunc(result, llvm_trunc_ty, "");
     }
 
+    // TODO Should this belong in compiler-rt?
+    //
+    // Implements @extractBits(source, mask) in software
+    // (i.e. without platform-specific instructions)
+    //
+    // var bb = 1;
+    // var result = 0;
+    // do {
+    //     const bit = mask & -mask;
+    //     mask &= ~bit;
+    //     const source_bit = source & bit;
+    //     if (source_bit != 0) result |= bb;
+    //     bb += bb;
+    // } while (mask)
+    //
+    // return result;
     fn buildExtractBitsEmulated(
         self: *FuncGen,
         ty: Type,
         params: [2]*llvm.Value,
     ) !*llvm.Value {
-        _ = ty;
-        _ = params;
-        return self.dg.todo("implement extract_bits emulation", .{});
+        const llvm_ty = try self.dg.lowerType(ty);
+
+        const zero = llvm_ty.constNull();
+        const one = llvm_ty.constInt(1, .False);
+        const minus_one = llvm_ty.constInt(@bitCast(c_ulonglong, @as(c_longlong, -1)), .True);
+        const source = params[0];
+        const start_mask = params[1];
+        const start_result = zero;
+        const start_bb = one;
+
+        const prev_block = self.builder.getInsertBlock();
+        const loop_block = self.context.appendBasicBlock(self.llvm_func, "Loop");
+        const after_block = self.context.appendBasicBlock(self.llvm_func, "After");
+
+        _ = self.builder.buildBr(loop_block);
+        self.builder.positionBuilderAtEnd(loop_block);
+        const mask_phi = self.builder.buildPhi(llvm_ty, "");
+        const result_phi = self.builder.buildPhi(llvm_ty, "");
+        const bb_phi = self.builder.buildPhi(llvm_ty, "");
+        const minus_mask = self.builder.buildSub(zero, mask_phi, "");
+        const bit = self.builder.buildAnd(mask_phi, minus_mask, "");
+        const not_bit = self.builder.buildXor(bit, minus_one, "");
+        const new_mask = self.builder.buildAnd(mask_phi, not_bit, "");
+        const source_bit = self.builder.buildAnd(source, bit, "");
+        const source_bit_set = self.builder.buildICmp(.NE, source_bit, zero, "");
+        const bb_or_zero = self.builder.buildSelect(source_bit_set, bb_phi, zero, ""); // avoid using control flow
+        const new_result = self.builder.buildOr(result_phi, bb_or_zero, "");
+        const new_bb = self.builder.buildAdd(bb_phi, bb_phi, "");
+        const while_cond = self.builder.buildICmp(.NE, new_mask, zero, "");
+        _ = self.builder.buildCondBr(while_cond, loop_block, after_block);
+
+        mask_phi.addIncoming(
+            &[2]*llvm.Value{ start_mask, new_mask },
+            &[2]*llvm.BasicBlock{ prev_block, loop_block },
+            2,
+        );
+
+        result_phi.addIncoming(
+            &[2]*llvm.Value{ start_result, new_result },
+            &[2]*llvm.BasicBlock{ prev_block, loop_block },
+            2,
+        );
+
+        bb_phi.addIncoming(
+            &[2]*llvm.Value{ start_bb, new_bb },
+            &[2]*llvm.BasicBlock{ prev_block, loop_block },
+            2,
+        );
+
+        self.builder.positionBuilderAtEnd(after_block);
+        const final_result = self.builder.buildPhi(llvm_ty, "");
+        final_result.addIncoming(
+            &[1]*llvm.Value{ new_result },
+            &[1]*llvm.BasicBlock{ loop_block },
+            1,
+        );
+
+        return final_result;
     }
 
     fn getErrorNameTable(self: *FuncGen) Allocator.Error!Builder.Variable.Index {

From 9760841abdf658c3a5567995a7a0c3a579615254 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Fri, 14 Apr 2023 12:45:18 +0100
Subject: [PATCH 07/28] std.math.big.int: Fix index out-of-bounds

---
 lib/std/math/big/int.zig | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index f896d75ee856..ee8d50d2df1f 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1764,6 +1764,9 @@ pub const Mutable = struct {
             const i_limb_bit = @intCast(u6, i % limb_bits);
 
             mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
+
+            if (i_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
+
             const source_bit_set = source.limbs[i_limb_index] & (@as(Limb, 1) << i_limb_bit) != 0;
 
             r.limbs[mask_limb_index] |= @as(Limb, @boolToInt(source_bit_set)) << mask_limb_bit;
@@ -1801,6 +1804,9 @@ pub const Mutable = struct {
             const i_limb_bit = @intCast(u6, i % limb_bits);
 
             mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
+
+            if (mask_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
+
             const source_bit_set = source.limbs[mask_limb_index] & (@as(Limb, 1) << mask_limb_bit) != 0;
 
             r.limbs[i_limb_index] |= @as(Limb, @boolToInt(source_bit_set)) << i_limb_bit;

From 566a88851f98cfb35ed605482b79732c5e2af1f8 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Fri, 14 Apr 2023 12:45:37 +0100
Subject: [PATCH 08/28] Add behaviour tests for `@depositBits` and
 `@extractBits`

---
 test/behavior/deposit_extract_bits.zig | 58 ++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 test/behavior/deposit_extract_bits.zig

diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig
new file mode 100644
index 000000000000..9f2bafe22560
--- /dev/null
+++ b/test/behavior/deposit_extract_bits.zig
@@ -0,0 +1,58 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const mem = std.mem;
+const expect = std.testing.expect;
+const expectEqual = std.testing.expectEqual;
+
+test "@depositBits" {
+    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO
+
+    const S = struct {
+        pub fn doTheTest() !void {
+            var a: u64 = 0;
+            var b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
+            var c: u64 = 0x1234_5678_9012_3456;
+            var d: u64 = 0x00F0_FF00_F00F_00FF;
+            var e: u128 = @as(u128, d) << 64;
+
+            try expect(@depositBits(b, a) == 0);
+            try expect(@depositBits(a, b) == 0);
+
+            try expect(@depositBits(b, c) == c);
+            try expect(@depositBits(b, d) == d);
+
+            try expect(@depositBits(c, d) == 0x0000_1200_3004_0056);
+            try expect(@depositBits(c, e) == 0x0000_1200_3004_0056 << 64);
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
+}
+
+test "@extractBits" {
+    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO
+
+    const S = struct {
+        pub fn doTheTest() !void {
+            var a: u64 = 0;
+            var b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
+            var c: u64 = 0x1234_5678_9012_3456;
+            var d: u64 = 0x00F0_FF00_F00F_00FF;
+            var e: u128 = @as(u128, c) << 64;
+            var f: u128 = @as(u128, d) << 64;
+
+            try expect(@extractBits(b, a) == 0);
+            try expect(@extractBits(a, b) == 0);
+
+            try expect(@extractBits(c, b) == c);
+            try expect(@extractBits(d, b) == d);
+
+            try expect(@extractBits(c, d) == 0x0356_9256);
+            try expect(@extractBits(e, f) == 0x0356_9256);
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
+}

From db280cef7d660713b7ec06b5c8267864e170449e Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Sat, 15 Apr 2023 12:00:06 +0100
Subject: [PATCH 09/28] zig fmt

---
 src/codegen/llvm.zig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 193ebd88c73b..4dbadbcc45d3 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10437,8 +10437,8 @@ pub const FuncGen = struct {
         self.builder.positionBuilderAtEnd(after_block);
         const final_result = self.builder.buildPhi(llvm_ty, "");
         final_result.addIncoming(
-            &[1]*llvm.Value{ new_result },
-            &[1]*llvm.BasicBlock{ loop_block },
+            &[1]*llvm.Value{new_result},
+            &[1]*llvm.BasicBlock{loop_block},
             1,
         );
 
@@ -10586,8 +10586,8 @@ pub const FuncGen = struct {
         self.builder.positionBuilderAtEnd(after_block);
         const final_result = self.builder.buildPhi(llvm_ty, "");
         final_result.addIncoming(
-            &[1]*llvm.Value{ new_result },
-            &[1]*llvm.BasicBlock{ loop_block },
+            &[1]*llvm.Value{new_result},
+            &[1]*llvm.BasicBlock{loop_block},
             1,
         );
 

From 9020b2f9c706510f87760e36f4378742b0cfd06a Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Sat, 15 Apr 2023 12:02:06 +0100
Subject: [PATCH 10/28] Replace `u6` with `Log2Limb`

---
 lib/std/math/big/int.zig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index ee8d50d2df1f..9d6ca2653ac9 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1758,10 +1758,10 @@ pub const Mutable = struct {
             i += 1;
         }) {
             const mask_limb_index = mask_bit_index / limb_bits;
-            const mask_limb_bit = @intCast(u6, mask_bit_index % limb_bits);
+            const mask_limb_bit = @intCast(Log2Limb, mask_bit_index % limb_bits);
 
             const i_limb_index = i / limb_bits;
-            const i_limb_bit = @intCast(u6, i % limb_bits);
+            const i_limb_bit = @intCast(Log2Limb, i % limb_bits);
 
             mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
 
@@ -1798,10 +1798,10 @@ pub const Mutable = struct {
             i += 1;
         }) {
             const mask_limb_index = mask_bit_index / limb_bits;
-            const mask_limb_bit = @intCast(u6, mask_bit_index % limb_bits);
+            const mask_limb_bit = @intCast(Log2Limb, mask_bit_index % limb_bits);
 
             const i_limb_index = i / limb_bits;
-            const i_limb_bit = @intCast(u6, i % limb_bits);
+            const i_limb_bit = @intCast(Log2Limb, i % limb_bits);
 
             mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
 

From eecdf99fe9de8d95ccbe206d557eaf92e8a0ccfe Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Mon, 17 Apr 2023 16:14:56 +0100
Subject: [PATCH 11/28] big.int.depositBits/extractBits: Remove limbs_buffer

Removes the requirement to copy and modify `mask`, removing the need to
clone `mask` into a `Mutable` bigint.
---
 lib/std/math/big/int.zig      | 76 ++++++++++++++++++++---------------
 lib/std/math/big/int_test.zig | 10 +----
 src/Sema.zig                  | 16 +-------
 3 files changed, 48 insertions(+), 54 deletions(-)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index 9d6ca2653ac9..c1bcc9248ba0 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1739,31 +1739,37 @@ pub const Mutable = struct {
     /// r = @depositBits(source, mask)
     ///
     /// Asserts that `source` and `mask` are positive
-    ///
-    /// `limbs_buffer` is used as a working area. It must have length of at least `mask.limbs.len`.
-    pub fn depositBits(r: *Mutable, source: Const, mask: Const, limbs_buffer: []Limb) void {
+    pub fn depositBits(r: *Mutable, source: Const, mask: Const) void {
         assert(source.positive);
         assert(mask.positive);
 
         r.positive = true;
         std.mem.set(Limb, r.limbs, 0);
 
-        var mut_mask = Mutable{ .limbs = limbs_buffer[0..mask.limbs.len], .positive = undefined, .len = undefined };
-        mut_mask.copy(mask);
-
-        var mask_bit_index = mut_mask.toConst().ctz();
+        var mask_limb: Limb = mask.limbs[0];
+        var mask_limb_index: Limb = 0;
         var i: usize = 0;
-        while (!mut_mask.eqZero()) : ({
-            mask_bit_index = mut_mask.toConst().ctz();
-            i += 1;
-        }) {
-            const mask_limb_index = mask_bit_index / limb_bits;
-            const mask_limb_bit = @intCast(Log2Limb, mask_bit_index % limb_bits);
+        outer: while (true) : (i += 1) {
+            // Find next bit in mask
+            const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+                const mask_limb_tz = @ctz(mask_limb);
+                if (mask_limb_tz != @sizeOf(Limb) * 8) {
+                    const cast_limb_bit = @intCast(Log2Limb, mask_limb_tz);
+                    mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                    break :limb_bit cast_limb_bit;
+                }
 
-            const i_limb_index = i / limb_bits;
-            const i_limb_bit = @intCast(Log2Limb, i % limb_bits);
+                mask_limb_index += 1;
+                // No more limbs, we've finished iterating the mask
+                if (mask_limb_index >= mask.limbs.len) {
+                    break :outer;
+                }
+
+                mask_limb = mask.limbs[mask_limb_index];
+            };
 
-            mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
+            const i_limb_index = i / limb_bits;
+            const i_limb_bit = @truncate(Log2Limb, i);
 
             if (i_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
 
@@ -1779,31 +1785,37 @@ pub const Mutable = struct {
     /// r = @extractBits(source, mask)
     ///
     /// Asserts that `source` and `mask` are positive
-    ///
-    /// `limbs_buffer` is used as a working area. It must have length of at least `mask.limbs.len`.
-    pub fn extractBits(r: *Mutable, source: Const, mask: Const, limbs_buffer: []Limb) void {
+    pub fn extractBits(r: *Mutable, source: Const, mask: Const) void {
         assert(source.positive);
         assert(mask.positive);
 
         r.positive = true;
         std.mem.set(Limb, r.limbs, 0);
 
-        var mut_mask = Mutable{ .limbs = limbs_buffer[0..mask.limbs.len], .positive = undefined, .len = undefined };
-        mut_mask.copy(mask);
-
-        var mask_bit_index = mut_mask.toConst().ctz();
+        var mask_limb: Limb = mask.limbs[0];
+        var mask_limb_index: Limb = 0;
         var i: usize = 0;
-        while (!mut_mask.eqZero()) : ({
-            mask_bit_index = mut_mask.toConst().ctz();
-            i += 1;
-        }) {
-            const mask_limb_index = mask_bit_index / limb_bits;
-            const mask_limb_bit = @intCast(Log2Limb, mask_bit_index % limb_bits);
+        outer: while (true) : (i += 1) {
+            // Find next bit in mask
+            const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+                const mask_limb_tz = @ctz(mask_limb);
+                if (mask_limb_tz != @sizeOf(Limb) * 8) {
+                    const cast_limb_bit = @intCast(Log2Limb, mask_limb_tz);
+                    mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                    break :limb_bit cast_limb_bit;
+                }
 
-            const i_limb_index = i / limb_bits;
-            const i_limb_bit = @intCast(Log2Limb, i % limb_bits);
+                mask_limb_index += 1;
+                // No more limbs, we've finished iterating the mask
+                if (mask_limb_index >= mask.limbs.len) {
+                    break :outer;
+                }
+
+                mask_limb = mask.limbs[mask_limb_index];
+            };
 
-            mut_mask.limbs[mask_limb_index] &= ~(@as(Limb, 1) << mask_limb_bit); // Unset the mask bit
+            const i_limb_index = i / limb_bits;
+            const i_limb_bit = @truncate(Log2Limb, i);
 
             if (mask_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
 
diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig
index 90c1cf719de2..e7e5a956666a 100644
--- a/lib/std/math/big/int_test.zig
+++ b/lib/std/math/big/int_test.zig
@@ -2819,10 +2819,7 @@ fn extractBitsTest(comptime source: comptime_int, comptime mask: comptime_int, c
     defer testing.allocator.free(limbs);
     var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined };
 
-    const limbs_buffer = try testing.allocator.alloc(Limb, mask_bigint.limbs.len);
-    defer testing.allocator.free(limbs_buffer);
-
-    result.extractBits(source_bigint.toConst(), mask_bigint.toConst(), limbs_buffer);
+    result.extractBits(source_bigint.toConst(), mask_bigint.toConst());
 
     try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected));
 }
@@ -2846,10 +2843,7 @@ fn depositBitsTest(comptime source: comptime_int, comptime mask: comptime_int, c
     defer testing.allocator.free(limbs);
     var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined };
 
-    const limbs_buffer = try testing.allocator.alloc(Limb, mask_bigint.limbs.len);
-    defer testing.allocator.free(limbs_buffer);
-
-    result.depositBits(source_bigint.toConst(), mask_bigint.toConst(), limbs_buffer);
+    result.depositBits(source_bigint.toConst(), mask_bigint.toConst());
 
     try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected));
 }
diff --git a/src/Sema.zig b/src/Sema.zig
index c42343161de4..2fec3ddd2a3e 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -39106,12 +39106,6 @@ fn intDepositBits(
     );
     defer arena.free(mask_limbs);
 
-    const limbs_buffer = try arena.alloc(
-        std.math.big.Limb,
-        rhs_bigint.limbs.len,
-    );
-    defer arena.free(limbs_buffer);
-
     var source = std.math.big.int.Mutable{ .limbs = source_limbs, .positive = undefined, .len = undefined };
     var mask = std.math.big.int.Mutable{ .limbs = mask_limbs, .positive = undefined, .len = undefined };
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
@@ -39119,7 +39113,7 @@ fn intDepositBits(
     source.convertToTwosComplement(lhs_bigint, info.signedness, info.bits);
     mask.convertToTwosComplement(rhs_bigint, info.signedness, info.bits);
 
-    result.depositBits(source.toConst(), mask.toConst(), limbs_buffer);
+    result.depositBits(source.toConst(), mask.toConst());
 
     result.convertFromTwosComplement(result.toConst(), info.signedness, info.bits);
     return Value.fromBigInt(arena, result.toConst());
@@ -39159,12 +39153,6 @@ fn intExtractBits(
     );
     defer arena.free(mask_limbs);
 
-    const limbs_buffer = try arena.alloc(
-        std.math.big.Limb,
-        rhs_bigint.limbs.len,
-    );
-    defer arena.free(limbs_buffer);
-
     var source = std.math.big.int.Mutable{ .limbs = source_limbs, .positive = undefined, .len = undefined };
     var mask = std.math.big.int.Mutable{ .limbs = mask_limbs, .positive = undefined, .len = undefined };
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
@@ -39172,7 +39160,7 @@ fn intExtractBits(
     source.convertToTwosComplement(lhs_bigint, info.signedness, info.bits);
     mask.convertToTwosComplement(rhs_bigint, info.signedness, info.bits);
 
-    result.extractBits(source.toConst(), mask.toConst(), limbs_buffer);
+    result.extractBits(source.toConst(), mask.toConst());
 
     result.convertFromTwosComplement(result.toConst(), info.signedness, info.bits);
     return Value.fromBigInt(arena, result.toConst());

From 13d4205f2299ad2d13073e7271cc85cfbabf5482 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Tue, 18 Apr 2023 14:52:46 +0100
Subject: [PATCH 12/28] Disallow signed integer types for deposit/extract

---
 lib/std/math/big/int.zig      | 34 -----------------
 lib/std/math/big/int_test.zig | 27 --------------
 src/Sema.zig                  | 70 ++++++++---------------------------
 3 files changed, 16 insertions(+), 115 deletions(-)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index c1bcc9248ba0..520048d5b7c1 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1862,40 +1862,6 @@ pub const Mutable = struct {
         r.normalize(r.len);
     }
 
-    /// Converts a twos-complement value to a magnitude, and sets the sign of `r` to match.
-    /// `a.positive` is ignored
-    /// r may alias a
-    ///
-    /// Asserts `r` has enough storage to store the result.
-    /// The upper bound is `calcTwosCompLimbCount(bit_count)`
-    pub fn convertFromTwosComplement(r: *Mutable, a: Const, signedness: Signedness, bit_count: usize) void {
-        const req_limbs = calcTwosCompLimbCount(bit_count);
-        if (req_limbs == 0 or a.eqZero()) {
-            r.set(0);
-            return;
-        }
-
-        const bit = @truncate(Log2Limb, bit_count - 1);
-        const signmask = @as(Limb, 1) << bit;
-        const mask = (signmask << 1) -% 1;
-
-        if (signedness == .unsigned or req_limbs > a.limbs.len or a.limbs[req_limbs - 1] & signmask == 0) {
-            r.truncate(a, signedness, bit_count);
-            return;
-        }
-
-        r.copy(a);
-        assert(r.limbs.len >= req_limbs);
-        r.len = req_limbs;
-
-        r.addScalar(r.toConst(), -1);
-        llnot(r.limbs[0..r.len]);
-        r.limbs[r.len - 1] &= mask;
-
-        r.positive = false;
-        r.normalize(r.len);
-    }
-
     /// Truncate an integer to a number of bits, following 2s-complement semantics.
     /// r may alias a.
     ///
diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig
index e7e5a956666a..c9dabaa31e30 100644
--- a/lib/std/math/big/int_test.zig
+++ b/lib/std/math/big/int_test.zig
@@ -2848,33 +2848,6 @@ fn depositBitsTest(comptime source: comptime_int, comptime mask: comptime_int, c
     try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected));
 }
 
-test "big int conversion to/from twos complement" {
-    var a = try Managed.initSet(testing.allocator, maxInt(u64));
-    defer a.deinit();
-    var b = try Managed.initSet(testing.allocator, maxInt(u32));
-    defer b.deinit();
-    var c = try Managed.initSet(testing.allocator, maxInt(u493));
-    defer c.deinit();
-
-    var m_a = a.toMutable();
-    m_a.convertToTwosComplement(m_a.toConst(), .unsigned, 64);
-    try testing.expectEqual(m_a.toConst().orderAgainstScalar(maxInt(u64)), .eq);
-    m_a.convertFromTwosComplement(m_a.toConst(), .signed, 64);
-    try testing.expectEqual(m_a.toConst().orderAgainstScalar(-1), .eq);
-
-    var m_b = b.toMutable();
-    m_b.convertToTwosComplement(m_b.toConst(), .unsigned, 32);
-    try testing.expectEqual(m_b.toConst().orderAgainstScalar(maxInt(u32)), .eq);
-    m_b.convertFromTwosComplement(m_b.toConst(), .signed, 32);
-    try testing.expectEqual(m_b.toConst().orderAgainstScalar(-1), .eq);
-
-    var m_c = c.toMutable();
-    m_c.convertToTwosComplement(m_c.toConst(), .unsigned, 493);
-    try testing.expectEqual(m_c.toConst().orderAgainstScalar(maxInt(u493)), .eq);
-    m_c.convertFromTwosComplement(m_c.toConst(), .signed, 493);
-    try testing.expectEqual(m_c.toConst().orderAgainstScalar(-1), .eq);
-}
-
 test "big int conversion read/write twos complement" {
     var a = try Managed.initSet(testing.allocator, (1 << 493) - 1);
     defer a.deinit();
diff --git a/src/Sema.zig b/src/Sema.zig
index 2fec3ddd2a3e..8b1e42ac319e 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -26410,12 +26410,12 @@ fn zirDepositExtractBits(
     const lhs_ty = sema.typeOf(uncasted_lhs);
     const rhs_ty = sema.typeOf(uncasted_rhs);
 
-    if (lhs_ty.zigTypeTag() != .Int) {
-        return sema.fail(block, lhs_src, "expected integer type, found '{}'", .{lhs_ty.fmt(sema.mod)});
+    if (!lhs_ty.isUnsignedInt()) {
+        return sema.fail(block, lhs_src, "expected unsigned integer type, found '{}'", .{lhs_ty.fmt(sema.mod)});
     }
 
-    if (rhs_ty.zigTypeTag() != .Int) {
-        return sema.fail(block, rhs_src, "expected integer type, found '{}'", .{rhs_ty.fmt(sema.mod)});
+    if (!rhs_ty.isUnsignedInt()) {
+        return sema.fail(block, rhs_src, "expected unsigned integer type, found '{}'", .{rhs_ty.fmt(sema.mod)});
     }
 
     const instructions = &[_]Air.Inst.Ref{ uncasted_lhs, uncasted_rhs };
@@ -26434,16 +26434,12 @@ fn zirDepositExtractBits(
     // If either of the operands are zero, the result is zero
     // If either of the operands are undefined, the result is undefined
     if (maybe_lhs_val) |lhs_val| {
+        if (try lhs_val.compareAllWithZeroAdvanced(.eq, sema)) return sema.addConstant(dest_ty, Value.zero);
         if (lhs_val.isUndef()) return sema.addConstUndef(dest_ty);
-        if (try lhs_val.compareAllWithZeroAdvanced(.eq, sema)) {
-            return sema.addConstant(dest_ty, Value.zero);
-        }
     }
     if (maybe_rhs_val) |rhs_val| {
+        if (try rhs_val.compareAllWithZeroAdvanced(.eq, sema)) return sema.addConstant(dest_ty, Value.zero);
         if (rhs_val.isUndef()) return sema.addConstUndef(dest_ty);
-        if (try rhs_val.compareAllWithZeroAdvanced(.eq, sema)) {
-            return sema.addConstant(dest_ty, Value.zero);
-        }
     }
 
     if (maybe_lhs_val) |lhs_val| {
@@ -39084,38 +39080,21 @@ fn intDepositBits(
     const arena = sema.arena;
     const info = ty.intInfo(target);
 
+    assert(ty.intInfo(target).signedness == .unsigned);
+
     var lhs_space: Value.BigIntSpace = undefined;
     var rhs_space: Value.BigIntSpace = undefined;
-    const lhs_bigint = lhs.toBigInt(&lhs_space, target);
-    const rhs_bigint = rhs.toBigInt(&rhs_space, target);
+    const source = lhs.toBigInt(&lhs_space, target);
+    const mask = rhs.toBigInt(&rhs_space, target);
 
     const result_limbs = try arena.alloc(
         std.math.big.Limb,
         std.math.big.int.calcTwosCompLimbCount(info.bits),
     );
 
-    const source_limbs = try arena.alloc(
-        std.math.big.Limb,
-        std.math.big.int.calcTwosCompLimbCount(info.bits),
-    );
-    defer arena.free(source_limbs);
-
-    const mask_limbs = try arena.alloc(
-        std.math.big.Limb,
-        std.math.big.int.calcTwosCompLimbCount(info.bits),
-    );
-    defer arena.free(mask_limbs);
-
-    var source = std.math.big.int.Mutable{ .limbs = source_limbs, .positive = undefined, .len = undefined };
-    var mask = std.math.big.int.Mutable{ .limbs = mask_limbs, .positive = undefined, .len = undefined };
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
 
-    source.convertToTwosComplement(lhs_bigint, info.signedness, info.bits);
-    mask.convertToTwosComplement(rhs_bigint, info.signedness, info.bits);
-
-    result.depositBits(source.toConst(), mask.toConst());
-
-    result.convertFromTwosComplement(result.toConst(), info.signedness, info.bits);
+    result.depositBits(source, mask);
     return Value.fromBigInt(arena, result.toConst());
 }
 
@@ -39131,38 +39110,21 @@ fn intExtractBits(
     const arena = sema.arena;
     const info = ty.intInfo(target);
 
+    assert(ty.intInfo(target).signedness == .unsigned);
+
     var lhs_space: Value.BigIntSpace = undefined;
     var rhs_space: Value.BigIntSpace = undefined;
-    const lhs_bigint = lhs.toBigInt(&lhs_space, target);
-    const rhs_bigint = rhs.toBigInt(&rhs_space, target);
+    const source = lhs.toBigInt(&lhs_space, target);
+    const mask = rhs.toBigInt(&rhs_space, target);
 
     const result_limbs = try arena.alloc(
         std.math.big.Limb,
         std.math.big.int.calcTwosCompLimbCount(info.bits),
     );
 
-    const source_limbs = try arena.alloc(
-        std.math.big.Limb,
-        std.math.big.int.calcTwosCompLimbCount(info.bits),
-    );
-    defer arena.free(source_limbs);
-
-    const mask_limbs = try arena.alloc(
-        std.math.big.Limb,
-        std.math.big.int.calcTwosCompLimbCount(info.bits),
-    );
-    defer arena.free(mask_limbs);
-
-    var source = std.math.big.int.Mutable{ .limbs = source_limbs, .positive = undefined, .len = undefined };
-    var mask = std.math.big.int.Mutable{ .limbs = mask_limbs, .positive = undefined, .len = undefined };
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
 
-    source.convertToTwosComplement(lhs_bigint, info.signedness, info.bits);
-    mask.convertToTwosComplement(rhs_bigint, info.signedness, info.bits);
-
-    result.extractBits(source.toConst(), mask.toConst());
-
-    result.convertFromTwosComplement(result.toConst(), info.signedness, info.bits);
+    result.extractBits(source, mask);
     return Value.fromBigInt(arena, result.toConst());
 }
 

From 313d258f365bee477cd5dafd93a744fad4eb3a80 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 19 Apr 2023 11:18:19 +0100
Subject: [PATCH 13/28] Actually use deposit/extract behaviour test

---
 test/behavior.zig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/behavior.zig b/test/behavior.zig
index 3081f6c9f969..d131b498e9b0 100644
--- a/test/behavior.zig
+++ b/test/behavior.zig
@@ -21,9 +21,10 @@ test {
     _ = @import("behavior/comptime_memory.zig");
     _ = @import("behavior/const_slice_child.zig");
     _ = @import("behavior/decltest.zig");
-    _ = @import("behavior/duplicated_test_names.zig");
     _ = @import("behavior/defer.zig");
+    _ = @import("behavior/deposit_extract_bits.zig");
     _ = @import("behavior/destructure.zig");
+    _ = @import("behavior/duplicated_test_names.zig");
     _ = @import("behavior/empty_tuple_fields.zig");
     _ = @import("behavior/empty_union.zig");
     _ = @import("behavior/enum.zig");

From 5a42ecbe628da07e5048c780242f8e5f6b84ecdc Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 19 Apr 2023 11:40:21 +0100
Subject: [PATCH 14/28] Enable langref tests for deposit and extract

---
 doc/langref.html.in | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/doc/langref.html.in b/doc/langref.html.in
index e9b14c66200b..3728b9a71637 100644
--- a/doc/langref.html.in
+++ b/doc/langref.html.in
@@ -8561,7 +8561,7 @@ test "main" {
       {#header_open|@depositBits#}
       <pre>{#syntax#}@depositBits(source: T, mask: T) T{#endsyntax#}</pre>
       <p>
-      {#syntax#}@TypeOf(source){#endsyntax#} must be an integer type.
+      {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution.
       </p>
       <p>
       Uses a mask to transfer contiguous lower bits in the {#syntax#}source{#endsyntax#} operand to the destination, transferring them to the corresponding bits in the destination that are set in the mask. All other bits in the destination are zeroed.
@@ -8573,8 +8573,7 @@ test "main" {
       Example:
       </p>
 
-      <!-- TODO make this a test when implemented-->
-      {#syntax_block|zig|@depositBits test#}
+      {#code_begin|test|test_depositbits_builtin#}
 const std = @import("std");
 
 test "deposit bits" {
@@ -8582,7 +8581,7 @@ test "deposit bits" {
         try std.testing.expectEqual(@depositBits(0x00001234, 0xf0f0f0f0), 0x10203040);
     }
 }
-      {#end_syntax_block#}
+      {#code_end#}
       {#see_also|@extractBits#}
       {#header_close#}
 
@@ -8757,7 +8756,7 @@ export fn @"A function name that is a complete sentence."() void {}
       {#header_open|@extractBits#}
       <pre>{#syntax#}@extractBits(source: T, mask: T) T{#endsyntax#}</pre>
       <p>
-      {#syntax#}T{#endsyntax#} must be an integer type.
+      {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution.
       </p>
       <p>
       Uses a mask to transfer bits in the {#syntax#}source{#endsyntax#} operand to the destination, writing them as contiguous lower bits in the destination. The upper bits of the destination are zeroed.
@@ -8769,8 +8768,7 @@ export fn @"A function name that is a complete sentence."() void {}
       Example:
       </p>
 
-      <!-- TODO Make this a test when implemented -->
-      {#syntax_block|zig|@extractBits test#}
+      {#code_begin|test|test_depositbits_builtin#}
 const std = @import("std");
 
 test "extract bits" {
@@ -8778,7 +8776,7 @@ test "extract bits" {
         try std.testing.expectEqual(@extractBits(0x12345678, 0xf0f0f0f0), 0x00001357);
     }
 }
-      {#end_syntax_block#}
+      {#code_end#}
       {#see_also|@depositBits#}
       {#header_close#}
 

From fc8eadb5f9f15e8786885d2af15683d259d02579 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 19 Apr 2023 19:16:56 +0100
Subject: [PATCH 15/28] Allow use of `comptime_int` with deposit/extract

---
 src/Sema.zig | 52 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 17 deletions(-)

diff --git a/src/Sema.zig b/src/Sema.zig
index 8b1e42ac319e..4b95cd15b6a4 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -26398,6 +26398,7 @@ fn zirDepositExtractBits(
     extended: Zir.Inst.Extended.InstData,
     air_tag: Air.Inst.Tag,
 ) CompileError!Air.Inst.Ref {
+    const target = sema.mod.getTarget();
     const extra = sema.code.extraData(Zir.Inst.BinNode, extended.operand).data;
     const src = LazySrcLoc.nodeOffset(extra.node);
 
@@ -26410,12 +26411,12 @@ fn zirDepositExtractBits(
     const lhs_ty = sema.typeOf(uncasted_lhs);
     const rhs_ty = sema.typeOf(uncasted_rhs);
 
-    if (!lhs_ty.isUnsignedInt()) {
-        return sema.fail(block, lhs_src, "expected unsigned integer type, found '{}'", .{lhs_ty.fmt(sema.mod)});
+    if (!lhs_ty.isUnsignedInt() and lhs_ty.zigTypeTag() != .ComptimeInt) {
+        return sema.fail(block, lhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{lhs_ty.fmt(sema.mod)});
     }
 
-    if (!rhs_ty.isUnsignedInt()) {
-        return sema.fail(block, rhs_src, "expected unsigned integer type, found '{}'", .{rhs_ty.fmt(sema.mod)});
+    if (!rhs_ty.isUnsignedInt() and rhs_ty.zigTypeTag() != .ComptimeInt) {
+        return sema.fail(block, rhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{rhs_ty.fmt(sema.mod)});
     }
 
     const instructions = &[_]Air.Inst.Ref{ uncasted_lhs, uncasted_rhs };
@@ -26423,7 +26424,30 @@ fn zirDepositExtractBits(
         .override = &[_]?LazySrcLoc{ lhs_src, rhs_src },
     });
 
-    assert(dest_ty.zigTypeTag() == .Int);
+    // This branch is only true if *both* parameters are comptime_ints.
+    if (dest_ty.zigTypeTag() == .ComptimeInt) {
+        const builtin_name = switch (air_tag) {
+            .deposit_bits => "@depositBits",
+            .extract_bits => "@extractBits",
+            else => unreachable,
+        };
+
+        const lhs_val = (try sema.resolveMaybeUndefVal(uncasted_lhs)).?;
+        if (lhs_val.compareHetero(.lt, Value.zero, target)) {
+            const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(lhs_ty, sema.mod)});
+            try sema.errNote(block, src, err, "{s} requires parameters of type 'comptime_int' be positive", .{builtin_name});
+            return sema.failWithOwnedErrorMsg(err);
+        }
+
+        const rhs_val = (try sema.resolveMaybeUndefVal(uncasted_rhs)).?;
+        if (rhs_val.compareHetero(.lt, Value.zero, target)) {
+            const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(rhs_ty, sema.mod)});
+            try sema.errNote(block, src, err, "{s} requires parameters of type 'comptime_int' be positive", .{builtin_name});
+            return sema.failWithOwnedErrorMsg(err);
+        }
+    }
+
+    assert(dest_ty.isUnsignedInt() or dest_ty.zigTypeTag() == .ComptimeInt);
 
     const lhs = try sema.coerce(block, dest_ty, uncasted_lhs, lhs_src);
     const rhs = try sema.coerce(block, dest_ty, uncasted_rhs, rhs_src);
@@ -26445,8 +26469,8 @@ fn zirDepositExtractBits(
     if (maybe_lhs_val) |lhs_val| {
         if (maybe_rhs_val) |rhs_val| {
             const dest_val = switch (air_tag) {
-                .deposit_bits => try sema.intDepositBits(lhs_val, rhs_val, dest_ty),
-                .extract_bits => try sema.intExtractBits(lhs_val, rhs_val, dest_ty),
+                .deposit_bits => try sema.intDepositBits(lhs_val, rhs_val),
+                .extract_bits => try sema.intExtractBits(lhs_val, rhs_val),
                 else => unreachable,
             };
 
@@ -39068,19 +39092,16 @@ fn intAddWithOverflowScalar(
     };
 }
 
+/// Asserts that the values are positive
 fn intDepositBits(
     sema: *Sema,
     lhs: Value,
     rhs: Value,
-    ty: Type,
 ) !Value {
     // TODO is this a performance issue? maybe we should try the operation without
     // resorting to BigInt first. For non-bigints, @intDeposit could be used?
     const target = sema.mod.getTarget();
     const arena = sema.arena;
-    const info = ty.intInfo(target);
-
-    assert(ty.intInfo(target).signedness == .unsigned);
 
     var lhs_space: Value.BigIntSpace = undefined;
     var rhs_space: Value.BigIntSpace = undefined;
@@ -39089,7 +39110,7 @@ fn intDepositBits(
 
     const result_limbs = try arena.alloc(
         std.math.big.Limb,
-        std.math.big.int.calcTwosCompLimbCount(info.bits),
+        mask.limbs.len,
     );
 
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
@@ -39098,19 +39119,16 @@ fn intDepositBits(
     return Value.fromBigInt(arena, result.toConst());
 }
 
+/// Asserts that the values are positive
 fn intExtractBits(
     sema: *Sema,
     lhs: Value,
     rhs: Value,
-    ty: Type,
 ) !Value {
     // TODO is this a performance issue? maybe we should try the operation without
     // resorting to BigInt first. For non-bigints, @intExtract could be used?
     const target = sema.mod.getTarget();
     const arena = sema.arena;
-    const info = ty.intInfo(target);
-
-    assert(ty.intInfo(target).signedness == .unsigned);
 
     var lhs_space: Value.BigIntSpace = undefined;
     var rhs_space: Value.BigIntSpace = undefined;
@@ -39119,7 +39137,7 @@ fn intExtractBits(
 
     const result_limbs = try arena.alloc(
         std.math.big.Limb,
-        std.math.big.int.calcTwosCompLimbCount(info.bits),
+        mask.limbs.len,
     );
 
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };

From 9c14b2687fbfc6db670e8c107b960a85a53c69d9 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 19 Apr 2023 22:11:58 +0100
Subject: [PATCH 16/28] Improve compile errors for negative values

---
 src/Sema.zig | 73 ++++++++++++++++++++++++++++++++++------------------
 1 file changed, 48 insertions(+), 25 deletions(-)

diff --git a/src/Sema.zig b/src/Sema.zig
index 4b95cd15b6a4..5cc6b4cb187a 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -26424,37 +26424,60 @@ fn zirDepositExtractBits(
         .override = &[_]?LazySrcLoc{ lhs_src, rhs_src },
     });
 
-    // This branch is only true if *both* parameters are comptime_ints.
-    if (dest_ty.zigTypeTag() == .ComptimeInt) {
-        const builtin_name = switch (air_tag) {
-            .deposit_bits => "@depositBits",
-            .extract_bits => "@extractBits",
-            else => unreachable,
-        };
-
-        const lhs_val = (try sema.resolveMaybeUndefVal(uncasted_lhs)).?;
-        if (lhs_val.compareHetero(.lt, Value.zero, target)) {
-            const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(lhs_ty, sema.mod)});
-            try sema.errNote(block, src, err, "{s} requires parameters of type 'comptime_int' be positive", .{builtin_name});
-            return sema.failWithOwnedErrorMsg(err);
-        }
-
-        const rhs_val = (try sema.resolveMaybeUndefVal(uncasted_rhs)).?;
-        if (rhs_val.compareHetero(.lt, Value.zero, target)) {
-            const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(rhs_ty, sema.mod)});
-            try sema.errNote(block, src, err, "{s} requires parameters of type 'comptime_int' be positive", .{builtin_name});
-            return sema.failWithOwnedErrorMsg(err);
-        }
-    }
+    const builtin_name = switch (air_tag) {
+        .deposit_bits => "@depositBits",
+        .extract_bits => "@extractBits",
+        else => unreachable,
+    };
 
-    assert(dest_ty.isUnsignedInt() or dest_ty.zigTypeTag() == .ComptimeInt);
+    // Coercion errors are intercepted to add a note if the caller is attempting to pass a negative comptime_int
+    const lhs = sema.coerce(block, dest_ty, uncasted_lhs, lhs_src) catch |err| switch (err) {
+        error.AnalysisFail => {
+            const msg = sema.err orelse return err;
+            const val = (try sema.resolveMaybeUndefVal(uncasted_lhs)).?;
+            if (val.compareHetero(.lt, Value.zero, target)) {
+                try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name});
+            }
+            return err;
+        },
+        else => return err,
+    };
 
-    const lhs = try sema.coerce(block, dest_ty, uncasted_lhs, lhs_src);
-    const rhs = try sema.coerce(block, dest_ty, uncasted_rhs, rhs_src);
+    const rhs = sema.coerce(block, dest_ty, uncasted_rhs, rhs_src) catch |err| switch (err) {
+        error.AnalysisFail => {
+            const msg = sema.err orelse return err;
+            const val = (try sema.resolveMaybeUndefVal(uncasted_rhs)).?;
+            if (val.compareHetero(.lt, Value.zero, target)) {
+                try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name});
+            }
+            return err;
+        },
+        else => return err,
+    };
 
     const maybe_lhs_val = try sema.resolveMaybeUndefVal(lhs);
     const maybe_rhs_val = try sema.resolveMaybeUndefVal(rhs);
 
+    // We check for negative values here only if the type is a comptime_int, as negative values
+    // would have otherwise been filtered out by coercion and the unsigned type restriction
+    if (dest_ty.zigTypeTag() == .ComptimeInt) {
+        if (maybe_lhs_val) |lhs_val| {
+            if (!lhs_val.isUndef() and lhs_val.compareHetero(.lt, Value.zero, target)) {
+                const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(lhs_ty, sema.mod)});
+                try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name});
+                return sema.failWithOwnedErrorMsg(err);
+            }
+        }
+
+        if (maybe_rhs_val) |rhs_val| {
+            if (!rhs_val.isUndef() and rhs_val.compareHetero(.lt, Value.zero, target)) {
+                const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(rhs_ty, sema.mod)});
+                try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name});
+                return sema.failWithOwnedErrorMsg(err);
+            }
+        }
+    }
+
     // If either of the operands are zero, the result is zero
     // If either of the operands are undefined, the result is undefined
     if (maybe_lhs_val) |lhs_val| {

From 4eff8317925064b7ea9d029603174d4997d4c3e6 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Fri, 21 Apr 2023 09:53:30 +0100
Subject: [PATCH 17/28] update comments

---
 src/codegen/llvm.zig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 4dbadbcc45d3..45549fd8ec23 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10366,7 +10366,7 @@ pub const FuncGen = struct {
         return self.builder.buildTrunc(result, llvm_trunc_ty, "");
     }
 
-    // TODO Should this belong in compiler-rt?
+    // TODO Move this to compiler-rt (see #14609)
     //
     // Implements @depositBits(source, mask) in software
     // (i.e. without platform-specific instructions)
@@ -10513,7 +10513,7 @@ pub const FuncGen = struct {
         return self.builder.buildTrunc(result, llvm_trunc_ty, "");
     }
 
-    // TODO Should this belong in compiler-rt?
+    // TODO Move this to compiler-rt (see #14609)
     //
     // Implements @extractBits(source, mask) in software
     // (i.e. without platform-specific instructions)

From 69e893da7817998027b3072ed88cd55cffa9c37b Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Thu, 9 Nov 2023 16:12:41 +0000
Subject: [PATCH 18/28] Bring branch up-to-date

---
 lib/std/math/big/int.zig      |  16 +-
 lib/std/zig/AstRlAnnotate.zig |   5 +
 src/Air.zig                   |   2 +
 src/Liveness/Verify.zig       |   2 +
 src/Sema.zig                  |  61 +++----
 src/codegen/llvm.zig          | 300 ++++++++++++++++++----------------
 src/codegen/llvm/Builder.zig  |  43 +++++
 7 files changed, 251 insertions(+), 178 deletions(-)

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig
index 520048d5b7c1..357d2c93e785 100644
--- a/lib/std/math/big/int.zig
+++ b/lib/std/math/big/int.zig
@@ -1744,7 +1744,7 @@ pub const Mutable = struct {
         assert(mask.positive);
 
         r.positive = true;
-        std.mem.set(Limb, r.limbs, 0);
+        @memset(r.limbs, 0);
 
         var mask_limb: Limb = mask.limbs[0];
         var mask_limb_index: Limb = 0;
@@ -1754,7 +1754,7 @@ pub const Mutable = struct {
             const mask_limb_bit: Log2Limb = limb_bit: while (true) {
                 const mask_limb_tz = @ctz(mask_limb);
                 if (mask_limb_tz != @sizeOf(Limb) * 8) {
-                    const cast_limb_bit = @intCast(Log2Limb, mask_limb_tz);
+                    const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
                     mask_limb ^= @as(Limb, 1) << cast_limb_bit;
                     break :limb_bit cast_limb_bit;
                 }
@@ -1769,13 +1769,13 @@ pub const Mutable = struct {
             };
 
             const i_limb_index = i / limb_bits;
-            const i_limb_bit = @truncate(Log2Limb, i);
+            const i_limb_bit: Log2Limb = @truncate(i);
 
             if (i_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
 
             const source_bit_set = source.limbs[i_limb_index] & (@as(Limb, 1) << i_limb_bit) != 0;
 
-            r.limbs[mask_limb_index] |= @as(Limb, @boolToInt(source_bit_set)) << mask_limb_bit;
+            r.limbs[mask_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << mask_limb_bit;
         }
 
         r.normalize(r.limbs.len);
@@ -1790,7 +1790,7 @@ pub const Mutable = struct {
         assert(mask.positive);
 
         r.positive = true;
-        std.mem.set(Limb, r.limbs, 0);
+        @memset(r.limbs, 0);
 
         var mask_limb: Limb = mask.limbs[0];
         var mask_limb_index: Limb = 0;
@@ -1800,7 +1800,7 @@ pub const Mutable = struct {
             const mask_limb_bit: Log2Limb = limb_bit: while (true) {
                 const mask_limb_tz = @ctz(mask_limb);
                 if (mask_limb_tz != @sizeOf(Limb) * 8) {
-                    const cast_limb_bit = @intCast(Log2Limb, mask_limb_tz);
+                    const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
                     mask_limb ^= @as(Limb, 1) << cast_limb_bit;
                     break :limb_bit cast_limb_bit;
                 }
@@ -1815,13 +1815,13 @@ pub const Mutable = struct {
             };
 
             const i_limb_index = i / limb_bits;
-            const i_limb_bit = @truncate(Log2Limb, i);
+            const i_limb_bit: Log2Limb = @truncate(i);
 
             if (mask_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes)
 
             const source_bit_set = source.limbs[mask_limb_index] & (@as(Limb, 1) << mask_limb_bit) != 0;
 
-            r.limbs[i_limb_index] |= @as(Limb, @boolToInt(source_bit_set)) << i_limb_bit;
+            r.limbs[i_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << i_limb_bit;
         }
 
         r.normalize(r.limbs.len);
diff --git a/lib/std/zig/AstRlAnnotate.zig b/lib/std/zig/AstRlAnnotate.zig
index 4a1203ca09fc..edf221caa103 100644
--- a/lib/std/zig/AstRlAnnotate.zig
+++ b/lib/std/zig/AstRlAnnotate.zig
@@ -1100,5 +1100,10 @@ fn builtinCall(astrl: *AstRlAnnotate, block: ?*Block, ri: ResultInfo, node: Ast.
             _ = try astrl.expr(args[4], block, ResultInfo.type_only);
             return false;
         },
+        .deposit_bits, .extract_bits => {
+            _ = try astrl.expr(args[0], block, ResultInfo.none);
+            _ = try astrl.expr(args[1], block, ResultInfo.none);
+            return false;
+        },
     }
 }
diff --git a/src/Air.zig b/src/Air.zig
index 1b7d8d77b9e9..f607f3c29447 100644
--- a/src/Air.zig
+++ b/src/Air.zig
@@ -1799,6 +1799,8 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool {
         .work_item_id,
         .work_group_size,
         .work_group_id,
+        .deposit_bits,
+        .extract_bits,
         => false,
 
         .assembly => {
diff --git a/src/Liveness/Verify.zig b/src/Liveness/Verify.zig
index 4392f25e101d..f43b498f3a46 100644
--- a/src/Liveness/Verify.zig
+++ b/src/Liveness/Verify.zig
@@ -257,6 +257,8 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void {
             .memset,
             .memset_safe,
             .memcpy,
+            .deposit_bits,
+            .extract_bits,
             => {
                 const bin_op = data[@intFromEnum(inst)].bin_op;
                 try self.verifyInstOperands(inst, .{ bin_op.lhs, bin_op.rhs, .none });
diff --git a/src/Sema.zig b/src/Sema.zig
index 5cc6b4cb187a..31b456cb6909 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -26398,7 +26398,10 @@ fn zirDepositExtractBits(
     extended: Zir.Inst.Extended.InstData,
     air_tag: Air.Inst.Tag,
 ) CompileError!Air.Inst.Ref {
+    const mod = sema.mod;
+
     const target = sema.mod.getTarget();
+    _ = target;
     const extra = sema.code.extraData(Zir.Inst.BinNode, extended.operand).data;
     const src = LazySrcLoc.nodeOffset(extra.node);
 
@@ -26411,11 +26414,11 @@ fn zirDepositExtractBits(
     const lhs_ty = sema.typeOf(uncasted_lhs);
     const rhs_ty = sema.typeOf(uncasted_rhs);
 
-    if (!lhs_ty.isUnsignedInt() and lhs_ty.zigTypeTag() != .ComptimeInt) {
+    if (!lhs_ty.isUnsignedInt(mod) and lhs_ty.zigTypeTag(mod) != .ComptimeInt) {
         return sema.fail(block, lhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{lhs_ty.fmt(sema.mod)});
     }
 
-    if (!rhs_ty.isUnsignedInt() and rhs_ty.zigTypeTag() != .ComptimeInt) {
+    if (!rhs_ty.isUnsignedInt(mod) and rhs_ty.zigTypeTag(mod) != .ComptimeInt) {
         return sema.fail(block, rhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{rhs_ty.fmt(sema.mod)});
     }
 
@@ -26434,8 +26437,8 @@ fn zirDepositExtractBits(
     const lhs = sema.coerce(block, dest_ty, uncasted_lhs, lhs_src) catch |err| switch (err) {
         error.AnalysisFail => {
             const msg = sema.err orelse return err;
-            const val = (try sema.resolveMaybeUndefVal(uncasted_lhs)).?;
-            if (val.compareHetero(.lt, Value.zero, target)) {
+            const val = (try sema.resolveValue(uncasted_lhs)).?;
+            if (val.orderAgainstZero(mod) == .lt) {
                 try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name});
             }
             return err;
@@ -26446,8 +26449,8 @@ fn zirDepositExtractBits(
     const rhs = sema.coerce(block, dest_ty, uncasted_rhs, rhs_src) catch |err| switch (err) {
         error.AnalysisFail => {
             const msg = sema.err orelse return err;
-            const val = (try sema.resolveMaybeUndefVal(uncasted_rhs)).?;
-            if (val.compareHetero(.lt, Value.zero, target)) {
+            const val = (try sema.resolveValue(uncasted_rhs)).?;
+            if (val.orderAgainstZero(mod) == .lt) {
                 try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name});
             }
             return err;
@@ -26455,25 +26458,25 @@ fn zirDepositExtractBits(
         else => return err,
     };
 
-    const maybe_lhs_val = try sema.resolveMaybeUndefVal(lhs);
-    const maybe_rhs_val = try sema.resolveMaybeUndefVal(rhs);
+    const maybe_lhs_val = try sema.resolveValue(lhs);
+    const maybe_rhs_val = try sema.resolveValue(rhs);
 
     // We check for negative values here only if the type is a comptime_int, as negative values
     // would have otherwise been filtered out by coercion and the unsigned type restriction
-    if (dest_ty.zigTypeTag() == .ComptimeInt) {
+    if (dest_ty.zigTypeTag(mod) == .ComptimeInt) {
         if (maybe_lhs_val) |lhs_val| {
-            if (!lhs_val.isUndef() and lhs_val.compareHetero(.lt, Value.zero, target)) {
+            if (!lhs_val.isUndef(mod) and lhs_val.orderAgainstZero(mod) == .lt) {
                 const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(lhs_ty, sema.mod)});
                 try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name});
-                return sema.failWithOwnedErrorMsg(err);
+                return sema.failWithOwnedErrorMsg(block, err);
             }
         }
 
         if (maybe_rhs_val) |rhs_val| {
-            if (!rhs_val.isUndef() and rhs_val.compareHetero(.lt, Value.zero, target)) {
+            if (!rhs_val.isUndef(mod) and rhs_val.orderAgainstZero(mod) == .lt) {
                 const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(rhs_ty, sema.mod)});
                 try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name});
-                return sema.failWithOwnedErrorMsg(err);
+                return sema.failWithOwnedErrorMsg(block, err);
             }
         }
     }
@@ -26481,23 +26484,23 @@ fn zirDepositExtractBits(
     // If either of the operands are zero, the result is zero
     // If either of the operands are undefined, the result is undefined
     if (maybe_lhs_val) |lhs_val| {
-        if (try lhs_val.compareAllWithZeroAdvanced(.eq, sema)) return sema.addConstant(dest_ty, Value.zero);
-        if (lhs_val.isUndef()) return sema.addConstUndef(dest_ty);
+        if (lhs_val.orderAgainstZero(mod) == .eq) return Air.internedToRef((try mod.intValue(dest_ty, 0)).toIntern());
+        if (lhs_val.isUndef(mod)) return try mod.undefRef(dest_ty);
     }
     if (maybe_rhs_val) |rhs_val| {
-        if (try rhs_val.compareAllWithZeroAdvanced(.eq, sema)) return sema.addConstant(dest_ty, Value.zero);
-        if (rhs_val.isUndef()) return sema.addConstUndef(dest_ty);
+        if (rhs_val.orderAgainstZero(mod) == .eq) return Air.internedToRef((try mod.intValue(dest_ty, 0)).toIntern());
+        if (rhs_val.isUndef(mod)) return mod.undefRef(dest_ty);
     }
 
     if (maybe_lhs_val) |lhs_val| {
         if (maybe_rhs_val) |rhs_val| {
             const dest_val = switch (air_tag) {
-                .deposit_bits => try sema.intDepositBits(lhs_val, rhs_val),
-                .extract_bits => try sema.intExtractBits(lhs_val, rhs_val),
+                .deposit_bits => try sema.intDepositBits(lhs_val, rhs_val, dest_ty),
+                .extract_bits => try sema.intExtractBits(lhs_val, rhs_val, dest_ty),
                 else => unreachable,
             };
 
-            return sema.addConstant(dest_ty, dest_val);
+            return Air.internedToRef(dest_val.toIntern());
         }
     }
 
@@ -39120,16 +39123,17 @@ fn intDepositBits(
     sema: *Sema,
     lhs: Value,
     rhs: Value,
+    ty: Type,
 ) !Value {
     // TODO is this a performance issue? maybe we should try the operation without
     // resorting to BigInt first. For non-bigints, @intDeposit could be used?
-    const target = sema.mod.getTarget();
+    const mod = sema.mod;
     const arena = sema.arena;
 
     var lhs_space: Value.BigIntSpace = undefined;
     var rhs_space: Value.BigIntSpace = undefined;
-    const source = lhs.toBigInt(&lhs_space, target);
-    const mask = rhs.toBigInt(&rhs_space, target);
+    const source = lhs.toBigInt(&lhs_space, mod);
+    const mask = rhs.toBigInt(&rhs_space, mod);
 
     const result_limbs = try arena.alloc(
         std.math.big.Limb,
@@ -39139,7 +39143,7 @@ fn intDepositBits(
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
 
     result.depositBits(source, mask);
-    return Value.fromBigInt(arena, result.toConst());
+    return try mod.intValue_big(ty, result.toConst());
 }
 
 /// Asserts that the values are positive
@@ -39147,16 +39151,17 @@ fn intExtractBits(
     sema: *Sema,
     lhs: Value,
     rhs: Value,
+    ty: Type,
 ) !Value {
     // TODO is this a performance issue? maybe we should try the operation without
     // resorting to BigInt first. For non-bigints, @intExtract could be used?
-    const target = sema.mod.getTarget();
+    const mod = sema.mod;
     const arena = sema.arena;
 
     var lhs_space: Value.BigIntSpace = undefined;
     var rhs_space: Value.BigIntSpace = undefined;
-    const source = lhs.toBigInt(&lhs_space, target);
-    const mask = rhs.toBigInt(&rhs_space, target);
+    const source = lhs.toBigInt(&lhs_space, mod);
+    const mask = rhs.toBigInt(&rhs_space, mod);
 
     const result_limbs = try arena.alloc(
         std.math.big.Limb,
@@ -39166,7 +39171,7 @@ fn intExtractBits(
     var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined };
 
     result.extractBits(source, mask);
-    return Value.fromBigInt(arena, result.toConst());
+    return try mod.intValue_big(ty, result.toConst());
 }
 
 /// Asserts the values are comparable. Both operands have type `ty`.
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 45549fd8ec23..c569d050bb08 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10298,48 +10298,52 @@ pub const FuncGen = struct {
         return self.amdgcnWorkIntrinsic(dimension, 0, "amdgcn.workgroup.id");
     }
 
-    fn airDepositBits(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
-        if (self.liveness.isUnused(inst)) return null;
+    fn airDepositBits(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
+        if (self.liveness.isUnused(inst)) return .none;
+
+        const o = self.dg.object;
 
         const bin_op = self.air.instructions.items(.data)[inst].bin_op;
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
-        const inst_ty = self.air.typeOfIndex(inst);
+        const inst_ty = self.typeOfIndex(inst);
+        const ty = try o.lowerType(inst_ty);
 
-        const target = self.dg.module.getTarget();
-        const params = [2]*llvm.Value{ lhs, rhs };
+        const target = o.module.getTarget();
+        const params = [2]Builder.Value{ lhs, rhs };
         switch (target.cpu.arch) {
             .x86, .x86_64 => |tag| blk: {
                 // Doesn't have pdep
                 if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk;
 
-                const bits = inst_ty.intInfo(target).bits;
+                const bits = inst_ty.intInfo(o.module).bits;
                 const supports_64 = tag == .x86_64;
                 // Integer size doesn't match the available instruction(s)
                 if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk;
 
-                return self.buildDepositBitsNative(inst_ty, params);
+                return try self.buildDepositBitsNative(ty, params);
             },
             else => {},
         }
 
-        return self.buildDepositBitsEmulated(inst_ty, params);
+        return try self.buildDepositBitsEmulated(ty, params);
     }
 
     fn buildDepositBitsNative(
         self: *FuncGen,
-        ty: Type,
-        params: [2]*llvm.Value,
-    ) !*llvm.Value {
-        const target = self.dg.module.getTarget();
+        ty: Builder.Type,
+        params: [2]Builder.Value,
+    ) !Builder.Value {
+        const o = self.dg.object;
+        const target = o.module.getTarget();
 
         assert(target.cpu.arch.isX86());
         assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2));
 
-        const bits = ty.intInfo(target).bits;
-        const intrinsic_name = switch (bits) {
-            1...32 => "llvm.x86.bmi.pdep.32",
-            33...64 => "llvm.x86.bmi.pdep.64",
+        const bits = ty.scalarBits(&o.builder);
+        const intrinsic: Builder.Intrinsic = switch (bits) {
+            1...32 => .@"x86.bmi.pdep.32",
+            33...64 => .@"x86.bmi.pdep.64",
             else => unreachable,
         };
         const needs_extend = bits != 32 and bits != 64;
@@ -10348,22 +10352,27 @@ pub const FuncGen = struct {
 
         // Cast to either a 32 or 64-bit integer
         if (needs_extend) {
-            const llvm_extend_ty = self.context.intType(if (bits <= 32) 32 else 64);
+            const extend_ty = try o.builder.intType(if (bits <= 32) 32 else 64);
             params_cast = .{
-                self.builder.buildZExt(params[0], llvm_extend_ty, ""),
-                self.builder.buildZExt(params[1], llvm_extend_ty, ""),
+                try self.wip.cast(.zext, params[0], extend_ty, ""),
+                try self.wip.cast(.zext, params[1], extend_ty, ""),
             };
         }
 
-        const llvm_fn = self.getIntrinsic(intrinsic_name, &.{});
-        const result = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, &params_cast, 2, .Fast, .Auto, "");
+        const result = try self.wip.callIntrinsic(
+            .fast,
+            .none,
+            intrinsic,
+            &.{},
+            &params_cast,
+            "",
+        );
 
         // No cast needed!
         if (!needs_extend) return result;
 
         // Cast back to the original integer size
-        const llvm_trunc_ty = try self.dg.lowerType(ty);
-        return self.builder.buildTrunc(result, llvm_trunc_ty, "");
+        return try self.wip.cast(.trunc, result, ty, "");
     }
 
     // TODO Move this to compiler-rt (see #14609)
@@ -10384,109 +10393,112 @@ pub const FuncGen = struct {
     // return result;
     fn buildDepositBitsEmulated(
         self: *FuncGen,
-        ty: Type,
-        params: [2]*llvm.Value,
-    ) !*llvm.Value {
-        const llvm_ty = try self.dg.lowerType(ty);
+        ty: Builder.Type,
+        params: [2]Builder.Value,
+    ) !Builder.Value {
+        const o = self.dg.object;
 
         const source = params[0];
-        const mask_start = params[1];
-        const zero = llvm_ty.constNull();
-        const one = llvm_ty.constInt(1, .False);
-        const minus_one = llvm_ty.constInt(@bitCast(c_ulonglong, @as(c_longlong, -1)), .True);
-
-        const prev_block = self.builder.getInsertBlock();
-        const loop_block = self.context.appendBasicBlock(self.llvm_func, "Loop");
-        const after_block = self.context.appendBasicBlock(self.llvm_func, "After");
-
-        _ = self.builder.buildBr(loop_block);
-        self.builder.positionBuilderAtEnd(loop_block);
-        const mask_phi = self.builder.buildPhi(llvm_ty, "");
-        const result_phi = self.builder.buildPhi(llvm_ty, "");
-        const bb_phi = self.builder.buildPhi(llvm_ty, "");
-        const minus_mask = self.builder.buildSub(zero, mask_phi, "");
-        const bit = self.builder.buildAnd(mask_phi, minus_mask, "");
-        const not_bit = self.builder.buildXor(bit, minus_one, "");
-        const new_mask = self.builder.buildAnd(mask_phi, not_bit, "");
-        const source_bit = self.builder.buildAnd(source, bb_phi, "");
-        const source_bit_set = self.builder.buildICmp(.NE, source_bit, zero, "");
-        const bit_or_zero = self.builder.buildSelect(source_bit_set, bit, zero, ""); // avoid using control flow
-        const new_result = self.builder.buildOr(result_phi, bit_or_zero, "");
-        const new_bb = self.builder.buildAdd(bb_phi, bb_phi, "");
-        const while_cond = self.builder.buildICmp(.NE, new_mask, zero, "");
-        _ = self.builder.buildCondBr(while_cond, loop_block, after_block);
-
-        mask_phi.addIncoming(
-            &[2]*llvm.Value{ mask_start, new_mask },
-            &[2]*llvm.BasicBlock{ prev_block, loop_block },
-            2,
+        const start_mask = params[1];
+        const zero = try o.builder.intValue(ty, 0);
+        const one = try o.builder.intValue(ty, 1);
+
+        const prev_block = self.wip.cursor.block;
+        const loop_block = try self.wip.block(2, "Loop");
+        const after_block = try self.wip.block(1, "After");
+
+        _ = try self.wip.br(loop_block);
+        self.wip.cursor = .{ .block = loop_block };
+        const mask_phi = try self.wip.phi(ty, "");
+        const result_phi = try self.wip.phi(ty, "");
+        const bb_phi = try self.wip.phi(ty, "");
+        const minus_mask = try self.wip.neg(mask_phi.toValue(), "");
+        const bit = try self.wip.bin(.@"and", mask_phi.toValue(), minus_mask, "");
+        const not_bit = try self.wip.not(bit, "");
+        const new_mask = try self.wip.bin(.@"and", mask_phi.toValue(), not_bit, "");
+        const source_bit = try self.wip.bin(.@"and", source, bb_phi.toValue(), "");
+        const source_bit_set = try self.wip.icmp(.ne, source_bit, zero, "");
+        const bit_or_zero = try self.wip.select(.normal, source_bit_set, bit, zero, ""); // avoid using control flow
+        const new_result = try self.wip.bin(.@"or", result_phi.toValue(), bit_or_zero, "");
+        const new_bb = try self.wip.bin(.@"add", bb_phi.toValue(), bb_phi.toValue(), "");
+        const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
+        _ = try self.wip.brCond(while_cond, loop_block, after_block);
+
+        try mask_phi.finish(
+            &.{ start_mask, new_mask },
+            &.{ prev_block, loop_block },
+            &self.wip,
         );
 
-        result_phi.addIncoming(
-            &[2]*llvm.Value{ zero, new_result },
-            &[2]*llvm.BasicBlock{ prev_block, loop_block },
-            2,
+        try result_phi.finish(
+            &.{ zero, new_result },
+            &.{ prev_block, loop_block },
+            &self.wip,
         );
 
-        bb_phi.addIncoming(
-            &[2]*llvm.Value{ one, new_bb },
-            &[2]*llvm.BasicBlock{ prev_block, loop_block },
-            2,
+        try bb_phi.finish(
+            &.{ one, new_bb },
+            &.{ prev_block, loop_block },
+            &self.wip,
         );
 
-        self.builder.positionBuilderAtEnd(after_block);
-        const final_result = self.builder.buildPhi(llvm_ty, "");
-        final_result.addIncoming(
-            &[1]*llvm.Value{new_result},
-            &[1]*llvm.BasicBlock{loop_block},
-            1,
+        self.wip.cursor = .{ .block = after_block };
+        const final_result = try self.wip.phi(ty, "");
+        try final_result.finish(
+            &.{new_result},
+            &.{loop_block},
+            &self.wip,
         );
 
-        return final_result;
+        return final_result.toValue();
     }
 
-    fn airExtractBits(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value {
-        if (self.liveness.isUnused(inst)) return null;
+    fn airExtractBits(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
+        if (self.liveness.isUnused(inst)) return .none;
+
+        const o = self.dg.object;
 
         const bin_op = self.air.instructions.items(.data)[inst].bin_op;
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
-        const inst_ty = self.air.typeOfIndex(inst);
+        const inst_ty = self.typeOfIndex(inst);
+        const ty = try o.lowerType(inst_ty);
 
-        const target = self.dg.module.getTarget();
-        const params = [2]*llvm.Value{ lhs, rhs };
+        const target = o.module.getTarget();
+        const params = [2]Builder.Value{ lhs, rhs };
         switch (target.cpu.arch) {
             .x86, .x86_64 => |tag| blk: {
                 // Doesn't have pext
                 if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk;
 
-                const bits = inst_ty.intInfo(target).bits;
+                const bits = inst_ty.intInfo(o.module).bits;
                 const supports_64 = tag == .x86_64;
                 // Integer size doesn't match the available instruction(s)
                 if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk;
 
-                return self.buildExtractBitsNative(inst_ty, params);
+                return self.buildExtractBitsNative(ty, params);
             },
             else => {},
         }
 
-        return self.buildExtractBitsEmulated(inst_ty, params);
+        return self.buildExtractBitsEmulated(ty, params);
     }
 
     fn buildExtractBitsNative(
         self: *FuncGen,
-        ty: Type,
-        params: [2]*llvm.Value,
-    ) !*llvm.Value {
-        const target = self.dg.module.getTarget();
+        ty: Builder.Type,
+        params: [2]Builder.Value,
+    ) !Builder.Value {
+        const o = self.dg.object;
+        const target = o.module.getTarget();
 
         assert(target.cpu.arch.isX86());
         assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2));
 
-        const bits = ty.intInfo(target).bits;
-        const intrinsic_name = switch (bits) {
-            1...32 => "llvm.x86.bmi.pext.32",
-            33...64 => "llvm.x86.bmi.pext.64",
+        const bits = ty.scalarBits(&o.builder);
+        const intrinsic: Builder.Intrinsic = switch (bits) {
+            1...32 => .@"x86.bmi.pext.32",
+            33...64 => .@"x86.bmi.pext.64",
             else => unreachable,
         };
         const needs_extend = bits != 32 and bits != 64;
@@ -10495,22 +10507,27 @@ pub const FuncGen = struct {
 
         // Cast to either a 32 or 64-bit integer
         if (needs_extend) {
-            const llvm_extend_ty = self.context.intType(if (bits <= 32) 32 else 64);
+            const extend_ty = try o.builder.intType(if (bits <= 32) 32 else 64);
             params_cast = .{
-                self.builder.buildZExt(params[0], llvm_extend_ty, ""),
-                self.builder.buildZExt(params[1], llvm_extend_ty, ""),
+                try self.wip.cast(.zext, params[0], extend_ty, ""),
+                try self.wip.cast(.zext, params[1], extend_ty, ""),
             };
         }
 
-        const llvm_fn = self.getIntrinsic(intrinsic_name, &.{});
-        const result = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, &params_cast, 2, .Fast, .Auto, "");
+        const result = try self.wip.callIntrinsic(
+            .fast,
+            .none,
+            intrinsic,
+            &.{},
+            &params_cast,
+            "",
+        );
 
         // No cast needed!
         if (!needs_extend) return result;
 
         // Cast back to the original integer size
-        const llvm_trunc_ty = try self.dg.lowerType(ty);
-        return self.builder.buildTrunc(result, llvm_trunc_ty, "");
+        return try self.wip.cast(.trunc, result, ty, "");
     }
 
     // TODO Move this to compiler-rt (see #14609)
@@ -10531,67 +10548,66 @@ pub const FuncGen = struct {
     // return result;
     fn buildExtractBitsEmulated(
         self: *FuncGen,
-        ty: Type,
-        params: [2]*llvm.Value,
-    ) !*llvm.Value {
-        const llvm_ty = try self.dg.lowerType(ty);
+        ty: Builder.Type,
+        params: [2]Builder.Value,
+    ) !Builder.Value {
+        const o = self.dg.object;
 
-        const zero = llvm_ty.constNull();
-        const one = llvm_ty.constInt(1, .False);
-        const minus_one = llvm_ty.constInt(@bitCast(c_ulonglong, @as(c_longlong, -1)), .True);
         const source = params[0];
         const start_mask = params[1];
+        const zero = try o.builder.intValue(ty, 0);
+        const one = try o.builder.intValue(ty, 1);
         const start_result = zero;
         const start_bb = one;
 
-        const prev_block = self.builder.getInsertBlock();
-        const loop_block = self.context.appendBasicBlock(self.llvm_func, "Loop");
-        const after_block = self.context.appendBasicBlock(self.llvm_func, "After");
-
-        _ = self.builder.buildBr(loop_block);
-        self.builder.positionBuilderAtEnd(loop_block);
-        const mask_phi = self.builder.buildPhi(llvm_ty, "");
-        const result_phi = self.builder.buildPhi(llvm_ty, "");
-        const bb_phi = self.builder.buildPhi(llvm_ty, "");
-        const minus_mask = self.builder.buildSub(zero, mask_phi, "");
-        const bit = self.builder.buildAnd(mask_phi, minus_mask, "");
-        const not_bit = self.builder.buildXor(bit, minus_one, "");
-        const new_mask = self.builder.buildAnd(mask_phi, not_bit, "");
-        const source_bit = self.builder.buildAnd(source, bit, "");
-        const source_bit_set = self.builder.buildICmp(.NE, source_bit, zero, "");
-        const bb_or_zero = self.builder.buildSelect(source_bit_set, bb_phi, zero, ""); // avoid using control flow
-        const new_result = self.builder.buildOr(result_phi, bb_or_zero, "");
-        const new_bb = self.builder.buildAdd(bb_phi, bb_phi, "");
-        const while_cond = self.builder.buildICmp(.NE, new_mask, zero, "");
-        _ = self.builder.buildCondBr(while_cond, loop_block, after_block);
-
-        mask_phi.addIncoming(
-            &[2]*llvm.Value{ start_mask, new_mask },
-            &[2]*llvm.BasicBlock{ prev_block, loop_block },
-            2,
+        const prev_block = self.wip.cursor.block;
+        const loop_block = try self.wip.block(2, "Loop");
+        const after_block = try self.wip.block(1, "After");
+
+        _ = try self.wip.br(loop_block);
+        self.wip.cursor = .{ .block = loop_block };
+        const mask_phi = try self.wip.phi(ty, "");
+        const result_phi = try self.wip.phi(ty, "");
+        const bb_phi = try self.wip.phi(ty, "");
+        const minus_mask = try self.wip.neg(mask_phi.toValue(), "");
+        const bit = try self.wip.bin(.@"and", mask_phi.toValue(), minus_mask, "");
+        const not_bit = try self.wip.not(bit, "");
+        const new_mask = try self.wip.bin(.@"and", mask_phi.toValue(), not_bit, "");
+        const source_bit = try self.wip.bin(.@"and", source, bit, "");
+        const source_bit_set = try self.wip.icmp(.ne, source_bit, zero, "");
+        const bb_or_zero = try self.wip.select(.normal, source_bit_set, bb_phi.toValue(), zero, ""); // avoid using control flow
+        const new_result = try self.wip.bin(.@"or", result_phi.toValue(), bb_or_zero, "");
+        const new_bb = try self.wip.bin(.@"add", bb_phi.toValue(), bb_phi.toValue(), "");
+        const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
+        _ = try self.wip.brCond(while_cond, loop_block, after_block);
+
+        try mask_phi.finish(
+            &.{ start_mask, new_mask },
+            &.{ prev_block, loop_block },
+            &self.wip,
         );
 
-        result_phi.addIncoming(
-            &[2]*llvm.Value{ start_result, new_result },
-            &[2]*llvm.BasicBlock{ prev_block, loop_block },
-            2,
+        try result_phi.finish(
+            &.{ start_result, new_result },
+            &.{ prev_block, loop_block },
+            &self.wip,
         );
 
-        bb_phi.addIncoming(
-            &[2]*llvm.Value{ start_bb, new_bb },
-            &[2]*llvm.BasicBlock{ prev_block, loop_block },
-            2,
+        try bb_phi.finish(
+            &.{ start_bb, new_bb },
+            &.{ prev_block, loop_block },
+            &self.wip,
         );
 
-        self.builder.positionBuilderAtEnd(after_block);
-        const final_result = self.builder.buildPhi(llvm_ty, "");
-        final_result.addIncoming(
-            &[1]*llvm.Value{new_result},
-            &[1]*llvm.BasicBlock{loop_block},
-            1,
+        self.wip.cursor = .{ .block = after_block };
+        const final_result = try self.wip.phi(ty, "");
+        try final_result.finish(
+            &.{new_result},
+            &.{loop_block},
+            &self.wip,
         );
 
-        return final_result;
+        return final_result.toValue();
     }
 
     fn getErrorNameTable(self: *FuncGen) Allocator.Error!Builder.Variable.Index {
diff --git a/src/codegen/llvm/Builder.zig b/src/codegen/llvm/Builder.zig
index 000223499b6f..30cb86e69432 100644
--- a/src/codegen/llvm/Builder.zig
+++ b/src/codegen/llvm/Builder.zig
@@ -2733,6 +2733,12 @@ pub const Intrinsic = enum {
     @"wasm.memory.size",
     @"wasm.memory.grow",
 
+    // x86 PDEP/PEXT
+    @"x86.bmi.pdep.32",
+    @"x86.bmi.pdep.64",
+    @"x86.bmi.pext.32",
+    @"x86.bmi.pext.64",
+
     const Signature = struct {
         ret_len: u8,
         params: []const Parameter,
@@ -3903,6 +3909,43 @@ pub const Intrinsic = enum {
             },
             .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .willreturn },
         },
+
+        .@"x86.bmi.pext.32" = .{
+            .ret_len = 1,
+            .params = &.{
+                .{ .kind = .{ .type = .i32 } },
+                .{ .kind = .{ .type = .i32 } },
+                .{ .kind = .{ .type = .i32 } },
+            },
+            .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } },
+        },
+        .@"x86.bmi.pext.64" = .{
+            .ret_len = 1,
+            .params = &.{
+                .{ .kind = .{ .type = .i64 } },
+                .{ .kind = .{ .type = .i64 } },
+                .{ .kind = .{ .type = .i64 } },
+            },
+            .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } },
+        },
+        .@"x86.bmi.pdep.32" = .{
+            .ret_len = 1,
+            .params = &.{
+                .{ .kind = .{ .type = .i32 } },
+                .{ .kind = .{ .type = .i32 } },
+                .{ .kind = .{ .type = .i32 } },
+            },
+            .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } },
+        },
+        .@"x86.bmi.pdep.64" = .{
+            .ret_len = 1,
+            .params = &.{
+                .{ .kind = .{ .type = .i64 } },
+                .{ .kind = .{ .type = .i64 } },
+                .{ .kind = .{ .type = .i64 } },
+            },
+            .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } },
+        },
     });
 };
 

From 71f8db45a9c3702f9372706737747680bea8ef90 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Sun, 12 Nov 2023 00:17:01 +0000
Subject: [PATCH 19/28] x86: Implement `@depositBits` and `@extractBits`

---
 src/arch/x86_64/CodeGen.zig            | 138 ++++++++++++++++++++++++-
 src/arch/x86_64/Encoding.zig           |   2 +
 src/arch/x86_64/Mir.zig                |   4 +
 src/arch/x86_64/encodings.zig          |   5 +
 test/behavior/deposit_extract_bits.zig |  47 +++++++--
 5 files changed, 187 insertions(+), 9 deletions(-)

diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 41f00c9c239d..7430efdf0eec 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -2196,8 +2196,8 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .work_group_size => unreachable,
             .work_group_id => unreachable,
 
-            .deposit_bits => return self.fail("TODO implement deposit_bits", .{}),
-            .extract_bits => return self.fail("TODO implement extract_bits", .{}),
+            .deposit_bits => try self.airDepositBits(inst),
+            .extract_bits => try self.airExtractBits(inst),
             // zig fmt: on
         }
 
@@ -5572,6 +5572,140 @@ fn airPtrSlicePtrPtr(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
 
+fn airDepositBits(self: *Self, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.options.module.?;
+
+    const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+    const lhs_mcv = try self.resolveInst(bin_op.lhs);
+    const rhs_mcv = try self.resolveInst(bin_op.rhs);
+    const dest_ty = self.typeOfIndex(inst);
+
+    const abi_size: u32 = @intCast(@max(dest_ty.abiSize(mod), 4));
+
+    if (!self.hasFeature(.bmi2) or abi_size > 8)
+        return self.fail("TODO implement depositBits without bmi2", .{});
+
+    var lhs_copied_to_dest = false;
+    const dest_mcv: MCValue = dest: {
+        if (rhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.rhs, 1, rhs_mcv))
+            break :dest rhs_mcv;
+
+        if (lhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.lhs, 0, lhs_mcv))
+            break :dest lhs_mcv;
+
+        lhs_copied_to_dest = true;
+        break :dest try self.copyToRegisterWithInstTracking(inst, dest_ty, lhs_mcv);
+    };
+
+    const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
+        .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
+        else => null,
+    };
+    defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+    const rhs_lock: ?RegisterLock = switch (rhs_mcv) {
+        .register => |reg| self.register_manager.lockReg(reg),
+        else => null,
+    };
+    defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+    const dest_lock = self.register_manager.lockReg(dest_mcv.getReg().?);
+    defer if (dest_lock) |lock| self.register_manager.unlockReg(lock);
+
+    const dest_reg = registerAlias(dest_mcv.getReg().?, abi_size);
+    const lhs_reg = if (lhs_copied_to_dest) dest_reg else registerAlias(if (lhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, lhs_mcv), abi_size);
+
+    if (rhs_mcv.isMemory()) {
+        try self.asmRegisterRegisterMemory(
+            .{ ._, .pdep },
+            dest_reg,
+            lhs_reg,
+            try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)),
+        );
+    } else {
+        const rhs_reg = registerAlias(
+            if (rhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, rhs_mcv),
+            abi_size,
+        );
+
+        try self.asmRegisterRegisterRegister(
+            .{ ._, .pdep },
+            dest_reg,
+            lhs_reg,
+            rhs_reg,
+        );
+    }
+
+    return self.finishAir(inst, .{ .register = dest_reg }, .{ bin_op.lhs, bin_op.rhs, .none });
+}
+
+fn airExtractBits(self: *Self, inst: Air.Inst.Index) !void {
+    const mod = self.bin_file.options.module.?;
+
+    const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+    const lhs_mcv = try self.resolveInst(bin_op.lhs);
+    const rhs_mcv = try self.resolveInst(bin_op.rhs);
+    const dest_ty = self.typeOfIndex(inst);
+
+    const abi_size: u32 = @intCast(@max(dest_ty.abiSize(mod), 4));
+
+    if (!self.hasFeature(.bmi2) or abi_size > 8)
+        return self.fail("TODO implement extractBits without bmi2", .{});
+
+    var lhs_copied_to_dest = false;
+    const dest_mcv: MCValue = dest: {
+        if (rhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.rhs, 1, rhs_mcv))
+            break :dest rhs_mcv;
+
+        if (lhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.lhs, 0, lhs_mcv))
+            break :dest lhs_mcv;
+
+        lhs_copied_to_dest = true;
+        break :dest try self.copyToRegisterWithInstTracking(inst, dest_ty, lhs_mcv);
+    };
+
+    const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
+        .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
+        else => null,
+    };
+    defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+    const rhs_lock: ?RegisterLock = switch (rhs_mcv) {
+        .register => |reg| self.register_manager.lockReg(reg),
+        else => null,
+    };
+    defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
+
+    const dest_lock = self.register_manager.lockReg(dest_mcv.getReg().?);
+    defer if (dest_lock) |lock| self.register_manager.unlockReg(lock);
+
+    const dest_reg = registerAlias(dest_mcv.getReg().?, abi_size);
+    const lhs_reg = if (lhs_copied_to_dest) dest_reg else registerAlias(if (lhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, lhs_mcv), abi_size);
+
+    if (rhs_mcv.isMemory()) {
+        try self.asmRegisterRegisterMemory(
+            .{ ._, .pext },
+            dest_reg,
+            lhs_reg,
+            try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)),
+        );
+    } else {
+        const rhs_reg = registerAlias(
+            if (rhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, rhs_mcv),
+            abi_size,
+        );
+
+        try self.asmRegisterRegisterRegister(
+            .{ ._, .pext },
+            dest_reg,
+            lhs_reg,
+            rhs_reg,
+        );
+    }
+
+    return self.finishAir(inst, .{ .register = dest_reg }, .{ bin_op.lhs, bin_op.rhs, .none });
+}
+
 fn elemOffset(self: *Self, index_ty: Type, index: MCValue, elem_size: u64) !Register {
     const reg: Register = blk: {
         switch (index) {
diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig
index e4c2a39d18c4..43a7fbbbfb94 100644
--- a/src/arch/x86_64/Encoding.zig
+++ b/src/arch/x86_64/Encoding.zig
@@ -245,6 +245,7 @@ pub const Mnemonic = enum {
     neg, nop, not,
     @"or",
     pause, pop, popcnt, popfq, push, pushfq,
+    pdep, pext,
     rcl, rcr, ret, rol, ror,
     sal, sar, sbb,
     scas, scasb, scasd, scasq, scasw,
@@ -782,6 +783,7 @@ pub const Feature = enum {
     avx,
     avx2,
     bmi,
+    bmi2,
     f16c,
     fma,
     lzcnt,
diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig
index d2dd6237a5e6..9b9126e26249 100644
--- a/src/arch/x86_64/Mir.zig
+++ b/src/arch/x86_64/Mir.zig
@@ -384,6 +384,10 @@ pub const Inst = struct {
         @"or",
         /// Spin loop hint
         pause,
+        /// Parallel bits deposit
+        pdep,
+        /// Parallel bits extract
+        pext,
         /// Pop
         pop,
         /// Return the count of number of bits set to 1
diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig
index d4a7dcafe7bd..dd36cf1e68ea 100644
--- a/src/arch/x86_64/encodings.zig
+++ b/src/arch/x86_64/encodings.zig
@@ -486,6 +486,11 @@ pub const table = [_]Entry{
 
     .{ .pause, .zo, &.{}, &.{ 0xf3, 0x90 }, 0, .none, .none },
 
+    .{ .pdep, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0xf2, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .pdep, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0xf2, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 },
+    .{ .pext, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0xf3, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 },
+    .{ .pext, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0xf3, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 },
+
     .{ .pop, .o, &.{ .r16  }, &.{ 0x58 }, 0, .short, .none },
     .{ .pop, .o, &.{ .r64  }, &.{ 0x58 }, 0, .none,  .none },
     .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .short, .none },
diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig
index 9f2bafe22560..33c7f338148b 100644
--- a/test/behavior/deposit_extract_bits.zig
+++ b/test/behavior/deposit_extract_bits.zig
@@ -5,7 +5,10 @@ const expect = std.testing.expect;
 const expectEqual = std.testing.expectEqual;
 
 test "@depositBits" {
-    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO
+    switch (builtin.zig_backend) {
+        .stage2_llvm, .stage2_x86_64 => {},
+        else => return error.SkipZigTest, // TODO
+    }
 
     const S = struct {
         pub fn doTheTest() !void {
@@ -13,7 +16,6 @@ test "@depositBits" {
             var b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
             var c: u64 = 0x1234_5678_9012_3456;
             var d: u64 = 0x00F0_FF00_F00F_00FF;
-            var e: u128 = @as(u128, d) << 64;
 
             try expect(@depositBits(b, a) == 0);
             try expect(@depositBits(a, b) == 0);
@@ -22,7 +24,22 @@ test "@depositBits" {
             try expect(@depositBits(b, d) == d);
 
             try expect(@depositBits(c, d) == 0x0000_1200_3004_0056);
-            try expect(@depositBits(c, e) == 0x0000_1200_3004_0056 << 64);
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
+}
+
+test "@depositBits u128" {
+    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest;
+
+    const S = struct {
+        pub fn doTheTest() !void {
+            var a: u64 = 0x1234_5678_9012_3456;
+            var b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+
+            try expect(@depositBits(a, b) == 0x0000_1200_3004_0056 << 64);
         }
     };
 
@@ -31,7 +48,10 @@ test "@depositBits" {
 }
 
 test "@extractBits" {
-    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO
+    switch (builtin.zig_backend) {
+        .stage2_llvm, .stage2_x86_64 => {},
+        else => return error.SkipZigTest, // TODO
+    }
 
     const S = struct {
         pub fn doTheTest() !void {
@@ -39,8 +59,6 @@ test "@extractBits" {
             var b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
             var c: u64 = 0x1234_5678_9012_3456;
             var d: u64 = 0x00F0_FF00_F00F_00FF;
-            var e: u128 = @as(u128, c) << 64;
-            var f: u128 = @as(u128, d) << 64;
 
             try expect(@extractBits(b, a) == 0);
             try expect(@extractBits(a, b) == 0);
@@ -49,7 +67,22 @@ test "@extractBits" {
             try expect(@extractBits(d, b) == d);
 
             try expect(@extractBits(c, d) == 0x0356_9256);
-            try expect(@extractBits(e, f) == 0x0356_9256);
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
+}
+
+test "@extractBits u128" {
+    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO
+
+    const S = struct {
+        pub fn doTheTest() !void {
+            var a: u128 = 0x1234_5678_9012_3456 << 64;
+            var b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+
+            try expect(@extractBits(a, b) == 0x0356_9256);
         }
     };
 

From 5f66df1ad537eeaf4a87a97cc2d487d0a8a8c438 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Thu, 25 Jan 2024 02:31:01 +0000
Subject: [PATCH 20/28] update deposit/extract to master

---
 src/arch/x86_64/CodeGen.zig            |  8 +++----
 src/codegen/llvm.zig                   |  4 ++--
 test/behavior/deposit_extract_bits.zig | 32 +++++++++++++++++++-------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 7430efdf0eec..74c0b83c742e 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -5573,9 +5573,9 @@ fn airPtrSlicePtrPtr(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airDepositBits(self: *Self, inst: Air.Inst.Index) !void {
-    const mod = self.bin_file.options.module.?;
+    const mod = self.bin_file.comp.module.?;
 
-    const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+    const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
     const lhs_mcv = try self.resolveInst(bin_op.lhs);
     const rhs_mcv = try self.resolveInst(bin_op.rhs);
     const dest_ty = self.typeOfIndex(inst);
@@ -5640,9 +5640,9 @@ fn airDepositBits(self: *Self, inst: Air.Inst.Index) !void {
 }
 
 fn airExtractBits(self: *Self, inst: Air.Inst.Index) !void {
-    const mod = self.bin_file.options.module.?;
+    const mod = self.bin_file.comp.module.?;
 
-    const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+    const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
     const lhs_mcv = try self.resolveInst(bin_op.lhs);
     const rhs_mcv = try self.resolveInst(bin_op.rhs);
     const dest_ty = self.typeOfIndex(inst);
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index c569d050bb08..7ef82b0482c5 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10303,7 +10303,7 @@ pub const FuncGen = struct {
 
         const o = self.dg.object;
 
-        const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
         const inst_ty = self.typeOfIndex(inst);
@@ -10458,7 +10458,7 @@ pub const FuncGen = struct {
 
         const o = self.dg.object;
 
-        const bin_op = self.air.instructions.items(.data)[inst].bin_op;
+        const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
         const lhs = try self.resolveInst(bin_op.lhs);
         const rhs = try self.resolveInst(bin_op.rhs);
         const inst_ty = self.typeOfIndex(inst);
diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig
index 33c7f338148b..03f56ea81a24 100644
--- a/test/behavior/deposit_extract_bits.zig
+++ b/test/behavior/deposit_extract_bits.zig
@@ -17,6 +17,11 @@ test "@depositBits" {
             var c: u64 = 0x1234_5678_9012_3456;
             var d: u64 = 0x00F0_FF00_F00F_00FF;
 
+            _ = &a;
+            _ = &b;
+            _ = &c;
+            _ = &d;
+
             try expect(@depositBits(b, a) == 0);
             try expect(@depositBits(a, b) == 0);
 
@@ -36,8 +41,11 @@ test "@depositBits u128" {
 
     const S = struct {
         pub fn doTheTest() !void {
-            var a: u64 = 0x1234_5678_9012_3456;
-            var b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+            const a: u64 = 0x1234_5678_9012_3456;
+            const b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+
+            _ = &a;
+            _ = &b;
 
             try expect(@depositBits(a, b) == 0x0000_1200_3004_0056 << 64);
         }
@@ -55,10 +63,15 @@ test "@extractBits" {
 
     const S = struct {
         pub fn doTheTest() !void {
-            var a: u64 = 0;
-            var b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
-            var c: u64 = 0x1234_5678_9012_3456;
-            var d: u64 = 0x00F0_FF00_F00F_00FF;
+            const a: u64 = 0;
+            const b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
+            const c: u64 = 0x1234_5678_9012_3456;
+            const d: u64 = 0x00F0_FF00_F00F_00FF;
+
+            _ = &a;
+            _ = &b;
+            _ = &c;
+            _ = &d;
 
             try expect(@extractBits(b, a) == 0);
             try expect(@extractBits(a, b) == 0);
@@ -79,8 +92,11 @@ test "@extractBits u128" {
 
     const S = struct {
         pub fn doTheTest() !void {
-            var a: u128 = 0x1234_5678_9012_3456 << 64;
-            var b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+            const a: u128 = 0x1234_5678_9012_3456 << 64;
+            const b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+
+            _ = &a;
+            _ = &b;
 
             try expect(@extractBits(a, b) == 0x0356_9256);
         }

From e0b463035216daa0d2a46b283bdc9d5e3297a0ed Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Thu, 25 Jan 2024 14:45:03 +0000
Subject: [PATCH 21/28] zig fmt

---
 src/codegen/llvm.zig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 7ef82b0482c5..1f5ece7a3056 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10420,7 +10420,7 @@ pub const FuncGen = struct {
         const source_bit_set = try self.wip.icmp(.ne, source_bit, zero, "");
         const bit_or_zero = try self.wip.select(.normal, source_bit_set, bit, zero, ""); // avoid using control flow
         const new_result = try self.wip.bin(.@"or", result_phi.toValue(), bit_or_zero, "");
-        const new_bb = try self.wip.bin(.@"add", bb_phi.toValue(), bb_phi.toValue(), "");
+        const new_bb = try self.wip.bin(.add, bb_phi.toValue(), bb_phi.toValue(), "");
         const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
         _ = try self.wip.brCond(while_cond, loop_block, after_block);
 
@@ -10577,7 +10577,7 @@ pub const FuncGen = struct {
         const source_bit_set = try self.wip.icmp(.ne, source_bit, zero, "");
         const bb_or_zero = try self.wip.select(.normal, source_bit_set, bb_phi.toValue(), zero, ""); // avoid using control flow
         const new_result = try self.wip.bin(.@"or", result_phi.toValue(), bb_or_zero, "");
-        const new_bb = try self.wip.bin(.@"add", bb_phi.toValue(), bb_phi.toValue(), "");
+        const new_bb = try self.wip.bin(.add, bb_phi.toValue(), bb_phi.toValue(), "");
         const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
         _ = try self.wip.brCond(while_cond, loop_block, after_block);
 

From 4bcaab9c23d78362756359af711742c8819c0da9 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Thu, 25 Jan 2024 17:18:19 +0000
Subject: [PATCH 22/28] Don't compile tests for deposit/extract when
 unsupported

---
 test/behavior/deposit_extract_bits.zig | 24 +++++++++++++++---------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig
index 03f56ea81a24..e6b364d77eec 100644
--- a/test/behavior/deposit_extract_bits.zig
+++ b/test/behavior/deposit_extract_bits.zig
@@ -4,11 +4,20 @@ const mem = std.mem;
 const expect = std.testing.expect;
 const expectEqual = std.testing.expectEqual;
 
-test "@depositBits" {
+fn runnerSupportsPextPdep(int_width: u16) bool {
     switch (builtin.zig_backend) {
-        .stage2_llvm, .stage2_x86_64 => {},
-        else => return error.SkipZigTest, // TODO
+        .stage2_llvm => return true,
+        .stage2_x86_64 => {
+            if (int_width > 64) return false;
+            if (!builtin.cpu.features.isEnabled(@intFromEnum(std.Target.x86.Feature.bmi2))) return false;
+            return true;
+        },
+        else => return false,
     }
+}
+
+test "@depositBits" {
+    if (comptime !runnerSupportsPextPdep(64)) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {
@@ -37,7 +46,7 @@ test "@depositBits" {
 }
 
 test "@depositBits u128" {
-    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest;
+    if (comptime !runnerSupportsPextPdep(128)) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {
@@ -56,10 +65,7 @@ test "@depositBits u128" {
 }
 
 test "@extractBits" {
-    switch (builtin.zig_backend) {
-        .stage2_llvm, .stage2_x86_64 => {},
-        else => return error.SkipZigTest, // TODO
-    }
+    if (comptime !runnerSupportsPextPdep(64)) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {
@@ -88,7 +94,7 @@ test "@extractBits" {
 }
 
 test "@extractBits u128" {
-    if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO
+    if (comptime !runnerSupportsPextPdep(128)) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {

From 432e1cbe0f104029de28f04f602ae0349bad85b3 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Fri, 15 Mar 2024 21:12:24 +0000
Subject: [PATCH 23/28] Bring branch up-to-date with llvm backend changes

---
 src/codegen/llvm.zig | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 1f5ece7a3056..00ee1ccd0735 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -10360,7 +10360,7 @@ pub const FuncGen = struct {
         }
 
         const result = try self.wip.callIntrinsic(
-            .fast,
+            .normal,
             .none,
             intrinsic,
             &.{},
@@ -10424,19 +10424,19 @@ pub const FuncGen = struct {
         const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
         _ = try self.wip.brCond(while_cond, loop_block, after_block);
 
-        try mask_phi.finish(
+        mask_phi.finish(
             &.{ start_mask, new_mask },
             &.{ prev_block, loop_block },
             &self.wip,
         );
 
-        try result_phi.finish(
+        result_phi.finish(
             &.{ zero, new_result },
             &.{ prev_block, loop_block },
             &self.wip,
         );
 
-        try bb_phi.finish(
+        bb_phi.finish(
             &.{ one, new_bb },
             &.{ prev_block, loop_block },
             &self.wip,
@@ -10444,7 +10444,7 @@ pub const FuncGen = struct {
 
         self.wip.cursor = .{ .block = after_block };
         const final_result = try self.wip.phi(ty, "");
-        try final_result.finish(
+        final_result.finish(
             &.{new_result},
             &.{loop_block},
             &self.wip,
@@ -10515,7 +10515,7 @@ pub const FuncGen = struct {
         }
 
         const result = try self.wip.callIntrinsic(
-            .fast,
+            .normal,
             .none,
             intrinsic,
             &.{},
@@ -10581,19 +10581,19 @@ pub const FuncGen = struct {
         const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
         _ = try self.wip.brCond(while_cond, loop_block, after_block);
 
-        try mask_phi.finish(
+        mask_phi.finish(
             &.{ start_mask, new_mask },
             &.{ prev_block, loop_block },
             &self.wip,
         );
 
-        try result_phi.finish(
+        result_phi.finish(
             &.{ start_result, new_result },
             &.{ prev_block, loop_block },
             &self.wip,
         );
 
-        try bb_phi.finish(
+        bb_phi.finish(
             &.{ start_bb, new_bb },
             &.{ prev_block, loop_block },
             &self.wip,
@@ -10601,7 +10601,7 @@ pub const FuncGen = struct {
 
         self.wip.cursor = .{ .block = after_block };
         const final_result = try self.wip.phi(ty, "");
-        try final_result.finish(
+        final_result.finish(
             &.{new_result},
             &.{loop_block},
             &self.wip,

From 725019e74331431f85429811cb2aa85cbbfe744f Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Tue, 26 Mar 2024 22:13:54 +0000
Subject: [PATCH 24/28] Emulate pdep and pext in compiler-rt

Implements compiler-rt functions to emulate the PEXT and PDEP instructions from
BMI2. These also implement the same functionality for arbitrarily-big integers.
The existing emulation of these instructions has been removed from the LLVM
backend, and replaced with calls to these compiler-rt functions. Some rework has
been done in the backend to reduce code duplication.
---
 lib/compiler_rt.zig          |   1 +
 lib/compiler_rt/pdeppext.zig | 177 +++++++++++++++++
 src/codegen/llvm.zig         | 373 ++++++++++-------------------------
 3 files changed, 284 insertions(+), 267 deletions(-)
 create mode 100644 lib/compiler_rt/pdeppext.zig

diff --git a/lib/compiler_rt.zig b/lib/compiler_rt.zig
index 173e6af85a5e..5e1cc86abebc 100644
--- a/lib/compiler_rt.zig
+++ b/lib/compiler_rt.zig
@@ -9,6 +9,7 @@ comptime {
     _ = @import("compiler_rt/popcount.zig");
     _ = @import("compiler_rt/bswap.zig");
     _ = @import("compiler_rt/cmp.zig");
+    _ = @import("compiler_rt/pdeppext.zig");
 
     _ = @import("compiler_rt/shift.zig");
     _ = @import("compiler_rt/negXi2.zig");
diff --git a/lib/compiler_rt/pdeppext.zig b/lib/compiler_rt/pdeppext.zig
new file mode 100644
index 000000000000..c9784f946b23
--- /dev/null
+++ b/lib/compiler_rt/pdeppext.zig
@@ -0,0 +1,177 @@
+const std = @import("std");
+const builtin = @import("builtin");
+const common = @import("common.zig");
+
+const Limb = u32;
+const Log2Limb = u5;
+
+comptime {
+    @export(__pdep_bigint, .{ .name = "__pdep_bigint", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pdep_u32, .{ .name = "__pdep_u32", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pdep_u64, .{ .name = "__pdep_u64", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pdep_u128, .{ .name = "__pdep_u128", .linkage = common.linkage, .visibility = common.visibility });
+
+    @export(__pext_bigint, .{ .name = "__pext_bigint", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pext_u32, .{ .name = "__pext_u32", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pext_u64, .{ .name = "__pext_u64", .linkage = common.linkage, .visibility = common.visibility });
+    @export(__pext_u128, .{ .name = "__pext_u128", .linkage = common.linkage, .visibility = common.visibility });
+}
+
+const endian = builtin.cpu.arch.endian();
+
+inline fn limb(x: []const Limb, i: usize) Limb {
+    return if (endian == .little) x[i] else x[x.len - 1 - i];
+}
+
+inline fn limb_ptr(x: []Limb, i: usize) *Limb {
+    return if (endian == .little) &x[i] else &x[x.len - 1 - i];
+}
+
+inline fn limb_set(x: []Limb, i: usize, v: Limb) void {
+    if (endian == .little) {
+        x[i] = v;
+    } else {
+        x[x.len - 1 - i] = v;
+    }
+}
+
+// Code for bigint pdep and pext largely taken from std.math.big.int.depositBits and extractBits
+
+inline fn pdep_bigint(result: []Limb, source: []const Limb, mask: []const Limb) void {
+    @memset(result, 0);
+
+    var mask_limb: Limb = limb(mask, 0);
+    var mask_limb_index: usize = 0;
+    var i: usize = 0;
+
+    outer: while (true) : (i += 1) {
+        // Find the lowest set bit in mask
+        const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+            const mask_limb_tz = @ctz(mask_limb);
+            if (mask_limb_tz != @bitSizeOf(Limb)) {
+                const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
+                mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                break :limb_bit cast_limb_bit;
+            }
+
+            mask_limb_index += 1;
+            if (mask_limb_index >= mask.len) break :outer;
+
+            mask_limb = limb(mask, mask_limb_index);
+        };
+
+        const i_limb_index = i / 32;
+        const i_limb_bit: Log2Limb = @truncate(i);
+
+        if (i_limb_index >= source.len) break;
+
+        const source_bit_set = limb(source, i_limb_index) & (@as(Limb, 1) << i_limb_bit) != 0;
+
+        limb_ptr(result, mask_limb_index).* |= @as(Limb, @intFromBool(source_bit_set)) << mask_limb_bit;
+    }
+}
+
+pub fn __pdep_bigint(r: [*]Limb, s: [*]const Limb, m: [*]const Limb, bits: usize) callconv(.C) void {
+    const result = r[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const source = s[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const mask = m[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+
+    pdep_bigint(result, source, mask);
+}
+
+inline fn pext_bigint(result: []Limb, source: []const Limb, mask: []const Limb) void {
+    @memset(result, 0);
+
+    var mask_limb: Limb = limb(mask, 0);
+    var mask_limb_index: usize = 0;
+    var i: usize = 0;
+
+    outer: while (true) : (i += 1) {
+        const mask_limb_bit: Log2Limb = limb_bit: while (true) {
+            const mask_limb_tz = @ctz(mask_limb);
+            if (mask_limb_tz != @bitSizeOf(Limb)) {
+                const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz);
+                mask_limb ^= @as(Limb, 1) << cast_limb_bit;
+                break :limb_bit cast_limb_bit;
+            }
+
+            mask_limb_index += 1;
+            if (mask_limb_index >= mask.len) break :outer;
+
+            mask_limb = limb(mask, mask_limb_index);
+        };
+
+        const i_limb_index = i / 32;
+        const i_limb_bit: Log2Limb = @truncate(i);
+
+        if (i_limb_index >= source.len) break;
+
+        const source_bit_set = limb(source, mask_limb_index) & (@as(Limb, 1) << mask_limb_bit) != 0;
+
+        limb_ptr(result, i_limb_index).* |= @as(Limb, @intFromBool(source_bit_set)) << i_limb_bit;
+    }
+}
+
+pub fn __pext_bigint(r: [*]Limb, s: [*]const Limb, m: [*]const Limb, bits: usize) callconv(.C) void {
+    const result = r[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const source = s[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+    const mask = m[0 .. std.math.divCeil(usize, bits, 32) catch unreachable];
+
+    pext_bigint(result, source, mask);
+}
+
+inline fn pdep_uX(comptime T: type, source: T, mask_: T) T {
+    var bb: T = 1;
+    var result: T = 0;
+    var mask = mask_;
+
+    while (mask != 0) {
+        const bit = mask & ~(mask - 1);
+        mask &= ~bit;
+        const source_bit = source & bb;
+        if (source_bit != 0) result |= bit;
+        bb += bb;
+    }
+
+    return result;
+}
+
+pub fn __pdep_u32(source: u32, mask: u32) callconv(.C) u32 {
+    return pdep_uX(u32, source, mask);
+}
+
+pub fn __pdep_u64(source: u64, mask: u64) callconv(.C) u64 {
+    return pdep_uX(u64, source, mask);
+}
+
+pub fn __pdep_u128(source: u128, mask: u128) callconv(.C) u128 {
+    return pdep_uX(u128, source, mask);
+}
+
+inline fn pext_uX(comptime T: type, source: T, mask_: T) T {
+    var bb: T = 1;
+    var result: T = 0;
+    var mask = mask_;
+
+    while (mask != 0) {
+        const bit = mask & ~(mask - 1);
+        mask &= ~bit;
+        const source_bit = source & bit;
+        if (source_bit != 0) result |= bb;
+        bb += bb;
+    }
+
+    return result;
+}
+
+pub fn __pext_u32(source: u32, mask: u32) callconv(.C) u32 {
+    return pext_uX(u32, source, mask);
+}
+
+pub fn __pext_u64(source: u64, mask: u64) callconv(.C) u64 {
+    return pext_uX(u64, source, mask);
+}
+
+pub fn __pext_u128(source: u128, mask: u128) callconv(.C) u128 {
+    return pext_uX(u128, source, mask);
+}
diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig
index 00ee1ccd0735..da9da70faf40 100644
--- a/src/codegen/llvm.zig
+++ b/src/codegen/llvm.zig
@@ -5104,8 +5104,8 @@ pub const FuncGen = struct {
                 .work_group_size => try self.airWorkGroupSize(inst),
                 .work_group_id => try self.airWorkGroupId(inst),
 
-                .deposit_bits => try self.airDepositBits(inst),
-                .extract_bits => try self.airExtractBits(inst),
+                .deposit_bits,
+                .extract_bits => |tag| try self.airDepositExtractBits(inst, tag),
                 // zig fmt: on
             };
             if (val != .none) try self.func_inst_table.putNoClobber(self.gpa, inst.toRef(), val);
@@ -10298,316 +10298,155 @@ pub const FuncGen = struct {
         return self.amdgcnWorkIntrinsic(dimension, 0, "amdgcn.workgroup.id");
     }
 
-    fn airDepositBits(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
+    fn airDepositExtractBits(self: *FuncGen, inst: Air.Inst.Index, tag: Air.Inst.Tag) !Builder.Value {
         if (self.liveness.isUnused(inst)) return .none;
 
         const o = self.dg.object;
 
         const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-        const lhs = try self.resolveInst(bin_op.lhs);
-        const rhs = try self.resolveInst(bin_op.rhs);
+        const source = try self.resolveInst(bin_op.lhs);
+        const mask = try self.resolveInst(bin_op.rhs);
         const inst_ty = self.typeOfIndex(inst);
-        const ty = try o.lowerType(inst_ty);
 
         const target = o.module.getTarget();
-        const params = [2]Builder.Value{ lhs, rhs };
+
+        const llvm_ty = try o.lowerType(inst_ty);
+        const bits: u16 = @intCast(llvm_ty.scalarBits(&o.builder));
+
         switch (target.cpu.arch) {
-            .x86, .x86_64 => |tag| blk: {
+            .x86, .x86_64 => |arch| blk: {
                 // Doesn't have pdep
                 if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk;
 
-                const bits = inst_ty.intInfo(o.module).bits;
-                const supports_64 = tag == .x86_64;
+                const supports_64 = arch == .x86_64;
                 // Integer size doesn't match the available instruction(s)
                 if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk;
 
-                return try self.buildDepositBitsNative(ty, params);
-            },
-            else => {},
-        }
+                const compiler_rt_bits = compilerRtIntBits(bits);
 
-        return try self.buildDepositBitsEmulated(ty, params);
-    }
-
-    fn buildDepositBitsNative(
-        self: *FuncGen,
-        ty: Builder.Type,
-        params: [2]Builder.Value,
-    ) !Builder.Value {
-        const o = self.dg.object;
-        const target = o.module.getTarget();
+                var buf: ["x86.bmi.pdep.32".len]u8 = undefined;
+                const intrinsic = std.meta.stringToEnum(Builder.Intrinsic, std.fmt.bufPrint(&buf, "x86.bmi.{s}.{d}", .{
+                    switch (tag) {
+                        .deposit_bits => "pdep",
+                        .extract_bits => "pext",
+                        else => unreachable,
+                    },
+                    compiler_rt_bits,
+                }) catch unreachable).?;
 
-        assert(target.cpu.arch.isX86());
-        assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2));
+                const needs_extend = bits != compiler_rt_bits;
+                const extended_ty = if (needs_extend) try o.builder.intType(compiler_rt_bits) else llvm_ty;
 
-        const bits = ty.scalarBits(&o.builder);
-        const intrinsic: Builder.Intrinsic = switch (bits) {
-            1...32 => .@"x86.bmi.pdep.32",
-            33...64 => .@"x86.bmi.pdep.64",
-            else => unreachable,
-        };
-        const needs_extend = bits != 32 and bits != 64;
+                const params = .{
+                    if (needs_extend) try self.wip.cast(.zext, source, extended_ty, "") else source,
+                    if (needs_extend) try self.wip.cast(.zext, mask, extended_ty, "") else mask,
+                };
 
-        var params_cast = params;
+                const result = try self.wip.callIntrinsic(
+                    .normal,
+                    .none,
+                    intrinsic,
+                    &.{},
+                    &params,
+                    "",
+                );
 
-        // Cast to either a 32 or 64-bit integer
-        if (needs_extend) {
-            const extend_ty = try o.builder.intType(if (bits <= 32) 32 else 64);
-            params_cast = .{
-                try self.wip.cast(.zext, params[0], extend_ty, ""),
-                try self.wip.cast(.zext, params[1], extend_ty, ""),
-            };
+                return if (needs_extend) try self.wip.cast(.trunc, result, llvm_ty, "") else result;
+            },
+            else => {},
         }
 
-        const result = try self.wip.callIntrinsic(
-            .normal,
-            .none,
-            intrinsic,
-            &.{},
-            &params_cast,
-            "",
-        );
+        return try self.genDepositExtractBitsEmulated(tag, bits, source, mask, llvm_ty);
+    }
 
-        // No cast needed!
-        if (!needs_extend) return result;
-
-        // Cast back to the original integer size
-        return try self.wip.cast(.trunc, result, ty, "");
-    }
-
-    // TODO Move this to compiler-rt (see #14609)
-    //
-    // Implements @depositBits(source, mask) in software
-    // (i.e. without platform-specific instructions)
-    //
-    // var bb = 1;
-    // var result = 0;
-    // do {
-    //     const bit = mask & -mask;
-    //     mask &= ~bit;
-    //     const source_bit = source & bb;
-    //     if (source_bit) result |= bit;
-    //     bb += bb;
-    // } while (mask)
-    //
-    // return result;
-    fn buildDepositBitsEmulated(
-        self: *FuncGen,
-        ty: Builder.Type,
-        params: [2]Builder.Value,
-    ) !Builder.Value {
+    fn genDepositExtractBitsEmulated(self: *FuncGen, tag: Air.Inst.Tag, bits: u16, source: Builder.Value, mask: Builder.Value, ty: Builder.Type) !Builder.Value {
         const o = self.dg.object;
+        const mod = o.module;
 
-        const source = params[0];
-        const start_mask = params[1];
-        const zero = try o.builder.intValue(ty, 0);
-        const one = try o.builder.intValue(ty, 1);
-
-        const prev_block = self.wip.cursor.block;
-        const loop_block = try self.wip.block(2, "Loop");
-        const after_block = try self.wip.block(1, "After");
-
-        _ = try self.wip.br(loop_block);
-        self.wip.cursor = .{ .block = loop_block };
-        const mask_phi = try self.wip.phi(ty, "");
-        const result_phi = try self.wip.phi(ty, "");
-        const bb_phi = try self.wip.phi(ty, "");
-        const minus_mask = try self.wip.neg(mask_phi.toValue(), "");
-        const bit = try self.wip.bin(.@"and", mask_phi.toValue(), minus_mask, "");
-        const not_bit = try self.wip.not(bit, "");
-        const new_mask = try self.wip.bin(.@"and", mask_phi.toValue(), not_bit, "");
-        const source_bit = try self.wip.bin(.@"and", source, bb_phi.toValue(), "");
-        const source_bit_set = try self.wip.icmp(.ne, source_bit, zero, "");
-        const bit_or_zero = try self.wip.select(.normal, source_bit_set, bit, zero, ""); // avoid using control flow
-        const new_result = try self.wip.bin(.@"or", result_phi.toValue(), bit_or_zero, "");
-        const new_bb = try self.wip.bin(.add, bb_phi.toValue(), bb_phi.toValue(), "");
-        const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
-        _ = try self.wip.brCond(while_cond, loop_block, after_block);
-
-        mask_phi.finish(
-            &.{ start_mask, new_mask },
-            &.{ prev_block, loop_block },
-            &self.wip,
-        );
-
-        result_phi.finish(
-            &.{ zero, new_result },
-            &.{ prev_block, loop_block },
-            &self.wip,
-        );
+        if (bits <= 128) {
+            const compiler_rt_bits = compilerRtIntBits(bits);
+            const needs_extend = bits != compiler_rt_bits;
+            const extended_ty = if (needs_extend) try o.builder.intType(compiler_rt_bits) else ty;
 
-        bb_phi.finish(
-            &.{ one, new_bb },
-            &.{ prev_block, loop_block },
-            &self.wip,
-        );
+            const fn_name = try o.builder.strtabStringFmt("__{s}_u{d}", .{
+                switch (tag) {
+                    .deposit_bits => "pdep",
+                    .extract_bits => "pext",
+                    else => unreachable,
+                },
+                compiler_rt_bits,
+            });
 
-        self.wip.cursor = .{ .block = after_block };
-        const final_result = try self.wip.phi(ty, "");
-        final_result.finish(
-            &.{new_result},
-            &.{loop_block},
-            &self.wip,
-        );
+            const params = .{
+                if (needs_extend) try self.wip.cast(.zext, source, extended_ty, "") else source,
+                if (needs_extend) try self.wip.cast(.zext, mask, extended_ty, "") else mask,
+            };
 
-        return final_result.toValue();
-    }
+            const libc_fn = try self.getLibcFunction(fn_name, &.{ extended_ty, extended_ty }, extended_ty);
+            const result = try self.wip.call(
+                .normal,
+                .ccc,
+                .none,
+                libc_fn.typeOf(&o.builder),
+                libc_fn.toValue(&o.builder),
+                &params,
+                "",
+            );
 
-    fn airExtractBits(self: *FuncGen, inst: Air.Inst.Index) !Builder.Value {
-        if (self.liveness.isUnused(inst)) return .none;
+            return if (needs_extend) try self.wip.cast(.trunc, result, ty, "") else result;
+        }
 
-        const o = self.dg.object;
+        // Rounded bits to the nearest 32, as limb size is 32.
+        const extended_bits = (((bits - 1) / 32) + 1) * 32;
+        const needs_extend = bits != extended_bits;
+        const extended_ty = if (needs_extend) try o.builder.intType(extended_bits) else ty;
 
-        const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-        const lhs = try self.resolveInst(bin_op.lhs);
-        const rhs = try self.resolveInst(bin_op.rhs);
-        const inst_ty = self.typeOfIndex(inst);
-        const ty = try o.lowerType(inst_ty);
+        const source_extended = if (needs_extend) try self.wip.cast(.zext, source, extended_ty, "") else source;
+        const mask_extended = if (needs_extend) try self.wip.cast(.zext, mask, extended_ty, "") else mask;
+        const zeroes_extended = try o.builder.intValue(extended_ty, 0);
 
-        const target = o.module.getTarget();
-        const params = [2]Builder.Value{ lhs, rhs };
-        switch (target.cpu.arch) {
-            .x86, .x86_64 => |tag| blk: {
-                // Doesn't have pext
-                if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk;
+        const alignment = Type.u32.abiAlignment(mod).toLlvm();
 
-                const bits = inst_ty.intInfo(o.module).bits;
-                const supports_64 = tag == .x86_64;
-                // Integer size doesn't match the available instruction(s)
-                if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk;
+        const source_pointer = try self.buildAlloca(extended_ty, alignment);
+        const mask_pointer = try self.buildAlloca(extended_ty, alignment);
+        const result_pointer = try self.buildAlloca(extended_ty, alignment);
 
-                return self.buildExtractBitsNative(ty, params);
-            },
-            else => {},
-        }
+        _ = try self.wip.store(.normal, source_extended, source_pointer, alignment);
+        _ = try self.wip.store(.normal, mask_extended, mask_pointer, alignment);
+        _ = try self.wip.store(.normal, zeroes_extended, result_pointer, alignment);
 
-        return self.buildExtractBitsEmulated(ty, params);
-    }
+        const fn_name = try o.builder.strtabStringFmt("__{s}_bigint", .{switch (tag) {
+            .deposit_bits => "pdep",
+            .extract_bits => "pext",
+            else => unreachable,
+        }});
 
-    fn buildExtractBitsNative(
-        self: *FuncGen,
-        ty: Builder.Type,
-        params: [2]Builder.Value,
-    ) !Builder.Value {
-        const o = self.dg.object;
-        const target = o.module.getTarget();
+        const pointer_ty = source_pointer.typeOfWip(&self.wip);
+        const usize_ty = try o.lowerType(Type.usize);
+        const void_ty = try o.lowerType(Type.void);
 
-        assert(target.cpu.arch.isX86());
-        assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2));
+        const bits_value = try o.builder.intValue(usize_ty, bits);
 
-        const bits = ty.scalarBits(&o.builder);
-        const intrinsic: Builder.Intrinsic = switch (bits) {
-            1...32 => .@"x86.bmi.pext.32",
-            33...64 => .@"x86.bmi.pext.64",
-            else => unreachable,
+        const params = .{
+            result_pointer,
+            source_pointer,
+            mask_pointer,
+            bits_value,
         };
-        const needs_extend = bits != 32 and bits != 64;
-
-        var params_cast = params;
 
-        // Cast to either a 32 or 64-bit integer
-        if (needs_extend) {
-            const extend_ty = try o.builder.intType(if (bits <= 32) 32 else 64);
-            params_cast = .{
-                try self.wip.cast(.zext, params[0], extend_ty, ""),
-                try self.wip.cast(.zext, params[1], extend_ty, ""),
-            };
-        }
-
-        const result = try self.wip.callIntrinsic(
+        const libc_fn = try self.getLibcFunction(fn_name, &.{ pointer_ty, pointer_ty, pointer_ty, usize_ty }, void_ty);
+        _ = try self.wip.call(
             .normal,
+            .ccc,
             .none,
-            intrinsic,
-            &.{},
-            &params_cast,
+            libc_fn.typeOf(&o.builder),
+            libc_fn.toValue(&o.builder),
+            &params,
             "",
         );
 
-        // No cast needed!
-        if (!needs_extend) return result;
-
-        // Cast back to the original integer size
-        return try self.wip.cast(.trunc, result, ty, "");
-    }
-
-    // TODO Move this to compiler-rt (see #14609)
-    //
-    // Implements @extractBits(source, mask) in software
-    // (i.e. without platform-specific instructions)
-    //
-    // var bb = 1;
-    // var result = 0;
-    // do {
-    //     const bit = mask & -mask;
-    //     mask &= ~bit;
-    //     const source_bit = source & bit;
-    //     if (source_bit != 0) result |= bb;
-    //     bb += bb;
-    // } while (mask)
-    //
-    // return result;
-    fn buildExtractBitsEmulated(
-        self: *FuncGen,
-        ty: Builder.Type,
-        params: [2]Builder.Value,
-    ) !Builder.Value {
-        const o = self.dg.object;
-
-        const source = params[0];
-        const start_mask = params[1];
-        const zero = try o.builder.intValue(ty, 0);
-        const one = try o.builder.intValue(ty, 1);
-        const start_result = zero;
-        const start_bb = one;
-
-        const prev_block = self.wip.cursor.block;
-        const loop_block = try self.wip.block(2, "Loop");
-        const after_block = try self.wip.block(1, "After");
-
-        _ = try self.wip.br(loop_block);
-        self.wip.cursor = .{ .block = loop_block };
-        const mask_phi = try self.wip.phi(ty, "");
-        const result_phi = try self.wip.phi(ty, "");
-        const bb_phi = try self.wip.phi(ty, "");
-        const minus_mask = try self.wip.neg(mask_phi.toValue(), "");
-        const bit = try self.wip.bin(.@"and", mask_phi.toValue(), minus_mask, "");
-        const not_bit = try self.wip.not(bit, "");
-        const new_mask = try self.wip.bin(.@"and", mask_phi.toValue(), not_bit, "");
-        const source_bit = try self.wip.bin(.@"and", source, bit, "");
-        const source_bit_set = try self.wip.icmp(.ne, source_bit, zero, "");
-        const bb_or_zero = try self.wip.select(.normal, source_bit_set, bb_phi.toValue(), zero, ""); // avoid using control flow
-        const new_result = try self.wip.bin(.@"or", result_phi.toValue(), bb_or_zero, "");
-        const new_bb = try self.wip.bin(.add, bb_phi.toValue(), bb_phi.toValue(), "");
-        const while_cond = try self.wip.icmp(.ne, new_mask, zero, "");
-        _ = try self.wip.brCond(while_cond, loop_block, after_block);
-
-        mask_phi.finish(
-            &.{ start_mask, new_mask },
-            &.{ prev_block, loop_block },
-            &self.wip,
-        );
-
-        result_phi.finish(
-            &.{ start_result, new_result },
-            &.{ prev_block, loop_block },
-            &self.wip,
-        );
-
-        bb_phi.finish(
-            &.{ start_bb, new_bb },
-            &.{ prev_block, loop_block },
-            &self.wip,
-        );
-
-        self.wip.cursor = .{ .block = after_block };
-        const final_result = try self.wip.phi(ty, "");
-        final_result.finish(
-            &.{new_result},
-            &.{loop_block},
-            &self.wip,
-        );
-
-        return final_result.toValue();
+        const result = try self.wip.load(.normal, extended_ty, result_pointer, alignment, "");
+        return if (needs_extend) try self.wip.cast(.trunc, result, ty, "") else result;
     }
 
     fn getErrorNameTable(self: *FuncGen) Allocator.Error!Builder.Variable.Index {

From e1915f93cf4364c5aa01dfe8440b870e5fd19c6c Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 27 Mar 2024 01:14:03 +0000
Subject: [PATCH 25/28] Include depositBits/extractBits emulation in x86
 backend

Adds calls into compiler-rt in the x86 backend for depositBits and extractBits.
This brings the x86 backend on-par with the LLVM backend, now fully supporting
these builtins for all targets and integer sizes. Some refactoring has been
applied to reduce code duplication.
---
 src/arch/x86_64/CodeGen.zig | 209 ++++++++++++++++++++----------------
 1 file changed, 119 insertions(+), 90 deletions(-)

diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig
index 74c0b83c742e..2c217f3f4c33 100644
--- a/src/arch/x86_64/CodeGen.zig
+++ b/src/arch/x86_64/CodeGen.zig
@@ -2196,8 +2196,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void {
             .work_group_size => unreachable,
             .work_group_id => unreachable,
 
-            .deposit_bits => try self.airDepositBits(inst),
-            .extract_bits => try self.airExtractBits(inst),
+            .deposit_bits,
+            .extract_bits,
+            => |tag| try self.airDepositExtractBits(inst, tag),
             // zig fmt: on
         }
 
@@ -5572,97 +5573,112 @@ fn airPtrSlicePtrPtr(self: *Self, inst: Air.Inst.Index) !void {
     return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none });
 }
 
-fn airDepositBits(self: *Self, inst: Air.Inst.Index) !void {
+fn airDepositExtractBits(self: *Self, inst: Air.Inst.Index, tag: Air.Inst.Tag) !void {
     const mod = self.bin_file.comp.module.?;
 
     const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const lhs_mcv = try self.resolveInst(bin_op.lhs);
-    const rhs_mcv = try self.resolveInst(bin_op.rhs);
-    const dest_ty = self.typeOfIndex(inst);
 
+    const dest_ty = self.typeOfIndex(inst);
     const abi_size: u32 = @intCast(@max(dest_ty.abiSize(mod), 4));
 
-    if (!self.hasFeature(.bmi2) or abi_size > 8)
-        return self.fail("TODO implement depositBits without bmi2", .{});
-
-    var lhs_copied_to_dest = false;
-    const dest_mcv: MCValue = dest: {
-        if (rhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.rhs, 1, rhs_mcv))
-            break :dest rhs_mcv;
+    const result = if (!self.hasFeature(.bmi2) or abi_size > 8)
+        try genDepositExtractBitsEmulated(self, inst, tag, bin_op.lhs, bin_op.rhs, dest_ty, abi_size)
+    else
+        try genDepositExtractBitsNative(self, inst, tag, bin_op.lhs, bin_op.rhs, dest_ty, abi_size);
 
-        if (lhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.lhs, 0, lhs_mcv))
-            break :dest lhs_mcv;
+    return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none });
+}
 
-        lhs_copied_to_dest = true;
-        break :dest try self.copyToRegisterWithInstTracking(inst, dest_ty, lhs_mcv);
-    };
+fn genDepositExtractBitsEmulated(
+    self: *Self,
+    inst: Air.Inst.Index,
+    tag: Air.Inst.Tag,
+    lhs: Air.Inst.Ref,
+    rhs: Air.Inst.Ref,
+    dest_ty: Type,
+    abi_size: u32,
+) !MCValue {
+    const mod = self.bin_file.comp.module.?;
 
-    const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
-        .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
-        else => null,
-    };
-    defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock);
+    var callee_buf: ["__pdep_bigint".len]u8 = undefined;
+    const callee = std.fmt.bufPrint(&callee_buf, "__{s}_{s}", .{
+        switch (tag) {
+            .deposit_bits => "pdep",
+            .extract_bits => "pext",
+            else => unreachable,
+        },
+        switch (abi_size) {
+            0...4 => "u32",
+            5...8 => "u64",
+            9...16 => "u128",
+            else => "bigint",
+        },
+    }) catch unreachable;
 
-    const rhs_lock: ?RegisterLock = switch (rhs_mcv) {
-        .register => |reg| self.register_manager.lockReg(reg),
-        else => null,
-    };
-    defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
+    if (abi_size <= 16) return try self.genCall(.{ .lib = .{
+        .return_type = dest_ty.toIntern(),
+        .param_types = &.{ dest_ty.toIntern(), dest_ty.toIntern() },
+        .callee = callee,
+    } }, &.{ dest_ty, dest_ty }, &.{ .{ .air_ref = lhs }, .{ .air_ref = rhs } });
 
-    const dest_lock = self.register_manager.lockReg(dest_mcv.getReg().?);
-    defer if (dest_lock) |lock| self.register_manager.unlockReg(lock);
+    const bit_count = dest_ty.intInfo(mod).bits;
 
-    const dest_reg = registerAlias(dest_mcv.getReg().?, abi_size);
-    const lhs_reg = if (lhs_copied_to_dest) dest_reg else registerAlias(if (lhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, lhs_mcv), abi_size);
+    const dest_mcv = try self.allocRegOrMemAdvanced(dest_ty, inst, false);
+    const lhs_mcv = try self.resolveInst(lhs);
+    const rhs_mcv = try self.resolveInst(rhs);
 
-    if (rhs_mcv.isMemory()) {
-        try self.asmRegisterRegisterMemory(
-            .{ ._, .pdep },
-            dest_reg,
-            lhs_reg,
-            try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)),
-        );
-    } else {
-        const rhs_reg = registerAlias(
-            if (rhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, rhs_mcv),
-            abi_size,
-        );
+    const manyptr_u32_ty = try mod.ptrType(.{
+        .child = .u32_type,
+        .flags = .{
+            .size = .Many,
+        },
+    });
+    const manyptr_const_u32_ty = try mod.ptrType(.{
+        .child = .u32_type,
+        .flags = .{
+            .size = .Many,
+            .is_const = true,
+        },
+    });
 
-        try self.asmRegisterRegisterRegister(
-            .{ ._, .pdep },
-            dest_reg,
-            lhs_reg,
-            rhs_reg,
-        );
-    }
+    _ = try self.genCall(.{ .lib = .{
+        .return_type = .void_type,
+        .param_types = &.{
+            manyptr_u32_ty.toIntern(),
+            manyptr_const_u32_ty.toIntern(),
+            manyptr_const_u32_ty.toIntern(),
+            .usize_type,
+        },
+        .callee = callee,
+    } }, &.{
+        manyptr_u32_ty,
+        manyptr_const_u32_ty,
+        manyptr_const_u32_ty,
+        Type.usize,
+    }, &.{
+        dest_mcv.address(),
+        lhs_mcv.address(),
+        rhs_mcv.address(),
+        .{ .immediate = bit_count },
+    });
 
-    return self.finishAir(inst, .{ .register = dest_reg }, .{ bin_op.lhs, bin_op.rhs, .none });
+    return dest_mcv;
 }
 
-fn airExtractBits(self: *Self, inst: Air.Inst.Index) !void {
-    const mod = self.bin_file.comp.module.?;
-
-    const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op;
-    const lhs_mcv = try self.resolveInst(bin_op.lhs);
-    const rhs_mcv = try self.resolveInst(bin_op.rhs);
-    const dest_ty = self.typeOfIndex(inst);
-
-    const abi_size: u32 = @intCast(@max(dest_ty.abiSize(mod), 4));
-
-    if (!self.hasFeature(.bmi2) or abi_size > 8)
-        return self.fail("TODO implement extractBits without bmi2", .{});
-
-    var lhs_copied_to_dest = false;
-    const dest_mcv: MCValue = dest: {
-        if (rhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.rhs, 1, rhs_mcv))
-            break :dest rhs_mcv;
-
-        if (lhs_mcv.isRegister() and self.reuseOperand(inst, bin_op.lhs, 0, lhs_mcv))
-            break :dest lhs_mcv;
+fn genDepositExtractBitsNative(
+    self: *Self,
+    inst: Air.Inst.Index,
+    tag: Air.Inst.Tag,
+    lhs: Air.Inst.Ref,
+    rhs: Air.Inst.Ref,
+    dest_ty: Type,
+    abi_size: u32,
+) !MCValue {
+    assert(self.hasFeature(.bmi2)); // BMI2 must be present for PEXT/PDEP instructions
+    assert(abi_size <= 8); // PEXT/PDEP only exist for 64-bit and below
 
-        lhs_copied_to_dest = true;
-        break :dest try self.copyToRegisterWithInstTracking(inst, dest_ty, lhs_mcv);
-    };
+    const lhs_mcv = try self.resolveInst(lhs);
+    const rhs_mcv = try self.resolveInst(rhs);
 
     const lhs_lock: ?RegisterLock = switch (lhs_mcv) {
         .register => |reg| self.register_manager.lockRegAssumeUnused(reg),
@@ -5676,34 +5692,47 @@ fn airExtractBits(self: *Self, inst: Air.Inst.Index) !void {
     };
     defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const dest_lock = self.register_manager.lockReg(dest_mcv.getReg().?);
+    const dest_mcv: MCValue, const dest_is_lhs = dest: {
+        if (rhs_mcv.isRegister() and self.reuseOperand(inst, rhs, 1, rhs_mcv))
+            break :dest .{ rhs_mcv, false };
+
+        if (lhs_mcv.isRegister() and self.reuseOperand(inst, lhs, 0, lhs_mcv))
+            break :dest .{ lhs_mcv, false };
+
+        break :dest .{ try self.copyToRegisterWithInstTracking(inst, dest_ty, lhs_mcv), true };
+    };
+
+    const dest_reg = dest_mcv.getReg().?;
+    const dest_lock = self.register_manager.lockReg(dest_reg);
     defer if (dest_lock) |lock| self.register_manager.unlockReg(lock);
 
-    const dest_reg = registerAlias(dest_mcv.getReg().?, abi_size);
-    const lhs_reg = if (lhs_copied_to_dest) dest_reg else registerAlias(if (lhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, lhs_mcv), abi_size);
+    const lhs_reg = if (dest_is_lhs) dest_reg else if (lhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, lhs_mcv);
+
+    const mir_tag = Mir.Inst.FixedTag{ ._, switch (tag) {
+        .deposit_bits => .pdep,
+        .extract_bits => .pext,
+        else => unreachable,
+    } };
 
     if (rhs_mcv.isMemory()) {
         try self.asmRegisterRegisterMemory(
-            .{ ._, .pext },
-            dest_reg,
-            lhs_reg,
+            mir_tag,
+            registerAlias(dest_reg, abi_size),
+            registerAlias(lhs_reg, abi_size),
             try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)),
         );
     } else {
-        const rhs_reg = registerAlias(
-            if (rhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, rhs_mcv),
-            abi_size,
-        );
+        const rhs_reg = if (rhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, rhs_mcv);
 
         try self.asmRegisterRegisterRegister(
-            .{ ._, .pext },
-            dest_reg,
-            lhs_reg,
-            rhs_reg,
+            mir_tag,
+            registerAlias(dest_reg, abi_size),
+            registerAlias(lhs_reg, abi_size),
+            registerAlias(rhs_reg, abi_size),
         );
     }
 
-    return self.finishAir(inst, .{ .register = dest_reg }, .{ bin_op.lhs, bin_op.rhs, .none });
+    return dest_mcv;
 }
 
 fn elemOffset(self: *Self, index_ty: Type, index: MCValue, elem_size: u64) !Register {

From e80a4b2113023dc586c8a5b26478974932e1878c Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Thu, 28 Mar 2024 22:19:10 +0000
Subject: [PATCH 26/28] Update behaviour tests for deposit/extractBits

Adds a test for u256 to provide some coverage for codegen of __pdep_bigint and
__pext_bigint. Also stops skipping tests on the x86 backend.
---
 test/behavior/deposit_extract_bits.zig | 78 ++++++++++++++++++--------
 1 file changed, 55 insertions(+), 23 deletions(-)

diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig
index e6b364d77eec..0f0f0663b191 100644
--- a/test/behavior/deposit_extract_bits.zig
+++ b/test/behavior/deposit_extract_bits.zig
@@ -4,20 +4,14 @@ const mem = std.mem;
 const expect = std.testing.expect;
 const expectEqual = std.testing.expectEqual;
 
-fn runnerSupportsPextPdep(int_width: u16) bool {
-    switch (builtin.zig_backend) {
-        .stage2_llvm => return true,
-        .stage2_x86_64 => {
-            if (int_width > 64) return false;
-            if (!builtin.cpu.features.isEnabled(@intFromEnum(std.Target.x86.Feature.bmi2))) return false;
-            return true;
-        },
-        else => return false,
-    }
-}
+const supports_pext_pdep = switch (builtin.zig_backend) {
+    .stage2_llvm => true,
+    .stage2_x86_64 => true,
+    else => false,
+};
 
 test "@depositBits" {
-    if (comptime !runnerSupportsPextPdep(64)) return error.SkipZigTest; // TODO
+    if (!supports_pext_pdep) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {
@@ -46,12 +40,12 @@ test "@depositBits" {
 }
 
 test "@depositBits u128" {
-    if (comptime !runnerSupportsPextPdep(128)) return error.SkipZigTest; // TODO
+    if (!supports_pext_pdep) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {
-            const a: u64 = 0x1234_5678_9012_3456;
-            const b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+            var a: u64 = 0x1234_5678_9012_3456;
+            var b: u128 = 0x00F0_FF00_F00F_00FF << 64;
 
             _ = &a;
             _ = &b;
@@ -64,15 +58,34 @@ test "@depositBits u128" {
     try comptime S.doTheTest();
 }
 
+test "@depositBits u256" {
+    if (!supports_pext_pdep) return error.SkipZigTest; // TODO
+
+    const S = struct {
+        pub fn doTheTest() !void {
+            var a: u64 = 0x1234_5678_9ABC_DEF0;
+            var b: u256 = 0x0F00_0FF0_0F0F_FF00 << 174;
+
+            _ = &a;
+            _ = &b;
+
+            try expect(@depositBits(a, b) == 0x0A00_0BC0_0D0E_F000 << 174);
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
+}
+
 test "@extractBits" {
-    if (comptime !runnerSupportsPextPdep(64)) return error.SkipZigTest; // TODO
+    if (!supports_pext_pdep) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {
-            const a: u64 = 0;
-            const b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
-            const c: u64 = 0x1234_5678_9012_3456;
-            const d: u64 = 0x00F0_FF00_F00F_00FF;
+            var a: u64 = 0;
+            var b: u64 = 0xFFFF_FFFF_FFFF_FFFF;
+            var c: u64 = 0x1234_5678_9012_3456;
+            var d: u64 = 0x00F0_FF00_F00F_00FF;
 
             _ = &a;
             _ = &b;
@@ -94,12 +107,12 @@ test "@extractBits" {
 }
 
 test "@extractBits u128" {
-    if (comptime !runnerSupportsPextPdep(128)) return error.SkipZigTest; // TODO
+    if (!supports_pext_pdep) return error.SkipZigTest; // TODO
 
     const S = struct {
         pub fn doTheTest() !void {
-            const a: u128 = 0x1234_5678_9012_3456 << 64;
-            const b: u128 = 0x00F0_FF00_F00F_00FF << 64;
+            var a: u128 = 0x1234_5678_9012_3456 << 64;
+            var b: u128 = 0x00F0_FF00_F00F_00FF << 64;
 
             _ = &a;
             _ = &b;
@@ -111,3 +124,22 @@ test "@extractBits u128" {
     try S.doTheTest();
     try comptime S.doTheTest();
 }
+
+test "@extractBits u256" {
+    if (!supports_pext_pdep) return error.SkipZigTest; // TODO
+
+    const S = struct {
+        pub fn doTheTest() !void {
+            var a: u256 = 0x1234_5678_9ABC_DEF0 << 96;
+            var b: u256 = 0x0F00_0FF0_0F0F_FF00 << 96;
+
+            _ = &a;
+            _ = &b;
+
+            try expect(@extractBits(a, b) == 0x0267_ACDE);
+        }
+    };
+
+    try S.doTheTest();
+    try comptime S.doTheTest();
+}

From b87e549762517b2f8c6d7c7b929384029b95f6b2 Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Fri, 29 Mar 2024 19:24:56 +0000
Subject: [PATCH 27/28] Bring fork up-to-date with master

---
 src/Sema.zig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Sema.zig b/src/Sema.zig
index 31b456cb6909..9a39773f0709 100644
--- a/src/Sema.zig
+++ b/src/Sema.zig
@@ -26466,7 +26466,7 @@ fn zirDepositExtractBits(
     if (dest_ty.zigTypeTag(mod) == .ComptimeInt) {
         if (maybe_lhs_val) |lhs_val| {
             if (!lhs_val.isUndef(mod) and lhs_val.orderAgainstZero(mod) == .lt) {
-                const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(lhs_ty, sema.mod)});
+                const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(sema.mod)});
                 try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name});
                 return sema.failWithOwnedErrorMsg(block, err);
             }
@@ -26474,7 +26474,7 @@ fn zirDepositExtractBits(
 
         if (maybe_rhs_val) |rhs_val| {
             if (!rhs_val.isUndef(mod) and rhs_val.orderAgainstZero(mod) == .lt) {
-                const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(rhs_ty, sema.mod)});
+                const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(sema.mod)});
                 try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name});
                 return sema.failWithOwnedErrorMsg(block, err);
             }

From 726b43619feb4ecfe9c6f7aaa40ca1ccd31702dc Mon Sep 17 00:00:00 2001
From: ominitay <37453713+ominitay@users.noreply.github.com>
Date: Wed, 17 Apr 2024 01:50:21 +0100
Subject: [PATCH 28/28] Skip failing behaviour tests

Disables the two behaviour tests which are caused to fail on the x86_64 backend
by #19498. Fixing the underlying issue is not within the scope of this pull request.
---
 test/behavior/deposit_extract_bits.zig | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig
index 0f0f0663b191..fb393866be08 100644
--- a/test/behavior/deposit_extract_bits.zig
+++ b/test/behavior/deposit_extract_bits.zig
@@ -41,6 +41,7 @@ test "@depositBits" {
 
 test "@depositBits u128" {
     if (!supports_pext_pdep) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.os.tag == .windows) return error.SkipZigTest; // TODO #19498
 
     const S = struct {
         pub fn doTheTest() !void {
@@ -108,6 +109,7 @@ test "@extractBits" {
 
 test "@extractBits u128" {
     if (!supports_pext_pdep) return error.SkipZigTest; // TODO
+    if (builtin.zig_backend == .stage2_x86_64 and builtin.target.os.tag == .windows) return error.SkipZigTest; // TODO #19498
 
     const S = struct {
         pub fn doTheTest() !void {