diff --git a/doc/langref.html.in b/doc/langref.html.in index 6f3e9961c389..3728b9a71637 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -8558,6 +8558,33 @@ test "main" { {#see_also|@cVaArg|@cVaCopy|@cVaEnd#} {#header_close#} + {#header_open|@depositBits#} +
{#syntax#}@depositBits(source: T, mask: T) T{#endsyntax#}
+

+ {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution. +

+

+ Uses a mask to transfer contiguous lower bits in the {#syntax#}source{#endsyntax#} operand to the destination, transferring them to the corresponding bits in the destination that are set in the mask. All other bits in the destination are zeroed. +

+

+ Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PDEP) in microcode. It may be faster to use an alternative method in both of these cases. +

+

+ Example: +

+ + {#code_begin|test|test_depositbits_builtin#} +const std = @import("std"); + +test "deposit bits" { + comptime { + try std.testing.expectEqual(@depositBits(0x00001234, 0xf0f0f0f0), 0x10203040); + } +} + {#code_end#} + {#see_also|@extractBits#} + {#header_close#} + {#header_open|@divExact#}
{#syntax#}@divExact(numerator: T, denominator: T) T{#endsyntax#}

@@ -8726,6 +8753,33 @@ export fn @"A function name that is a complete sentence."() void {} {#see_also|@export#} {#header_close#} + {#header_open|@extractBits#} +

{#syntax#}@extractBits(source: T, mask: T) T{#endsyntax#}
+

+ {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution. +

+

+ Uses a mask to transfer bits in the {#syntax#}source{#endsyntax#} operand to the destination, writing them as contiguous lower bits in the destination. The upper bits of the destination are zeroed. +

+

+ Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PEXT) in microcode. It may be faster to use an alternative method in both of these cases. +

+

+ Example: +

+ + {#code_begin|test|test_depositbits_builtin#} +const std = @import("std"); + +test "extract bits" { + comptime { + try std.testing.expectEqual(@extractBits(0x12345678, 0xf0f0f0f0), 0x00001357); + } +} + {#code_end#} + {#see_also|@depositBits#} + {#header_close#} + {#header_open|@fence#}
{#syntax#}@fence(order: AtomicOrder) void{#endsyntax#}

diff --git a/lib/compiler_rt.zig b/lib/compiler_rt.zig index 173e6af85a5e..5e1cc86abebc 100644 --- a/lib/compiler_rt.zig +++ b/lib/compiler_rt.zig @@ -9,6 +9,7 @@ comptime { _ = @import("compiler_rt/popcount.zig"); _ = @import("compiler_rt/bswap.zig"); _ = @import("compiler_rt/cmp.zig"); + _ = @import("compiler_rt/pdeppext.zig"); _ = @import("compiler_rt/shift.zig"); _ = @import("compiler_rt/negXi2.zig"); diff --git a/lib/compiler_rt/pdeppext.zig b/lib/compiler_rt/pdeppext.zig new file mode 100644 index 000000000000..c9784f946b23 --- /dev/null +++ b/lib/compiler_rt/pdeppext.zig @@ -0,0 +1,177 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const common = @import("common.zig"); + +const Limb = u32; +const Log2Limb = u5; + +comptime { + @export(__pdep_bigint, .{ .name = "__pdep_bigint", .linkage = common.linkage, .visibility = common.visibility }); + @export(__pdep_u32, .{ .name = "__pdep_u32", .linkage = common.linkage, .visibility = common.visibility }); + @export(__pdep_u64, .{ .name = "__pdep_u64", .linkage = common.linkage, .visibility = common.visibility }); + @export(__pdep_u128, .{ .name = "__pdep_u128", .linkage = common.linkage, .visibility = common.visibility }); + + @export(__pext_bigint, .{ .name = "__pext_bigint", .linkage = common.linkage, .visibility = common.visibility }); + @export(__pext_u32, .{ .name = "__pext_u32", .linkage = common.linkage, .visibility = common.visibility }); + @export(__pext_u64, .{ .name = "__pext_u64", .linkage = common.linkage, .visibility = common.visibility }); + @export(__pext_u128, .{ .name = "__pext_u128", .linkage = common.linkage, .visibility = common.visibility }); +} + +const endian = builtin.cpu.arch.endian(); + +inline fn limb(x: []const Limb, i: usize) Limb { + return if (endian == .little) x[i] else x[x.len - 1 - i]; +} + +inline fn limb_ptr(x: []Limb, i: usize) *Limb { + return if (endian == .little) &x[i] else &x[x.len - 1 - i]; +} + +inline fn limb_set(x: []Limb, i: usize, v: Limb) void { + if (endian == .little) { + x[i] = v; + } else { + x[x.len - 1 - i] = v; + } +} + +// Code for bigint pdep and pext largely taken from std.math.big.int.depositBits and extractBits + +inline fn pdep_bigint(result: []Limb, source: []const Limb, mask: []const Limb) void { + @memset(result, 0); + + var mask_limb: Limb = limb(mask, 0); + var mask_limb_index: usize = 0; + var i: usize = 0; + + outer: while (true) : (i += 1) { + // Find the lowest set bit in mask + const mask_limb_bit: Log2Limb = limb_bit: while (true) { + const mask_limb_tz = @ctz(mask_limb); + if (mask_limb_tz != @bitSizeOf(Limb)) { + const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz); + mask_limb ^= @as(Limb, 1) << cast_limb_bit; + break :limb_bit cast_limb_bit; + } + + mask_limb_index += 1; + if (mask_limb_index >= mask.len) break :outer; + + mask_limb = limb(mask, mask_limb_index); + }; + + const i_limb_index = i / 32; + const i_limb_bit: Log2Limb = @truncate(i); + + if (i_limb_index >= source.len) break; + + const source_bit_set = limb(source, i_limb_index) & (@as(Limb, 1) << i_limb_bit) != 0; + + limb_ptr(result, mask_limb_index).* |= @as(Limb, @intFromBool(source_bit_set)) << mask_limb_bit; + } +} + +pub fn __pdep_bigint(r: [*]Limb, s: [*]const Limb, m: [*]const Limb, bits: usize) callconv(.C) void { + const result = r[0 .. std.math.divCeil(usize, bits, 32) catch unreachable]; + const source = s[0 .. std.math.divCeil(usize, bits, 32) catch unreachable]; + const mask = m[0 .. std.math.divCeil(usize, bits, 32) catch unreachable]; + + pdep_bigint(result, source, mask); +} + +inline fn pext_bigint(result: []Limb, source: []const Limb, mask: []const Limb) void { + @memset(result, 0); + + var mask_limb: Limb = limb(mask, 0); + var mask_limb_index: usize = 0; + var i: usize = 0; + + outer: while (true) : (i += 1) { + const mask_limb_bit: Log2Limb = limb_bit: while (true) { + const mask_limb_tz = @ctz(mask_limb); + if (mask_limb_tz != @bitSizeOf(Limb)) { + const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz); + mask_limb ^= @as(Limb, 1) << cast_limb_bit; + break :limb_bit cast_limb_bit; + } + + mask_limb_index += 1; + if (mask_limb_index >= mask.len) break :outer; + + mask_limb = limb(mask, mask_limb_index); + }; + + const i_limb_index = i / 32; + const i_limb_bit: Log2Limb = @truncate(i); + + if (i_limb_index >= source.len) break; + + const source_bit_set = limb(source, mask_limb_index) & (@as(Limb, 1) << mask_limb_bit) != 0; + + limb_ptr(result, i_limb_index).* |= @as(Limb, @intFromBool(source_bit_set)) << i_limb_bit; + } +} + +pub fn __pext_bigint(r: [*]Limb, s: [*]const Limb, m: [*]const Limb, bits: usize) callconv(.C) void { + const result = r[0 .. std.math.divCeil(usize, bits, 32) catch unreachable]; + const source = s[0 .. std.math.divCeil(usize, bits, 32) catch unreachable]; + const mask = m[0 .. std.math.divCeil(usize, bits, 32) catch unreachable]; + + pext_bigint(result, source, mask); +} + +inline fn pdep_uX(comptime T: type, source: T, mask_: T) T { + var bb: T = 1; + var result: T = 0; + var mask = mask_; + + while (mask != 0) { + const bit = mask & ~(mask - 1); + mask &= ~bit; + const source_bit = source & bb; + if (source_bit != 0) result |= bit; + bb += bb; + } + + return result; +} + +pub fn __pdep_u32(source: u32, mask: u32) callconv(.C) u32 { + return pdep_uX(u32, source, mask); +} + +pub fn __pdep_u64(source: u64, mask: u64) callconv(.C) u64 { + return pdep_uX(u64, source, mask); +} + +pub fn __pdep_u128(source: u128, mask: u128) callconv(.C) u128 { + return pdep_uX(u128, source, mask); +} + +inline fn pext_uX(comptime T: type, source: T, mask_: T) T { + var bb: T = 1; + var result: T = 0; + var mask = mask_; + + while (mask != 0) { + const bit = mask & ~(mask - 1); + mask &= ~bit; + const source_bit = source & bit; + if (source_bit != 0) result |= bb; + bb += bb; + } + + return result; +} + +pub fn __pext_u32(source: u32, mask: u32) callconv(.C) u32 { + return pext_uX(u32, source, mask); +} + +pub fn __pext_u64(source: u64, mask: u64) callconv(.C) u64 { + return pext_uX(u64, source, mask); +} + +pub fn __pext_u128(source: u128, mask: u128) callconv(.C) u128 { + return pext_uX(u128, source, mask); +} diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig index 93ad1ccbe26a..357d2c93e785 100644 --- a/lib/std/math/big/int.zig +++ b/lib/std/math/big/int.zig @@ -1735,6 +1735,98 @@ pub const Mutable = struct { y.shiftRight(y.toConst(), norm_shift); } + // TODO this function is quite inefficient and could be optimised + /// r = @depositBits(source, mask) + /// + /// Asserts that `source` and `mask` are positive + pub fn depositBits(r: *Mutable, source: Const, mask: Const) void { + assert(source.positive); + assert(mask.positive); + + r.positive = true; + @memset(r.limbs, 0); + + var mask_limb: Limb = mask.limbs[0]; + var mask_limb_index: Limb = 0; + var i: usize = 0; + outer: while (true) : (i += 1) { + // Find next bit in mask + const mask_limb_bit: Log2Limb = limb_bit: while (true) { + const mask_limb_tz = @ctz(mask_limb); + if (mask_limb_tz != @sizeOf(Limb) * 8) { + const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz); + mask_limb ^= @as(Limb, 1) << cast_limb_bit; + break :limb_bit cast_limb_bit; + } + + mask_limb_index += 1; + // No more limbs, we've finished iterating the mask + if (mask_limb_index >= mask.limbs.len) { + break :outer; + } + + mask_limb = mask.limbs[mask_limb_index]; + }; + + const i_limb_index = i / limb_bits; + const i_limb_bit: Log2Limb = @truncate(i); + + if (i_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes) + + const source_bit_set = source.limbs[i_limb_index] & (@as(Limb, 1) << i_limb_bit) != 0; + + r.limbs[mask_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << mask_limb_bit; + } + + r.normalize(r.limbs.len); + } + + // TODO this function is quite inefficient and could be optimised + /// r = @extractBits(source, mask) + /// + /// Asserts that `source` and `mask` are positive + pub fn extractBits(r: *Mutable, source: Const, mask: Const) void { + assert(source.positive); + assert(mask.positive); + + r.positive = true; + @memset(r.limbs, 0); + + var mask_limb: Limb = mask.limbs[0]; + var mask_limb_index: Limb = 0; + var i: usize = 0; + outer: while (true) : (i += 1) { + // Find next bit in mask + const mask_limb_bit: Log2Limb = limb_bit: while (true) { + const mask_limb_tz = @ctz(mask_limb); + if (mask_limb_tz != @sizeOf(Limb) * 8) { + const cast_limb_bit: Log2Limb = @intCast(mask_limb_tz); + mask_limb ^= @as(Limb, 1) << cast_limb_bit; + break :limb_bit cast_limb_bit; + } + + mask_limb_index += 1; + // No more limbs, we've finished iterating the mask + if (mask_limb_index >= mask.limbs.len) { + break :outer; + } + + mask_limb = mask.limbs[mask_limb_index]; + }; + + const i_limb_index = i / limb_bits; + const i_limb_bit: Log2Limb = @truncate(i); + + if (mask_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes) + + const source_bit_set = source.limbs[mask_limb_index] & (@as(Limb, 1) << mask_limb_bit) != 0; + + r.limbs[i_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << i_limb_bit; + } + + r.normalize(r.limbs.len); + } + /// If a is positive, this passes through to truncate. /// If a is negative, then r is set to positive with the bit pattern ~(a - 1). /// r may alias a. diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig index 624bdc0b83af..c9dabaa31e30 100644 --- a/lib/std/math/big/int_test.zig +++ b/lib/std/math/big/int_test.zig @@ -2800,6 +2800,54 @@ fn popCountTest(val: *const Managed, bit_count: usize, expected: usize) !void { try testing.expectEqual(expected, val.toConst().popCount(bit_count)); } +test "big int extractBits" { + try extractBitsTest(0x12345678, 0x0, 0x0); + try extractBitsTest(0x12345678, 0xf0f0f0f0, 0x1357); + try extractBitsTest(0x12345678, 0xff00ff00, 0x1256); + try extractBitsTest(0x12345678, 0xffff, 0x5678); + + try extractBitsTest(0x12345678_90123456_78901234_56789012, 0xff << 64, 0x56); + try extractBitsTest(0x12345678_90123456_78901234_56789012, (0xff << 64) | 0xff00f, 0x56892); +} + +fn extractBitsTest(comptime source: comptime_int, comptime mask: comptime_int, comptime expected: comptime_int) !void { + var source_bigint = try Managed.initSet(testing.allocator, source); + defer source_bigint.deinit(); + var mask_bigint = try Managed.initSet(testing.allocator, mask); + defer mask_bigint.deinit(); + const limbs = try testing.allocator.alloc(Limb, mask_bigint.limbs.len); + defer testing.allocator.free(limbs); + var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined }; + + result.extractBits(source_bigint.toConst(), mask_bigint.toConst()); + + try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected)); +} + +test "big int depositBits" { + try depositBitsTest(0x12345678, 0x0, 0x0); + try depositBitsTest(0x12345678, 0xf0f0f0f0, 0x50607080); + try depositBitsTest(0x12345678, 0xff00ff00, 0x56007800); + try depositBitsTest(0x12345678, 0xffff, 0x5678); + + try depositBitsTest(0x1234, 0xff << 64, 0x34_00000000_00000000); + try depositBitsTest(0x12345678, (0xff << 64) | 0xff00f, 0x45_00000000_00067008); +} + +fn depositBitsTest(comptime source: comptime_int, comptime mask: comptime_int, comptime expected: comptime_int) !void { + var source_bigint = try Managed.initSet(testing.allocator, source); + defer source_bigint.deinit(); + var mask_bigint = try Managed.initSet(testing.allocator, mask); + defer mask_bigint.deinit(); + const limbs = try testing.allocator.alloc(Limb, mask_bigint.limbs.len); + defer testing.allocator.free(limbs); + var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined }; + + result.depositBits(source_bigint.toConst(), mask_bigint.toConst()); + + try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected)); +} + test "big int conversion read/write twos complement" { var a = try Managed.initSet(testing.allocator, (1 << 493) - 1); defer a.deinit(); diff --git a/lib/std/zig/AstGen.zig b/lib/std/zig/AstGen.zig index a52007eabf24..e483a9e890e6 100644 --- a/lib/std/zig/AstGen.zig +++ b/lib/std/zig/AstGen.zig @@ -9691,6 +9691,9 @@ fn builtinCall( }); return rvalue(gz, ri, result, node); }, + + .deposit_bits => return depositExtractBits(gz, scope, ri, node, params, .deposit_bits), + .extract_bits => return depositExtractBits(gz, scope, ri, node, params, .extract_bits), } } @@ -9958,6 +9961,24 @@ fn overflowArithmetic( return rvalue(gz, ri, result, node); } +fn depositExtractBits( + gz: *GenZir, + scope: *Scope, + ri: ResultInfo, + node: Ast.Node.Index, + params: []const Ast.Node.Index, + tag: Zir.Inst.Extended, +) InnerError!Zir.Inst.Ref { + const lhs = try expr(gz, scope, .{ .rl = .none }, params[0]); + const rhs = try expr(gz, scope, .{ .rl = .none }, params[1]); + const result = try gz.addExtendedPayload(tag, Zir.Inst.BinNode{ + .node = gz.nodeIndexToRelative(node), + .lhs = lhs, + .rhs = rhs, + }); + return rvalue(gz, ri, result, node); +} + fn callExpr( gz: *GenZir, scope: *Scope, diff --git a/lib/std/zig/AstRlAnnotate.zig b/lib/std/zig/AstRlAnnotate.zig index 4a1203ca09fc..edf221caa103 100644 --- a/lib/std/zig/AstRlAnnotate.zig +++ b/lib/std/zig/AstRlAnnotate.zig @@ -1100,5 +1100,10 @@ fn builtinCall(astrl: *AstRlAnnotate, block: ?*Block, ri: ResultInfo, node: Ast. _ = try astrl.expr(args[4], block, ResultInfo.type_only); return false; }, + .deposit_bits, .extract_bits => { + _ = try astrl.expr(args[0], block, ResultInfo.none); + _ = try astrl.expr(args[1], block, ResultInfo.none); + return false; + }, } } diff --git a/lib/std/zig/BuiltinFn.zig b/lib/std/zig/BuiltinFn.zig index 11d6a17303c8..37f648893da2 100644 --- a/lib/std/zig/BuiltinFn.zig +++ b/lib/std/zig/BuiltinFn.zig @@ -35,6 +35,7 @@ pub const Tag = enum { c_va_copy, c_va_end, c_va_start, + deposit_bits, div_exact, div_floor, div_trunc, @@ -46,6 +47,7 @@ pub const Tag = enum { error_cast, @"export", @"extern", + extract_bits, fence, field, field_parent_ptr, @@ -405,6 +407,12 @@ pub const list = list: { .illegal_outside_function = true, }, }, + .{ + "@depositBits", .{ + .tag = .deposit_bits, + .param_count = 2, + }, + }, .{ "@divExact", .{ @@ -483,6 +491,13 @@ pub const list = list: { .param_count = 2, }, }, + .{ + "@extractBits", + .{ + .tag = .extract_bits, + .param_count = 2, + }, + }, .{ "@fence", .{ diff --git a/lib/std/zig/Zir.zig b/lib/std/zig/Zir.zig index 64e8a1c8050f..b9f3e10dd007 100644 --- a/lib/std/zig/Zir.zig +++ b/lib/std/zig/Zir.zig @@ -2060,6 +2060,12 @@ pub const Inst = struct { /// Guaranteed to not have the `ptr_cast` flag. /// Uses the `pl_node` union field with payload `FieldParentPtr`. field_parent_ptr, + /// Implements the `@depositBits` builtin. + /// `operand` is payload index to `BinNode`. + deposit_bits, + /// Implements the `@extractBits` builtin. + /// `operand` is payload index to `BinNode`. + extract_bits, pub const InstData = struct { opcode: Extended, diff --git a/src/Air.zig b/src/Air.zig index 9554c55561a5..f607f3c29447 100644 --- a/src/Air.zig +++ b/src/Air.zig @@ -848,6 +848,13 @@ pub const Inst = struct { /// Operand is unused and set to Ref.none work_group_id, + /// Implements @depositBits builtin. + /// Uses the `bin_op` field. + deposit_bits, + /// Implements @extractBits builtin. + /// Uses the `bin_op` field. + extract_bits, + pub fn fromCmpOp(op: std.math.CompareOperator, optimized: bool) Tag { switch (op) { .lt => return if (optimized) .cmp_lt_optimized else .cmp_lt, @@ -1318,6 +1325,8 @@ pub fn typeOfIndex(air: *const Air, inst: Air.Inst.Index, ip: *const InternPool) .div_exact_optimized, .rem_optimized, .mod_optimized, + .deposit_bits, + .extract_bits, => return air.typeOf(datas[@intFromEnum(inst)].bin_op.lhs, ip), .sqrt, @@ -1790,6 +1799,8 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool { .work_item_id, .work_group_size, .work_group_id, + .deposit_bits, + .extract_bits, => false, .assembly => { diff --git a/src/Liveness.zig b/src/Liveness.zig index 4ca28758e222..dd9f4fe24776 100644 --- a/src/Liveness.zig +++ b/src/Liveness.zig @@ -286,6 +286,8 @@ pub fn categorizeOperand( .cmp_gte_optimized, .cmp_gt_optimized, .cmp_neq_optimized, + .deposit_bits, + .extract_bits, => { const o = air_datas[@intFromEnum(inst)].bin_op; if (o.lhs == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none); @@ -955,6 +957,8 @@ fn analyzeInst( .memset, .memset_safe, .memcpy, + .deposit_bits, + .extract_bits, => { const o = inst_datas[@intFromEnum(inst)].bin_op; return analyzeOperands(a, pass, data, inst, .{ o.lhs, o.rhs, .none }); diff --git a/src/Liveness/Verify.zig b/src/Liveness/Verify.zig index 4392f25e101d..f43b498f3a46 100644 --- a/src/Liveness/Verify.zig +++ b/src/Liveness/Verify.zig @@ -257,6 +257,8 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void { .memset, .memset_safe, .memcpy, + .deposit_bits, + .extract_bits, => { const bin_op = data[@intFromEnum(inst)].bin_op; try self.verifyInstOperands(inst, .{ bin_op.lhs, bin_op.rhs, .none }); diff --git a/src/Sema.zig b/src/Sema.zig index d3989f630cb5..9a39773f0709 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -1260,6 +1260,8 @@ fn analyzeBodyInner( .work_group_id => try sema.zirWorkItem( block, extended, extended.opcode), .in_comptime => try sema.zirInComptime( block), .closure_get => try sema.zirClosureGet( block, extended), + .deposit_bits => try sema.zirDepositExtractBits(block, extended, .deposit_bits), + .extract_bits => try sema.zirDepositExtractBits(block, extended, .extract_bits), // zig fmt: on .fence => { @@ -26390,6 +26392,130 @@ fn zirInComptime( return if (block.is_comptime) .bool_true else .bool_false; } +fn zirDepositExtractBits( + sema: *Sema, + block: *Block, + extended: Zir.Inst.Extended.InstData, + air_tag: Air.Inst.Tag, +) CompileError!Air.Inst.Ref { + const mod = sema.mod; + + const target = sema.mod.getTarget(); + _ = target; + const extra = sema.code.extraData(Zir.Inst.BinNode, extended.operand).data; + const src = LazySrcLoc.nodeOffset(extra.node); + + const lhs_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = extra.node }; + const rhs_src: LazySrcLoc = .{ .node_offset_builtin_call_arg1 = extra.node }; + + const uncasted_lhs = try sema.resolveInst(extra.lhs); + const uncasted_rhs = try sema.resolveInst(extra.rhs); + + const lhs_ty = sema.typeOf(uncasted_lhs); + const rhs_ty = sema.typeOf(uncasted_rhs); + + if (!lhs_ty.isUnsignedInt(mod) and lhs_ty.zigTypeTag(mod) != .ComptimeInt) { + return sema.fail(block, lhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{lhs_ty.fmt(sema.mod)}); + } + + if (!rhs_ty.isUnsignedInt(mod) and rhs_ty.zigTypeTag(mod) != .ComptimeInt) { + return sema.fail(block, rhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{rhs_ty.fmt(sema.mod)}); + } + + const instructions = &[_]Air.Inst.Ref{ uncasted_lhs, uncasted_rhs }; + const dest_ty = try sema.resolvePeerTypes(block, src, instructions, .{ + .override = &[_]?LazySrcLoc{ lhs_src, rhs_src }, + }); + + const builtin_name = switch (air_tag) { + .deposit_bits => "@depositBits", + .extract_bits => "@extractBits", + else => unreachable, + }; + + // Coercion errors are intercepted to add a note if the caller is attempting to pass a negative comptime_int + const lhs = sema.coerce(block, dest_ty, uncasted_lhs, lhs_src) catch |err| switch (err) { + error.AnalysisFail => { + const msg = sema.err orelse return err; + const val = (try sema.resolveValue(uncasted_lhs)).?; + if (val.orderAgainstZero(mod) == .lt) { + try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name}); + } + return err; + }, + else => return err, + }; + + const rhs = sema.coerce(block, dest_ty, uncasted_rhs, rhs_src) catch |err| switch (err) { + error.AnalysisFail => { + const msg = sema.err orelse return err; + const val = (try sema.resolveValue(uncasted_rhs)).?; + if (val.orderAgainstZero(mod) == .lt) { + try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name}); + } + return err; + }, + else => return err, + }; + + const maybe_lhs_val = try sema.resolveValue(lhs); + const maybe_rhs_val = try sema.resolveValue(rhs); + + // We check for negative values here only if the type is a comptime_int, as negative values + // would have otherwise been filtered out by coercion and the unsigned type restriction + if (dest_ty.zigTypeTag(mod) == .ComptimeInt) { + if (maybe_lhs_val) |lhs_val| { + if (!lhs_val.isUndef(mod) and lhs_val.orderAgainstZero(mod) == .lt) { + const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(sema.mod)}); + try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name}); + return sema.failWithOwnedErrorMsg(block, err); + } + } + + if (maybe_rhs_val) |rhs_val| { + if (!rhs_val.isUndef(mod) and rhs_val.orderAgainstZero(mod) == .lt) { + const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(sema.mod)}); + try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name}); + return sema.failWithOwnedErrorMsg(block, err); + } + } + } + + // If either of the operands are zero, the result is zero + // If either of the operands are undefined, the result is undefined + if (maybe_lhs_val) |lhs_val| { + if (lhs_val.orderAgainstZero(mod) == .eq) return Air.internedToRef((try mod.intValue(dest_ty, 0)).toIntern()); + if (lhs_val.isUndef(mod)) return try mod.undefRef(dest_ty); + } + if (maybe_rhs_val) |rhs_val| { + if (rhs_val.orderAgainstZero(mod) == .eq) return Air.internedToRef((try mod.intValue(dest_ty, 0)).toIntern()); + if (rhs_val.isUndef(mod)) return mod.undefRef(dest_ty); + } + + if (maybe_lhs_val) |lhs_val| { + if (maybe_rhs_val) |rhs_val| { + const dest_val = switch (air_tag) { + .deposit_bits => try sema.intDepositBits(lhs_val, rhs_val, dest_ty), + .extract_bits => try sema.intExtractBits(lhs_val, rhs_val, dest_ty), + else => unreachable, + }; + + return Air.internedToRef(dest_val.toIntern()); + } + } + + const runtime_src = if (maybe_lhs_val == null) lhs_src else rhs_src; + try sema.requireRuntimeBlock(block, src, runtime_src); + + return block.addInst(.{ + .tag = air_tag, + .data = .{ .bin_op = .{ + .lhs = lhs, + .rhs = rhs, + } }, + }); +} + fn requireRuntimeBlock(sema: *Sema, block: *Block, src: LazySrcLoc, runtime_src: ?LazySrcLoc) !void { if (block.is_comptime) { const msg = msg: { @@ -38992,6 +39118,62 @@ fn intAddWithOverflowScalar( }; } +/// Asserts that the values are positive +fn intDepositBits( + sema: *Sema, + lhs: Value, + rhs: Value, + ty: Type, +) !Value { + // TODO is this a performance issue? maybe we should try the operation without + // resorting to BigInt first. For non-bigints, @intDeposit could be used? + const mod = sema.mod; + const arena = sema.arena; + + var lhs_space: Value.BigIntSpace = undefined; + var rhs_space: Value.BigIntSpace = undefined; + const source = lhs.toBigInt(&lhs_space, mod); + const mask = rhs.toBigInt(&rhs_space, mod); + + const result_limbs = try arena.alloc( + std.math.big.Limb, + mask.limbs.len, + ); + + var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined }; + + result.depositBits(source, mask); + return try mod.intValue_big(ty, result.toConst()); +} + +/// Asserts that the values are positive +fn intExtractBits( + sema: *Sema, + lhs: Value, + rhs: Value, + ty: Type, +) !Value { + // TODO is this a performance issue? maybe we should try the operation without + // resorting to BigInt first. For non-bigints, @intExtract could be used? + const mod = sema.mod; + const arena = sema.arena; + + var lhs_space: Value.BigIntSpace = undefined; + var rhs_space: Value.BigIntSpace = undefined; + const source = lhs.toBigInt(&lhs_space, mod); + const mask = rhs.toBigInt(&rhs_space, mod); + + const result_limbs = try arena.alloc( + std.math.big.Limb, + mask.limbs.len, + ); + + var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined }; + + result.extractBits(source, mask); + return try mod.intValue_big(ty, result.toConst()); +} + /// Asserts the values are comparable. Both operands have type `ty`. /// For vectors, returns true if the comparison is true for ALL elements. /// diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig index ddde72345efe..5a327f1a0ae6 100644 --- a/src/arch/aarch64/CodeGen.zig +++ b/src/arch/aarch64/CodeGen.zig @@ -899,6 +899,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => return self.fail("TODO implement deposit_bits", .{}), + .extract_bits => return self.fail("TODO implement extract_bits", .{}), // zig fmt: on } diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig index 86d4e8f7fdd6..d55c69d48a7a 100644 --- a/src/arch/arm/CodeGen.zig +++ b/src/arch/arm/CodeGen.zig @@ -885,6 +885,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => return self.fail("TODO implement deposit_bits", .{}), + .extract_bits => return self.fail("TODO implement extract_bits", .{}), // zig fmt: on } diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig index 5abe3afcfd2a..d45904d7e98c 100644 --- a/src/arch/riscv64/CodeGen.zig +++ b/src/arch/riscv64/CodeGen.zig @@ -713,6 +713,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => return self.fail("TODO implement deposit_bits", .{}), + .extract_bits => return self.fail("TODO implement extract_bits", .{}), // zig fmt: on } if (std.debug.runtime_safety) { diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig index 19c18ec4a6b0..f2fbb813affd 100644 --- a/src/arch/sparc64/CodeGen.zig +++ b/src/arch/sparc64/CodeGen.zig @@ -732,6 +732,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => @panic("TODO implement deposit_bits"), + .extract_bits => @panic("TODO implement extract_bits"), // zig fmt: on } diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig index 83159ec80e7d..fcf8bd362a0d 100644 --- a/src/arch/wasm/CodeGen.zig +++ b/src/arch/wasm/CodeGen.zig @@ -2058,6 +2058,10 @@ fn genInst(func: *CodeGen, inst: Air.Inst.Index) InnerError!void { .work_group_size, .work_group_id, => unreachable, + + .deposit_bits, + .extract_bits, + => |tag| return func.fail("TODO implement {s}", .{@tagName(tag)}), }; } diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index c165baf7e885..2c217f3f4c33 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -2195,6 +2195,10 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits, + .extract_bits, + => |tag| try self.airDepositExtractBits(inst, tag), // zig fmt: on } @@ -5569,6 +5573,168 @@ fn airPtrSlicePtrPtr(self: *Self, inst: Air.Inst.Index) !void { return self.finishAir(inst, dst_mcv, .{ ty_op.operand, .none, .none }); } +fn airDepositExtractBits(self: *Self, inst: Air.Inst.Index, tag: Air.Inst.Tag) !void { + const mod = self.bin_file.comp.module.?; + + const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op; + + const dest_ty = self.typeOfIndex(inst); + const abi_size: u32 = @intCast(@max(dest_ty.abiSize(mod), 4)); + + const result = if (!self.hasFeature(.bmi2) or abi_size > 8) + try genDepositExtractBitsEmulated(self, inst, tag, bin_op.lhs, bin_op.rhs, dest_ty, abi_size) + else + try genDepositExtractBitsNative(self, inst, tag, bin_op.lhs, bin_op.rhs, dest_ty, abi_size); + + return self.finishAir(inst, result, .{ bin_op.lhs, bin_op.rhs, .none }); +} + +fn genDepositExtractBitsEmulated( + self: *Self, + inst: Air.Inst.Index, + tag: Air.Inst.Tag, + lhs: Air.Inst.Ref, + rhs: Air.Inst.Ref, + dest_ty: Type, + abi_size: u32, +) !MCValue { + const mod = self.bin_file.comp.module.?; + + var callee_buf: ["__pdep_bigint".len]u8 = undefined; + const callee = std.fmt.bufPrint(&callee_buf, "__{s}_{s}", .{ + switch (tag) { + .deposit_bits => "pdep", + .extract_bits => "pext", + else => unreachable, + }, + switch (abi_size) { + 0...4 => "u32", + 5...8 => "u64", + 9...16 => "u128", + else => "bigint", + }, + }) catch unreachable; + + if (abi_size <= 16) return try self.genCall(.{ .lib = .{ + .return_type = dest_ty.toIntern(), + .param_types = &.{ dest_ty.toIntern(), dest_ty.toIntern() }, + .callee = callee, + } }, &.{ dest_ty, dest_ty }, &.{ .{ .air_ref = lhs }, .{ .air_ref = rhs } }); + + const bit_count = dest_ty.intInfo(mod).bits; + + const dest_mcv = try self.allocRegOrMemAdvanced(dest_ty, inst, false); + const lhs_mcv = try self.resolveInst(lhs); + const rhs_mcv = try self.resolveInst(rhs); + + const manyptr_u32_ty = try mod.ptrType(.{ + .child = .u32_type, + .flags = .{ + .size = .Many, + }, + }); + const manyptr_const_u32_ty = try mod.ptrType(.{ + .child = .u32_type, + .flags = .{ + .size = .Many, + .is_const = true, + }, + }); + + _ = try self.genCall(.{ .lib = .{ + .return_type = .void_type, + .param_types = &.{ + manyptr_u32_ty.toIntern(), + manyptr_const_u32_ty.toIntern(), + manyptr_const_u32_ty.toIntern(), + .usize_type, + }, + .callee = callee, + } }, &.{ + manyptr_u32_ty, + manyptr_const_u32_ty, + manyptr_const_u32_ty, + Type.usize, + }, &.{ + dest_mcv.address(), + lhs_mcv.address(), + rhs_mcv.address(), + .{ .immediate = bit_count }, + }); + + return dest_mcv; +} + +fn genDepositExtractBitsNative( + self: *Self, + inst: Air.Inst.Index, + tag: Air.Inst.Tag, + lhs: Air.Inst.Ref, + rhs: Air.Inst.Ref, + dest_ty: Type, + abi_size: u32, +) !MCValue { + assert(self.hasFeature(.bmi2)); // BMI2 must be present for PEXT/PDEP instructions + assert(abi_size <= 8); // PEXT/PDEP only exist for 64-bit and below + + const lhs_mcv = try self.resolveInst(lhs); + const rhs_mcv = try self.resolveInst(rhs); + + const lhs_lock: ?RegisterLock = switch (lhs_mcv) { + .register => |reg| self.register_manager.lockRegAssumeUnused(reg), + else => null, + }; + defer if (lhs_lock) |lock| self.register_manager.unlockReg(lock); + + const rhs_lock: ?RegisterLock = switch (rhs_mcv) { + .register => |reg| self.register_manager.lockReg(reg), + else => null, + }; + defer if (rhs_lock) |lock| self.register_manager.unlockReg(lock); + + const dest_mcv: MCValue, const dest_is_lhs = dest: { + if (rhs_mcv.isRegister() and self.reuseOperand(inst, rhs, 1, rhs_mcv)) + break :dest .{ rhs_mcv, false }; + + if (lhs_mcv.isRegister() and self.reuseOperand(inst, lhs, 0, lhs_mcv)) + break :dest .{ lhs_mcv, false }; + + break :dest .{ try self.copyToRegisterWithInstTracking(inst, dest_ty, lhs_mcv), true }; + }; + + const dest_reg = dest_mcv.getReg().?; + const dest_lock = self.register_manager.lockReg(dest_reg); + defer if (dest_lock) |lock| self.register_manager.unlockReg(lock); + + const lhs_reg = if (dest_is_lhs) dest_reg else if (lhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, lhs_mcv); + + const mir_tag = Mir.Inst.FixedTag{ ._, switch (tag) { + .deposit_bits => .pdep, + .extract_bits => .pext, + else => unreachable, + } }; + + if (rhs_mcv.isMemory()) { + try self.asmRegisterRegisterMemory( + mir_tag, + registerAlias(dest_reg, abi_size), + registerAlias(lhs_reg, abi_size), + try rhs_mcv.mem(self, Memory.Size.fromSize(abi_size)), + ); + } else { + const rhs_reg = if (rhs_mcv.getReg()) |reg| reg else try self.copyToTmpRegister(dest_ty, rhs_mcv); + + try self.asmRegisterRegisterRegister( + mir_tag, + registerAlias(dest_reg, abi_size), + registerAlias(lhs_reg, abi_size), + registerAlias(rhs_reg, abi_size), + ); + } + + return dest_mcv; +} + fn elemOffset(self: *Self, index_ty: Type, index: MCValue, elem_size: u64) !Register { const reg: Register = blk: { switch (index) { diff --git a/src/arch/x86_64/Encoding.zig b/src/arch/x86_64/Encoding.zig index e4c2a39d18c4..43a7fbbbfb94 100644 --- a/src/arch/x86_64/Encoding.zig +++ b/src/arch/x86_64/Encoding.zig @@ -245,6 +245,7 @@ pub const Mnemonic = enum { neg, nop, not, @"or", pause, pop, popcnt, popfq, push, pushfq, + pdep, pext, rcl, rcr, ret, rol, ror, sal, sar, sbb, scas, scasb, scasd, scasq, scasw, @@ -782,6 +783,7 @@ pub const Feature = enum { avx, avx2, bmi, + bmi2, f16c, fma, lzcnt, diff --git a/src/arch/x86_64/Mir.zig b/src/arch/x86_64/Mir.zig index d2dd6237a5e6..9b9126e26249 100644 --- a/src/arch/x86_64/Mir.zig +++ b/src/arch/x86_64/Mir.zig @@ -384,6 +384,10 @@ pub const Inst = struct { @"or", /// Spin loop hint pause, + /// Parallel bits deposit + pdep, + /// Parallel bits extract + pext, /// Pop pop, /// Return the count of number of bits set to 1 diff --git a/src/arch/x86_64/encodings.zig b/src/arch/x86_64/encodings.zig index d4a7dcafe7bd..dd36cf1e68ea 100644 --- a/src/arch/x86_64/encodings.zig +++ b/src/arch/x86_64/encodings.zig @@ -486,6 +486,11 @@ pub const table = [_]Entry{ .{ .pause, .zo, &.{}, &.{ 0xf3, 0x90 }, 0, .none, .none }, + .{ .pdep, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0xf2, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 }, + .{ .pdep, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0xf2, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 }, + .{ .pext, .rvm, &.{ .r32, .r32, .rm32 }, &.{ 0xf3, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w0, .bmi2 }, + .{ .pext, .rvm, &.{ .r64, .r64, .rm64 }, &.{ 0xf3, 0x0f, 0x38, 0xf5 }, 0, .vex_lz_w1, .bmi2 }, + .{ .pop, .o, &.{ .r16 }, &.{ 0x58 }, 0, .short, .none }, .{ .pop, .o, &.{ .r64 }, &.{ 0x58 }, 0, .none, .none }, .{ .pop, .m, &.{ .rm16 }, &.{ 0x8f }, 0, .short, .none }, diff --git a/src/codegen/c.zig b/src/codegen/c.zig index 818267a8b819..80da6ff96482 100644 --- a/src/codegen/c.zig +++ b/src/codegen/c.zig @@ -3466,6 +3466,9 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, .work_group_size, .work_group_id, => unreachable, + + .deposit_bits => return f.fail("TODO: C backend: implement deposit_bits", .{}), + .extract_bits => return f.fail("TODO: C backend: implement extract_bits", .{}), // zig fmt: on }; if (result_value == .new_local) { diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig index db0eaa3ce5e6..da9da70faf40 100644 --- a/src/codegen/llvm.zig +++ b/src/codegen/llvm.zig @@ -5103,6 +5103,9 @@ pub const FuncGen = struct { .work_item_id => try self.airWorkItemId(inst), .work_group_size => try self.airWorkGroupSize(inst), .work_group_id => try self.airWorkGroupId(inst), + + .deposit_bits, + .extract_bits => |tag| try self.airDepositExtractBits(inst, tag), // zig fmt: on }; if (val != .none) try self.func_inst_table.putNoClobber(self.gpa, inst.toRef(), val); @@ -10295,6 +10298,157 @@ pub const FuncGen = struct { return self.amdgcnWorkIntrinsic(dimension, 0, "amdgcn.workgroup.id"); } + fn airDepositExtractBits(self: *FuncGen, inst: Air.Inst.Index, tag: Air.Inst.Tag) !Builder.Value { + if (self.liveness.isUnused(inst)) return .none; + + const o = self.dg.object; + + const bin_op = self.air.instructions.items(.data)[@intFromEnum(inst)].bin_op; + const source = try self.resolveInst(bin_op.lhs); + const mask = try self.resolveInst(bin_op.rhs); + const inst_ty = self.typeOfIndex(inst); + + const target = o.module.getTarget(); + + const llvm_ty = try o.lowerType(inst_ty); + const bits: u16 = @intCast(llvm_ty.scalarBits(&o.builder)); + + switch (target.cpu.arch) { + .x86, .x86_64 => |arch| blk: { + // Doesn't have pdep + if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk; + + const supports_64 = arch == .x86_64; + // Integer size doesn't match the available instruction(s) + if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk; + + const compiler_rt_bits = compilerRtIntBits(bits); + + var buf: ["x86.bmi.pdep.32".len]u8 = undefined; + const intrinsic = std.meta.stringToEnum(Builder.Intrinsic, std.fmt.bufPrint(&buf, "x86.bmi.{s}.{d}", .{ + switch (tag) { + .deposit_bits => "pdep", + .extract_bits => "pext", + else => unreachable, + }, + compiler_rt_bits, + }) catch unreachable).?; + + const needs_extend = bits != compiler_rt_bits; + const extended_ty = if (needs_extend) try o.builder.intType(compiler_rt_bits) else llvm_ty; + + const params = .{ + if (needs_extend) try self.wip.cast(.zext, source, extended_ty, "") else source, + if (needs_extend) try self.wip.cast(.zext, mask, extended_ty, "") else mask, + }; + + const result = try self.wip.callIntrinsic( + .normal, + .none, + intrinsic, + &.{}, + ¶ms, + "", + ); + + return if (needs_extend) try self.wip.cast(.trunc, result, llvm_ty, "") else result; + }, + else => {}, + } + + return try self.genDepositExtractBitsEmulated(tag, bits, source, mask, llvm_ty); + } + + fn genDepositExtractBitsEmulated(self: *FuncGen, tag: Air.Inst.Tag, bits: u16, source: Builder.Value, mask: Builder.Value, ty: Builder.Type) !Builder.Value { + const o = self.dg.object; + const mod = o.module; + + if (bits <= 128) { + const compiler_rt_bits = compilerRtIntBits(bits); + const needs_extend = bits != compiler_rt_bits; + const extended_ty = if (needs_extend) try o.builder.intType(compiler_rt_bits) else ty; + + const fn_name = try o.builder.strtabStringFmt("__{s}_u{d}", .{ + switch (tag) { + .deposit_bits => "pdep", + .extract_bits => "pext", + else => unreachable, + }, + compiler_rt_bits, + }); + + const params = .{ + if (needs_extend) try self.wip.cast(.zext, source, extended_ty, "") else source, + if (needs_extend) try self.wip.cast(.zext, mask, extended_ty, "") else mask, + }; + + const libc_fn = try self.getLibcFunction(fn_name, &.{ extended_ty, extended_ty }, extended_ty); + const result = try self.wip.call( + .normal, + .ccc, + .none, + libc_fn.typeOf(&o.builder), + libc_fn.toValue(&o.builder), + ¶ms, + "", + ); + + return if (needs_extend) try self.wip.cast(.trunc, result, ty, "") else result; + } + + // Rounded bits to the nearest 32, as limb size is 32. + const extended_bits = (((bits - 1) / 32) + 1) * 32; + const needs_extend = bits != extended_bits; + const extended_ty = if (needs_extend) try o.builder.intType(extended_bits) else ty; + + const source_extended = if (needs_extend) try self.wip.cast(.zext, source, extended_ty, "") else source; + const mask_extended = if (needs_extend) try self.wip.cast(.zext, mask, extended_ty, "") else mask; + const zeroes_extended = try o.builder.intValue(extended_ty, 0); + + const alignment = Type.u32.abiAlignment(mod).toLlvm(); + + const source_pointer = try self.buildAlloca(extended_ty, alignment); + const mask_pointer = try self.buildAlloca(extended_ty, alignment); + const result_pointer = try self.buildAlloca(extended_ty, alignment); + + _ = try self.wip.store(.normal, source_extended, source_pointer, alignment); + _ = try self.wip.store(.normal, mask_extended, mask_pointer, alignment); + _ = try self.wip.store(.normal, zeroes_extended, result_pointer, alignment); + + const fn_name = try o.builder.strtabStringFmt("__{s}_bigint", .{switch (tag) { + .deposit_bits => "pdep", + .extract_bits => "pext", + else => unreachable, + }}); + + const pointer_ty = source_pointer.typeOfWip(&self.wip); + const usize_ty = try o.lowerType(Type.usize); + const void_ty = try o.lowerType(Type.void); + + const bits_value = try o.builder.intValue(usize_ty, bits); + + const params = .{ + result_pointer, + source_pointer, + mask_pointer, + bits_value, + }; + + const libc_fn = try self.getLibcFunction(fn_name, &.{ pointer_ty, pointer_ty, pointer_ty, usize_ty }, void_ty); + _ = try self.wip.call( + .normal, + .ccc, + .none, + libc_fn.typeOf(&o.builder), + libc_fn.toValue(&o.builder), + ¶ms, + "", + ); + + const result = try self.wip.load(.normal, extended_ty, result_pointer, alignment, ""); + return if (needs_extend) try self.wip.cast(.trunc, result, ty, "") else result; + } + fn getErrorNameTable(self: *FuncGen) Allocator.Error!Builder.Variable.Index { const o = self.dg.object; const mod = o.module; diff --git a/src/codegen/llvm/Builder.zig b/src/codegen/llvm/Builder.zig index 000223499b6f..30cb86e69432 100644 --- a/src/codegen/llvm/Builder.zig +++ b/src/codegen/llvm/Builder.zig @@ -2733,6 +2733,12 @@ pub const Intrinsic = enum { @"wasm.memory.size", @"wasm.memory.grow", + // x86 PDEP/PEXT + @"x86.bmi.pdep.32", + @"x86.bmi.pdep.64", + @"x86.bmi.pext.32", + @"x86.bmi.pext.64", + const Signature = struct { ret_len: u8, params: []const Parameter, @@ -3903,6 +3909,43 @@ pub const Intrinsic = enum { }, .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .willreturn }, }, + + .@"x86.bmi.pext.32" = .{ + .ret_len = 1, + .params = &.{ + .{ .kind = .{ .type = .i32 } }, + .{ .kind = .{ .type = .i32 } }, + .{ .kind = .{ .type = .i32 } }, + }, + .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } }, + }, + .@"x86.bmi.pext.64" = .{ + .ret_len = 1, + .params = &.{ + .{ .kind = .{ .type = .i64 } }, + .{ .kind = .{ .type = .i64 } }, + .{ .kind = .{ .type = .i64 } }, + }, + .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } }, + }, + .@"x86.bmi.pdep.32" = .{ + .ret_len = 1, + .params = &.{ + .{ .kind = .{ .type = .i32 } }, + .{ .kind = .{ .type = .i32 } }, + .{ .kind = .{ .type = .i32 } }, + }, + .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } }, + }, + .@"x86.bmi.pdep.64" = .{ + .ret_len = 1, + .params = &.{ + .{ .kind = .{ .type = .i64 } }, + .{ .kind = .{ .type = .i64 } }, + .{ .kind = .{ .type = .i64 } }, + }, + .attrs = &.{ .nocallback, .nofree, .nosync, .nounwind, .{ .memory = Attribute.Memory.all(.none) } }, + }, }); }; diff --git a/src/print_air.zig b/src/print_air.zig index 12e2825d4ef0..e1a8a4ceeeb7 100644 --- a/src/print_air.zig +++ b/src/print_air.zig @@ -162,6 +162,8 @@ const Writer = struct { .memcpy, .memset, .memset_safe, + .deposit_bits, + .extract_bits, => try w.writeBinOp(s, inst), .is_null, diff --git a/src/print_zir.zig b/src/print_zir.zig index dfe94d397097..311d1d1c2240 100644 --- a/src/print_zir.zig +++ b/src/print_zir.zig @@ -591,6 +591,8 @@ const Writer = struct { .wasm_memory_grow, .prefetch, .c_va_arg, + .deposit_bits, + .extract_bits, => { const inst_data = self.code.extraData(Zir.Inst.BinNode, extended.operand).data; const src = LazySrcLoc.nodeOffset(inst_data.node); diff --git a/test/behavior.zig b/test/behavior.zig index 3081f6c9f969..d131b498e9b0 100644 --- a/test/behavior.zig +++ b/test/behavior.zig @@ -21,9 +21,10 @@ test { _ = @import("behavior/comptime_memory.zig"); _ = @import("behavior/const_slice_child.zig"); _ = @import("behavior/decltest.zig"); - _ = @import("behavior/duplicated_test_names.zig"); _ = @import("behavior/defer.zig"); + _ = @import("behavior/deposit_extract_bits.zig"); _ = @import("behavior/destructure.zig"); + _ = @import("behavior/duplicated_test_names.zig"); _ = @import("behavior/empty_tuple_fields.zig"); _ = @import("behavior/empty_union.zig"); _ = @import("behavior/enum.zig"); diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig new file mode 100644 index 000000000000..fb393866be08 --- /dev/null +++ b/test/behavior/deposit_extract_bits.zig @@ -0,0 +1,147 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const mem = std.mem; +const expect = std.testing.expect; +const expectEqual = std.testing.expectEqual; + +const supports_pext_pdep = switch (builtin.zig_backend) { + .stage2_llvm => true, + .stage2_x86_64 => true, + else => false, +}; + +test "@depositBits" { + if (!supports_pext_pdep) return error.SkipZigTest; // TODO + + const S = struct { + pub fn doTheTest() !void { + var a: u64 = 0; + var b: u64 = 0xFFFF_FFFF_FFFF_FFFF; + var c: u64 = 0x1234_5678_9012_3456; + var d: u64 = 0x00F0_FF00_F00F_00FF; + + _ = &a; + _ = &b; + _ = &c; + _ = &d; + + try expect(@depositBits(b, a) == 0); + try expect(@depositBits(a, b) == 0); + + try expect(@depositBits(b, c) == c); + try expect(@depositBits(b, d) == d); + + try expect(@depositBits(c, d) == 0x0000_1200_3004_0056); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +} + +test "@depositBits u128" { + if (!supports_pext_pdep) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and builtin.target.os.tag == .windows) return error.SkipZigTest; // TODO #19498 + + const S = struct { + pub fn doTheTest() !void { + var a: u64 = 0x1234_5678_9012_3456; + var b: u128 = 0x00F0_FF00_F00F_00FF << 64; + + _ = &a; + _ = &b; + + try expect(@depositBits(a, b) == 0x0000_1200_3004_0056 << 64); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +} + +test "@depositBits u256" { + if (!supports_pext_pdep) return error.SkipZigTest; // TODO + + const S = struct { + pub fn doTheTest() !void { + var a: u64 = 0x1234_5678_9ABC_DEF0; + var b: u256 = 0x0F00_0FF0_0F0F_FF00 << 174; + + _ = &a; + _ = &b; + + try expect(@depositBits(a, b) == 0x0A00_0BC0_0D0E_F000 << 174); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +} + +test "@extractBits" { + if (!supports_pext_pdep) return error.SkipZigTest; // TODO + + const S = struct { + pub fn doTheTest() !void { + var a: u64 = 0; + var b: u64 = 0xFFFF_FFFF_FFFF_FFFF; + var c: u64 = 0x1234_5678_9012_3456; + var d: u64 = 0x00F0_FF00_F00F_00FF; + + _ = &a; + _ = &b; + _ = &c; + _ = &d; + + try expect(@extractBits(b, a) == 0); + try expect(@extractBits(a, b) == 0); + + try expect(@extractBits(c, b) == c); + try expect(@extractBits(d, b) == d); + + try expect(@extractBits(c, d) == 0x0356_9256); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +} + +test "@extractBits u128" { + if (!supports_pext_pdep) return error.SkipZigTest; // TODO + if (builtin.zig_backend == .stage2_x86_64 and builtin.target.os.tag == .windows) return error.SkipZigTest; // TODO #19498 + + const S = struct { + pub fn doTheTest() !void { + var a: u128 = 0x1234_5678_9012_3456 << 64; + var b: u128 = 0x00F0_FF00_F00F_00FF << 64; + + _ = &a; + _ = &b; + + try expect(@extractBits(a, b) == 0x0356_9256); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +} + +test "@extractBits u256" { + if (!supports_pext_pdep) return error.SkipZigTest; // TODO + + const S = struct { + pub fn doTheTest() !void { + var a: u256 = 0x1234_5678_9ABC_DEF0 << 96; + var b: u256 = 0x0F00_0FF0_0F0F_FF00 << 96; + + _ = &a; + _ = &b; + + try expect(@extractBits(a, b) == 0x0267_ACDE); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +}