diff --git a/doc/langref.html.in b/doc/langref.html.in index 7c184c8f3688..e1c1aa268c43 100644 --- a/doc/langref.html.in +++ b/doc/langref.html.in @@ -8292,6 +8292,33 @@ test "main" { {#see_also|@cVaArg|@cVaCopy|@cVaEnd#} {#header_close#} + {#header_open|@depositBits#} +
{#syntax#}@depositBits(source: T, mask: T) T{#endsyntax#}
+

+ {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution. +

+

+ Uses a mask to transfer contiguous lower bits in the {#syntax#}source{#endsyntax#} operand to the destination, transferring them to the corresponding bits in the destination that are set in the mask. All other bits in the destination are zeroed. +

+

+ Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PDEP) in microcode. It may be faster to use an alternative method in both of these cases. +

+

+ Example: +

+ + {#code_begin|test|test_depositbits_builtin#} +const std = @import("std"); + +test "deposit bits" { + comptime { + try std.testing.expectEqual(@depositBits(0x00001234, 0xf0f0f0f0), 0x10203040); + } +} + {#code_end#} + {#see_also|@extractBits#} + {#header_close#} + {#header_open|@divExact#}
{#syntax#}@divExact(numerator: T, denominator: T) T{#endsyntax#}

@@ -8462,6 +8489,33 @@ export fn @"A function name that is a complete sentence."() void {} {#see_also|@export#} {#header_close#} + {#header_open|@extractBits#} +

{#syntax#}@extractBits(source: T, mask: T) T{#endsyntax#}
+

+ {#syntax#}T{#endsyntax#} must be an unsigned integer type, or a `comptime_int` (for which both parameters must be positive). `T` is determined by peer-type resolution. +

+

+ Uses a mask to transfer bits in the {#syntax#}source{#endsyntax#} operand to the destination, writing them as contiguous lower bits in the destination. The upper bits of the destination are zeroed. +

+

+ Currently, only x86 processors with BMI2 enabled support this in hardware. On processors without support for the instruction, it will be emulated. AMD processors before Zen 3 implement the corresponding instruction (PEXT) in microcode. It may be faster to use an alternative method in both of these cases. +

+

+ Example: +

+ + {#code_begin|test|test_depositbits_builtin#} +const std = @import("std"); + +test "extract bits" { + comptime { + try std.testing.expectEqual(@extractBits(0x12345678, 0xf0f0f0f0), 0x00001357); + } +} + {#code_end#} + {#see_also|@depositBits#} + {#header_close#} + {#header_open|@fence#}
{#syntax#}@fence(order: AtomicOrder) void{#endsyntax#}

diff --git a/lib/std/math/big/int.zig b/lib/std/math/big/int.zig index 846a809e0565..2e973f42be4e 100644 --- a/lib/std/math/big/int.zig +++ b/lib/std/math/big/int.zig @@ -1732,6 +1732,98 @@ pub const Mutable = struct { y.shiftRight(y.toConst(), norm_shift); } + // TODO this function is quite inefficient and could be optimised + /// r = @depositBits(source, mask) + /// + /// Asserts that `source` and `mask` are positive + pub fn depositBits(r: *Mutable, source: Const, mask: Const) void { + assert(source.positive); + assert(mask.positive); + + r.positive = true; + @memset(r.limbs, 0); + + var mask_limb: Limb = mask.limbs[0]; + var mask_limb_index: Limb = 0; + var i: usize = 0; + outer: while (true) : (i += 1) { + // Find next bit in mask + const mask_limb_bit: Log2Limb = limb_bit: while (true) { + const mask_limb_tz = @ctz(mask_limb); + if (mask_limb_tz != @sizeOf(Limb) * 8) { + const cast_limb_bit = @intCast(Log2Limb, mask_limb_tz); + mask_limb ^= @as(Limb, 1) << cast_limb_bit; + break :limb_bit cast_limb_bit; + } + + mask_limb_index += 1; + // No more limbs, we've finished iterating the mask + if (mask_limb_index >= mask.limbs.len) { + break :outer; + } + + mask_limb = mask.limbs[mask_limb_index]; + }; + + const i_limb_index = i / limb_bits; + const i_limb_bit = @truncate(Log2Limb, i); + + if (i_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes) + + const source_bit_set = source.limbs[i_limb_index] & (@as(Limb, 1) << i_limb_bit) != 0; + + r.limbs[mask_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << mask_limb_bit; + } + + r.normalize(r.limbs.len); + } + + // TODO this function is quite inefficient and could be optimised + /// r = @extractBits(source, mask) + /// + /// Asserts that `source` and `mask` are positive + pub fn extractBits(r: *Mutable, source: Const, mask: Const) void { + assert(source.positive); + assert(mask.positive); + + r.positive = true; + @memset(r.limbs, 0); + + var mask_limb: Limb = mask.limbs[0]; + var mask_limb_index: Limb = 0; + var i: usize = 0; + outer: while (true) : (i += 1) { + // Find next bit in mask + const mask_limb_bit: Log2Limb = limb_bit: while (true) { + const mask_limb_tz = @ctz(mask_limb); + if (mask_limb_tz != @sizeOf(Limb) * 8) { + const cast_limb_bit = @intCast(Log2Limb, mask_limb_tz); + mask_limb ^= @as(Limb, 1) << cast_limb_bit; + break :limb_bit cast_limb_bit; + } + + mask_limb_index += 1; + // No more limbs, we've finished iterating the mask + if (mask_limb_index >= mask.limbs.len) { + break :outer; + } + + mask_limb = mask.limbs[mask_limb_index]; + }; + + const i_limb_index = i / limb_bits; + const i_limb_bit = @truncate(Log2Limb, i); + + if (mask_limb_index >= source.limbs.len) break; // Stop when we reach the end of `source` (we can treat the rest as zeroes) + + const source_bit_set = source.limbs[mask_limb_index] & (@as(Limb, 1) << mask_limb_bit) != 0; + + r.limbs[i_limb_index] |= @as(Limb, @intFromBool(source_bit_set)) << i_limb_bit; + } + + r.normalize(r.limbs.len); + } + /// If a is positive, this passes through to truncate. /// If a is negative, then r is set to positive with the bit pattern ~(a - 1). /// r may alias a. diff --git a/lib/std/math/big/int_test.zig b/lib/std/math/big/int_test.zig index 9c3c1b68815f..25705bdcf985 100644 --- a/lib/std/math/big/int_test.zig +++ b/lib/std/math/big/int_test.zig @@ -2762,6 +2762,54 @@ fn popCountTest(val: *const Managed, bit_count: usize, expected: usize) !void { try testing.expectEqual(expected, val.toConst().popCount(bit_count)); } +test "big int extractBits" { + try extractBitsTest(0x12345678, 0x0, 0x0); + try extractBitsTest(0x12345678, 0xf0f0f0f0, 0x1357); + try extractBitsTest(0x12345678, 0xff00ff00, 0x1256); + try extractBitsTest(0x12345678, 0xffff, 0x5678); + + try extractBitsTest(0x12345678_90123456_78901234_56789012, 0xff << 64, 0x56); + try extractBitsTest(0x12345678_90123456_78901234_56789012, (0xff << 64) | 0xff00f, 0x56892); +} + +fn extractBitsTest(comptime source: comptime_int, comptime mask: comptime_int, comptime expected: comptime_int) !void { + var source_bigint = try Managed.initSet(testing.allocator, source); + defer source_bigint.deinit(); + var mask_bigint = try Managed.initSet(testing.allocator, mask); + defer mask_bigint.deinit(); + const limbs = try testing.allocator.alloc(Limb, mask_bigint.limbs.len); + defer testing.allocator.free(limbs); + var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined }; + + result.extractBits(source_bigint.toConst(), mask_bigint.toConst()); + + try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected)); +} + +test "big int depositBits" { + try depositBitsTest(0x12345678, 0x0, 0x0); + try depositBitsTest(0x12345678, 0xf0f0f0f0, 0x50607080); + try depositBitsTest(0x12345678, 0xff00ff00, 0x56007800); + try depositBitsTest(0x12345678, 0xffff, 0x5678); + + try depositBitsTest(0x1234, 0xff << 64, 0x34_00000000_00000000); + try depositBitsTest(0x12345678, (0xff << 64) | 0xff00f, 0x45_00000000_00067008); +} + +fn depositBitsTest(comptime source: comptime_int, comptime mask: comptime_int, comptime expected: comptime_int) !void { + var source_bigint = try Managed.initSet(testing.allocator, source); + defer source_bigint.deinit(); + var mask_bigint = try Managed.initSet(testing.allocator, mask); + defer mask_bigint.deinit(); + const limbs = try testing.allocator.alloc(Limb, mask_bigint.limbs.len); + defer testing.allocator.free(limbs); + var result = Mutable{ .limbs = limbs, .positive = undefined, .len = undefined }; + + result.depositBits(source_bigint.toConst(), mask_bigint.toConst()); + + try testing.expectEqual(std.math.Order.eq, result.toConst().orderAgainstScalar(expected)); +} + test "big int conversion read/write twos complement" { var a = try Managed.initSet(testing.allocator, (1 << 493) - 1); defer a.deinit(); diff --git a/src/Air.zig b/src/Air.zig index d4d4de07f28c..8a080efb3576 100644 --- a/src/Air.zig +++ b/src/Air.zig @@ -822,6 +822,13 @@ pub const Inst = struct { /// Operand is unused and set to Ref.none work_group_id, + /// Implements @depositBits builtin. + /// Uses the `bin_op` field. + deposit_bits, + /// Implements @extractBits builtin. + /// Uses the `bin_op` field. + extract_bits, + pub fn fromCmpOp(op: std.math.CompareOperator, optimized: bool) Tag { switch (op) { .lt => return if (optimized) .cmp_lt_optimized else .cmp_lt, @@ -1232,6 +1239,8 @@ pub fn typeOfIndex(air: Air, inst: Air.Inst.Index, ip: *const InternPool) Type { .div_exact_optimized, .rem_optimized, .mod_optimized, + .deposit_bits, + .extract_bits, => return air.typeOf(datas[inst].bin_op.lhs, ip), .sqrt, @@ -1742,6 +1751,8 @@ pub fn mustLower(air: Air, inst: Air.Inst.Index, ip: *const InternPool) bool { .work_item_id, .work_group_size, .work_group_id, + .deposit_bits, + .extract_bits, => false, .assembly => @truncate(u1, air.extraData(Air.Asm, data.ty_pl.payload).data.flags >> 31) != 0, diff --git a/src/AstGen.zig b/src/AstGen.zig index f1acd7e3e3c2..3c81526dac8b 100644 --- a/src/AstGen.zig +++ b/src/AstGen.zig @@ -8699,6 +8699,9 @@ fn builtinCall( }); return rvalue(gz, ri, result, node); }, + + .deposit_bits => return depositExtractBits(gz, scope, ri, node, params, .deposit_bits), + .extract_bits => return depositExtractBits(gz, scope, ri, node, params, .extract_bits), } } @@ -8966,6 +8969,24 @@ fn overflowArithmetic( return rvalue(gz, ri, result, node); } +fn depositExtractBits( + gz: *GenZir, + scope: *Scope, + ri: ResultInfo, + node: Ast.Node.Index, + params: []const Ast.Node.Index, + tag: Zir.Inst.Extended, +) InnerError!Zir.Inst.Ref { + const lhs = try expr(gz, scope, .{ .rl = .none }, params[0]); + const rhs = try expr(gz, scope, .{ .rl = .none }, params[1]); + const result = try gz.addExtendedPayload(tag, Zir.Inst.BinNode{ + .node = gz.nodeIndexToRelative(node), + .lhs = lhs, + .rhs = rhs, + }); + return rvalue(gz, ri, result, node); +} + fn callExpr( gz: *GenZir, scope: *Scope, diff --git a/src/BuiltinFn.zig b/src/BuiltinFn.zig index 27b963f52871..769f191c78c7 100644 --- a/src/BuiltinFn.zig +++ b/src/BuiltinFn.zig @@ -35,6 +35,7 @@ pub const Tag = enum { c_va_copy, c_va_end, c_va_start, + deposit_bits, div_exact, div_floor, div_trunc, @@ -46,6 +47,7 @@ pub const Tag = enum { err_set_cast, @"export", @"extern", + extract_bits, fence, field, field_parent_ptr, @@ -396,6 +398,12 @@ pub const list = list: { .param_count = 0, }, }, + .{ + "@depositBits", .{ + .tag = .deposit_bits, + .param_count = 2, + }, + }, .{ "@divExact", .{ @@ -474,6 +482,13 @@ pub const list = list: { .param_count = 2, }, }, + .{ + "@extractBits", + .{ + .tag = .extract_bits, + .param_count = 2, + }, + }, .{ "@fence", .{ diff --git a/src/Liveness.zig b/src/Liveness.zig index 2ba029136406..5ae53c575bdb 100644 --- a/src/Liveness.zig +++ b/src/Liveness.zig @@ -286,6 +286,8 @@ pub fn categorizeOperand( .cmp_gte_optimized, .cmp_gt_optimized, .cmp_neq_optimized, + .deposit_bits, + .extract_bits, => { const o = air_datas[inst].bin_op; if (o.lhs == operand_ref) return matchOperandSmallIndex(l, inst, 0, .none); @@ -942,6 +944,8 @@ fn analyzeInst( .memset, .memset_safe, .memcpy, + .deposit_bits, + .extract_bits, => { const o = inst_datas[inst].bin_op; return analyzeOperands(a, pass, data, inst, .{ o.lhs, o.rhs, .none }); diff --git a/src/Liveness/Verify.zig b/src/Liveness/Verify.zig index 904e38007353..f81f62ad0a3d 100644 --- a/src/Liveness/Verify.zig +++ b/src/Liveness/Verify.zig @@ -261,6 +261,8 @@ fn verifyBody(self: *Verify, body: []const Air.Inst.Index) Error!void { .memset, .memset_safe, .memcpy, + .deposit_bits, + .extract_bits, => { const bin_op = data[inst].bin_op; try self.verifyInstOperands(inst, .{ bin_op.lhs, bin_op.rhs, .none }); diff --git a/src/Sema.zig b/src/Sema.zig index bb2ef22ca560..a7baab078f63 100644 --- a/src/Sema.zig +++ b/src/Sema.zig @@ -1183,6 +1183,8 @@ fn analyzeBodyInner( .work_group_size => try sema.zirWorkItem( block, extended, extended.opcode), .work_group_id => try sema.zirWorkItem( block, extended, extended.opcode), .in_comptime => try sema.zirInComptime( block), + .deposit_bits => try sema.zirDepositExtractBits(block, extended, .deposit_bits), + .extract_bits => try sema.zirDepositExtractBits(block, extended, .extract_bits), // zig fmt: on .fence => { @@ -24145,6 +24147,127 @@ fn zirInComptime( } } +fn zirDepositExtractBits( + sema: *Sema, + block: *Block, + extended: Zir.Inst.Extended.InstData, + air_tag: Air.Inst.Tag, +) CompileError!Air.Inst.Ref { + const mod = sema.mod; + const extra = sema.code.extraData(Zir.Inst.BinNode, extended.operand).data; + const src = LazySrcLoc.nodeOffset(extra.node); + + const lhs_src: LazySrcLoc = .{ .node_offset_builtin_call_arg0 = extra.node }; + const rhs_src: LazySrcLoc = .{ .node_offset_builtin_call_arg1 = extra.node }; + + const uncasted_lhs = try sema.resolveInst(extra.lhs); + const uncasted_rhs = try sema.resolveInst(extra.rhs); + + const lhs_ty = sema.typeOf(uncasted_lhs); + const rhs_ty = sema.typeOf(uncasted_rhs); + + if (!lhs_ty.isUnsignedInt(mod) and lhs_ty.zigTypeTag(mod) != .ComptimeInt) { + return sema.fail(block, lhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{lhs_ty.fmt(mod)}); + } + + if (!rhs_ty.isUnsignedInt(mod) and rhs_ty.zigTypeTag(mod) != .ComptimeInt) { + return sema.fail(block, rhs_src, "expected unsigned integer or 'comptime_int', found '{}'", .{rhs_ty.fmt(mod)}); + } + + const instructions = &[_]Air.Inst.Ref{ uncasted_lhs, uncasted_rhs }; + const dest_ty = try sema.resolvePeerTypes(block, src, instructions, .{ + .override = &[_]?LazySrcLoc{ lhs_src, rhs_src }, + }); + + const builtin_name = switch (air_tag) { + .deposit_bits => "@depositBits", + .extract_bits => "@extractBits", + else => unreachable, + }; + + // Coercion errors are intercepted to add a note if the caller is attempting to pass a negative comptime_int + const lhs = sema.coerce(block, dest_ty, uncasted_lhs, lhs_src) catch |err| switch (err) { + error.AnalysisFail => { + const msg = sema.err orelse return err; + const val = (try sema.resolveMaybeUndefVal(uncasted_lhs)).?; + if (val.orderAgainstZero(mod) == .lt) { + try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name}); + } + return err; + }, + else => return err, + }; + + const rhs = sema.coerce(block, dest_ty, uncasted_rhs, rhs_src) catch |err| switch (err) { + error.AnalysisFail => { + const msg = sema.err orelse return err; + const val = (try sema.resolveMaybeUndefVal(uncasted_rhs)).?; + if (val.orderAgainstZero(mod) == .lt) { + try sema.errNote(block, src, msg, "parameters to {s} must be positive", .{builtin_name}); + } + return err; + }, + else => return err, + }; + + const maybe_lhs_val = try sema.resolveMaybeUndefVal(lhs); + const maybe_rhs_val = try sema.resolveMaybeUndefVal(rhs); + + // We check for negative values here only if the type is a comptime_int, as negative values + // would have otherwise been filtered out by coercion and the unsigned type restriction + if (dest_ty.zigTypeTag(mod) == .ComptimeInt) { + if (maybe_lhs_val) |lhs_val| { + if (!lhs_val.isUndef(mod) and lhs_val.orderAgainstZero(mod) == .lt) { + const err = try sema.errMsg(block, lhs_src, "use of negative value '{}'", .{lhs_val.fmtValue(lhs_ty, mod)}); + try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name}); + return sema.failWithOwnedErrorMsg(err); + } + } + + if (maybe_rhs_val) |rhs_val| { + if (!rhs_val.isUndef(mod) and rhs_val.orderAgainstZero(mod) == .lt) { + const err = try sema.errMsg(block, rhs_src, "use of negative value '{}'", .{rhs_val.fmtValue(rhs_ty, mod)}); + try sema.errNote(block, src, err, "parameters to {s} must be positive", .{builtin_name}); + return sema.failWithOwnedErrorMsg(err); + } + } + } + + // If either of the operands are zero, the result is zero + // If either of the operands are undefined, the result is undefined + if (maybe_lhs_val) |lhs_val| { + if (lhs_val.orderAgainstZero(mod) == .eq) return sema.addConstant(dest_ty, try mod.intValue(dest_ty, 0)); + if (lhs_val.isUndef(mod)) return sema.addConstUndef(dest_ty); + } + if (maybe_rhs_val) |rhs_val| { + if (rhs_val.orderAgainstZero(mod) == .lt) return sema.addConstant(dest_ty, try mod.intValue(dest_ty, 0)); + if (rhs_val.isUndef(mod)) return sema.addConstUndef(dest_ty); + } + + if (maybe_lhs_val) |lhs_val| { + if (maybe_rhs_val) |rhs_val| { + const dest_val = switch (air_tag) { + .deposit_bits => try sema.intDepositBits(lhs_val, rhs_val, dest_ty), + .extract_bits => try sema.intExtractBits(lhs_val, rhs_val, dest_ty), + else => unreachable, + }; + + return sema.addConstant(dest_ty, dest_val); + } + } + + const runtime_src = if (maybe_lhs_val == null) lhs_src else rhs_src; + try sema.requireRuntimeBlock(block, src, runtime_src); + + return block.addInst(.{ + .tag = air_tag, + .data = .{ .bin_op = .{ + .lhs = lhs, + .rhs = rhs, + } }, + }); +} + fn requireRuntimeBlock(sema: *Sema, block: *Block, src: LazySrcLoc, runtime_src: ?LazySrcLoc) !void { if (block.is_comptime) { const msg = msg: { @@ -36247,6 +36370,62 @@ fn intAddWithOverflowScalar( }; } +/// Asserts that the values are positive +fn intDepositBits( + sema: *Sema, + lhs: Value, + rhs: Value, + ty: Type, +) !Value { + // TODO is this a performance issue? maybe we should try the operation without + // resorting to BigInt first. For non-bigints, @intDeposit could be used? + const mod = sema.mod; + const arena = sema.arena; + + var lhs_space: Value.BigIntSpace = undefined; + var rhs_space: Value.BigIntSpace = undefined; + const source = lhs.toBigInt(&lhs_space, mod); + const mask = rhs.toBigInt(&rhs_space, mod); + + const result_limbs = try arena.alloc( + std.math.big.Limb, + mask.limbs.len, + ); + + var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined }; + + result.depositBits(source, mask); + return mod.intValue_big(ty, result.toConst()); +} + +/// Asserts that the values are positive +fn intExtractBits( + sema: *Sema, + lhs: Value, + rhs: Value, + ty: Type, +) !Value { + // TODO is this a performance issue? maybe we should try the operation without + // resorting to BigInt first. For non-bigints, @intExtract could be used? + const mod = sema.mod; + const arena = sema.arena; + + var lhs_space: Value.BigIntSpace = undefined; + var rhs_space: Value.BigIntSpace = undefined; + const source = lhs.toBigInt(&lhs_space, mod); + const mask = rhs.toBigInt(&rhs_space, mod); + + const result_limbs = try arena.alloc( + std.math.big.Limb, + mask.limbs.len, + ); + + var result = std.math.big.int.Mutable{ .limbs = result_limbs, .positive = undefined, .len = undefined }; + + result.extractBits(source, mask); + return mod.intValue_big(ty, result.toConst()); +} + /// Asserts the values are comparable. Both operands have type `ty`. /// For vectors, returns true if the comparison is true for ALL elements. /// diff --git a/src/Zir.zig b/src/Zir.zig index 4a0fdde24f95..7b6284d26412 100644 --- a/src/Zir.zig +++ b/src/Zir.zig @@ -1985,6 +1985,12 @@ pub const Inst = struct { /// with a specific value. For instance, this is used for the capture of an `errdefer`. /// This should never appear in a body. value_placeholder, + /// Implements the `@depositBits` builtin. + /// `operand` is payload index to `BinNode`. + deposit_bits, + /// Implements the `@extractBits` builtin. + /// `operand` is payload index to `BinNode`. + extract_bits, pub const InstData = struct { opcode: Extended, diff --git a/src/arch/aarch64/CodeGen.zig b/src/arch/aarch64/CodeGen.zig index 5080a0451a7c..076ce8764b89 100644 --- a/src/arch/aarch64/CodeGen.zig +++ b/src/arch/aarch64/CodeGen.zig @@ -903,6 +903,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => return self.fail("TODO implement deposit_bits", .{}), + .extract_bits => return self.fail("TODO implement extract_bits", .{}), // zig fmt: on } diff --git a/src/arch/arm/CodeGen.zig b/src/arch/arm/CodeGen.zig index 7ece4ba2e3a3..bcc651e99481 100644 --- a/src/arch/arm/CodeGen.zig +++ b/src/arch/arm/CodeGen.zig @@ -887,6 +887,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => return self.fail("TODO implement deposit_bits", .{}), + .extract_bits => return self.fail("TODO implement extract_bits", .{}), // zig fmt: on } diff --git a/src/arch/riscv64/CodeGen.zig b/src/arch/riscv64/CodeGen.zig index cba1de92c1ed..65b1cfe6b0f4 100644 --- a/src/arch/riscv64/CodeGen.zig +++ b/src/arch/riscv64/CodeGen.zig @@ -717,6 +717,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => return self.fail("TODO implement deposit_bits", .{}), + .extract_bits => return self.fail("TODO implement extract_bits", .{}), // zig fmt: on } if (std.debug.runtime_safety) { diff --git a/src/arch/sparc64/CodeGen.zig b/src/arch/sparc64/CodeGen.zig index f210f8e14461..4aa5e87311dd 100644 --- a/src/arch/sparc64/CodeGen.zig +++ b/src/arch/sparc64/CodeGen.zig @@ -735,6 +735,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => @panic("TODO implement deposit_bits"), + .extract_bits => @panic("TODO implement extract_bits"), // zig fmt: on } diff --git a/src/arch/wasm/CodeGen.zig b/src/arch/wasm/CodeGen.zig index efd5ea6642f9..d9be132d4fa4 100644 --- a/src/arch/wasm/CodeGen.zig +++ b/src/arch/wasm/CodeGen.zig @@ -2068,6 +2068,10 @@ fn genInst(func: *CodeGen, inst: Air.Inst.Index) InnerError!void { .work_group_size, .work_group_id, => unreachable, + + .deposit_bits, + .extract_bits, + => |tag| return func.fail("TODO implement {s}", .{@tagName(tag)}), }; } diff --git a/src/arch/x86_64/CodeGen.zig b/src/arch/x86_64/CodeGen.zig index b4ef42b953c5..fabfc1877666 100644 --- a/src/arch/x86_64/CodeGen.zig +++ b/src/arch/x86_64/CodeGen.zig @@ -1985,6 +1985,9 @@ fn genBody(self: *Self, body: []const Air.Inst.Index) InnerError!void { .work_item_id => unreachable, .work_group_size => unreachable, .work_group_id => unreachable, + + .deposit_bits => return self.fail("TODO implement deposit_bits", .{}), + .extract_bits => return self.fail("TODO implement extract_bits", .{}), // zig fmt: on } diff --git a/src/codegen/c.zig b/src/codegen/c.zig index 498eca4ce28b..a5c824669ed9 100644 --- a/src/codegen/c.zig +++ b/src/codegen/c.zig @@ -3084,6 +3084,9 @@ fn genBodyInner(f: *Function, body: []const Air.Inst.Index) error{ AnalysisFail, .work_group_size, .work_group_id, => unreachable, + + .deposit_bits => return f.fail("TODO: C backend: implement deposit_bits", .{}), + .extract_bits => return f.fail("TODO: C backend: implement extract_bits", .{}), // zig fmt: on }; if (result_value == .new_local) { diff --git a/src/codegen/llvm.zig b/src/codegen/llvm.zig index 1f390edc0210..cdd5277570e9 100644 --- a/src/codegen/llvm.zig +++ b/src/codegen/llvm.zig @@ -4550,6 +4550,9 @@ pub const FuncGen = struct { .work_item_id => try self.airWorkItemId(inst), .work_group_size => try self.airWorkGroupSize(inst), .work_group_id => try self.airWorkGroupId(inst), + + .deposit_bits => try self.airDepositBits(inst), + .extract_bits => try self.airExtractBits(inst), // zig fmt: on }; if (opt_value) |val| { @@ -9447,6 +9450,302 @@ pub const FuncGen = struct { return self.amdgcnWorkIntrinsic(dimension, 0, "llvm.amdgcn.workgroup.id"); } + fn airDepositBits(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value { + if (self.liveness.isUnused(inst)) return null; + + const bin_op = self.air.instructions.items(.data)[inst].bin_op; + const lhs = try self.resolveInst(bin_op.lhs); + const rhs = try self.resolveInst(bin_op.rhs); + const inst_ty = self.typeOfIndex(inst); + + const target = self.dg.module.getTarget(); + const params = [2]*llvm.Value{ lhs, rhs }; + switch (target.cpu.arch) { + .x86, .x86_64 => |tag| blk: { + // Doesn't have pdep + if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk; + + const bits = inst_ty.intInfo(self.dg.module).bits; + const supports_64 = tag == .x86_64; + // Integer size doesn't match the available instruction(s) + if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk; + + return self.buildDepositBitsNative(inst_ty, params); + }, + else => {}, + } + + return self.buildDepositBitsEmulated(inst_ty, params); + } + + fn buildDepositBitsNative( + self: *FuncGen, + ty: Type, + params: [2]*llvm.Value, + ) !*llvm.Value { + const target = self.dg.module.getTarget(); + + assert(target.cpu.arch.isX86()); + assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2)); + + const bits = ty.intInfo(self.dg.module).bits; + const intrinsic_name = switch (bits) { + 1...32 => "llvm.x86.bmi.pdep.32", + 33...64 => "llvm.x86.bmi.pdep.64", + else => unreachable, + }; + const needs_extend = bits != 32 and bits != 64; + + var params_cast = params; + + // Cast to either a 32 or 64-bit integer + if (needs_extend) { + const llvm_extend_ty = self.context.intType(if (bits <= 32) 32 else 64); + params_cast = .{ + self.builder.buildZExt(params[0], llvm_extend_ty, ""), + self.builder.buildZExt(params[1], llvm_extend_ty, ""), + }; + } + + const llvm_fn = self.getIntrinsic(intrinsic_name, &.{}); + const result = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, ¶ms_cast, 2, .Fast, .Auto, ""); + + // No cast needed! + if (!needs_extend) return result; + + // Cast back to the original integer size + const llvm_trunc_ty = try self.dg.lowerType(ty); + return self.builder.buildTrunc(result, llvm_trunc_ty, ""); + } + + // TODO Move this to compiler-rt (see #14609) + // + // Implements @depositBits(source, mask) in software + // (i.e. without platform-specific instructions) + // + // var bb = 1; + // var result = 0; + // do { + // const bit = mask & -mask; + // mask &= ~bit; + // const source_bit = source & bb; + // if (source_bit) result |= bit; + // bb += bb; + // } while (mask) + // + // return result; + fn buildDepositBitsEmulated( + self: *FuncGen, + ty: Type, + params: [2]*llvm.Value, + ) !*llvm.Value { + const llvm_ty = try self.dg.lowerType(ty); + + const source = params[0]; + const mask_start = params[1]; + const zero = llvm_ty.constNull(); + const one = llvm_ty.constInt(1, .False); + const minus_one = llvm_ty.constInt(@bitCast(c_ulonglong, @as(c_longlong, -1)), .True); + + const prev_block = self.builder.getInsertBlock(); + const loop_block = self.context.appendBasicBlock(self.llvm_func, "Loop"); + const after_block = self.context.appendBasicBlock(self.llvm_func, "After"); + + _ = self.builder.buildBr(loop_block); + self.builder.positionBuilderAtEnd(loop_block); + const mask_phi = self.builder.buildPhi(llvm_ty, ""); + const result_phi = self.builder.buildPhi(llvm_ty, ""); + const bb_phi = self.builder.buildPhi(llvm_ty, ""); + const minus_mask = self.builder.buildSub(zero, mask_phi, ""); + const bit = self.builder.buildAnd(mask_phi, minus_mask, ""); + const not_bit = self.builder.buildXor(bit, minus_one, ""); + const new_mask = self.builder.buildAnd(mask_phi, not_bit, ""); + const source_bit = self.builder.buildAnd(source, bb_phi, ""); + const source_bit_set = self.builder.buildICmp(.NE, source_bit, zero, ""); + const bit_or_zero = self.builder.buildSelect(source_bit_set, bit, zero, ""); // avoid using control flow + const new_result = self.builder.buildOr(result_phi, bit_or_zero, ""); + const new_bb = self.builder.buildAdd(bb_phi, bb_phi, ""); + const while_cond = self.builder.buildICmp(.NE, new_mask, zero, ""); + _ = self.builder.buildCondBr(while_cond, loop_block, after_block); + + mask_phi.addIncoming( + &[2]*llvm.Value{ mask_start, new_mask }, + &[2]*llvm.BasicBlock{ prev_block, loop_block }, + 2, + ); + + result_phi.addIncoming( + &[2]*llvm.Value{ zero, new_result }, + &[2]*llvm.BasicBlock{ prev_block, loop_block }, + 2, + ); + + bb_phi.addIncoming( + &[2]*llvm.Value{ one, new_bb }, + &[2]*llvm.BasicBlock{ prev_block, loop_block }, + 2, + ); + + self.builder.positionBuilderAtEnd(after_block); + const final_result = self.builder.buildPhi(llvm_ty, ""); + final_result.addIncoming( + &[1]*llvm.Value{new_result}, + &[1]*llvm.BasicBlock{loop_block}, + 1, + ); + + return final_result; + } + + fn airExtractBits(self: *FuncGen, inst: Air.Inst.Index) !?*llvm.Value { + if (self.liveness.isUnused(inst)) return null; + + const bin_op = self.air.instructions.items(.data)[inst].bin_op; + const lhs = try self.resolveInst(bin_op.lhs); + const rhs = try self.resolveInst(bin_op.rhs); + const inst_ty = self.typeOfIndex(inst); + + const target = self.dg.module.getTarget(); + const params = [2]*llvm.Value{ lhs, rhs }; + switch (target.cpu.arch) { + .x86, .x86_64 => |tag| blk: { + // Doesn't have pext + if (!std.Target.x86.featureSetHas(target.cpu.features, .bmi2)) break :blk; + + const bits = inst_ty.intInfo(self.dg.module).bits; + const supports_64 = tag == .x86_64; + // Integer size doesn't match the available instruction(s) + if (!(bits <= 32 or (bits <= 64 and supports_64))) break :blk; + + return self.buildExtractBitsNative(inst_ty, params); + }, + else => {}, + } + + return self.buildExtractBitsEmulated(inst_ty, params); + } + + fn buildExtractBitsNative( + self: *FuncGen, + ty: Type, + params: [2]*llvm.Value, + ) !*llvm.Value { + const target = self.dg.module.getTarget(); + + assert(target.cpu.arch.isX86()); + assert(std.Target.x86.featureSetHas(target.cpu.features, .bmi2)); + + const bits = ty.intInfo(self.dg.module).bits; + const intrinsic_name = switch (bits) { + 1...32 => "llvm.x86.bmi.pext.32", + 33...64 => "llvm.x86.bmi.pext.64", + else => unreachable, + }; + const needs_extend = bits != 32 and bits != 64; + + var params_cast = params; + + // Cast to either a 32 or 64-bit integer + if (needs_extend) { + const llvm_extend_ty = self.context.intType(if (bits <= 32) 32 else 64); + params_cast = .{ + self.builder.buildZExt(params[0], llvm_extend_ty, ""), + self.builder.buildZExt(params[1], llvm_extend_ty, ""), + }; + } + + const llvm_fn = self.getIntrinsic(intrinsic_name, &.{}); + const result = self.builder.buildCall(llvm_fn.globalGetValueType(), llvm_fn, ¶ms_cast, 2, .Fast, .Auto, ""); + + // No cast needed! + if (!needs_extend) return result; + + // Cast back to the original integer size + const llvm_trunc_ty = try self.dg.lowerType(ty); + return self.builder.buildTrunc(result, llvm_trunc_ty, ""); + } + + // TODO Move this to compiler-rt (see #14609) + // + // Implements @extractBits(source, mask) in software + // (i.e. without platform-specific instructions) + // + // var bb = 1; + // var result = 0; + // do { + // const bit = mask & -mask; + // mask &= ~bit; + // const source_bit = source & bit; + // if (source_bit != 0) result |= bb; + // bb += bb; + // } while (mask) + // + // return result; + fn buildExtractBitsEmulated( + self: *FuncGen, + ty: Type, + params: [2]*llvm.Value, + ) !*llvm.Value { + const llvm_ty = try self.dg.lowerType(ty); + + const zero = llvm_ty.constNull(); + const one = llvm_ty.constInt(1, .False); + const minus_one = llvm_ty.constInt(@bitCast(c_ulonglong, @as(c_longlong, -1)), .True); + const source = params[0]; + const start_mask = params[1]; + const start_result = zero; + const start_bb = one; + + const prev_block = self.builder.getInsertBlock(); + const loop_block = self.context.appendBasicBlock(self.llvm_func, "Loop"); + const after_block = self.context.appendBasicBlock(self.llvm_func, "After"); + + _ = self.builder.buildBr(loop_block); + self.builder.positionBuilderAtEnd(loop_block); + const mask_phi = self.builder.buildPhi(llvm_ty, ""); + const result_phi = self.builder.buildPhi(llvm_ty, ""); + const bb_phi = self.builder.buildPhi(llvm_ty, ""); + const minus_mask = self.builder.buildSub(zero, mask_phi, ""); + const bit = self.builder.buildAnd(mask_phi, minus_mask, ""); + const not_bit = self.builder.buildXor(bit, minus_one, ""); + const new_mask = self.builder.buildAnd(mask_phi, not_bit, ""); + const source_bit = self.builder.buildAnd(source, bit, ""); + const source_bit_set = self.builder.buildICmp(.NE, source_bit, zero, ""); + const bb_or_zero = self.builder.buildSelect(source_bit_set, bb_phi, zero, ""); // avoid using control flow + const new_result = self.builder.buildOr(result_phi, bb_or_zero, ""); + const new_bb = self.builder.buildAdd(bb_phi, bb_phi, ""); + const while_cond = self.builder.buildICmp(.NE, new_mask, zero, ""); + _ = self.builder.buildCondBr(while_cond, loop_block, after_block); + + mask_phi.addIncoming( + &[2]*llvm.Value{ start_mask, new_mask }, + &[2]*llvm.BasicBlock{ prev_block, loop_block }, + 2, + ); + + result_phi.addIncoming( + &[2]*llvm.Value{ start_result, new_result }, + &[2]*llvm.BasicBlock{ prev_block, loop_block }, + 2, + ); + + bb_phi.addIncoming( + &[2]*llvm.Value{ start_bb, new_bb }, + &[2]*llvm.BasicBlock{ prev_block, loop_block }, + 2, + ); + + self.builder.positionBuilderAtEnd(after_block); + const final_result = self.builder.buildPhi(llvm_ty, ""); + final_result.addIncoming( + &[1]*llvm.Value{new_result}, + &[1]*llvm.BasicBlock{loop_block}, + 1, + ); + + return final_result; + } + fn getErrorNameTable(self: *FuncGen) !*llvm.Value { if (self.dg.object.error_name_table) |table| { return table; diff --git a/src/print_air.zig b/src/print_air.zig index d73ec308917f..700fdbffadef 100644 --- a/src/print_air.zig +++ b/src/print_air.zig @@ -173,6 +173,8 @@ const Writer = struct { .memcpy, .memset, .memset_safe, + .deposit_bits, + .extract_bits, => try w.writeBinOp(s, inst), .is_null, diff --git a/src/print_zir.zig b/src/print_zir.zig index 029157818957..b85cf3e2eff7 100644 --- a/src/print_zir.zig +++ b/src/print_zir.zig @@ -527,6 +527,8 @@ const Writer = struct { .prefetch, .addrspace_cast, .c_va_arg, + .deposit_bits, + .extract_bits, => { const inst_data = self.code.extraData(Zir.Inst.BinNode, extended.operand).data; const src = LazySrcLoc.nodeOffset(inst_data.node); diff --git a/test/behavior.zig b/test/behavior.zig index 6e9435c49ef4..3006a7602a3c 100644 --- a/test/behavior.zig +++ b/test/behavior.zig @@ -152,6 +152,7 @@ test { _ = @import("behavior/const_slice_child.zig"); _ = @import("behavior/decltest.zig"); _ = @import("behavior/duplicated_test_names.zig"); + _ = @import("behavior/deposit_extract_bits.zig"); _ = @import("behavior/defer.zig"); _ = @import("behavior/empty_tuple_fields.zig"); _ = @import("behavior/empty_union.zig"); diff --git a/test/behavior/deposit_extract_bits.zig b/test/behavior/deposit_extract_bits.zig new file mode 100644 index 000000000000..9f2bafe22560 --- /dev/null +++ b/test/behavior/deposit_extract_bits.zig @@ -0,0 +1,58 @@ +const std = @import("std"); +const builtin = @import("builtin"); +const mem = std.mem; +const expect = std.testing.expect; +const expectEqual = std.testing.expectEqual; + +test "@depositBits" { + if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO + + const S = struct { + pub fn doTheTest() !void { + var a: u64 = 0; + var b: u64 = 0xFFFF_FFFF_FFFF_FFFF; + var c: u64 = 0x1234_5678_9012_3456; + var d: u64 = 0x00F0_FF00_F00F_00FF; + var e: u128 = @as(u128, d) << 64; + + try expect(@depositBits(b, a) == 0); + try expect(@depositBits(a, b) == 0); + + try expect(@depositBits(b, c) == c); + try expect(@depositBits(b, d) == d); + + try expect(@depositBits(c, d) == 0x0000_1200_3004_0056); + try expect(@depositBits(c, e) == 0x0000_1200_3004_0056 << 64); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +} + +test "@extractBits" { + if (builtin.zig_backend != .stage2_llvm) return error.SkipZigTest; // TODO + + const S = struct { + pub fn doTheTest() !void { + var a: u64 = 0; + var b: u64 = 0xFFFF_FFFF_FFFF_FFFF; + var c: u64 = 0x1234_5678_9012_3456; + var d: u64 = 0x00F0_FF00_F00F_00FF; + var e: u128 = @as(u128, c) << 64; + var f: u128 = @as(u128, d) << 64; + + try expect(@extractBits(b, a) == 0); + try expect(@extractBits(a, b) == 0); + + try expect(@extractBits(c, b) == c); + try expect(@extractBits(d, b) == d); + + try expect(@extractBits(c, d) == 0x0356_9256); + try expect(@extractBits(e, f) == 0x0356_9256); + } + }; + + try S.doTheTest(); + try comptime S.doTheTest(); +}