Skip to content

Commit cfa0227

Browse files
committed
stage2: store Cache files in a map data structure
Cache now stores files as an StringArrayListHashMap instead of an ArrayList. The key is the resolved file path, to avoid adding the same file multiple times into the cache system when it is depended on multiple times. This avoids re-hashing file contents that have already been inserted into the cache. This was an attempt to fix #7308, but it is solving a different problem. I do think it is worth considering this improvement regardless. On the other hand, it might be duplicated effort since the layer of code above this one probably wants to do its own de-duplication as well. This implementation would be much nicer with #7391.
1 parent 0d00938 commit cfa0227

File tree

2 files changed

+86
-49
lines changed

2 files changed

+86
-49
lines changed

src/Cache.zig

Lines changed: 84 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,8 @@ pub const Manifest = struct {
175175
hash: HashHelper,
176176
manifest_file: ?fs.File,
177177
manifest_dirty: bool,
178-
files: std.ArrayListUnmanaged(File) = .{},
178+
/// The key is the resolved_path.
179+
files: std.StringArrayHashMapUnmanaged(File) = .{},
179180
hex_digest: [hex_digest_len]u8,
180181

181182
/// Add a file as a dependency of process being cached. When `hit` is
@@ -186,30 +187,36 @@ pub const Manifest = struct {
186187
/// are allowed to take up in memory. If max_file_size is null, then the contents
187188
/// will not be loaded into memory.
188189
///
189-
/// Returns the index of the entry in the `files` array list. You can use it
190+
/// Returns the index of the entry in `files`. You can use it
190191
/// to access the contents of the file after calling `hit()` like so:
191192
///
192193
/// ```
193-
/// var file_contents = cache_hash.files.items[file_index].contents.?;
194+
/// const file_contents = cache_hash.files.entries.items[index].contents.?;
194195
/// ```
195196
pub fn addFile(self: *Manifest, file_path: []const u8, max_file_size: ?usize) !usize {
196197
assert(self.manifest_file == null);
197198

198-
try self.files.ensureCapacity(self.cache.gpa, self.files.items.len + 1);
199-
const resolved_path = try fs.path.resolve(self.cache.gpa, &[_][]const u8{file_path});
200-
201-
const idx = self.files.items.len;
202-
self.files.addOneAssumeCapacity().* = .{
203-
.path = resolved_path,
204-
.contents = null,
205-
.max_file_size = max_file_size,
206-
.stat = undefined,
207-
.bin_digest = undefined,
208-
};
209-
199+
var index = self.files.count();
200+
try self.files.ensureCapacity(self.cache.gpa, index + 1);
201+
var resolved_path: []const u8 = try fs.path.resolve(self.cache.gpa, &[_][]const u8{file_path});
202+
const gop = self.files.getOrPutAssumeCapacity(resolved_path);
203+
if (gop.found_existing) {
204+
self.cache.gpa.free(resolved_path);
205+
resolved_path = gop.entry.key;
206+
// TODO getOrPut should give us the index
207+
// https://github.com/ziglang/zig/issues/7391
208+
index = self.files.getIndex(resolved_path).?;
209+
} else {
210+
gop.entry.value = .{
211+
.path = resolved_path,
212+
.contents = null,
213+
.max_file_size = max_file_size,
214+
.stat = undefined,
215+
.bin_digest = undefined,
216+
};
217+
}
210218
self.hash.addBytes(resolved_path);
211-
212-
return idx;
219+
return index;
213220
}
214221

215222
pub fn addOptionalFile(self: *Manifest, optional_file_path: ?[]const u8) !void {
@@ -252,7 +259,7 @@ pub const Manifest = struct {
252259
mem.copy(u8, &manifest_file_path, &self.hex_digest);
253260
manifest_file_path[self.hex_digest.len..][0..ext.len].* = ext.*;
254261

255-
if (self.files.items.len != 0) {
262+
if (self.files.count() != 0) {
256263
self.manifest_file = try self.cache.manifest_dir.createFile(&manifest_file_path, .{
257264
.read = true,
258265
.truncate = false,
@@ -282,23 +289,26 @@ pub const Manifest = struct {
282289
const file_contents = try self.manifest_file.?.inStream().readAllAlloc(self.cache.gpa, manifest_file_size_max);
283290
defer self.cache.gpa.free(file_contents);
284291

285-
const input_file_count = self.files.items.len;
292+
const input_file_count = self.files.count();
286293
var any_file_changed = false;
287294
var line_iter = mem.tokenize(file_contents, "\n");
288295
var idx: usize = 0;
289296
while (line_iter.next()) |line| {
290-
defer idx += 1;
291-
292-
const cache_hash_file = if (idx < input_file_count) &self.files.items[idx] else blk: {
293-
const new = try self.files.addOne(self.cache.gpa);
294-
new.* = .{
297+
var new_file: File = undefined;
298+
var is_new = false;
299+
300+
const cache_hash_file = if (idx < input_file_count)
301+
&self.files.entries.items[idx].value
302+
else blk: {
303+
is_new = true;
304+
new_file = .{
295305
.path = null,
296306
.contents = null,
297307
.max_file_size = null,
298308
.stat = undefined,
299309
.bin_digest = undefined,
300310
};
301-
break :blk new;
311+
break :blk &new_file;
302312
};
303313

304314
var iter = mem.tokenize(line, " ");
@@ -360,6 +370,12 @@ pub const Manifest = struct {
360370
if (!any_file_changed) {
361371
self.hash.hasher.update(&cache_hash_file.bin_digest);
362372
}
373+
374+
if (is_new) {
375+
try self.files.putNoClobber(self.cache.gpa, new_file.path.?, new_file);
376+
}
377+
378+
idx += 1;
363379
}
364380

365381
if (any_file_changed) {
@@ -372,7 +388,7 @@ pub const Manifest = struct {
372388
if (idx < input_file_count) {
373389
self.manifest_dirty = true;
374390
while (idx < input_file_count) : (idx += 1) {
375-
const ch_file = &self.files.items[idx];
391+
const ch_file = &self.files.entries.items[idx].value;
376392
try self.populateFileHash(ch_file);
377393
}
378394
return false;
@@ -387,13 +403,16 @@ pub const Manifest = struct {
387403
self.hash.hasher.update(&bin_digest);
388404

389405
// Remove files not in the initial hash.
390-
for (self.files.items[input_file_count..]) |*file| {
391-
file.deinit(self.cache.gpa);
406+
// TODO this code can get cleaner and more efficient after
407+
// https://github.com/ziglang/zig/issues/7391
408+
while (self.files.count() > input_file_count) {
409+
const file_path = self.files.entries.items[self.files.entries.items.len - 1].value.path.?;
410+
var entry = self.files.remove(file_path).?;
411+
entry.value.deinit(self.cache.gpa);
392412
}
393-
self.files.shrinkRetainingCapacity(input_file_count);
394413

395-
for (self.files.items) |file| {
396-
self.hash.hasher.update(&file.bin_digest);
414+
for (self.files.entries.items) |*entry| {
415+
self.hash.hasher.update(&entry.value.bin_digest);
397416
}
398417
}
399418

@@ -445,21 +464,27 @@ pub const Manifest = struct {
445464
assert(self.manifest_file != null);
446465

447466
const resolved_path = try fs.path.resolve(self.cache.gpa, &[_][]const u8{file_path});
448-
errdefer self.cache.gpa.free(resolved_path);
467+
var keep_resolved_path = false;
468+
defer if (!keep_resolved_path) self.cache.gpa.free(resolved_path);
449469

450-
const new_ch_file = try self.files.addOne(self.cache.gpa);
451-
new_ch_file.* = .{
470+
const gop = try self.files.getOrPut(self.cache.gpa, resolved_path);
471+
if (gop.found_existing) {
472+
return gop.entry.value.contents.?;
473+
}
474+
gop.entry.value = .{
452475
.path = resolved_path,
453476
.max_file_size = max_file_size,
454477
.stat = undefined,
455478
.bin_digest = undefined,
456479
.contents = null,
457480
};
458-
errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
481+
// TODO after https://github.com/ziglang/zig/issues/7391 this can be replaced with pop()
482+
errdefer self.files.removeAssertDiscard(resolved_path);
459483

460-
try self.populateFileHash(new_ch_file);
484+
try self.populateFileHash(&gop.entry.value);
461485

462-
return new_ch_file.contents.?;
486+
keep_resolved_path = true;
487+
return gop.entry.value.contents.?;
463488
}
464489

465490
/// Add a file as a dependency of process being cached, after the initial hash has been
@@ -470,19 +495,26 @@ pub const Manifest = struct {
470495
assert(self.manifest_file != null);
471496

472497
const resolved_path = try fs.path.resolve(self.cache.gpa, &[_][]const u8{file_path});
473-
errdefer self.cache.gpa.free(resolved_path);
498+
var keep_resolved_path = false;
499+
defer if (!keep_resolved_path) self.cache.gpa.free(resolved_path);
474500

475-
const new_ch_file = try self.files.addOne(self.cache.gpa);
476-
new_ch_file.* = .{
501+
const gop = try self.files.getOrPut(self.cache.gpa, resolved_path);
502+
if (gop.found_existing) {
503+
return;
504+
}
505+
gop.entry.value = .{
477506
.path = resolved_path,
478507
.max_file_size = null,
479508
.stat = undefined,
480509
.bin_digest = undefined,
481510
.contents = null,
482511
};
483-
errdefer self.files.shrinkRetainingCapacity(self.files.items.len - 1);
512+
// TODO after https://github.com/ziglang/zig/issues/7391 this can be replaced with pop()
513+
errdefer self.files.removeAssertDiscard(resolved_path);
514+
515+
try self.populateFileHash(&gop.entry.value);
484516

485-
try self.populateFileHash(new_ch_file);
517+
keep_resolved_path = true;
486518
}
487519

488520
pub fn addDepFilePost(self: *Manifest, dir: fs.Dir, dep_file_basename: []const u8) !void {
@@ -549,7 +581,8 @@ pub const Manifest = struct {
549581
const writer = contents.writer();
550582
var encoded_digest: [hex_digest_len]u8 = undefined;
551583

552-
for (self.files.items) |file| {
584+
for (self.files.entries.items) |entry| {
585+
const file = &entry.value;
553586
_ = std.fmt.bufPrint(&encoded_digest, "{x}", .{file.bin_digest}) catch unreachable;
554587
try writer.print("{d} {d} {d} {s} {s}\n", .{
555588
file.stat.size,
@@ -581,8 +614,8 @@ pub const Manifest = struct {
581614
if (self.manifest_file) |file| {
582615
file.close();
583616
}
584-
for (self.files.items) |*file| {
585-
file.deinit(self.cache.gpa);
617+
for (self.files.entries.items) |*file_entry| {
618+
file_entry.value.deinit(self.cache.gpa);
586619
}
587620
self.files.deinit(self.cache.gpa);
588621
}
@@ -775,7 +808,11 @@ test "check that changing a file makes cache fail" {
775808
// There should be nothing in the cache
776809
testing.expectEqual(false, try ch.hit());
777810

778-
testing.expect(mem.eql(u8, original_temp_file_contents, ch.files.items[temp_file_idx].contents.?));
811+
testing.expect(mem.eql(
812+
u8,
813+
original_temp_file_contents,
814+
ch.files.entries.items[temp_file_idx].value.contents.?,
815+
));
779816

780817
digest1 = ch.final();
781818

@@ -795,7 +832,7 @@ test "check that changing a file makes cache fail" {
795832
testing.expectEqual(false, try ch.hit());
796833

797834
// The cache system does not keep the contents of re-hashed input files.
798-
testing.expect(ch.files.items[temp_file_idx].contents == null);
835+
testing.expect(ch.files.entries.items[temp_file_idx].value.contents == null);
799836

800837
digest2 = ch.final();
801838

src/Compilation.zig

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1584,7 +1584,7 @@ pub fn cImport(comp: *Compilation, c_src: []const u8) !CImportResult {
15841584
const prev_hash_state = man.hash.peekBin();
15851585
const actual_hit = hit: {
15861586
const is_hit = try man.hit();
1587-
if (man.files.items.len == 0) {
1587+
if (man.files.count() == 0) {
15881588
man.unhit(prev_hash_state, 0);
15891589
break :hit false;
15901590
}
@@ -2842,7 +2842,7 @@ fn updateStage1Module(comp: *Compilation, main_progress_node: *std.Progress.Node
28422842

28432843
// Capture the state in case we come back from this branch where the hash doesn't match.
28442844
const prev_hash_state = man.hash.peekBin();
2845-
const input_file_count = man.files.items.len;
2845+
const input_file_count = man.files.count();
28462846

28472847
if (try man.hit()) {
28482848
const digest = man.final();

0 commit comments

Comments
 (0)