Skip to content

Commit 9dbf837

Browse files
committed
Package.Fetch: retry fetch on error
As noted in ziglang#17472, this should probably be configurable; however, I'm not sure what the best approach for this is, and it's not necessary for an MVP. For now, this is hardcoded to make 4 total attempts, with delays of 200, 800, and 4000 ms respectively between attempts. I've erred on the side of retrying unnecessarily here; there are definitely errors we retry on which really ought to trigger the error immediately. We can work to tighten this up a little in future. Resolves: ziglang#17472
1 parent 7abf9b3 commit 9dbf837

File tree

1 file changed

+111
-39
lines changed

1 file changed

+111
-39
lines changed

src/Package/Fetch.zig

Lines changed: 111 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,40 @@ pub const RunError = error{
234234
FetchFailed,
235235
};
236236

237+
const RetryableRunError = RunError || error{
238+
/// An error occurred, but it may be temporary (e.g. due to network conditions).
239+
/// Also, we have not yet retried this fetch `retry_delays.len` many times.
240+
/// No error was added to the bundle. Attempt the fetch again after a short delay.
241+
FetchFailedRetryable,
242+
};
243+
244+
/// The delay between fetch attempts (after retryable errors) in ns.
245+
/// The length of this list is the number of times a fetch will be retried.
246+
const retry_delays = [_]u32{
247+
200 * std.time.ns_per_ms,
248+
800 * std.time.ns_per_ms,
249+
4000 * std.time.ns_per_ms,
250+
};
251+
237252
pub fn run(f: *Fetch) RunError!void {
253+
var attempt_count: u32 = 0;
254+
while (true) {
255+
if (f.runInner(&attempt_count)) |_| {
256+
return;
257+
} else |err| switch (err) {
258+
error.FetchFailedRetryable => {
259+
// Since we'll be sleeping, make sure the progress is up-to-date.
260+
f.prog_node.context.refresh();
261+
// The function which triggered this error has checked and updated attempt_count.
262+
std.time.sleep(retry_delays[attempt_count - 1]);
263+
continue;
264+
},
265+
else => |e| return e,
266+
}
267+
}
268+
}
269+
270+
fn runInner(f: *Fetch, attempt_count: *u32) RetryableRunError!void {
238271
const eb = &f.error_bundle;
239272
const arena = f.arena.allocator();
240273
const gpa = f.arena.child_allocator;
@@ -278,12 +311,12 @@ pub fn run(f: *Fetch) RunError!void {
278311
.path_or_url => |path_or_url| {
279312
if (fs.cwd().openIterableDir(path_or_url, .{})) |dir| {
280313
var resource: Resource = .{ .dir = dir };
281-
return runResource(f, path_or_url, &resource, null);
314+
return runResource(f, path_or_url, &resource, null, attempt_count);
282315
} else |dir_err| {
283316
const file_err = if (dir_err == error.NotDir) e: {
284317
if (fs.cwd().openFile(path_or_url, .{})) |file| {
285318
var resource: Resource = .{ .file = file };
286-
return runResource(f, path_or_url, &resource, null);
319+
return runResource(f, path_or_url, &resource, null, attempt_count);
287320
} else |err| break :e err;
288321
} else dir_err;
289322

@@ -293,8 +326,9 @@ pub fn run(f: *Fetch) RunError!void {
293326
.{ path_or_url, @errorName(file_err), @errorName(uri_err) },
294327
));
295328
};
296-
var resource = try f.initResource(uri);
297-
return runResource(f, uri.path, &resource, null);
329+
330+
var resource = try f.initResource(uri, attempt_count);
331+
return runResource(f, uri.path, &resource, null, attempt_count);
298332
}
299333
},
300334
};
@@ -330,8 +364,8 @@ pub fn run(f: *Fetch) RunError!void {
330364
f.location_tok,
331365
try eb.printString("invalid URI: {s}", .{@errorName(err)}),
332366
);
333-
var resource = try f.initResource(uri);
334-
return runResource(f, uri.path, &resource, remote.hash);
367+
var resource = try f.initResource(uri, attempt_count);
368+
return runResource(f, uri.path, &resource, remote.hash, attempt_count);
335369
}
336370

337371
pub fn deinit(f: *Fetch) void {
@@ -345,7 +379,8 @@ fn runResource(
345379
uri_path: []const u8,
346380
resource: *Resource,
347381
remote_hash: ?Manifest.MultiHashHexDigest,
348-
) RunError!void {
382+
attempt_count: *u32,
383+
) RetryableRunError!void {
349384
defer resource.deinit();
350385
const arena = f.arena.allocator();
351386
const eb = &f.error_bundle;
@@ -371,7 +406,7 @@ fn runResource(
371406
};
372407
defer tmp_directory.handle.close();
373408

374-
try unpackResource(f, resource, uri_path, tmp_directory);
409+
try unpackResource(f, resource, uri_path, tmp_directory, attempt_count);
375410

376411
// Load, parse, and validate the unpacked build.zig.zon file. It is allowed
377412
// for the file to be missing, in which case this fetched package is
@@ -707,6 +742,14 @@ fn fail(f: *Fetch, msg_tok: std.zig.Ast.TokenIndex, msg_str: u32) RunError {
707742
return error.FetchFailed;
708743
}
709744

745+
fn failRetryable(f: *Fetch, msg_tok: std.zig.Ast.TokenIndex, comptime msg_format: []const u8, msg_args: anytype, attempt_count: *u32) RetryableRunError {
746+
if (attempt_count.* == retry_delays.len) {
747+
return f.fail(msg_tok, try f.error_bundle.printString(msg_format, msg_args));
748+
}
749+
attempt_count.* += 1;
750+
return error.FetchFailedRetryable;
751+
}
752+
710753
const Resource = union(enum) {
711754
file: fs.File,
712755
http_request: std.http.Client.Request,
@@ -797,7 +840,7 @@ const FileType = enum {
797840
}
798841
};
799842

800-
fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource {
843+
fn initResource(f: *Fetch, uri: std.Uri, attempt_count: *u32) RetryableRunError!Resource {
801844
const gpa = f.arena.child_allocator;
802845
const arena = f.arena.allocator();
803846
const eb = &f.error_bundle;
@@ -819,31 +862,39 @@ fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource {
819862
defer h.deinit();
820863

821864
var req = http_client.request(.GET, uri, h, .{}) catch |err| {
822-
return f.fail(f.location_tok, try eb.printString(
865+
return f.failRetryable(
866+
f.location_tok,
823867
"unable to connect to server: {s}",
824868
.{@errorName(err)},
825-
));
869+
attempt_count,
870+
);
826871
};
827872
errdefer req.deinit(); // releases more than memory
828873

829874
req.start(.{}) catch |err| {
830-
return f.fail(f.location_tok, try eb.printString(
875+
return f.failRetryable(
876+
f.location_tok,
831877
"HTTP request failed: {s}",
832878
.{@errorName(err)},
833-
));
879+
attempt_count,
880+
);
834881
};
835882
req.wait() catch |err| {
836-
return f.fail(f.location_tok, try eb.printString(
883+
return f.failRetryable(
884+
f.location_tok,
837885
"invalid HTTP response: {s}",
838886
.{@errorName(err)},
839-
));
887+
attempt_count,
888+
);
840889
};
841890

842891
if (req.response.status != .ok) {
843-
return f.fail(f.location_tok, try eb.printString(
892+
return f.failRetryable(
893+
f.location_tok,
844894
"bad HTTP response code: '{d} {s}'",
845895
.{ @intFromEnum(req.response.status), req.response.status.phrase() orelse "" },
846-
));
896+
attempt_count,
897+
);
847898
}
848899

849900
return .{ .http_request = req };
@@ -859,16 +910,19 @@ fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource {
859910
session.discoverCapabilities(gpa, &redirect_uri) catch |err| switch (err) {
860911
error.Redirected => {
861912
defer gpa.free(redirect_uri);
913+
// We got a valid response, so this wasn't a network issue, so this is not retryable.
862914
return f.fail(f.location_tok, try eb.printString(
863915
"repository moved to {s}",
864916
.{redirect_uri},
865917
));
866918
},
867919
else => |e| {
868-
return f.fail(f.location_tok, try eb.printString(
920+
return f.failRetryable(
921+
f.location_tok,
869922
"unable to discover remote git server capabilities: {s}",
870923
.{@errorName(e)},
871-
));
924+
attempt_count,
925+
);
872926
},
873927
};
874928

@@ -883,17 +937,21 @@ fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource {
883937
.ref_prefixes = &.{ want_ref, want_ref_head, want_ref_tag },
884938
.include_peeled = true,
885939
}) catch |err| {
886-
return f.fail(f.location_tok, try eb.printString(
940+
return f.failRetryable(
941+
f.location_tok,
887942
"unable to list refs: {s}",
888943
.{@errorName(err)},
889-
));
944+
attempt_count,
945+
);
890946
};
891947
defer ref_iterator.deinit();
892948
while (ref_iterator.next() catch |err| {
893-
return f.fail(f.location_tok, try eb.printString(
949+
return f.failRetryable(
950+
f.location_tok,
894951
"unable to iterate refs: {s}",
895952
.{@errorName(err)},
896-
));
953+
attempt_count,
954+
);
897955
}) |ref| {
898956
if (std.mem.eql(u8, ref.name, want_ref) or
899957
std.mem.eql(u8, ref.name, want_ref_head) or
@@ -902,6 +960,7 @@ fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource {
902960
break :want_oid ref.peeled orelse ref.oid;
903961
}
904962
}
963+
// We successfully iterated the refs, so this wasn't a network issue, so this is not retryable.
905964
return f.fail(f.location_tok, try eb.printString("ref not found: {s}", .{want_ref}));
906965
};
907966
if (uri.fragment == null) {
@@ -925,10 +984,12 @@ fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource {
925984
std.fmt.fmtSliceHexLower(&want_oid),
926985
}) catch unreachable;
927986
var fetch_stream = session.fetch(gpa, &.{&want_oid_buf}) catch |err| {
928-
return f.fail(f.location_tok, try eb.printString(
987+
return f.failRetryable(
988+
f.location_tok,
929989
"unable to create fetch stream: {s}",
930990
.{@errorName(err)},
931-
));
991+
attempt_count,
992+
);
932993
};
933994
errdefer fetch_stream.deinit();
934995

@@ -949,7 +1010,8 @@ fn unpackResource(
9491010
resource: *Resource,
9501011
uri_path: []const u8,
9511012
tmp_directory: Cache.Directory,
952-
) RunError!void {
1013+
attempt_count: *u32,
1014+
) RetryableRunError!void {
9531015
const eb = &f.error_bundle;
9541016
const file_type = switch (resource.*) {
9551017
.file => FileType.fromPath(uri_path) orelse
@@ -1010,16 +1072,19 @@ fn unpackResource(
10101072
};
10111073

10121074
switch (file_type) {
1013-
.tar => try unpackTarball(f, tmp_directory.handle, resource.reader()),
1014-
.@"tar.gz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.gzip),
1015-
.@"tar.xz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.xz),
1075+
.tar => try unpackTarball(f, tmp_directory.handle, resource.reader(), attempt_count),
1076+
.@"tar.gz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.gzip, attempt_count),
1077+
.@"tar.xz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.xz, attempt_count),
10161078
.git_pack => unpackGitPack(f, tmp_directory.handle, resource) catch |err| switch (err) {
10171079
error.FetchFailed => return error.FetchFailed,
10181080
error.OutOfMemory => return error.OutOfMemory,
1019-
else => |e| return f.fail(f.location_tok, try eb.printString(
1081+
// TODO: don't mark actual git errors as retryable. We're only interested in reader errors.
1082+
else => |e| return f.failRetryable(
1083+
f.location_tok,
10201084
"unable to unpack git files: {s}",
10211085
.{@errorName(e)},
1022-
)),
1086+
attempt_count,
1087+
),
10231088
},
10241089
}
10251090
}
@@ -1029,7 +1094,8 @@ fn unpackTarballCompressed(
10291094
out_dir: fs.Dir,
10301095
resource: *Resource,
10311096
comptime Compression: type,
1032-
) RunError!void {
1097+
attempt_count: *u32,
1098+
) RetryableRunError!void {
10331099
const gpa = f.arena.child_allocator;
10341100
const eb = &f.error_bundle;
10351101
const reader = resource.reader();
@@ -1043,10 +1109,10 @@ fn unpackTarballCompressed(
10431109
};
10441110
defer decompress.deinit();
10451111

1046-
return unpackTarball(f, out_dir, decompress.reader());
1112+
return unpackTarball(f, out_dir, decompress.reader(), attempt_count);
10471113
}
10481114

1049-
fn unpackTarball(f: *Fetch, out_dir: fs.Dir, reader: anytype) RunError!void {
1115+
fn unpackTarball(f: *Fetch, out_dir: fs.Dir, reader: anytype, attempt_count: *u32) RetryableRunError!void {
10501116
const eb = &f.error_bundle;
10511117
const gpa = f.arena.child_allocator;
10521118

@@ -1063,10 +1129,15 @@ fn unpackTarball(f: *Fetch, out_dir: fs.Dir, reader: anytype) RunError!void {
10631129
// bit on Windows from the ACLs (see the isExecutable function).
10641130
.mode_mode = .ignore,
10651131
.exclude_empty_directories = true,
1066-
}) catch |err| return f.fail(f.location_tok, try eb.printString(
1067-
"unable to unpack tarball to temporary directory: {s}",
1068-
.{@errorName(err)},
1069-
));
1132+
}) catch |err| {
1133+
// TODO: don't mark actual tar errors as retryable. We're only interested in reader errors.
1134+
return f.failRetryable(
1135+
f.location_tok,
1136+
"unable to unpack tarball to temporary directory: {s}",
1137+
.{@errorName(err)},
1138+
attempt_count,
1139+
);
1140+
};
10701141

10711142
if (diagnostics.errors.items.len > 0) {
10721143
const notes_len: u32 = @intCast(diagnostics.errors.items.len);
@@ -1163,7 +1234,8 @@ fn unpackGitPack(f: *Fetch, out_dir: fs.Dir, resource: *Resource) anyerror!void
11631234
},
11641235
}
11651236
}
1166-
return error.InvalidGitPack;
1237+
// Use FetchFailed since we already added an error.
1238+
return error.FetchFailed;
11671239
}
11681240
}
11691241
}

0 commit comments

Comments
 (0)