From 9400c03905f2ac7b0743165a1ab8b765bcfaf772 Mon Sep 17 00:00:00 2001 From: Jon Jensen Date: Tue, 16 Sep 2025 11:39:11 -0600 Subject: [PATCH] fix(libnpmexec): improve withLock stability various improvements to withLock and its tests - fix windows race conditions/errors - use second-level granularity when detecting lock compromise; this resolves a sporadic floating point issue under APFS, and makes this generally more robust no matter the underlying file system - improve touchLock logic so that it doesn't compromise the lock for the next holder or keep running the interval when cleanup fails --- workspaces/libnpmexec/lib/with-lock.js | 41 ++++++---- workspaces/libnpmexec/test/with-lock.js | 102 ++++++++++++++++++++++-- 2 files changed, 122 insertions(+), 21 deletions(-) diff --git a/workspaces/libnpmexec/lib/with-lock.js b/workspaces/libnpmexec/lib/with-lock.js index bc8b6b6529789..897046adedb8a 100644 --- a/workspaces/libnpmexec/lib/with-lock.js +++ b/workspaces/libnpmexec/lib/with-lock.js @@ -18,7 +18,7 @@ const { onExit } = require('signal-exit') // - more ergonomic compromised lock handling (i.e. withLock will reject, and callbacks have access to an AbortSignal) // - uses a more recent version of signal-exit -const touchInterval = 100 +const touchInterval = 1_000 // mtime precision is platform dependent, so use a reasonably large threshold const staleThreshold = 5_000 @@ -75,7 +75,7 @@ function acquireLock (lockPath) { try { await fs.mkdir(lockPath) } catch (err) { - if (err.code !== 'EEXIST') { + if (err.code !== 'EEXIST' && err.code !== 'EBUSY' && err.code !== 'EPERM') { throw err } @@ -86,9 +86,18 @@ function acquireLock (lockPath) { return retry(err) } if (status === 'stale') { - // there is a very tiny window where another process could also release the stale lock and acquire it before we release it here; the lock compromise checker should detect this and throw an error - deleteLock(lockPath, ['ENOENT', 'EBUSY']) // on windows, EBUSY can happen if another process is creating the lock; we'll just retry + try { + // there is a very tiny window where another process could also release the stale lock and acquire it before we release it here; the lock compromise checker should detect this and throw an error + deleteLock(lockPath) + } catch (e) { + // on windows, EBUSY/EPERM can happen if another process is (re)creating the lock; maybe we can acquire it on a subsequent attempt 🤞 + if (e.code === 'EBUSY' || e.code === 'EPERM') { + return retry(e) + } + throw e + } } + // immediately attempt to acquire the lock (no backoff) return await acquireLock(lockPath) } try { @@ -100,12 +109,12 @@ function acquireLock (lockPath) { }) } -function deleteLock (lockPath, ignoreCodes = ['ENOENT']) { +function deleteLock (lockPath) { try { // synchronous, so we can call in an exit handler rmdirSync(lockPath) } catch (err) { - if (!ignoreCodes.includes(err.code)) { + if (err.code !== 'ENOENT') { throw err } } @@ -131,31 +140,33 @@ async function getLockStatus (lockPath) { async function maintainLock (lockPath) { const controller = new AbortController() const stats = await fs.stat(lockPath) - let mtimeMs = stats.mtimeMs + // fs.utimes operates on floating points seconds (directly, or via strings/Date objects), which may not match the underlying filesystem's mtime precision, meaning that we might read a slightly different mtime than we write. always round to the nearest second, since all filesystems support at least second precision + let mtime = Math.round(stats.mtimeMs / 1000) const signal = controller.signal async function touchLock () { try { const currentStats = (await fs.stat(lockPath)) - if (currentStats.ino !== stats.ino || currentStats.mtimeMs !== mtimeMs) { + const currentMtime = Math.round(currentStats.mtimeMs / 1000) + if (currentStats.ino !== stats.ino || currentMtime !== mtime) { throw new Error('Lock compromised') } - mtimeMs = Date.now() - const mtime = new Date(mtimeMs) - await fs.utimes(lockPath, mtime, mtime) - } catch (err) { - // stats mismatch or other fs error means the lock was compromised, unless we just released the lock during this iteration + mtime = Math.round(Date.now() / 1000) + // touch the lock, unless we just released it during this iteration if (currentLocks.has(lockPath)) { - controller.abort() + await fs.utimes(lockPath, mtime, mtime) } + } catch (err) { + // stats mismatch or other fs error means the lock was compromised + controller.abort() } } const timeout = setInterval(touchLock, touchInterval) timeout.unref() function cleanup () { - deleteLock(lockPath) clearInterval(timeout) + deleteLock(lockPath) } currentLocks.set(lockPath, cleanup) return signal diff --git a/workspaces/libnpmexec/test/with-lock.js b/workspaces/libnpmexec/test/with-lock.js index 348f9753fd125..bde4c2b764cae 100644 --- a/workspaces/libnpmexec/test/with-lock.js +++ b/workspaces/libnpmexec/test/with-lock.js @@ -102,7 +102,7 @@ t.test('stale lock takeover', async (t) => { } } let statCalls = 0 - const mtimeMs = Date.now() + const mtimeMs = Math.round(Date.now() / 1000) * 1000 mockStat = async () => { if (++statCalls === 1) { return { mtimeMs: mtimeMs - 10_000 } @@ -122,6 +122,48 @@ t.test('stale lock takeover', async (t) => { t.equal(mkdirCalls, 2, 'should make two mkdir calls') }) +t.test('EBUSY during lock acquisition', async (t) => { + let mkdirCalls = 0 + mockMkdir = async (...args) => { + if (++mkdirCalls === 1) { + throw Object.assign(new Error(), { code: 'EBUSY' }) + } + return fs.promises.mkdir(...args) + } + const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock') + t.ok(await withLock(lockPath, async () => true)) + t.equal(mkdirCalls, 2, 'should make two mkdir calls') +}) + +t.test('EBUSY during stale lock takeover', async (t) => { + let mkdirCalls = 0 + mockMkdir = async () => { + if (++mkdirCalls === 1) { + throw Object.assign(new Error(), { code: 'EEXIST' }) + } + } + let statCalls = 0 + const mtimeMs = Math.round(Date.now() / 1000) * 1000 + mockStat = async () => { + if (++statCalls === 1) { + return { mtimeMs: mtimeMs - 10_000 } + } else { + return { mtimeMs, ino: 1 } + } + } + let rmdirSyncCalls = 0 + mockRmdirSync = () => { + if (++rmdirSyncCalls === 1) { + throw Object.assign(new Error(), { code: 'EBUSY' }) + } + } + + const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock') + const lockPromise = withLock(lockPath, async () => 'test value') + t.equal(await lockPromise, 'test value', 'should take over the lock') + t.equal(mkdirCalls, 2, 'should make two mkdir calls') +}) + t.test('concurrent stale lock takeover', async (t) => { const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock') // make a stale lock @@ -147,7 +189,7 @@ t.test('mkdir -> getLockStatus race', async (t) => { } } let statCalls = 0 - const mtimeMs = Date.now() + const mtimeMs = Math.round(Date.now() / 1000) * 1000 mockStat = async () => { if (++statCalls === 1) { throw Object.assign(new Error(), { code: 'ENOENT' }) @@ -167,6 +209,22 @@ t.test('mkdir -> getLockStatus race', async (t) => { t.equal(mkdirCalls, 2, 'should make two mkdir calls') }) +t.test('mtime floating point mismatch', async (t) => { + let mtimeMs = Math.round(Date.now() / 1000) * 1000 + mockStat = async () => { + return { mtimeMs, ino: 1 } + } + mockUtimes = async (_, nextMtimeSeconds) => { + mtimeMs = nextMtimeSeconds * 1000 - 0.001 + } + + const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock') + t.ok(await withLock(lockPath, async () => { + await setTimeout(2000) + return true + }), 'should handle mtime floating point mismatches') +}) + t.test('unexpected errors', async (t) => { t.test('can\'t create lock', async (t) => { const lockPath = '/these/parent/directories/do/not/exist/so/it/should/fail.lock' @@ -217,7 +275,7 @@ t.test('unexpected errors', async (t) => { mockStat = async () => { return { mtimeMs: Date.now(), ino: Math.floor(Math.random() * 1000000) } } - await t.rejects(withLock(lockPath, () => setTimeout(1000)), { code: 'ECOMPROMISED' }) + await t.rejects(withLock(lockPath, () => setTimeout(2000)), { code: 'ECOMPROMISED' }) }) t.test('lock compromised (deleted)', async (t) => { @@ -226,10 +284,42 @@ t.test('unexpected errors', async (t) => { mockStat = async () => { throw Object.assign(new Error(), { code: 'ENOENT' }) } - await t.rejects(withLock(lockPath, () => setTimeout(1000)), { code: 'ECOMPROMISED' }) + await t.rejects(withLock(lockPath, () => setTimeout(2000)), { code: 'ECOMPROMISED' }) }) }) +t.test('lock released during maintenance', async (t) => { + // this test validates that if we release the lock while touchLock is running, it doesn't interfere with subsequent locks + const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock') + + let releaseLock + const releaseLockPromise = new Promise((resolve) => { + releaseLock = resolve + }) + + let statCalls = 0 + mockStat = async (...args) => { + const value = await fs.promises.stat(...args) + if (++statCalls > 1) { + // this runs during the setInterval; release the lock so that we no longer hold it + await releaseLock('test value') + await setTimeout() + } + return value + } + + let utimesCalls = 0 + mockUtimes = async () => { + utimesCalls++ + } + + const lockPromise = withLock(lockPath, () => releaseLockPromise) + // since we unref the interval timeout, we need to wait to ensure it actually runs + await setTimeout(2000) + t.equal(await lockPromise, 'test value', 'should acquire the lock') + t.equal(utimesCalls, 0, 'should never call utimes') +}) + t.test('onExit handler', async (t) => { t.ok(onExitHandler, 'should be registered') let rmdirSyncCalls = 0 @@ -241,8 +331,8 @@ t.test('onExit handler', async (t) => { const lockPath = path.join(fs.mkdtempSync(path.join(getTempDir(), 'test-')), 'concurrency.lock') // don't await it since the promise never resolves withLock(lockPath, () => new Promise(() => {})).catch(() => {}) - // ensure the lock is acquired - await setTimeout(0) + // since we unref the interval timeout, we need to wait to ensure it actually runs + await setTimeout(2000) onExitHandler() t.ok(rmdirSyncCalls > 0, 'should have removed outstanding locks') })