Skip to content

Commit a069704

Browse files
adonovangopherbot
authored andcommitted
gopls/internal/lsp/filecache: avoid flock
This CL changes the implementation of the filecache to use a scheme similar to that used by the go command's cache. Instead of atomic rename(2), or flock(2), it instead relies on the atomicity in practice of writes to small files (in our case 32 bytes, the size of a SHA256 hash). A cache entry now consists of two files, a kind="cas" file that holds the cache value, keyed by its SHA256 hash, and an index file, whose name is formed from the user-provided kind and key, and whose content is the SHA256 hash that is a key into the CAS. Writes to the CAS may race, so we check the integrity of everything we read back from it using SHA256. Writes to the index files may also race, but we assume that small writes are in practice atomic. The memory-based LRU cache has beeen temporarily disabled so that we can soak test the new implementation for a while. We expect this to be significantly more reliable, and also faster. Change-Id: I25cf341b90c985dcab015df770be579ea786bd06 Reviewed-on: https://go-review.googlesource.com/c/tools/+/495800 Run-TryBot: Alan Donovan <[email protected]> Reviewed-by: Robert Findley <[email protected]> TryBot-Result: Gopher Robot <[email protected]> gopls-CI: kokoro <[email protected]> Auto-Submit: Alan Donovan <[email protected]>
1 parent 3d53c2d commit a069704

File tree

14 files changed

+123
-1570
lines changed

14 files changed

+123
-1570
lines changed

gopls/internal/lsp/filecache/filecache.go

Lines changed: 123 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -23,25 +23,21 @@ package filecache
2323
import (
2424
"bytes"
2525
"crypto/sha256"
26-
"encoding/binary"
2726
"encoding/hex"
2827
"errors"
2928
"fmt"
30-
"hash/crc32"
3129
"io"
3230
"io/fs"
3331
"log"
3432
"os"
3533
"path/filepath"
36-
"runtime"
3734
"sort"
3835
"sync"
3936
"sync/atomic"
4037
"time"
4138

4239
"golang.org/x/tools/gopls/internal/bug"
4340
"golang.org/x/tools/gopls/internal/lsp/lru"
44-
"golang.org/x/tools/internal/lockedfile"
4541
)
4642

4743
// Start causes the filecache to initialize and start garbage gollection.
@@ -62,6 +58,8 @@ type memKey struct {
6258
key [32]byte
6359
}
6460

61+
const useMemCache = false // disabled for now while we debug the new file-based implementation
62+
6563
// Get retrieves from the cache and returns a newly allocated
6664
// copy of the value most recently supplied to Set(kind, key),
6765
// possibly by another process.
@@ -70,67 +68,71 @@ func Get(kind string, key [32]byte) ([]byte, error) {
7068
// First consult the read-through memory cache.
7169
// Note that memory cache hits do not update the times
7270
// used for LRU eviction of the file-based cache.
73-
if value := memCache.Get(memKey{kind, key}); value != nil {
74-
return value.([]byte), nil
71+
if useMemCache {
72+
if value := memCache.Get(memKey{kind, key}); value != nil {
73+
return value.([]byte), nil
74+
}
7575
}
7676

7777
iolimit <- struct{}{} // acquire a token
7878
defer func() { <-iolimit }() // release a token
7979

80-
name, err := filename(kind, key)
80+
// Read the index file, which provides the name of the CAS file.
81+
indexName, err := filename(kind, key)
8182
if err != nil {
8283
return nil, err
8384
}
84-
data, err := lockedfile.Read(name)
85+
indexData, err := os.ReadFile(indexName)
8586
if err != nil {
8687
if errors.Is(err, os.ErrNotExist) {
8788
return nil, ErrNotFound
8889
}
8990
return nil, err
9091
}
91-
92-
// Verify that the Write was complete
93-
// by checking the recorded length.
94-
if len(data) < 8+4 {
95-
return nil, ErrNotFound // cache entry is incomplete
96-
}
97-
length, value, checksum := data[:8], data[8:len(data)-4], data[len(data)-4:]
98-
if binary.LittleEndian.Uint64(length) != uint64(len(value)) {
99-
return nil, ErrNotFound // cache entry is incomplete (or too long!)
92+
var valueHash [32]byte
93+
if copy(valueHash[:], indexData) != len(valueHash) {
94+
return nil, ErrNotFound // index entry has wrong length
10095
}
10196

102-
// Check for corruption and print the entire file content; see
103-
// issue #59289. TODO(adonovan): stop printing the entire file
104-
// once we've seen enough reports to understand the pattern.
105-
if binary.LittleEndian.Uint32(checksum) != crc32.ChecksumIEEE(value) {
106-
// Darwin has repeatedly displayed a problem (#59895)
107-
// whereby the checksum portion (and only it) is zero,
108-
// which suggests a bug in its file system . Don't
109-
// panic, but keep an eye on other failures for now.
110-
errorf := bug.Errorf
111-
if binary.LittleEndian.Uint32(checksum) == 0 && runtime.GOOS == "darwin" {
112-
errorf = fmt.Errorf
113-
}
114-
115-
return nil, errorf("internal error in filecache.Get(%q, %x): invalid checksum at end of %d-byte file %s:\n%q",
116-
kind, key, len(data), name, data)
97+
// Read the CAS file and check its contents match.
98+
//
99+
// This ensures integrity in all cases (corrupt or truncated
100+
// file, short read, I/O error, wrong length, etc) except an
101+
// engineered hash collision, which is infeasible.
102+
casName, err := filename(casKind, valueHash)
103+
if err != nil {
104+
return nil, err
105+
}
106+
value, _ := os.ReadFile(casName) // ignore error
107+
if sha256.Sum256(value) != valueHash {
108+
return nil, ErrNotFound // CAS file is missing or has wrong contents
117109
}
118110

119-
// Update file time for use by LRU eviction.
120-
// (This turns every read into a write operation.
111+
// Update file times used by LRU eviction.
112+
//
113+
// This turns every read into a write operation.
121114
// If this is a performance problem, we should
122-
// touch the files aynchronously.)
115+
// touch the files asynchronously, or, follow
116+
// the approach used in the go command's cache
117+
// and update only if the existing timestamp is
118+
// older than, say, one hour.
123119
//
124120
// (Traditionally the access time would be updated
125121
// automatically, but for efficiency most POSIX systems have
126122
// for many years set the noatime mount option to avoid every
127123
// open or read operation entailing a metadata write.)
128124
now := time.Now()
129-
if err := os.Chtimes(name, now, now); err != nil {
130-
return nil, fmt.Errorf("failed to update access time: %w", err)
125+
if err := os.Chtimes(indexName, now, now); err != nil {
126+
return nil, fmt.Errorf("failed to update access time of index file: %w", err)
127+
}
128+
if err := os.Chtimes(casName, now, now); err != nil {
129+
return nil, fmt.Errorf("failed to update access time of CAS file: %w", err)
130+
}
131+
132+
if useMemCache {
133+
memCache.Set(memKey{kind, key}, value, len(value))
131134
}
132135

133-
memCache.Set(memKey{kind, key}, value, len(value))
134136
return value, nil
135137
}
136138

@@ -140,50 +142,69 @@ var ErrNotFound = fmt.Errorf("not found")
140142

141143
// Set updates the value in the cache.
142144
func Set(kind string, key [32]byte, value []byte) error {
143-
memCache.Set(memKey{kind, key}, value, len(value))
145+
if useMemCache {
146+
memCache.Set(memKey{kind, key}, value, len(value))
147+
}
144148

145149
iolimit <- struct{}{} // acquire a token
146150
defer func() { <-iolimit }() // release a token
147151

148-
name, err := filename(kind, key)
152+
// First, add the value to the content-
153+
// addressable store (CAS), if not present.
154+
hash := sha256.Sum256(value)
155+
casName, err := filename(casKind, hash)
149156
if err != nil {
150157
return err
151158
}
152-
if err := os.MkdirAll(filepath.Dir(name), 0700); err != nil {
159+
// Does CAS file exist and have correct (complete) content?
160+
// TODO(adonovan): opt: use mmap for this check.
161+
if prev, _ := os.ReadFile(casName); !bytes.Equal(prev, value) {
162+
if err := os.MkdirAll(filepath.Dir(casName), 0700); err != nil {
163+
return err
164+
}
165+
// Avoiding O_TRUNC here is merely an optimization to avoid
166+
// cache misses when two threads race to write the same file.
167+
if err := writeFileNoTrunc(casName, value, 0666); err != nil {
168+
os.Remove(casName) // ignore error
169+
return err // e.g. disk full
170+
}
171+
}
172+
173+
// Now write an index entry that refers to the CAS file.
174+
indexName, err := filename(kind, key)
175+
if err != nil {
153176
return err
154177
}
178+
if err := os.MkdirAll(filepath.Dir(indexName), 0700); err != nil {
179+
return err
180+
}
181+
if err := writeFileNoTrunc(indexName, hash[:], 0666); err != nil {
182+
os.Remove(indexName) // ignore error
183+
return err // e.g. disk full
184+
}
155185

156-
// In the unlikely event of a short write (e.g. ENOSPC)
157-
// followed by process termination (e.g. a power cut), we
158-
// don't want a reader to see a short file, so we record
159-
// the expected length first and verify it in Get.
160-
var length [8]byte
161-
binary.LittleEndian.PutUint64(length[:], uint64(len(value)))
162-
163-
// Occasional file corruption (presence of zero bytes in JSON
164-
// files) has been reported on macOS (see issue #59289),
165-
// assumed due to a nonatomicity problem in the file system.
166-
// Ideally the macOS kernel would be fixed, or lockedfile
167-
// would implement a workaround (since its job is to provide
168-
// reliable the mutual exclusion primitive that allows
169-
// cooperating gopls processes to implement transactional
170-
// file replacement), but for now we add an extra integrity
171-
// check: a 32-bit checksum at the end.
172-
var checksum [4]byte
173-
binary.LittleEndian.PutUint32(checksum[:], crc32.ChecksumIEEE(value))
174-
175-
// Windows doesn't support atomic rename--we tried MoveFile,
176-
// MoveFileEx, ReplaceFileEx, and SetFileInformationByHandle
177-
// of RenameFileInfo, all to no avail--so instead we use
178-
// advisory file locking, which is only about 2x slower even
179-
// on POSIX platforms with atomic rename.
180-
return lockedfile.Write(name, io.MultiReader(
181-
bytes.NewReader(length[:]),
182-
bytes.NewReader(value),
183-
bytes.NewReader(checksum[:])),
184-
0600)
186+
return nil
185187
}
186188

189+
// writeFileNoTrunc is like os.WriteFile but doesn't truncate until
190+
// after the write, so that racing writes of the same data are idempotent.
191+
func writeFileNoTrunc(filename string, data []byte, perm os.FileMode) error {
192+
f, err := os.OpenFile(filename, os.O_WRONLY|os.O_CREATE, perm)
193+
if err != nil {
194+
return err
195+
}
196+
_, err = f.Write(data)
197+
if err == nil {
198+
err = f.Truncate(int64(len(data)))
199+
}
200+
if closeErr := f.Close(); err == nil {
201+
err = closeErr
202+
}
203+
return err
204+
}
205+
206+
const casKind = "cas"
207+
187208
var iolimit = make(chan struct{}, 128) // counting semaphore to limit I/O concurrency in Set.
188209

189210
var budget int64 = 1e9 // 1GB
@@ -204,9 +225,9 @@ func SetBudget(new int64) (old int64) {
204225

205226
// --- implementation ----
206227

207-
// filename returns the cache entry of the specified kind and key.
228+
// filename returns the name of the cache file of the specified kind and key.
208229
//
209-
// A typical cache entry is a file name such as:
230+
// A typical cache file has a name such as:
210231
//
211232
// $HOME/Library/Caches / gopls / VVVVVVVV / kind / KK / KKKK...KKKK
212233
//
@@ -218,8 +239,33 @@ func SetBudget(new int64) (old int64) {
218239
// - The first 8 bits of the key, to avoid huge directories.
219240
// - The full 256 bits of the key.
220241
//
221-
// Once a file is written its contents are never modified, though it
222-
// may be atomically replaced or removed.
242+
// Previous iterations of the design aimed for the invariant that once
243+
// a file is written, its contents are never modified, though it may
244+
// be atomically replaced or removed. However, not all platforms have
245+
// an atomic rename operation (our first approach), and file locking
246+
// (our second) is a notoriously fickle mechanism.
247+
//
248+
// The current design instead exploits a trick from the cache
249+
// implementation used by the go command: writes of small files are in
250+
// practice atomic (all or nothing) on all platforms.
251+
// (See GOROOT/src/cmd/go/internal/cache/cache.go.)
252+
//
253+
// We use a two-level scheme consisting of an index and a
254+
// content-addressable store (CAS). A single cache entry consists of
255+
// two files. The value of a cache entry is written into the file at
256+
// filename("cas", sha256(value)). Since the value may be arbitrarily
257+
// large, this write is not atomic. That means we must check the
258+
// integrity of the contents read back from the CAS to make sure they
259+
// hash to the expected key. If the CAS file is incomplete or
260+
// inconsistent, we proceed as if it were missing.
261+
//
262+
// Once the CAS file has been written, we write a small fixed-size
263+
// index file at filename(kind, key), using the values supplied by the
264+
// caller. The index file contains the hash that identifies the value
265+
// file in the CAS. (We could add a small amount of extra metadata to
266+
// this file if later desired.) Because the index file is small,
267+
// concurrent writes to it are atomic in practice, even though this is
268+
// not guaranteed by any OS.
223269
//
224270
// New versions of gopls are free to reorganize the contents of the
225271
// version directory as needs evolve. But all versions of gopls must
@@ -229,6 +275,9 @@ func SetBudget(new int64) (old int64) {
229275
// the entire gopls directory so that newer binaries can clean up
230276
// after older ones: in the development cycle especially, new
231277
// new versions may be created frequently.
278+
279+
// TODO(adonovan): opt: use "VVVVVVVV / KK / KKKK...KKKK-kind" to
280+
// avoid creating 256 directories per distinct kind (+ cas).
232281
func filename(kind string, key [32]byte) (string, error) {
233282
hex := fmt.Sprintf("%x", key)
234283
dir, err := getCacheDir()

0 commit comments

Comments
 (0)