|
| 1 | +// Copyright 2022 The Gitea Authors. All rights reserved. |
| 2 | +// SPDX-License-Identifier: MIT |
| 3 | + |
| 4 | +// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys. |
| 5 | +// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path, |
| 6 | +// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock |
| 7 | +// filesystem, living only in process RAM). We must have the on-disk path to do anything |
| 8 | +// useful with git-annex because all of its interesting data is on-disk under .git/annex/. |
| 9 | + |
| 10 | +package annex |
| 11 | + |
| 12 | +import ( |
| 13 | + "errors" |
| 14 | + "fmt" |
| 15 | + "os" |
| 16 | + "path" |
| 17 | + "strings" |
| 18 | + |
| 19 | + "code.gitea.io/gitea/modules/git" |
| 20 | + "code.gitea.io/gitea/modules/setting" |
| 21 | + "code.gitea.io/gitea/modules/util" |
| 22 | +) |
| 23 | + |
| 24 | +const ( |
| 25 | + // > The maximum size of a pointer file is 32 kb. |
| 26 | + // - https://git-annex.branchable.com/internals/pointer_file/ |
| 27 | + // It's unclear if that's kilobytes or kibibytes; assuming kibibytes: |
| 28 | + blobSizeCutoff = 32 * 1024 |
| 29 | +) |
| 30 | + |
| 31 | +// ErrInvalidPointer occurs if the pointer's value doesn't parse |
| 32 | +var ErrInvalidPointer = errors.New("Not a git-annex pointer") |
| 33 | + |
| 34 | +// Gets the content of the blob as raw text, up to n bytes. |
| 35 | +// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit) |
| 36 | +func getBlobContent(b *git.Blob, n int) (string, error) { |
| 37 | + dataRc, err := b.DataAsync() |
| 38 | + if err != nil { |
| 39 | + return "", err |
| 40 | + } |
| 41 | + defer dataRc.Close() |
| 42 | + buf := make([]byte, n) |
| 43 | + n, _ = util.ReadAtMost(dataRc, buf) |
| 44 | + buf = buf[:n] |
| 45 | + return string(buf), nil |
| 46 | +} |
| 47 | + |
| 48 | +func Pointer(blob *git.Blob) (string, error) { |
| 49 | + // git-annex doesn't seem fully spec what its pointer are, but |
| 50 | + // the fullest description is here: |
| 51 | + // https://git-annex.branchable.com/internals/pointer_file/ |
| 52 | + |
| 53 | + // a pointer can be: |
| 54 | + // the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY' |
| 55 | + // the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY' |
| 56 | + // |
| 57 | + // in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because |
| 58 | + // git.Blob.DataAsync() works like open() + readlink(), handling both cases in one. |
| 59 | + |
| 60 | + if blob.Size() > blobSizeCutoff { |
| 61 | + // > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file. |
| 62 | + // https://git-annex.branchable.com/internals/pointer_file/ |
| 63 | + |
| 64 | + // It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too. |
| 65 | + return "", ErrInvalidPointer |
| 66 | + } |
| 67 | + |
| 68 | + pointer, err := getBlobContent(blob, blobSizeCutoff) |
| 69 | + if err != nil { |
| 70 | + return "", fmt.Errorf("error reading %s: %w", blob.Name(), err) |
| 71 | + } |
| 72 | + |
| 73 | + // the spec says a pointer file can contain multiple lines each with a pointer in them |
| 74 | + // but that makes no sense to me, so I'm just ignoring all but the first |
| 75 | + lines := strings.Split(pointer, "\n") |
| 76 | + if len(lines) < 1 { |
| 77 | + return "", ErrInvalidPointer |
| 78 | + } |
| 79 | + pointer = lines[0] |
| 80 | + |
| 81 | + // in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it |
| 82 | + if !strings.Contains(pointer, "/annex/") { |
| 83 | + return "", ErrInvalidPointer |
| 84 | + } |
| 85 | + |
| 86 | + // extract $KEY |
| 87 | + pointer = path.Base(strings.TrimSpace(pointer)) |
| 88 | + |
| 89 | + // ask git-annex's opinion on $KEY |
| 90 | + // XXX: this is probably a bit slow, especially if this operation gets run often |
| 91 | + // and examinekey is not that strict: |
| 92 | + // - it doesn't enforce that the "BACKEND" tag is one it knows, |
| 93 | + // - it doesn't enforce that the fields and their format fit the "BACKEND" tag |
| 94 | + // so maybe this is a wasteful step |
| 95 | + _, examineStderr, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "examinekey").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path}) |
| 96 | + if err != nil { |
| 97 | + // TODO: make ErrInvalidPointer into a type capable of wrapping err |
| 98 | + if strings.TrimSpace(examineStderr) == "git-annex: bad key" { |
| 99 | + return "", ErrInvalidPointer |
| 100 | + } |
| 101 | + return "", err |
| 102 | + } |
| 103 | + |
| 104 | + return pointer, nil |
| 105 | +} |
| 106 | + |
| 107 | +// return the absolute path of the content pointed to by the annex pointer stored in the git object |
| 108 | +// errors if the content is not found in this repo |
| 109 | +func ContentLocation(blob *git.Blob) (string, error) { |
| 110 | + pointer, err := Pointer(blob) |
| 111 | + if err != nil { |
| 112 | + return "", err |
| 113 | + } |
| 114 | + |
| 115 | + contentLocation, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path}) |
| 116 | + if err != nil { |
| 117 | + return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", blob.Repo().Path, pointer, err) |
| 118 | + } |
| 119 | + contentLocation = strings.TrimSpace(contentLocation) |
| 120 | + contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals |
| 121 | + contentLocation = path.Join(blob.Repo().Path, contentLocation) |
| 122 | + |
| 123 | + return contentLocation, nil |
| 124 | +} |
| 125 | + |
| 126 | +// returns a stream open to the annex content |
| 127 | +func Content(blob *git.Blob) (*os.File, error) { |
| 128 | + contentLocation, err := ContentLocation(blob) |
| 129 | + if err != nil { |
| 130 | + return nil, err |
| 131 | + } |
| 132 | + |
| 133 | + return os.Open(contentLocation) |
| 134 | +} |
| 135 | + |
| 136 | +// whether the object appears to be a valid annex pointer |
| 137 | +// does *not* verify if the content is actually in this repo; |
| 138 | +// for that, use ContentLocation() |
| 139 | +func IsAnnexed(blob *git.Blob) (bool, error) { |
| 140 | + if !setting.Annex.Enabled { |
| 141 | + return false, nil |
| 142 | + } |
| 143 | + |
| 144 | + // Pointer() is written to only return well-formed pointers |
| 145 | + // so the test is just to see if it errors |
| 146 | + _, err := Pointer(blob) |
| 147 | + if err != nil { |
| 148 | + if errors.Is(err, ErrInvalidPointer) { |
| 149 | + return false, nil |
| 150 | + } |
| 151 | + return false, err |
| 152 | + } |
| 153 | + return true, nil |
| 154 | +} |
0 commit comments