Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit f46585c

Browse files
committedNov 27, 2022
git-annex: create modules/annex (#21)
This moves the `annexObjectPath()` helper out of the tests and into a dedicated sub-package as `annex.ContentLocation()`, and expands it with `.Pointer()` (which validates using `git annex examinekey`), `.IsAnnexed()` and `.Content()` to make it a more useful module. The tests retain their own wrapper version of `ContentLocation()` because I tried to follow close to the API modules/lfs uses, which in terms of abstract `git.Blob` and `git.TreeEntry` objects, not in terms of `repoPath string`s which are more convenient for the tests.
1 parent fbe8189 commit f46585c

File tree

3 files changed

+188
-17
lines changed

3 files changed

+188
-17
lines changed
 

‎modules/annex/annex.go

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
// Copyright 2021 The Gitea Authors. All rights reserved.
2+
// Use of this source code is governed by a MIT-style
3+
// license that can be found in the LICENSE file.
4+
5+
// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys.
6+
// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path,
7+
// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock
8+
// filesystem, living only in process RAM). We must have the on-disk path to do anything
9+
// useful with git-annex because all of its interesting data is on-disk under .git/annex/.
10+
11+
package annex
12+
13+
import (
14+
"errors"
15+
"fmt"
16+
"os"
17+
"path"
18+
"strings"
19+
20+
"code.gitea.io/gitea/modules/git"
21+
"code.gitea.io/gitea/modules/setting"
22+
"code.gitea.io/gitea/modules/util"
23+
)
24+
25+
const (
26+
// > The maximum size of a pointer file is 32 kb.
27+
// - https://git-annex.branchable.com/internals/pointer_file/
28+
// It's unclear if that's kilobytes or kibibytes; assuming kibibytes:
29+
blobSizeCutoff = 32 * 1024
30+
)
31+
32+
var (
33+
// ErrInvalidStructure occurs if the content has an invalid structure
34+
ErrInvalidPointer = errors.New("Not a git-annex pointer")
35+
)
36+
37+
// Gets the content of the blob as raw text, up to n bytes.
38+
// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit)
39+
func getBlobContent(b *git.Blob, n int) (string, error) {
40+
dataRc, err := b.DataAsync()
41+
if err != nil {
42+
return "", err
43+
}
44+
defer dataRc.Close()
45+
buf := make([]byte, n)
46+
n, _ = util.ReadAtMost(dataRc, buf)
47+
buf = buf[:n]
48+
return string(buf), nil
49+
}
50+
51+
func Pointer(te *git.TreeEntry) (string, error) {
52+
blob := te.Blob()
53+
54+
// git-annex doesn't seem fully spec what its pointer are, but
55+
// the fullest description is here:
56+
// https://git-annex.branchable.com/internals/pointer_file/
57+
58+
// a pointer can be:
59+
// the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY'
60+
// the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY'
61+
//
62+
// in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because
63+
// git.Blob.DataAsync() works like open() + readlink(), handling both cases in one.
64+
65+
if blob.Size() > blobSizeCutoff {
66+
// > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file.
67+
// https://git-annex.branchable.com/internals/pointer_file/
68+
69+
// It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too.
70+
return "", ErrInvalidPointer
71+
}
72+
73+
//if pointer, err := getBlobContent(blob, blobSizeCutoff); err != nil {
74+
pointer, err := getBlobContent(blob, blobSizeCutoff)
75+
if err != nil {
76+
return "", fmt.Errorf("error reading %s: %w", blob.Name(), err)
77+
}
78+
79+
// the spec says a pointer file can contain multiple lines each with a pointer in them
80+
// but that makes no sense to me, so I'm just ignoring all but the first
81+
lines := strings.Split(pointer, "\n")
82+
if len(lines) < 1 {
83+
return "", ErrInvalidPointer
84+
}
85+
pointer = lines[0]
86+
87+
// in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it
88+
if !strings.Contains(pointer, "/annex/") {
89+
return "", ErrInvalidPointer
90+
}
91+
92+
// extract $KEY
93+
pointer = path.Base(strings.TrimSpace(pointer))
94+
95+
// ask git-annex's opinion on $KEY
96+
// XXX: this is probably a bit slow, especially if this operation gets run often
97+
// and examinekey is not that strict:
98+
// - it doesn't enforce that the "BACKEND" tag is one it knows,
99+
// - it doesn't enforce that the fields and their format fit the "BACKEND" tag
100+
// so maybe this is a wasteful step
101+
_, examineStderr, err := git.NewCommandNoGlobals("annex", "examinekey", git.CmdArg(pointer)).RunStdString(&git.RunOpts{Dir: te.Repo().Path})
102+
if err != nil {
103+
// TODO: make ErrInvalidPointer into a type capable of wrapping err
104+
if strings.TrimSpace(examineStderr) == "git-annex: bad key" {
105+
return "", ErrInvalidPointer
106+
}
107+
return "", err
108+
}
109+
110+
return pointer, nil
111+
}
112+
113+
// return the absolute path of the content pointed to by the annex pointer stored in the git object
114+
// errors if the content is not found in this repo
115+
func ContentLocation(te *git.TreeEntry) (string, error) {
116+
repoPath := te.Repo().Path
117+
118+
pointer, err := Pointer(te)
119+
if err != nil {
120+
return "", err
121+
}
122+
123+
contentLocation, _, err := git.NewCommandNoGlobals("annex", "contentlocation", git.CmdArg(pointer)).RunStdString(&git.RunOpts{Dir: repoPath})
124+
if err != nil {
125+
return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", repoPath, pointer, err)
126+
}
127+
contentLocation = strings.TrimSpace(contentLocation)
128+
contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals
129+
contentLocation = path.Join(repoPath, contentLocation)
130+
131+
return contentLocation, nil
132+
}
133+
134+
// returns a stream open to the annex content
135+
func Content(te *git.TreeEntry) (*os.File, error) {
136+
contentLocation, err := ContentLocation(te)
137+
if err != nil {
138+
return nil, err
139+
}
140+
141+
return os.Open(contentLocation)
142+
}
143+
144+
// whether the object appears to be a valid annex pointer
145+
// does *not* verify if the content is actually in this repo;
146+
// for that, use ContentLocation()
147+
func IsAnnexed(te *git.TreeEntry) (bool, error) {
148+
if !setting.Annex.Enabled {
149+
return false, nil
150+
}
151+
152+
// Pointer() is written to only return well-formed pointers
153+
// so the test is just to see if it errors
154+
_, err := Pointer(te)
155+
if err != nil {
156+
if errors.Is(err, ErrInvalidPointer) {
157+
return false, nil
158+
}
159+
return false, err
160+
}
161+
return true, nil
162+
}

‎modules/git/tree_entry.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ import (
1111
"strings"
1212
)
1313

14+
func (te *TreeEntry) Repo() *Repository {
15+
return te.ptree.repo
16+
}
17+
1418
// Type returns the type of the entry (commit, tree, blob)
1519
func (te *TreeEntry) Type() string {
1620
switch te.Mode() {

‎tests/integration/git_annex_test.go

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020

2121
"code.gitea.io/gitea/models/perm"
2222
repo_model "code.gitea.io/gitea/models/repo"
23+
"code.gitea.io/gitea/modules/annex"
2324
"code.gitea.io/gitea/modules/git"
2425
"code.gitea.io/gitea/modules/setting"
2526
api "code.gitea.io/gitea/modules/structs"
@@ -781,13 +782,13 @@ func doAnnexDownloadTest(remoteRepoPath string, repoPath string) (err error) {
781782
}
782783

783784
// verify the file was downloaded
784-
localObjectPath, err := annexObjectPath(repoPath, "large.bin")
785+
localObjectPath, err := contentLocation(repoPath, "large.bin")
785786
if err != nil {
786787
return err
787788
}
788789
//localObjectPath := path.Join(repoPath, "large.bin") // or, just compare against the checked-out file
789790

790-
remoteObjectPath, err := annexObjectPath(remoteRepoPath, "large.bin")
791+
remoteObjectPath, err := contentLocation(remoteRepoPath, "large.bin")
791792
if err != nil {
792793
return err
793794
}
@@ -834,13 +835,13 @@ func doAnnexUploadTest(remoteRepoPath string, repoPath string) (err error) {
834835
}
835836

836837
// verify the file was uploaded
837-
localObjectPath, err := annexObjectPath(repoPath, "contribution.bin")
838+
localObjectPath, err := contentLocation(repoPath, "contribution.bin")
838839
if err != nil {
839840
return err
840841
}
841842
//localObjectPath := path.Join(repoPath, "contribution.bin") // or, just compare against the checked-out file
842843

843-
remoteObjectPath, err := annexObjectPath(remoteRepoPath, "contribution.bin")
844+
remoteObjectPath, err := contentLocation(remoteRepoPath, "contribution.bin")
844845
if err != nil {
845846
return err
846847
}
@@ -999,26 +1000,30 @@ func doInitRemoteAnnexRepository(t *testing.T, repoURL *url.URL) error {
9991000
10001001
TODO: pass a parameter to allow examining non-HEAD branches
10011002
*/
1002-
func annexObjectPath(repoPath string, file string) (string, error) {
1003-
// NB: `git annex lookupkey` is more reliable, but doesn't work in bare repos.
1004-
annexKey, _, err := git.NewCommandNoGlobals("show", git.CmdArg("HEAD:"+file)).RunStdString(&git.RunOpts{Dir: repoPath})
1003+
func contentLocation(repoPath string, file string) (path string, err error) {
1004+
path = ""
1005+
1006+
repo, err := git.OpenRepository(git.DefaultContext, repoPath)
1007+
if err != nil {
1008+
return
1009+
}
1010+
1011+
commitID, err := repo.GetRefCommitID("HEAD") // NB: to examine a *branch*, prefix with "refs/branch/", or call repo.GetBranchCommitID(); ditto for tags
10051012
if err != nil {
1006-
return "", fmt.Errorf("in %s: %w", repoPath, err) // the error from git prints the filename but not repo
1013+
return
10071014
}
10081015

1009-
// There are two formats an annexed file pointer might be:
1010-
// * a symlink to .git/annex/objects/$HASHDIR/$ANNEX_KEY/$ANNEX_KEY - used by files created with 'git annex add'
1011-
// * a text file containing /annex/objects/$ANNEX_KEY - used by files for which 'git add' was configured to run git-annex-smudge
1012-
// This recovers $ANNEX_KEY from either case:
1013-
annexKey = path.Base(strings.TrimSpace(annexKey))
1016+
commit, err := repo.GetCommit(commitID)
1017+
if err != nil {
1018+
return
1019+
}
10141020

1015-
contentPath, _, err := git.NewCommandNoGlobals("annex", "contentlocation", git.CmdArg(annexKey)).RunStdString(&git.RunOpts{Dir: repoPath})
1021+
treeEntry, err := commit.GetTreeEntryByPath(file)
10161022
if err != nil {
1017-
return "", fmt.Errorf("in %s: %s does not seem to be annexed: %w", repoPath, file, err)
1023+
return
10181024
}
1019-
contentPath = strings.TrimSpace(contentPath)
10201025

1021-
return path.Join(repoPath, contentPath), nil
1026+
return annex.ContentLocation(treeEntry)
10221027
}
10231028

10241029
/* like withKeyFile(), but automatically sets it the account given in ctx for use by git-annex */

0 commit comments

Comments
 (0)
Please sign in to comment.