Skip to content

Commit 60da8b5

Browse files
kousuactions-user
authored andcommitted
git-annex: create modules/annex (#21)
This moves the `annexObjectPath()` helper out of the tests and into a dedicated sub-package as `annex.ContentLocation()`, and expands it with `.Pointer()` (which validates using `git annex examinekey`), `.IsAnnexed()` and `.Content()` to make it a more useful module. The tests retain their own wrapper version of `ContentLocation()` because I tried to follow close to the API modules/lfs uses, which in terms of abstract `git.Blob` and `git.TreeEntry` objects, not in terms of `repoPath string`s which are more convenient for the tests.
1 parent 48d8144 commit 60da8b5

File tree

6 files changed

+184
-18
lines changed

6 files changed

+184
-18
lines changed

modules/annex/annex.go

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// Copyright 2022 The Gitea Authors. All rights reserved.
2+
// SPDX-License-Identifier: MIT
3+
4+
// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys.
5+
// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path,
6+
// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock
7+
// filesystem, living only in process RAM). We must have the on-disk path to do anything
8+
// useful with git-annex because all of its interesting data is on-disk under .git/annex/.
9+
10+
package annex
11+
12+
import (
13+
"errors"
14+
"fmt"
15+
"os"
16+
"path"
17+
"strings"
18+
19+
"code.gitea.io/gitea/modules/git"
20+
"code.gitea.io/gitea/modules/setting"
21+
"code.gitea.io/gitea/modules/util"
22+
)
23+
24+
const (
25+
// > The maximum size of a pointer file is 32 kb.
26+
// - https://git-annex.branchable.com/internals/pointer_file/
27+
// It's unclear if that's kilobytes or kibibytes; assuming kibibytes:
28+
blobSizeCutoff = 32 * 1024
29+
)
30+
31+
// ErrInvalidPointer occurs if the pointer's value doesn't parse
32+
var ErrInvalidPointer = errors.New("Not a git-annex pointer")
33+
34+
// Gets the content of the blob as raw text, up to n bytes.
35+
// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit)
36+
func getBlobContent(b *git.Blob, n int) (string, error) {
37+
dataRc, err := b.DataAsync()
38+
if err != nil {
39+
return "", err
40+
}
41+
defer dataRc.Close()
42+
buf := make([]byte, n)
43+
n, _ = util.ReadAtMost(dataRc, buf)
44+
buf = buf[:n]
45+
return string(buf), nil
46+
}
47+
48+
func Pointer(blob *git.Blob) (string, error) {
49+
// git-annex doesn't seem fully spec what its pointer are, but
50+
// the fullest description is here:
51+
// https://git-annex.branchable.com/internals/pointer_file/
52+
53+
// a pointer can be:
54+
// the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY'
55+
// the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY'
56+
//
57+
// in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because
58+
// git.Blob.DataAsync() works like open() + readlink(), handling both cases in one.
59+
60+
if blob.Size() > blobSizeCutoff {
61+
// > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file.
62+
// https://git-annex.branchable.com/internals/pointer_file/
63+
64+
// It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too.
65+
return "", ErrInvalidPointer
66+
}
67+
68+
pointer, err := getBlobContent(blob, blobSizeCutoff)
69+
if err != nil {
70+
return "", fmt.Errorf("error reading %s: %w", blob.Name(), err)
71+
}
72+
73+
// the spec says a pointer file can contain multiple lines each with a pointer in them
74+
// but that makes no sense to me, so I'm just ignoring all but the first
75+
lines := strings.Split(pointer, "\n")
76+
if len(lines) < 1 {
77+
return "", ErrInvalidPointer
78+
}
79+
pointer = lines[0]
80+
81+
// in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it
82+
if !strings.Contains(pointer, "/annex/") {
83+
return "", ErrInvalidPointer
84+
}
85+
86+
// extract $KEY
87+
pointer = path.Base(strings.TrimSpace(pointer))
88+
89+
// ask git-annex's opinion on $KEY
90+
// XXX: this is probably a bit slow, especially if this operation gets run often
91+
// and examinekey is not that strict:
92+
// - it doesn't enforce that the "BACKEND" tag is one it knows,
93+
// - it doesn't enforce that the fields and their format fit the "BACKEND" tag
94+
// so maybe this is a wasteful step
95+
_, examineStderr, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "examinekey").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path})
96+
if err != nil {
97+
// TODO: make ErrInvalidPointer into a type capable of wrapping err
98+
if strings.TrimSpace(examineStderr) == "git-annex: bad key" {
99+
return "", ErrInvalidPointer
100+
}
101+
return "", err
102+
}
103+
104+
return pointer, nil
105+
}
106+
107+
// return the absolute path of the content pointed to by the annex pointer stored in the git object
108+
// errors if the content is not found in this repo
109+
func ContentLocation(blob *git.Blob) (string, error) {
110+
pointer, err := Pointer(blob)
111+
if err != nil {
112+
return "", err
113+
}
114+
115+
contentLocation, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(pointer).RunStdString(&git.RunOpts{Dir: blob.Repo().Path})
116+
if err != nil {
117+
return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", blob.Repo().Path, pointer, err)
118+
}
119+
contentLocation = strings.TrimSpace(contentLocation)
120+
contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals
121+
contentLocation = path.Join(blob.Repo().Path, contentLocation)
122+
123+
return contentLocation, nil
124+
}
125+
126+
// returns a stream open to the annex content
127+
func Content(blob *git.Blob) (*os.File, error) {
128+
contentLocation, err := ContentLocation(blob)
129+
if err != nil {
130+
return nil, err
131+
}
132+
133+
return os.Open(contentLocation)
134+
}
135+
136+
// whether the object appears to be a valid annex pointer
137+
// does *not* verify if the content is actually in this repo;
138+
// for that, use ContentLocation()
139+
func IsAnnexed(blob *git.Blob) (bool, error) {
140+
if !setting.Annex.Enabled {
141+
return false, nil
142+
}
143+
144+
// Pointer() is written to only return well-formed pointers
145+
// so the test is just to see if it errors
146+
_, err := Pointer(blob)
147+
if err != nil {
148+
if errors.Is(err, ErrInvalidPointer) {
149+
return false, nil
150+
}
151+
return false, err
152+
}
153+
return true, nil
154+
}

modules/git/blob.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ import (
1515

1616
// This file contains common functions between the gogit and !gogit variants for git Blobs
1717

18+
func (b *Blob) Repo() *Repository {
19+
return b.repo
20+
}
21+
1822
// Name returns name of the tree entry this blob object was created from (or empty string)
1923
func (b *Blob) Name() string {
2024
return b.name

modules/git/blob_gogit.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ import (
1414

1515
// Blob represents a Git object.
1616
type Blob struct {
17-
ID SHA1
17+
ID SHA1
18+
repo *Repository
1819

1920
gogitEncodedObj plumbing.EncodedObject
2021
name string

modules/git/repo_blob_gogit.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ func (repo *Repository) getBlob(id SHA1) (*Blob, error) {
1717

1818
return &Blob{
1919
ID: id,
20+
repo: repo,
2021
gogitEncodedObj: encodedObj,
2122
}, nil
2223
}

modules/git/tree_entry_gogit.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ func (te *TreeEntry) Blob() *Blob {
8989

9090
return &Blob{
9191
ID: te.gogitTreeEntry.Hash,
92+
repo: te.ptree.repo,
9293
gogitEncodedObj: encodedObj,
9394
name: te.Name(),
9495
}

tests/integration/git_annex_test.go

Lines changed: 22 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ import (
1919
"code.gitea.io/gitea/models/db"
2020
"code.gitea.io/gitea/models/perm"
2121
repo_model "code.gitea.io/gitea/models/repo"
22+
"code.gitea.io/gitea/modules/annex"
2223
"code.gitea.io/gitea/modules/git"
2324
"code.gitea.io/gitea/modules/setting"
2425
api "code.gitea.io/gitea/modules/structs"
@@ -788,13 +789,13 @@ func doAnnexDownloadTest(remoteRepoPath, repoPath string) (err error) {
788789
}
789790

790791
// verify the file was downloaded
791-
localObjectPath, err := annexObjectPath(repoPath, "large.bin")
792+
localObjectPath, err := contentLocation(repoPath, "large.bin")
792793
if err != nil {
793794
return err
794795
}
795796
// localObjectPath := path.Join(repoPath, "large.bin") // or, just compare against the checked-out file
796797

797-
remoteObjectPath, err := annexObjectPath(remoteRepoPath, "large.bin")
798+
remoteObjectPath, err := contentLocation(remoteRepoPath, "large.bin")
798799
if err != nil {
799800
return err
800801
}
@@ -841,13 +842,13 @@ func doAnnexUploadTest(remoteRepoPath, repoPath string) (err error) {
841842
}
842843

843844
// verify the file was uploaded
844-
localObjectPath, err := annexObjectPath(repoPath, "contribution.bin")
845+
localObjectPath, err := contentLocation(repoPath, "contribution.bin")
845846
if err != nil {
846847
return err
847848
}
848849
// localObjectPath := path.Join(repoPath, "contribution.bin") // or, just compare against the checked-out file
849850

850-
remoteObjectPath, err := annexObjectPath(remoteRepoPath, "contribution.bin")
851+
remoteObjectPath, err := contentLocation(remoteRepoPath, "contribution.bin")
851852
if err != nil {
852853
return err
853854
}
@@ -1001,26 +1002,30 @@ Find the path in .git/annex/objects/ of the contents for a given annexed file.
10011002
10021003
TODO: pass a parameter to allow examining non-HEAD branches
10031004
*/
1004-
func annexObjectPath(repoPath, file string) (string, error) {
1005-
// NB: `git annex lookupkey` is more reliable, but doesn't work in bare repos.
1006-
annexKey, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "show").AddDynamicArguments("HEAD:" + file).RunStdString(&git.RunOpts{Dir: repoPath})
1005+
func contentLocation(repoPath, file string) (path string, err error) {
1006+
path = ""
1007+
1008+
repo, err := git.OpenRepository(git.DefaultContext, repoPath)
1009+
if err != nil {
1010+
return path, nil
1011+
}
1012+
1013+
commitID, err := repo.GetRefCommitID("HEAD") // NB: to examine a *branch*, prefix with "refs/branch/", or call repo.GetBranchCommitID(); ditto for tags
10071014
if err != nil {
1008-
return "", fmt.Errorf("in %s: %w", repoPath, err) // the error from git prints the filename but not repo
1015+
return path, nil
10091016
}
10101017

1011-
// There are two formats an annexed file pointer might be:
1012-
// * a symlink to .git/annex/objects/$HASHDIR/$ANNEX_KEY/$ANNEX_KEY - used by files created with 'git annex add'
1013-
// * a text file containing /annex/objects/$ANNEX_KEY - used by files for which 'git add' was configured to run git-annex-smudge
1014-
// This recovers $ANNEX_KEY from either case:
1015-
annexKey = path.Base(strings.TrimSpace(annexKey))
1018+
commit, err := repo.GetCommit(commitID)
1019+
if err != nil {
1020+
return path, nil
1021+
}
10161022

1017-
contentPath, _, err := git.NewCommandContextNoGlobals(git.DefaultContext, "annex", "contentlocation").AddDynamicArguments(annexKey).RunStdString(&git.RunOpts{Dir: repoPath})
1023+
treeEntry, err := commit.GetTreeEntryByPath(file)
10181024
if err != nil {
1019-
return "", fmt.Errorf("in %s: %s does not seem to be annexed: %w", repoPath, file, err)
1025+
return path, nil
10201026
}
1021-
contentPath = strings.TrimSpace(contentPath)
10221027

1023-
return path.Join(repoPath, contentPath), nil
1028+
return annex.ContentLocation(treeEntry.Blob())
10241029
}
10251030

10261031
/* like withKeyFile(), but automatically sets it the account given in ctx for use by git-annex */

0 commit comments

Comments
 (0)