Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

git-annex: create helper functions in modules/annex #21

Merged
merged 3 commits into from
Nov 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions modules/annex/annex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys.
// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path,
// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock
// filesystem, living only in process RAM). We must have the on-disk path to do anything
// useful with git-annex because all of its interesting data is on-disk under .git/annex/.

package annex

import (
"errors"
"fmt"
"os"
"path"
"strings"

"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
)

const (
// > The maximum size of a pointer file is 32 kb.
// - https://git-annex.branchable.com/internals/pointer_file/
// It's unclear if that's kilobytes or kibibytes; assuming kibibytes:
blobSizeCutoff = 32 * 1024
)

var (
// ErrInvalidStructure occurs if the content has an invalid structure
ErrInvalidPointer = errors.New("Not a git-annex pointer")
)

// Gets the content of the blob as raw text, up to n bytes.
// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit)
func getBlobContent(b *git.Blob, n int) (string, error) {
dataRc, err := b.DataAsync()
if err != nil {
return "", err
}
defer dataRc.Close()
buf := make([]byte, n)
n, _ = util.ReadAtMost(dataRc, buf)
buf = buf[:n]
return string(buf), nil
}

func Pointer(te *git.TreeEntry) (string, error) {
blob := te.Blob()

// git-annex doesn't seem fully spec what its pointer are, but
// the fullest description is here:
// https://git-annex.branchable.com/internals/pointer_file/

// a pointer can be:
// the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY'
// the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY'
//
// in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because
// git.Blob.DataAsync() works like open() + readlink(), handling both cases in one.

if blob.Size() > blobSizeCutoff {
// > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file.
// https://git-annex.branchable.com/internals/pointer_file/

// It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too.
return "", ErrInvalidPointer
}

//if pointer, err := getBlobContent(blob, blobSizeCutoff); err != nil {
pointer, err := getBlobContent(blob, blobSizeCutoff)
if err != nil {
return "", fmt.Errorf("error reading %s: %w", blob.Name(), err)
}

// the spec says a pointer file can contain multiple lines each with a pointer in them
// but that makes no sense to me, so I'm just ignoring all but the first
lines := strings.Split(pointer, "\n")
if len(lines) < 1 {
return "", ErrInvalidPointer
}
pointer = lines[0]

// in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it
if !strings.Contains(pointer, "/annex/") {
return "", ErrInvalidPointer
}

// extract $KEY
pointer = path.Base(strings.TrimSpace(pointer))

// ask git-annex's opinion on $KEY
// XXX: this is probably a bit slow, especially if this operation gets run often
// and examinekey is not that strict:
// - it doesn't enforce that the "BACKEND" tag is one it knows,
// - it doesn't enforce that the fields and their format fit the "BACKEND" tag
// so maybe this is a wasteful step
_, examineStderr, err := git.NewCommandNoGlobals("annex", "examinekey", git.CmdArg(pointer)).RunStdString(&git.RunOpts{Dir: te.Repo().Path})
if err != nil {
// TODO: make ErrInvalidPointer into a type capable of wrapping err
if strings.TrimSpace(examineStderr) == "git-annex: bad key" {
return "", ErrInvalidPointer
}
return "", err
}

return pointer, nil
}

// return the absolute path of the content pointed to by the annex pointer stored in the git object
// errors if the content is not found in this repo
func ContentLocation(te *git.TreeEntry) (string, error) {
repoPath := te.Repo().Path

pointer, err := Pointer(te)
if err != nil {
return "", err
}

contentLocation, _, err := git.NewCommandNoGlobals("annex", "contentlocation", git.CmdArg(pointer)).RunStdString(&git.RunOpts{Dir: repoPath})
if err != nil {
return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", repoPath, pointer, err)
}
contentLocation = strings.TrimSpace(contentLocation)
contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals
contentLocation = path.Join(repoPath, contentLocation)

return contentLocation, nil
}

// returns a stream open to the annex content
func Content(te *git.TreeEntry) (*os.File, error) {
contentLocation, err := ContentLocation(te)
if err != nil {
return nil, err
}

return os.Open(contentLocation)
}

// whether the object appears to be a valid annex pointer
// does *not* verify if the content is actually in this repo;
// for that, use ContentLocation()
func IsAnnexed(te *git.TreeEntry) (bool, error) {
if !setting.Annex.Enabled {
return false, nil
}

// Pointer() is written to only return well-formed pointers
// so the test is just to see if it errors
_, err := Pointer(te)
if err != nil {
if errors.Is(err, ErrInvalidPointer) {
return false, nil
}
return false, err
}
return true, nil
}
4 changes: 4 additions & 0 deletions modules/git/tree_entry.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import (
"strings"
)

func (te *TreeEntry) Repo() *Repository {
return te.ptree.repo
}

// Type returns the type of the entry (commit, tree, blob)
func (te *TreeEntry) Type() string {
switch te.Mode() {
Expand Down
39 changes: 22 additions & 17 deletions tests/integration/git_annex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (

"code.gitea.io/gitea/models/perm"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/annex"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
api "code.gitea.io/gitea/modules/structs"
Expand Down Expand Up @@ -781,13 +782,13 @@ func doAnnexDownloadTest(remoteRepoPath string, repoPath string) (err error) {
}

// verify the file was downloaded
localObjectPath, err := annexObjectPath(repoPath, "large.bin")
localObjectPath, err := contentLocation(repoPath, "large.bin")
if err != nil {
return err
}
//localObjectPath := path.Join(repoPath, "large.bin") // or, just compare against the checked-out file

remoteObjectPath, err := annexObjectPath(remoteRepoPath, "large.bin")
remoteObjectPath, err := contentLocation(remoteRepoPath, "large.bin")
if err != nil {
return err
}
Expand Down Expand Up @@ -834,13 +835,13 @@ func doAnnexUploadTest(remoteRepoPath string, repoPath string) (err error) {
}

// verify the file was uploaded
localObjectPath, err := annexObjectPath(repoPath, "contribution.bin")
localObjectPath, err := contentLocation(repoPath, "contribution.bin")
if err != nil {
return err
}
//localObjectPath := path.Join(repoPath, "contribution.bin") // or, just compare against the checked-out file

remoteObjectPath, err := annexObjectPath(remoteRepoPath, "contribution.bin")
remoteObjectPath, err := contentLocation(remoteRepoPath, "contribution.bin")
if err != nil {
return err
}
Expand Down Expand Up @@ -999,26 +1000,30 @@ func doInitRemoteAnnexRepository(t *testing.T, repoURL *url.URL) error {

TODO: pass a parameter to allow examining non-HEAD branches
*/
func annexObjectPath(repoPath string, file string) (string, error) {
// NB: `git annex lookupkey` is more reliable, but doesn't work in bare repos.
annexKey, _, err := git.NewCommandNoGlobals("show", git.CmdArg("HEAD:"+file)).RunStdString(&git.RunOpts{Dir: repoPath})
func contentLocation(repoPath string, file string) (path string, err error) {
path = ""

repo, err := git.OpenRepository(git.DefaultContext, repoPath)
if err != nil {
return
}

commitID, err := repo.GetRefCommitID("HEAD") // NB: to examine a *branch*, prefix with "refs/branch/", or call repo.GetBranchCommitID(); ditto for tags
if err != nil {
return "", fmt.Errorf("in %s: %w", repoPath, err) // the error from git prints the filename but not repo
return
}

// There are two formats an annexed file pointer might be:
// * a symlink to .git/annex/objects/$HASHDIR/$ANNEX_KEY/$ANNEX_KEY - used by files created with 'git annex add'
// * a text file containing /annex/objects/$ANNEX_KEY - used by files for which 'git add' was configured to run git-annex-smudge
// This recovers $ANNEX_KEY from either case:
annexKey = path.Base(strings.TrimSpace(annexKey))
commit, err := repo.GetCommit(commitID)
if err != nil {
return
}

contentPath, _, err := git.NewCommandNoGlobals("annex", "contentlocation", git.CmdArg(annexKey)).RunStdString(&git.RunOpts{Dir: repoPath})
treeEntry, err := commit.GetTreeEntryByPath(file)
if err != nil {
return "", fmt.Errorf("in %s: %s does not seem to be annexed: %w", repoPath, file, err)
return
}
contentPath = strings.TrimSpace(contentPath)

return path.Join(repoPath, contentPath), nil
return annex.ContentLocation(treeEntry)
}

/* like withKeyFile(), but automatically sets it the account given in ctx for use by git-annex */
Expand Down