Skip to content

Commit

Permalink
git-annex: create modules/annex (#21)
Browse files Browse the repository at this point in the history
This moves the `annexObjectPath()` helper out of the tests and into a dedicated sub-package as `annex.ContentLocation()`, and expands it with `.Pointer()` (which validates using `git annex examinekey`), `.IsAnnexed()` and `.Content()` to make it a more useful module.

The tests retain their own wrapper version of `ContentLocation()` because I tried to follow close to the API modules/lfs uses, which in terms of abstract `git.Blob` and `git.TreeEntry` objects, not in terms of `repoPath string`s which are more convenient for the tests.
  • Loading branch information
kousu committed Nov 27, 2022
1 parent fbe8189 commit f46585c
Show file tree
Hide file tree
Showing 3 changed files with 188 additions and 17 deletions.
162 changes: 162 additions & 0 deletions modules/annex/annex.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.

// Unlike modules/lfs, which operates mainly on git.Blobs, this operates on git.TreeEntrys.
// The motivation for this is that TreeEntrys have an easy pointer to the on-disk repo path,
// while blobs do not (in fact, if building with TAGS=gogit, blobs might exist only in a mock
// filesystem, living only in process RAM). We must have the on-disk path to do anything
// useful with git-annex because all of its interesting data is on-disk under .git/annex/.

package annex

import (
"errors"
"fmt"
"os"
"path"
"strings"

"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
"code.gitea.io/gitea/modules/util"
)

const (
// > The maximum size of a pointer file is 32 kb.
// - https://git-annex.branchable.com/internals/pointer_file/
// It's unclear if that's kilobytes or kibibytes; assuming kibibytes:
blobSizeCutoff = 32 * 1024
)

var (
// ErrInvalidStructure occurs if the content has an invalid structure
ErrInvalidPointer = errors.New("Not a git-annex pointer")
)

// Gets the content of the blob as raw text, up to n bytes.
// (the pre-existing blob.GetBlobContent() has a hardcoded 1024-byte limit)
func getBlobContent(b *git.Blob, n int) (string, error) {
dataRc, err := b.DataAsync()
if err != nil {
return "", err
}
defer dataRc.Close()
buf := make([]byte, n)
n, _ = util.ReadAtMost(dataRc, buf)
buf = buf[:n]
return string(buf), nil
}

func Pointer(te *git.TreeEntry) (string, error) {
blob := te.Blob()

// git-annex doesn't seem fully spec what its pointer are, but
// the fullest description is here:
// https://git-annex.branchable.com/internals/pointer_file/

// a pointer can be:
// the original format, generated by `git annex add`: a symlink to '.git/annex/objects/$HASHDIR/$HASHDIR2/$KEY/$KEY'
// the newer, git-lfs influenced, format, generated by `git annex smudge`: a text file containing '/annex/objects/$KEY'
//
// in either case we can extract the $KEY the same way, and we need not actually know if it's a symlink or not because
// git.Blob.DataAsync() works like open() + readlink(), handling both cases in one.

if blob.Size() > blobSizeCutoff {
// > The maximum size of a pointer file is 32 kb. If it is any longer, it is not considered to be a valid pointer file.
// https://git-annex.branchable.com/internals/pointer_file/

// It's unclear to me whether the same size limit applies to symlink-pointers, but it seems sensible to limit them too.
return "", ErrInvalidPointer
}

//if pointer, err := getBlobContent(blob, blobSizeCutoff); err != nil {
pointer, err := getBlobContent(blob, blobSizeCutoff)
if err != nil {
return "", fmt.Errorf("error reading %s: %w", blob.Name(), err)
}

// the spec says a pointer file can contain multiple lines each with a pointer in them
// but that makes no sense to me, so I'm just ignoring all but the first
lines := strings.Split(pointer, "\n")
if len(lines) < 1 {
return "", ErrInvalidPointer
}
pointer = lines[0]

// in both the symlink and pointer-file formats, the pointer must have "/annex/" somewhere in it
if !strings.Contains(pointer, "/annex/") {
return "", ErrInvalidPointer
}

// extract $KEY
pointer = path.Base(strings.TrimSpace(pointer))

// ask git-annex's opinion on $KEY
// XXX: this is probably a bit slow, especially if this operation gets run often
// and examinekey is not that strict:
// - it doesn't enforce that the "BACKEND" tag is one it knows,
// - it doesn't enforce that the fields and their format fit the "BACKEND" tag
// so maybe this is a wasteful step
_, examineStderr, err := git.NewCommandNoGlobals("annex", "examinekey", git.CmdArg(pointer)).RunStdString(&git.RunOpts{Dir: te.Repo().Path})
if err != nil {
// TODO: make ErrInvalidPointer into a type capable of wrapping err
if strings.TrimSpace(examineStderr) == "git-annex: bad key" {
return "", ErrInvalidPointer
}
return "", err
}

return pointer, nil
}

// return the absolute path of the content pointed to by the annex pointer stored in the git object
// errors if the content is not found in this repo
func ContentLocation(te *git.TreeEntry) (string, error) {
repoPath := te.Repo().Path

pointer, err := Pointer(te)
if err != nil {
return "", err
}

contentLocation, _, err := git.NewCommandNoGlobals("annex", "contentlocation", git.CmdArg(pointer)).RunStdString(&git.RunOpts{Dir: repoPath})
if err != nil {
return "", fmt.Errorf("in %s: %s does not seem to be a valid annexed file: %w", repoPath, pointer, err)
}
contentLocation = strings.TrimSpace(contentLocation)
contentLocation = path.Clean("/" + contentLocation)[1:] // prevent directory traversals
contentLocation = path.Join(repoPath, contentLocation)

return contentLocation, nil
}

// returns a stream open to the annex content
func Content(te *git.TreeEntry) (*os.File, error) {
contentLocation, err := ContentLocation(te)
if err != nil {
return nil, err
}

return os.Open(contentLocation)
}

// whether the object appears to be a valid annex pointer
// does *not* verify if the content is actually in this repo;
// for that, use ContentLocation()
func IsAnnexed(te *git.TreeEntry) (bool, error) {
if !setting.Annex.Enabled {
return false, nil
}

// Pointer() is written to only return well-formed pointers
// so the test is just to see if it errors
_, err := Pointer(te)
if err != nil {
if errors.Is(err, ErrInvalidPointer) {
return false, nil
}
return false, err
}
return true, nil
}
4 changes: 4 additions & 0 deletions modules/git/tree_entry.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ import (
"strings"
)

func (te *TreeEntry) Repo() *Repository {
return te.ptree.repo
}

// Type returns the type of the entry (commit, tree, blob)
func (te *TreeEntry) Type() string {
switch te.Mode() {
Expand Down
39 changes: 22 additions & 17 deletions tests/integration/git_annex_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (

"code.gitea.io/gitea/models/perm"
repo_model "code.gitea.io/gitea/models/repo"
"code.gitea.io/gitea/modules/annex"
"code.gitea.io/gitea/modules/git"
"code.gitea.io/gitea/modules/setting"
api "code.gitea.io/gitea/modules/structs"
Expand Down Expand Up @@ -781,13 +782,13 @@ func doAnnexDownloadTest(remoteRepoPath string, repoPath string) (err error) {
}

// verify the file was downloaded
localObjectPath, err := annexObjectPath(repoPath, "large.bin")
localObjectPath, err := contentLocation(repoPath, "large.bin")
if err != nil {
return err
}
//localObjectPath := path.Join(repoPath, "large.bin") // or, just compare against the checked-out file

remoteObjectPath, err := annexObjectPath(remoteRepoPath, "large.bin")
remoteObjectPath, err := contentLocation(remoteRepoPath, "large.bin")
if err != nil {
return err
}
Expand Down Expand Up @@ -834,13 +835,13 @@ func doAnnexUploadTest(remoteRepoPath string, repoPath string) (err error) {
}

// verify the file was uploaded
localObjectPath, err := annexObjectPath(repoPath, "contribution.bin")
localObjectPath, err := contentLocation(repoPath, "contribution.bin")
if err != nil {
return err
}
//localObjectPath := path.Join(repoPath, "contribution.bin") // or, just compare against the checked-out file

remoteObjectPath, err := annexObjectPath(remoteRepoPath, "contribution.bin")
remoteObjectPath, err := contentLocation(remoteRepoPath, "contribution.bin")
if err != nil {
return err
}
Expand Down Expand Up @@ -999,26 +1000,30 @@ func doInitRemoteAnnexRepository(t *testing.T, repoURL *url.URL) error {
TODO: pass a parameter to allow examining non-HEAD branches
*/
func annexObjectPath(repoPath string, file string) (string, error) {
// NB: `git annex lookupkey` is more reliable, but doesn't work in bare repos.
annexKey, _, err := git.NewCommandNoGlobals("show", git.CmdArg("HEAD:"+file)).RunStdString(&git.RunOpts{Dir: repoPath})
func contentLocation(repoPath string, file string) (path string, err error) {
path = ""

repo, err := git.OpenRepository(git.DefaultContext, repoPath)
if err != nil {
return
}

commitID, err := repo.GetRefCommitID("HEAD") // NB: to examine a *branch*, prefix with "refs/branch/", or call repo.GetBranchCommitID(); ditto for tags
if err != nil {
return "", fmt.Errorf("in %s: %w", repoPath, err) // the error from git prints the filename but not repo
return
}

// There are two formats an annexed file pointer might be:
// * a symlink to .git/annex/objects/$HASHDIR/$ANNEX_KEY/$ANNEX_KEY - used by files created with 'git annex add'
// * a text file containing /annex/objects/$ANNEX_KEY - used by files for which 'git add' was configured to run git-annex-smudge
// This recovers $ANNEX_KEY from either case:
annexKey = path.Base(strings.TrimSpace(annexKey))
commit, err := repo.GetCommit(commitID)
if err != nil {
return
}

contentPath, _, err := git.NewCommandNoGlobals("annex", "contentlocation", git.CmdArg(annexKey)).RunStdString(&git.RunOpts{Dir: repoPath})
treeEntry, err := commit.GetTreeEntryByPath(file)
if err != nil {
return "", fmt.Errorf("in %s: %s does not seem to be annexed: %w", repoPath, file, err)
return
}
contentPath = strings.TrimSpace(contentPath)

return path.Join(repoPath, contentPath), nil
return annex.ContentLocation(treeEntry)
}

/* like withKeyFile(), but automatically sets it the account given in ctx for use by git-annex */
Expand Down

0 comments on commit f46585c

Please sign in to comment.