From fca84ac2c9feea27b23c1a87e8c6b13a60cf6844 Mon Sep 17 00:00:00 2001 From: wxiaoguang Date: Fri, 14 Feb 2025 22:05:12 +0800 Subject: [PATCH] fix elasticsearch, refactor gitgrep --- modules/indexer/code/bleve/bleve.go | 9 ++- .../code/elasticsearch/elasticsearch.go | 17 ++++-- modules/indexer/code/gitgrep/gitgrep.go | 59 +++++++++++++++++++ .../indexer/code/gitgrep/gitgrep_test.go | 2 +- modules/indexer/code/indexer.go | 6 +- modules/indexer/code/internal/util.go | 10 +++- modules/indexer/code/internal/util_test.go | 24 ++++++++ routers/web/repo/search.go | 46 ++------------- 8 files changed, 119 insertions(+), 54 deletions(-) create mode 100644 modules/indexer/code/gitgrep/gitgrep.go rename routers/web/repo/search_test.go => modules/indexer/code/gitgrep/gitgrep_test.go (97%) create mode 100644 modules/indexer/code/internal/util_test.go diff --git a/modules/indexer/code/bleve/bleve.go b/modules/indexer/code/bleve/bleve.go index c7eb6ead7b103..963c151a05662 100644 --- a/modules/indexer/code/bleve/bleve.go +++ b/modules/indexer/code/bleve/bleve.go @@ -267,10 +267,13 @@ func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int pathQuery.FieldVal = "Filename" pathQuery.SetBoost(10) - if strings.HasPrefix(opts.Keyword, "\"") && strings.HasSuffix(opts.Keyword, "\"") { - opts.Keyword = strings.Trim(opts.Keyword, "\"") - q := bleve.NewMatchPhraseQuery(opts.Keyword) + keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) + if isPhrase { + q := bleve.NewMatchPhraseQuery(keywordAsPhrase) q.FieldVal = "Content" + if opts.IsKeywordFuzzy { + q.Fuzziness = inner_bleve.GuessFuzzinessByKeyword(keywordAsPhrase) + } contentQuery = q } else { q := bleve.NewMatchQuery(opts.Keyword) diff --git a/modules/indexer/code/elasticsearch/elasticsearch.go b/modules/indexer/code/elasticsearch/elasticsearch.go index 1c4dd39eff0be..5e4b2c56f2d0a 100644 --- a/modules/indexer/code/elasticsearch/elasticsearch.go +++ b/modules/indexer/code/elasticsearch/elasticsearch.go @@ -24,6 +24,7 @@ import ( "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/timeutil" "code.gitea.io/gitea/modules/typesniffer" + "code.gitea.io/gitea/modules/util" "github.com/go-enry/go-enry/v2" "github.com/olivere/elastic/v7" @@ -359,13 +360,19 @@ func extractAggs(searchResult *elastic.SearchResult) []*internal.SearchResultLan // Search searches for codes and language stats by given conditions. func (b *Indexer) Search(ctx context.Context, opts *internal.SearchOptions) (int64, []*internal.SearchResult, []*internal.SearchResultLanguages, error) { - searchType := esMultiMatchTypePhrasePrefix - if opts.IsKeywordFuzzy { - searchType = esMultiMatchTypeBestFields + var contentQuery elastic.Query + keywordAsPhrase, isPhrase := internal.ParseKeywordAsPhrase(opts.Keyword) + if isPhrase { + contentQuery = elastic.NewMatchPhraseQuery("content", keywordAsPhrase) + } else { + // TODO: this is the old logic, but not really using "fuzziness" + // * IsKeywordFuzzy=true: "best_fields" + // * IsKeywordFuzzy=false: "phrase_prefix" + contentQuery = elastic.NewMultiMatchQuery("content", opts.Keyword). + Type(util.Iif(opts.IsKeywordFuzzy, esMultiMatchTypeBestFields, esMultiMatchTypePhrasePrefix)) } - kwQuery := elastic.NewBoolQuery().Should( - elastic.NewMultiMatchQuery(opts.Keyword, "content").Type(searchType), + contentQuery, elastic.NewMultiMatchQuery(opts.Keyword, "filename^10").Type(esMultiMatchTypePhrasePrefix), ) query := elastic.NewBoolQuery() diff --git a/modules/indexer/code/gitgrep/gitgrep.go b/modules/indexer/code/gitgrep/gitgrep.go new file mode 100644 index 0000000000000..a85c9d02a5e51 --- /dev/null +++ b/modules/indexer/code/gitgrep/gitgrep.go @@ -0,0 +1,59 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package gitgrep + +import ( + "context" + "fmt" + "strings" + + "code.gitea.io/gitea/modules/git" + code_indexer "code.gitea.io/gitea/modules/indexer/code" + "code.gitea.io/gitea/modules/setting" +) + +func indexSettingToGitGrepPathspecList() (list []string) { + for _, expr := range setting.Indexer.IncludePatterns { + list = append(list, ":(glob)"+expr.PatternString()) + } + for _, expr := range setting.Indexer.ExcludePatterns { + list = append(list, ":(glob,exclude)"+expr.PatternString()) + } + return list +} + +func PerformSearch(ctx context.Context, page int, repoID int64, gitRepo *git.Repository, ref git.RefName, keyword string, isFuzzy bool) (searchResults []*code_indexer.Result, total int, err error) { + // TODO: it should also respect ParseKeywordAsPhrase and clarify the "fuzzy" behavior + res, err := git.GrepSearch(ctx, gitRepo, keyword, git.GrepOptions{ + ContextLineNumber: 1, + IsFuzzy: isFuzzy, + RefName: ref.String(), + PathspecList: indexSettingToGitGrepPathspecList(), + }) + if err != nil { + // TODO: if no branch exists, it reports: exit status 128, fatal: this operation must be run in a work tree. + return nil, 0, fmt.Errorf("git.GrepSearch: %w", err) + } + commitID, err := gitRepo.GetRefCommitID(ref.String()) + if err != nil { + return nil, 0, fmt.Errorf("gitRepo.GetRefCommitID: %w", err) + } + + total = len(res) + pageStart := min((page-1)*setting.UI.RepoSearchPagingNum, len(res)) + pageEnd := min(page*setting.UI.RepoSearchPagingNum, len(res)) + res = res[pageStart:pageEnd] + for _, r := range res { + searchResults = append(searchResults, &code_indexer.Result{ + RepoID: repoID, + Filename: r.Filename, + CommitID: commitID, + // UpdatedUnix: not supported yet + // Language: not supported yet + // Color: not supported yet + Lines: code_indexer.HighlightSearchResultCode(r.Filename, "", r.LineNumbers, strings.Join(r.LineCodes, "\n")), + }) + } + return searchResults, total, nil +} diff --git a/routers/web/repo/search_test.go b/modules/indexer/code/gitgrep/gitgrep_test.go similarity index 97% rename from routers/web/repo/search_test.go rename to modules/indexer/code/gitgrep/gitgrep_test.go index 33a161038449c..97dda9d9661fd 100644 --- a/routers/web/repo/search_test.go +++ b/modules/indexer/code/gitgrep/gitgrep_test.go @@ -1,7 +1,7 @@ // Copyright 2024 The Gitea Authors. All rights reserved. // SPDX-License-Identifier: MIT -package repo +package gitgrep import ( "testing" diff --git a/modules/indexer/code/indexer.go b/modules/indexer/code/indexer.go index 728b37fab6ecd..38fd10dae7fa0 100644 --- a/modules/indexer/code/indexer.go +++ b/modules/indexer/code/indexer.go @@ -29,13 +29,11 @@ var ( // When the real indexer is not ready, it will be a dummy indexer which will return error to explain it's not ready. // So it's always safe use it as *globalIndexer.Load() and call its methods. globalIndexer atomic.Pointer[internal.Indexer] - dummyIndexer *internal.Indexer ) func init() { - i := internal.NewDummyIndexer() - dummyIndexer = &i - globalIndexer.Store(dummyIndexer) + dummyIndexer := internal.NewDummyIndexer() + globalIndexer.Store(&dummyIndexer) } func index(ctx context.Context, indexer internal.Indexer, repoID int64) error { diff --git a/modules/indexer/code/internal/util.go b/modules/indexer/code/internal/util.go index 5b95783d9fcfe..df3dfc81f83bc 100644 --- a/modules/indexer/code/internal/util.go +++ b/modules/indexer/code/internal/util.go @@ -35,7 +35,7 @@ func FilenameOfIndexerID(indexerID string) string { return indexerID[index+1:] } -// Given the contents of file, returns the boundaries of its first seven lines. +// FilenameMatchIndexPos returns the boundaries of its first seven lines. func FilenameMatchIndexPos(content string) (int, int) { count := 1 for i, c := range content { @@ -48,3 +48,11 @@ func FilenameMatchIndexPos(content string) (int, int) { } return 0, len(content) } + +func ParseKeywordAsPhrase(keyword string) (string, bool) { + if strings.HasPrefix(keyword, `"`) && strings.HasSuffix(keyword, `"`) { + // only remove the prefix and suffix quotes, no need to decode the content at the moment + return keyword[1 : len(keyword)-1], true + } + return "", false +} diff --git a/modules/indexer/code/internal/util_test.go b/modules/indexer/code/internal/util_test.go new file mode 100644 index 0000000000000..a3bb56afa2aa0 --- /dev/null +++ b/modules/indexer/code/internal/util_test.go @@ -0,0 +1,24 @@ +// Copyright 2025 The Gitea Authors. All rights reserved. +// SPDX-License-Identifier: MIT + +package internal + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestParseKeywordAsPhrase(t *testing.T) { + phrase, isPhrase := ParseKeywordAsPhrase(`a`) + assert.Empty(t, phrase) + assert.False(t, isPhrase) + + phrase, isPhrase = ParseKeywordAsPhrase(`"a"`) + assert.Equal(t, "a", phrase) + assert.True(t, isPhrase) + + phrase, isPhrase = ParseKeywordAsPhrase(`""\"""`) + assert.Equal(t, `"\""`, phrase) + assert.True(t, isPhrase) +} diff --git a/routers/web/repo/search.go b/routers/web/repo/search.go index bbbe5c1081cce..ea40e64bbbb16 100644 --- a/routers/web/repo/search.go +++ b/routers/web/repo/search.go @@ -5,11 +5,11 @@ package repo import ( "net/http" - "strings" "code.gitea.io/gitea/models/db" "code.gitea.io/gitea/modules/git" code_indexer "code.gitea.io/gitea/modules/indexer/code" + "code.gitea.io/gitea/modules/indexer/code/gitgrep" "code.gitea.io/gitea/modules/setting" "code.gitea.io/gitea/modules/templates" "code.gitea.io/gitea/routers/common" @@ -18,16 +18,6 @@ import ( const tplSearch templates.TplName = "repo/search" -func indexSettingToGitGrepPathspecList() (list []string) { - for _, expr := range setting.Indexer.IncludePatterns { - list = append(list, ":(glob)"+expr.PatternString()) - } - for _, expr := range setting.Indexer.ExcludePatterns { - list = append(list, ":(glob,exclude)"+expr.PatternString()) - } - return list -} - // Search render repository search page func Search(ctx *context.Context) { ctx.Data["PageIsViewCode"] = true @@ -67,38 +57,14 @@ func Search(ctx *context.Context) { ctx.Data["CodeIndexerUnavailable"] = !code_indexer.IsAvailable(ctx) } } else { - searchRefName := git.RefNameFromBranch(ctx.Repo.Repository.DefaultBranch) // BranchName should be default branch or the first existing branch - res, err := git.GrepSearch(ctx, ctx.Repo.GitRepo, prepareSearch.Keyword, git.GrepOptions{ - ContextLineNumber: 1, - IsFuzzy: prepareSearch.IsFuzzy, - RefName: searchRefName.String(), - PathspecList: indexSettingToGitGrepPathspecList(), - }) - if err != nil { - // TODO: if no branch exists, it reports: exit status 128, fatal: this operation must be run in a work tree. - ctx.ServerError("GrepSearch", err) - return - } - commitID, err := ctx.Repo.GitRepo.GetRefCommitID(searchRefName.String()) + var err error + // ref should be default branch or the first existing branch + searchRef := git.RefNameFromBranch(ctx.Repo.Repository.DefaultBranch) + searchResults, total, err = gitgrep.PerformSearch(ctx, page, ctx.Repo.Repository.ID, ctx.Repo.GitRepo, searchRef, prepareSearch.Keyword, prepareSearch.IsFuzzy) if err != nil { - ctx.ServerError("GetRefCommitID", err) + ctx.ServerError("gitgrep.PerformSearch", err) return } - total = len(res) - pageStart := min((page-1)*setting.UI.RepoSearchPagingNum, len(res)) - pageEnd := min(page*setting.UI.RepoSearchPagingNum, len(res)) - res = res[pageStart:pageEnd] - for _, r := range res { - searchResults = append(searchResults, &code_indexer.Result{ - RepoID: ctx.Repo.Repository.ID, - Filename: r.Filename, - CommitID: commitID, - // UpdatedUnix: not supported yet - // Language: not supported yet - // Color: not supported yet - Lines: code_indexer.HighlightSearchResultCode(r.Filename, "", r.LineNumbers, strings.Join(r.LineCodes, "\n")), - }) - } } ctx.Data["Repo"] = ctx.Repo.Repository