From e87fc276d856f7148439189a7fc882d6d0d1ac32 Mon Sep 17 00:00:00 2001 From: hiddenMedic <124312252+hiddenMedic@users.noreply.github.com> Date: Sat, 30 Sep 2023 19:12:06 +0200 Subject: [PATCH 1/3] improved ranking --- src/bucket/bucket.go | 13 +----- src/config/load.go | 1 - src/engines/bing/bing.go | 2 +- src/engines/brave/brave.go | 2 +- src/engines/duckduckgo/duckduckgo.go | 2 +- src/engines/etools/etools.go | 13 ++---- src/engines/google/google.go | 2 +- src/engines/mojeek/mojeek.go | 2 +- src/engines/presearch/presearch.go | 2 +- src/engines/qwant/qwant.go | 4 +- src/engines/startpage/startpage.go | 2 +- src/engines/swisscows/swisscows.go | 4 +- src/engines/yahoo/yahoo.go | 2 +- src/engines/yep/yep.go | 4 +- src/rank/byrank.go | 21 +++++++++- src/rank/rank.go | 62 ++++++++++++++++++++-------- src/search/search.go | 18 ++++---- 17 files changed, 94 insertions(+), 62 deletions(-) diff --git a/src/bucket/bucket.go b/src/bucket/bucket.go index 3d4b9071..9f4f712b 100644 --- a/src/bucket/bucket.go +++ b/src/bucket/bucket.go @@ -8,7 +8,6 @@ import ( "github.com/tminaorg/brzaguza/src/bucket/result" "github.com/tminaorg/brzaguza/src/config" "github.com/tminaorg/brzaguza/src/engines" - "github.com/tminaorg/brzaguza/src/rank" ) type Relay struct { @@ -36,10 +35,6 @@ func AddSEResult(seResult *engines.RetrievedResult, seName engines.Name, relay * Response: nil, } - if config.InsertDefaultRank { - result.Rank = rank.DefaultRank(seResult.Rank.Rank, seResult.Rank.Page, seResult.Rank.OnPageRank) - } - relay.Mutex.Lock() relay.ResultMap[result.URL] = &result relay.Mutex.Unlock() @@ -83,17 +78,13 @@ func SetResultResponse(link string, response *colly.Response, relay *Relay, seNa } mapRes.Response = response - - resCopy := *mapRes - rankAddr := &(mapRes.Rank) relay.Mutex.Unlock() - rank.SetRank(&resCopy, rankAddr, &(relay.Mutex)) //copy contains pointer to response } -func MakeSEResult(urll string, title string, description string, searchEngineName engines.Name, seRank int, sePage int, seOnPageRank int) *engines.RetrievedResult { +func MakeSEResult(urll string, title string, description string, searchEngineName engines.Name, sePage int, seOnPageRank int) *engines.RetrievedResult { ser := engines.RetrievedRank{ SearchEngine: searchEngineName, - Rank: seRank, + Rank: -1, Page: sePage, OnPageRank: seOnPageRank, } diff --git a/src/config/load.go b/src/config/load.go index 18b53b60..65d08844 100644 --- a/src/config/load.go +++ b/src/config/load.go @@ -62,5 +62,4 @@ func (c *Config) Load(path string) { } } -const InsertDefaultRank bool = true // this should be moved to config const LogDumpLocation string = "logdump/" // this should be moved to config diff --git a/src/engines/bing/bing.go b/src/engines/bing/bing.go index b9b45802..571acf9a 100644 --- a/src/engines/bing/bing.go +++ b/src/engines/bing/bing.go @@ -56,7 +56,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi var pageStr string = e.Request.Ctx.Get("page") page, _ := strconv.Atoi(pageStr) - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, -1, page, pageRankCounter[page]+1) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, pageRankCounter[page]+1) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) pageRankCounter[page]++ } else { diff --git a/src/engines/brave/brave.go b/src/engines/brave/brave.go index b29775bd..a58c5cfc 100644 --- a/src/engines/brave/brave.go +++ b/src/engines/brave/brave.go @@ -52,7 +52,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi var pageStr string = e.Request.Ctx.Get("page") page, _ := strconv.Atoi(pageStr) - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, -1, page, pageRankCounter[page]+1) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, pageRankCounter[page]+1) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) pageRankCounter[page]++ } diff --git a/src/engines/duckduckgo/duckduckgo.go b/src/engines/duckduckgo/duckduckgo.go index aa5929c5..06509ad9 100644 --- a/src/engines/duckduckgo/duckduckgo.go +++ b/src/engines/duckduckgo/duckduckgo.go @@ -62,7 +62,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi linkText = parse.ParseURL(rawURL) case 3: if linkText != "" && linkText != "#" && titleText != "" { - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, rrank, page, (i/4 + 1)) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, (i/4 + 1)) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) } } diff --git a/src/engines/etools/etools.go b/src/engines/etools/etools.go index 2a849702..e9b8bb24 100644 --- a/src/engines/etools/etools.go +++ b/src/engines/etools/etools.go @@ -32,6 +32,8 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi sedefaults.ColRequest(Info.Name, col, &ctx, &retError) sedefaults.ColError(Info.Name, col, &retError) + var pageRankCounter []int = make([]int, options.MaxPages*Info.ResultsPerPage) + col.OnHTML(dompaths.Result, func(e *colly.HTMLElement) { dom := e.DOM @@ -53,17 +55,10 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi if linkText != "" && linkText != "#" && titleText != "" { var pageStr string = e.Request.Ctx.Get("page") page, _ := strconv.Atoi(pageStr) - seRankString := strings.TrimSpace(dom.Find("td[class=\"count help\"]").Text()) - seRank, convErr := strconv.Atoi(seRankString) - if convErr != nil { - log.Error().Err(convErr).Msgf("%v: SERank string to int conversion error. URL: %v, SERank string: %v", Info.Name, linkText, seRankString) - } - - //var onPageRank int = e.Index // this should also work, but is a bit more volatile - var onPageRank int = (seRank-1)%Info.ResultsPerPage + 1 - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, seRank, page, onPageRank) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, pageRankCounter[page]+1) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) + pageRankCounter[page]++ } }) diff --git a/src/engines/google/google.go b/src/engines/google/google.go index b418b0c4..80f43836 100644 --- a/src/engines/google/google.go +++ b/src/engines/google/google.go @@ -45,7 +45,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi var pageStr string = e.Request.Ctx.Get("page") page, _ := strconv.Atoi(pageStr) - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, -1, page, pageRankCounter[page]+1) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, pageRankCounter[page]+1) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) pageRankCounter[page]++ } diff --git a/src/engines/mojeek/mojeek.go b/src/engines/mojeek/mojeek.go index bd3eb497..eb5b69b7 100644 --- a/src/engines/mojeek/mojeek.go +++ b/src/engines/mojeek/mojeek.go @@ -46,7 +46,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi var pageStr string = e.Request.Ctx.Get("page") page, _ := strconv.Atoi(pageStr) - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, (page-1)*Info.ResultsPerPage+pageRankCounter[page]+1, page, pageRankCounter[page]+1) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, pageRankCounter[page]+1) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) pageRankCounter[page]++ } diff --git a/src/engines/presearch/presearch.go b/src/engines/presearch/presearch.go index 9e6fd9e7..13433551 100644 --- a/src/engines/presearch/presearch.go +++ b/src/engines/presearch/presearch.go @@ -68,7 +68,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi goodTitle := parse.ParseTextWithHTML(result.Title) goodDesc := parse.ParseTextWithHTML(result.Desc) - res := bucket.MakeSEResult(goodURL, goodTitle, goodDesc, Info.Name, -1, page, counter) + res := bucket.MakeSEResult(goodURL, goodTitle, goodDesc, Info.Name, page, counter) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) counter += 1 } diff --git a/src/engines/qwant/qwant.go b/src/engines/qwant/qwant.go index 7bd9f336..9dd8d873 100644 --- a/src/engines/qwant/qwant.go +++ b/src/engines/qwant/qwant.go @@ -49,7 +49,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi } mainline := parsedResponse.Data.Res.Items.Mainline - counter := 0 + counter := 1 for _, group := range mainline { if group.Type != "web" { continue @@ -57,7 +57,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi for _, result := range group.Items { goodURL := parse.ParseURL(result.URL) - res := bucket.MakeSEResult(goodURL, result.Title, result.Description, Info.Name, (page-1)*Info.ResultsPerPage+counter, page, counter%Info.ResultsPerPage+1) + res := bucket.MakeSEResult(goodURL, result.Title, result.Description, Info.Name, page, counter) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) counter += 1 } diff --git a/src/engines/startpage/startpage.go b/src/engines/startpage/startpage.go index 1e6a8ce9..9d2dfc91 100644 --- a/src/engines/startpage/startpage.go +++ b/src/engines/startpage/startpage.go @@ -46,7 +46,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi var pageStr string = e.Request.Ctx.Get("page") page, _ := strconv.Atoi(pageStr) - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, -1, page, pageRankCounter[page]+1) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, pageRankCounter[page]+1) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) pageRankCounter[page]++ } else { diff --git a/src/engines/swisscows/swisscows.go b/src/engines/swisscows/swisscows.go index 8dcc0935..68bec7ec 100644 --- a/src/engines/swisscows/swisscows.go +++ b/src/engines/swisscows/swisscows.go @@ -74,13 +74,13 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi log.Error().Err(err).Msgf("%v: Failed body unmarshall to json:\n%v", Info.Name, string(r.Body)) } - counter := 0 + counter := 1 for _, result := range parsedResponse.Items { goodURL := parse.ParseURL(result.URL) title := parse.ParseTextWithHTML(result.Title) desc := parse.ParseTextWithHTML(result.Desc) - res := bucket.MakeSEResult(goodURL, title, desc, Info.Name, -1, page, counter%Info.ResultsPerPage+1) + res := bucket.MakeSEResult(goodURL, title, desc, Info.Name, page, counter) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) counter += 1 } diff --git a/src/engines/yahoo/yahoo.go b/src/engines/yahoo/yahoo.go index 752a7315..215cc63e 100644 --- a/src/engines/yahoo/yahoo.go +++ b/src/engines/yahoo/yahoo.go @@ -48,7 +48,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi var pageStr string = e.Request.Ctx.Get("page") page, _ := strconv.Atoi(pageStr) - res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, -1, page, pageRankCounter[page]+1) + res := bucket.MakeSEResult(linkText, titleText, descText, Info.Name, page, pageRankCounter[page]+1) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) pageRankCounter[page]++ } diff --git a/src/engines/yep/yep.go b/src/engines/yep/yep.go index 12150b4e..fa919127 100644 --- a/src/engines/yep/yep.go +++ b/src/engines/yep/yep.go @@ -34,7 +34,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi col.OnResponse(func(r *colly.Response) { content := parseJSON(r.Body) - counter := 0 + counter := 1 for _, result := range content.Results { if result.TType != "Organic" { continue @@ -44,7 +44,7 @@ func Search(ctx context.Context, query string, relay *bucket.Relay, options engi goodTitle := parse.ParseTextWithHTML(result.Title) goodDescription := parse.ParseTextWithHTML(result.Snippet) - res := bucket.MakeSEResult(goodURL, goodTitle, goodDescription, Info.Name, counter, counter/Info.ResultsPerPage+1, counter%Info.ResultsPerPage+1) + res := bucket.MakeSEResult(goodURL, goodTitle, goodDescription, Info.Name, 1, counter) bucket.AddSEResult(res, Info.Name, relay, &options, pagesCol) counter += 1 } diff --git a/src/rank/byrank.go b/src/rank/byrank.go index 0ac83d1f..75ced529 100644 --- a/src/rank/byrank.go +++ b/src/rank/byrank.go @@ -1,9 +1,28 @@ package rank -import "github.com/tminaorg/brzaguza/src/bucket/result" +import ( + "github.com/rs/zerolog/log" + "github.com/tminaorg/brzaguza/src/bucket/result" +) type ByRank []result.Result func (r ByRank) Len() int { return len(r) } func (r ByRank) Swap(i, j int) { r[i], r[j] = r[j], r[i] } func (r ByRank) Less(i, j int) bool { return r[i].Rank < r[j].Rank } + +type ByRetrievedRank []RankFiller + +func (r ByRetrievedRank) Len() int { return len(r) } +func (r ByRetrievedRank) Swap(i, j int) { r[i], r[j] = r[j], r[i] } +func (r ByRetrievedRank) Less(i, j int) bool { + if r[i].RetRank.Page != r[j].RetRank.Page { + return r[i].RetRank.Page < r[j].RetRank.Page + } + if r[i].RetRank.OnPageRank != r[j].RetRank.OnPageRank { + return r[i].RetRank.OnPageRank < r[j].RetRank.OnPageRank + } + + log.Error().Msgf("failed at ranking: %v, %v", r[i], r[j]) + return true +} diff --git a/src/rank/rank.go b/src/rank/rank.go index db916a19..9f0ac3d7 100644 --- a/src/rank/rank.go +++ b/src/rank/rank.go @@ -1,33 +1,61 @@ package rank import ( - "sync" + "sort" - "github.com/rs/zerolog/log" "github.com/tminaorg/brzaguza/src/bucket/result" + "github.com/tminaorg/brzaguza/src/engines" ) -// TLDR: you must mutex.Lock when changing *rankAddr, you probably dont need to mutex.RLock() when reading result -// (in reality even *rankAddr shouldnt need a lock, but go would definately complain about simultanious read/write because of it) -func SetRank(result *result.Result, rankAddr *int, mutex *sync.RWMutex) { +func SetRank(result *result.Result) { + result.Rank = result.EngineRanks[0].Rank +} + +func Rank(resMap map[string]*result.Result) []result.Result { + results := make([]result.Result, 0, len(resMap)) + for _, res := range resMap { + results = append(results, *res) + } - //mutex.RLock() - reqUrl := result.Response.Request.URL.String() //dummy code, if error here, uncomment lock - //mutex.RUnlock() + //setup retrieved rank here + FillRetrievedRank(results) - if reqUrl != result.URL { //dummy code - log.Trace().Msgf("(This is ok) Request URL not same as result.URL \\/ %v | %v", reqUrl, result.URL) + for ind := range results { + SetRank(&(results[ind])) } - rrank := result.EngineRanks[0].Page*100 + result.EngineRanks[0].OnPageRank + sort.Sort(ByRank(results)) - mutex.Lock() - *rankAddr = rrank - mutex.Unlock() + return results +} - log.Trace().Msgf("Set rank to %v for %v: %v", rrank, result.Title, result.URL) +type RankFiller struct { + ArrInd int + RetRank engines.RetrievedRank + RRInd int } -func DefaultRank(seRank int, sePage int, seOnPageRank int) int { - return sePage*100 + seOnPageRank +func FillRetrievedRank(results []result.Result) { + engResults := make([][]RankFiller, 100) //TODO. need as many elements as there are implemented engines. the value used for len in searcher/buildOneRun + for arrind, res := range results { + for rrind, er := range res.EngineRanks { + rf := RankFiller{ + ArrInd: arrind, + RetRank: er, + RRInd: rrind, + } + if er.SearchEngine == engines.Undefined { //this should be fixed. TODO + continue + } + engResults[er.SearchEngine] = append(engResults[er.SearchEngine], rf) + } + } + + for _, engRes := range engResults { + sort.Sort(ByRetrievedRank(engRes)) + + for rnk, el := range engRes { + results[el.ArrInd].EngineRanks[el.RRInd].Rank = rnk + 1 + } + } } diff --git a/src/search/search.go b/src/search/search.go index 6e5af9dd..8cc01777 100644 --- a/src/search/search.go +++ b/src/search/search.go @@ -3,7 +3,7 @@ package search import ( "context" "net/url" - "sort" + "time" "github.com/rs/zerolog/log" "github.com/sourcegraph/conc" @@ -21,19 +21,19 @@ func PerformSearch(query string, options engines.Options, config *config.Config) query = url.QueryEscape(query) + resTiming := time.Now() + log.Debug().Msg("Waiting for results from engines...") var worker conc.WaitGroup runEngines(config.Engines, query, &worker, &relay, options) - log.Debug().Msg("Waiting for results from engines...") worker.Wait() + log.Debug().Msgf("Got results in %v", time.Since(resTiming).Milliseconds()) - results := make([]result.Result, 0, len(relay.ResultMap)) - for _, res := range relay.ResultMap { - results = append(results, *res) - } - - sort.Sort(rank.ByRank(results)) + rankTiming := time.Now() + log.Debug().Msg("Ranking...") + results := rank.Rank(relay.ResultMap) + log.Debug().Msgf("Finished ranking in %v", time.Since(rankTiming).Milliseconds()) - log.Debug().Msg("All processing done!") + log.Debug().Msg("Search Done!") return results } From 4ec574bd31638f71b8b11f65af728ea8ab634fa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleksa=20Siri=C5=A1ki?= <31509435+aleksasiriski@users.noreply.github.com> Date: Sun, 1 Oct 2023 17:41:22 +0200 Subject: [PATCH 2/3] Generated logdump path, correct number of elem in rank --- src/cli.go | 2 +- src/config/load.go | 10 ++++++++-- src/logger/logger.go | 2 +- src/main.go | 2 +- src/rank/rank.go | 2 +- 5 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/cli.go b/src/cli.go index 18acff2b..53d9d4d2 100644 --- a/src/cli.go +++ b/src/cli.go @@ -55,7 +55,7 @@ func setupCli() { kong.Vars{ "version": fmt.Sprintf("%v (%v@%v)", Version, GitCommit, Timestamp), "config_path": ".", - "log_path": ".", + "log_path": "./log", "query_string": "banana death", }, ) diff --git a/src/config/load.go b/src/config/load.go index 65d08844..371394d0 100644 --- a/src/config/load.go +++ b/src/config/load.go @@ -14,8 +14,12 @@ import ( ) var EnabledEngines []engines.Name = make([]engines.Name, 0) +var LogDumpLocation string = "dump/" + +func (c *Config) Load(path string, logPath string) { + // Load vars + loadVars(logPath) -func (c *Config) Load(path string) { // Use "." as the key path delimiter. This can be "/" or any character. k := koanf.New(".") @@ -62,4 +66,6 @@ func (c *Config) Load(path string) { } } -const LogDumpLocation string = "logdump/" // this should be moved to config +func loadVars(logPath string) { + LogDumpLocation = logPath + "/" + LogDumpLocation +} diff --git a/src/logger/logger.go b/src/logger/logger.go index d89392e8..89c131e0 100644 --- a/src/logger/logger.go +++ b/src/logger/logger.go @@ -18,7 +18,7 @@ func DateString() string { func Setup(path string, verbosity int) { // Generate logfile name datetime := DateString() - filepath := fmt.Sprintf("%v/log/brzaguza_%v.log", path, datetime) + filepath := fmt.Sprintf("%v/brzaguza_%v.log", path, datetime) // Setup logger logger := log.Output(io.MultiWriter(zerolog.ConsoleWriter{ diff --git a/src/main.go b/src/main.go index b90b417b..89e07ea1 100644 --- a/src/main.go +++ b/src/main.go @@ -36,7 +36,7 @@ func main() { // load config file config := config.New() - config.Load(cli.Config) + config.Load(cli.Config, cli.Log) if cli.Cli { log.Info(). diff --git a/src/rank/rank.go b/src/rank/rank.go index 9f0ac3d7..0ab3cdf8 100644 --- a/src/rank/rank.go +++ b/src/rank/rank.go @@ -36,7 +36,7 @@ type RankFiller struct { } func FillRetrievedRank(results []result.Result) { - engResults := make([][]RankFiller, 100) //TODO. need as many elements as there are implemented engines. the value used for len in searcher/buildOneRun + engResults := make([][]RankFiller, len(engines.NameValues())) for arrind, res := range results { for rrind, er := range res.EngineRanks { rf := RankFiller{ From adcb8650ec05df3dea48b3a5f538c1014ee49e29 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aleksa=20Siri=C5=A1ki?= <31509435+aleksasiriski@users.noreply.github.com> Date: Sun, 1 Oct 2023 17:57:03 +0200 Subject: [PATCH 3/3] Formatting --- src/search/search.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/search/search.go b/src/search/search.go index 8cc01777..7c388e05 100644 --- a/src/search/search.go +++ b/src/search/search.go @@ -33,7 +33,7 @@ func PerformSearch(query string, options engines.Options, config *config.Config) results := rank.Rank(relay.ResultMap) log.Debug().Msgf("Finished ranking in %v", time.Since(rankTiming).Milliseconds()) - log.Debug().Msg("Search Done!") + log.Debug().Msg("Search done!") return results }