From f4409024fadba48fcbf6eb688605e24304db68b6 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 14:55:53 -0300 Subject: [PATCH 01/21] fix typos in dates in JSON They prevented parsing in Go. --- crawler-user-agents.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crawler-user-agents.json b/crawler-user-agents.json index 6c6dd7b..5d2110c 100644 --- a/crawler-user-agents.json +++ b/crawler-user-agents.json @@ -220,7 +220,7 @@ , { "pattern": "httpx", - "addition_date":" 2019/12/23", + "addition_date": "2019/12/23", "instances": [ "python-httpx/0.16.1", "python-httpx/0.13.0.dev1" @@ -265,7 +265,7 @@ , { "pattern": "phpcrawl", - "addition_date": "2012-09/17", + "addition_date": "2012/09/17", "url": "http://phpcrawl.cuab.de/", "instances": [ "phpcrawl" From fb4ca376539cb058b8e259f0118d3bed288b3278 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 15:01:17 -0300 Subject: [PATCH 02/21] add Go package --- go.mod | 11 +++++ go.sum | 9 ++++ validate.go | 117 +++++++++++++++++++++++++++++++++++++++++++++++ validate_test.go | 18 ++++++++ 4 files changed, 155 insertions(+) create mode 100644 go.mod create mode 100644 go.sum create mode 100644 validate.go create mode 100644 validate_test.go diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..30af075 --- /dev/null +++ b/go.mod @@ -0,0 +1,11 @@ +module github.com/monperrus/crawler-user-agents + +go 1.19 + +require github.com/stretchr/testify v1.9.0 + +require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect +) diff --git a/go.sum b/go.sum new file mode 100644 index 0000000..e20fa14 --- /dev/null +++ b/go.sum @@ -0,0 +1,9 @@ +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= +github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= +gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= +gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/validate.go b/validate.go new file mode 100644 index 0000000..b9977ad --- /dev/null +++ b/validate.go @@ -0,0 +1,117 @@ +package agents + +import ( + _ "embed" + "encoding/json" + "fmt" + "regexp" + "strings" + "time" +) + +//go:embed crawler-user-agents.json +var crawlersJson []byte + +// Crawler contains information about one crawler. +type Crawler struct { + // Regexp of User Agent of the crawler. + Pattern string `json:"pattern"` + + // Discovery date. + AdditionDate time.Time `json:"addition_date"` + + // Official url of the robot. + URL string `json:"url"` + + // Examples of full User Agent strings. + Instances []string `json:"instances"` +} + +// Private time needed to convert addition_date from/to the format used in JSON. +type jsonCrawler struct { + Pattern string `json:"pattern"` + AdditionDate string `json:"addition_date"` + URL string `json:"url"` + Instances []string `json:"instances"` +} + +const timeLayout = "2006/01/02" + +func (c Crawler) MarshalJSON() ([]byte, error) { + jc := jsonCrawler{ + Pattern: c.Pattern, + AdditionDate: c.AdditionDate.Format(timeLayout), + URL: c.URL, + Instances: c.Instances, + } + return json.Marshal(jc) +} + +func (c *Crawler) UnmarshalJSON(b []byte) error { + var jc jsonCrawler + if err := json.Unmarshal(b, &jc); err != nil { + return err + } + + c.Pattern = jc.Pattern + c.URL = jc.URL + c.Instances = jc.Instances + + if c.Pattern == "" { + return fmt.Errorf("empty pattern in record %s", string(b)) + } + + if jc.AdditionDate != "" { + tim, err := time.ParseInLocation(timeLayout, jc.AdditionDate, time.UTC) + if err != nil { + return err + } + c.AdditionDate = tim + } + + return nil +} + +// The list of crawlers, built from contents of crawler-user-agents.json. +var Crawlers = func() []Crawler { + var crawlers []Crawler + if err := json.Unmarshal(crawlersJson, &crawlers); err != nil { + panic(err) + } + return crawlers +}() + +var allRegexps = func() string { + regexps := make([]string, 0, len(Crawlers)) + for _, crawler := range Crawlers { + regexps = append(regexps, "("+crawler.Pattern+")") + } + return strings.Join(regexps, "|") +}() + +var allRegexpsRe = regexp.MustCompile(allRegexps) + +// Returns if User Agent string matches any of crawler patterns. +func IsCrawler(userAgent string) bool { + return allRegexpsRe.MatchString(userAgent) +} + +var individualRegexps = func() []*regexp.Regexp { + regexps := make([]*regexp.Regexp, len(Crawlers)) + for i, crawler := range Crawlers { + regexps[i] = regexp.MustCompile(crawler.Pattern) + } + return regexps +}() + +// Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. +func MatchingCrawlers(userAgent string) []int { + indices := []int{} + for i, regexp := range individualRegexps { + if regexp.MatchString(userAgent) { + indices = append(indices, i) + } + } + + return indices +} diff --git a/validate_test.go b/validate_test.go new file mode 100644 index 0000000..d8fd1b8 --- /dev/null +++ b/validate_test.go @@ -0,0 +1,18 @@ +package agents + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestPatterns(t *testing.T) { + for i, crawler := range Crawlers { + t.Run(crawler.URL, func(t *testing.T) { + for _, instance := range crawler.Instances { + require.True(t, IsCrawler(instance), instance) + require.Contains(t, MatchingCrawlers(instance), i, instance) + } + }) + } +} From 1e03d93d22861f258b946d833070c1577ea7485d Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 15:08:28 -0300 Subject: [PATCH 03/21] README: update Go instructions --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b7f050c..a71777e 100644 --- a/README.md +++ b/README.md @@ -34,6 +34,9 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f * JavaScript: `if (RegExp(entry.pattern).test(req.headers['user-agent']) { ... }` * PHP: add a slash before and after the pattern: `if (preg_match('/'.$entry['pattern'].'/', $_SERVER['HTTP_USER_AGENT'])): ...` * Python: `if re.search(entry['pattern'], ua): ...` +* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agent), + it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`), + functions `IsCrawler` and `MatchingCrawlers`. ## Contributing @@ -66,7 +69,6 @@ There are a few wrapper libraries that use this data to detect bots: * [Voight-Kampff](https://github.com/biola/Voight-Kampff) (Ruby) * [isbot](https://github.com/Hentioe/isbot) (Ruby) * [crawlers](https://github.com/Olical/crawlers) (Clojure) - * [crawlerflagger](https://godoc.org/go.kelfa.io/kelfa/pkg/crawlerflagger) (Go) * [isBot](https://github.com/omrilotan/isbot) (Node.JS) Other systems for spotting robots, crawlers, and spiders that you may want to consider are: From 099a79dc1e60e3ca0e3a96cfd04db456345b8a2e Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 15:27:14 -0300 Subject: [PATCH 04/21] golang: add benchmark --- validate_test.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/validate_test.go b/validate_test.go index d8fd1b8..f2555e4 100644 --- a/validate_test.go +++ b/validate_test.go @@ -16,3 +16,11 @@ func TestPatterns(t *testing.T) { }) } } + +func BenchmarkPatterns(b *testing.B) { + userAgent := "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)" + b.SetBytes(int64(len(userAgent))) + for n := 0; n < b.N; n++ { + IsCrawler(userAgent) + } +} From eb84428ca1b33eb1c128f567a3fc3a8b92f05247 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 15:27:33 -0300 Subject: [PATCH 05/21] golang: use go-re2 for regular expresion matching Performance increase is huge! Go regexp: 0.05 MB/s go-re2 in pure Go mode: 77.84 MB/s go-re2 using C++ Re2 (-tags re2_cgo): 213.85 MB/s To enable C++ Re2, install it: sudo apt-get install libre2-dev and pass -tags re2_cgo build tag. --- go.mod | 7 ++++++- go.sum | 6 ++++++ validate.go | 3 ++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 30af075..cbc7e79 100644 --- a/go.mod +++ b/go.mod @@ -2,10 +2,15 @@ module github.com/monperrus/crawler-user-agents go 1.19 -require github.com/stretchr/testify v1.9.0 +require ( + github.com/stretchr/testify v1.9.0 + github.com/wasilibs/go-re2 v1.5.1 +) require ( github.com/davecgh/go-spew v1.1.1 // indirect + github.com/magefile/mage v1.14.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect + github.com/tetratelabs/wazero v1.7.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index e20fa14..db539bf 100644 --- a/go.sum +++ b/go.sum @@ -1,9 +1,15 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/magefile/mage v1.14.0 h1:6QDX3g6z1YvJ4olPhT1wksUcSa/V0a1B+pJb73fBjyo= +github.com/magefile/mage v1.14.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= +github.com/tetratelabs/wazero v1.7.0 h1:jg5qPydno59wqjpGrHph81lbtHzTrWzwwtD4cD88+hQ= +github.com/tetratelabs/wazero v1.7.0/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y= +github.com/wasilibs/go-re2 v1.5.1 h1:a+Gb1mx6Q7MmU4d+3BCnnN28U2/cnADmY1oRRanQi10= +github.com/wasilibs/go-re2 v1.5.1/go.mod h1:UqqxQ1O99boQUm1r61H/IYGiGQOS/P88K7hU5nLNkEg= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/validate.go b/validate.go index b9977ad..eb7718a 100644 --- a/validate.go +++ b/validate.go @@ -4,9 +4,10 @@ import ( _ "embed" "encoding/json" "fmt" - "regexp" "strings" "time" + + regexp "github.com/wasilibs/go-re2" ) //go:embed crawler-user-agents.json From b741c0bff53d191d8c280832d043669ae3bcd0ba Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 15:31:08 -0300 Subject: [PATCH 06/21] add github workflow for Go --- .github/workflows/golang.yml | 41 ++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 .github/workflows/golang.yml diff --git a/.github/workflows/golang.yml b/.github/workflows/golang.yml new file mode 100644 index 0000000..ff80658 --- /dev/null +++ b/.github/workflows/golang.yml @@ -0,0 +1,41 @@ +name: Test Go package + +on: [push] + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - ubuntu-latest + - macos-latest + - windows-latest + go-version: + - 1.19 + - 1.22 + + steps: + - uses: actions/checkout@v3 + + - name: Set up Go ${{ matrix.go-version }} + uses: actions/setup-go@v4 + with: + go-version: ${{ matrix.go-version }} + + - name: Test + run: | + go version + go test -v ./... + + - name: Benchmark tests + run: | + go test -v ./... -bench . -run Benchmark + + - name: Benchmark tests with CGo Re2 + if: runner.os == 'Linux' + run: | + sudo apt-get install libre2-dev + go test -v ./... -tags re2_cgo + go test -v ./... -tags re2_cgo -bench . -run Benchmark From eacd6fd0fa29e0d233fca842fc073aa7356dfb58 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 16:15:33 -0300 Subject: [PATCH 07/21] golang: add benchmark for MatchingCrawlers --- validate_test.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/validate_test.go b/validate_test.go index f2555e4..16f8ecf 100644 --- a/validate_test.go +++ b/validate_test.go @@ -17,10 +17,18 @@ func TestPatterns(t *testing.T) { } } -func BenchmarkPatterns(b *testing.B) { +func BenchmarkIsCrawler(b *testing.B) { userAgent := "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)" b.SetBytes(int64(len(userAgent))) for n := 0; n < b.N; n++ { IsCrawler(userAgent) } } + +func BenchmarkMatchingCrawlers(b *testing.B) { + userAgent := "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)" + b.SetBytes(int64(len(userAgent))) + for n := 0; n < b.N; n++ { + MatchingCrawlers(userAgent) + } +} From 65e66d185baaa497a95362278850f463497f9164 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 16:15:57 -0300 Subject: [PATCH 08/21] golang: speed-up MatchingCrawlers Re2 is fast on large Regexps (faster than when running individually on each RE, including with Go regexp). I used this fact to find matching regexps using tree of regexps of concatenated parts patterns. The individual regexps are found by going from root node of the tree to down. Benchmark results BenchmarkMatchingCrawlers: Before this commit (Re2 individually, pure Go): 0.32 MB/s Before this commit (Re2 individually, -tags re2_cgo): 1.32 MB/s If Go regexp is used individually: 2.31 MB/s With this commit (Re2, pure Go): 5.90 MB/s With this commit (Re2, -tags re2_cgo): 18.24 MB/s Maybe it can be improved even better with hyperscan, but I don't want to bring another dependency. --- validate.go | 85 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 75 insertions(+), 10 deletions(-) diff --git a/validate.go b/validate.go index eb7718a..3df52ec 100644 --- a/validate.go +++ b/validate.go @@ -82,13 +82,15 @@ var Crawlers = func() []Crawler { return crawlers }() -var allRegexps = func() string { +func joinRes(begin, end int) string { regexps := make([]string, 0, len(Crawlers)) - for _, crawler := range Crawlers { + for _, crawler := range Crawlers[begin:end] { regexps = append(regexps, "("+crawler.Pattern+")") } return strings.Join(regexps, "|") -}() +} + +var allRegexps = joinRes(0, len(Crawlers)) var allRegexpsRe = regexp.MustCompile(allRegexps) @@ -97,22 +99,85 @@ func IsCrawler(userAgent string) bool { return allRegexpsRe.MatchString(userAgent) } -var individualRegexps = func() []*regexp.Regexp { - regexps := make([]*regexp.Regexp, len(Crawlers)) +// With RE2 it is fast to check the text against a large regexp. +// To find matching regexps faster, built a binary tree of regexps. + +type regexpNode struct { + re *regexp.Regexp + left *regexpNode + right *regexpNode + index int +} + +var regexpsTree = func() *regexpNode { + nodes := make([]*regexpNode, len(Crawlers)) + starts := make([]int, len(Crawlers)+1) for i, crawler := range Crawlers { - regexps[i] = regexp.MustCompile(crawler.Pattern) + nodes[i] = ®expNode{ + re: regexp.MustCompile(crawler.Pattern), + index: i, + } + starts[i] = i } - return regexps + starts[len(Crawlers)] = len(Crawlers) // To get end of interval. + + for len(nodes) > 1 { + // Join into pairs. + nodes2 := make([]*regexpNode, (len(nodes)+1)/2) + starts2 := make([]int, 0, len(nodes2)+1) + for i := 0; i < len(nodes)/2; i++ { + leftIndex := 2 * i + rightIndex := 2*i + 1 + nodes2[i] = ®expNode{ + left: nodes[leftIndex], + right: nodes[rightIndex], + } + if len(nodes2) != 1 { + // Skip regexp for root node, it is not used. + joinedRe := joinRes(starts[leftIndex], starts[rightIndex+1]) + nodes2[i].re = regexp.MustCompile(joinedRe) + } + starts2 = append(starts2, starts[leftIndex]) + } + if len(nodes)%2 == 1 { + nodes2[len(nodes2)-1] = nodes[len(nodes)-1] + starts2 = append(starts2, starts[len(starts)-2]) + } + starts2 = append(starts2, starts[len(starts)-1]) + + nodes = nodes2 + starts = starts2 + } + + root := nodes[0] + + if root.left == nil { + panic("the algoriths does not work with just one regexp") + } + + return root }() // Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. func MatchingCrawlers(userAgent string) []int { indices := []int{} - for i, regexp := range individualRegexps { - if regexp.MatchString(userAgent) { - indices = append(indices, i) + + var visit func(node *regexpNode) + visit = func(node *regexpNode) { + if node.left != nil { + if node.left.re.MatchString(userAgent) { + visit(node.left) + } + if node.right.re.MatchString(userAgent) { + visit(node.right) + } + } else { + // Leaf. + indices = append(indices, node.index) } } + visit(regexpsTree) + return indices } From 51acf89c4ce518247c777bb29ffae3ed92f9e58c Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 16:27:45 -0300 Subject: [PATCH 09/21] README: instruct to install C++ RE2 --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index a71777e..7aced36 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,9 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f * Python: `if re.search(entry['pattern'], ua): ...` * Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agent), it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`), - functions `IsCrawler` and `MatchingCrawlers`. + functions `IsCrawler` and `MatchingCrawlers`. To achieve the best performance possible in functions + `IsCrawler` and `MatchingCrawlers`, install C++ RE2 into your system: `sudo apt-get install libre2-dev` + and pass tag: `-tags re2_cgo`. ## Contributing From 3f3adbf9aa6ffb3b0ec1ab5de73431adc4f2952c Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Wed, 3 Apr 2024 17:56:16 -0300 Subject: [PATCH 10/21] fix link in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7aced36..ed75f41 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f * JavaScript: `if (RegExp(entry.pattern).test(req.headers['user-agent']) { ... }` * PHP: add a slash before and after the pattern: `if (preg_match('/'.$entry['pattern'].'/', $_SERVER['HTTP_USER_AGENT'])): ...` * Python: `if re.search(entry['pattern'], ua): ...` -* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agent), +* Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents), it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`), functions `IsCrawler` and `MatchingCrawlers`. To achieve the best performance possible in functions `IsCrawler` and `MatchingCrawlers`, install C++ RE2 into your system: `sudo apt-get install libre2-dev` From 9262954c4c8f7ab38133fcc98237b438738bc0ec Mon Sep 17 00:00:00 2001 From: Martin Monperrus Date: Thu, 4 Apr 2024 04:10:37 +0200 Subject: [PATCH 11/21] simplify CI --- .github/workflows/golang.yml | 41 ------------------------------------ 1 file changed, 41 deletions(-) delete mode 100644 .github/workflows/golang.yml diff --git a/.github/workflows/golang.yml b/.github/workflows/golang.yml deleted file mode 100644 index ff80658..0000000 --- a/.github/workflows/golang.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: Test Go package - -on: [push] - -jobs: - test: - runs-on: ${{ matrix.os }} - strategy: - fail-fast: false - matrix: - os: - - ubuntu-latest - - macos-latest - - windows-latest - go-version: - - 1.19 - - 1.22 - - steps: - - uses: actions/checkout@v3 - - - name: Set up Go ${{ matrix.go-version }} - uses: actions/setup-go@v4 - with: - go-version: ${{ matrix.go-version }} - - - name: Test - run: | - go version - go test -v ./... - - - name: Benchmark tests - run: | - go test -v ./... -bench . -run Benchmark - - - name: Benchmark tests with CGo Re2 - if: runner.os == 'Linux' - run: | - sudo apt-get install libre2-dev - go test -v ./... -tags re2_cgo - go test -v ./... -tags re2_cgo -bench . -run Benchmark From 473be9d8aadb848fcb878309d6b94e44f63f37b5 Mon Sep 17 00:00:00 2001 From: Martin Monperrus Date: Thu, 4 Apr 2024 04:11:31 +0200 Subject: [PATCH 12/21] simplify CI --- .github/workflows/ci-validation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci-validation.yml b/.github/workflows/ci-validation.yml index 0888568..38c5b50 100644 --- a/.github/workflows/ci-validation.yml +++ b/.github/workflows/ci-validation.yml @@ -23,3 +23,4 @@ jobs: - run: py.test -vv - run: python3 validate.py - run: php validate.php + - run: go test From ee4872bf1c1c46abb742372bb80c75da7e75b889 Mon Sep 17 00:00:00 2001 From: Martin Monperrus Date: Thu, 4 Apr 2024 04:27:08 +0200 Subject: [PATCH 13/21] explicit validation --- validate_test.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/validate_test.go b/validate_test.go index 16f8ecf..17a3262 100644 --- a/validate_test.go +++ b/validate_test.go @@ -1,14 +1,25 @@ package agents import ( + "fmt" "testing" "github.com/stretchr/testify/require" ) func TestPatterns(t *testing.T) { - for i, crawler := range Crawlers { + // loading all crawlers wwith go:embed + // some validation happens in UnmarshalJSON + allCrawlers := Crawlers + + // there is at least 10 crawlers + require.GreaterOrEqual(t, len(allCrawlers), 10) + + for i, crawler := range allCrawlers { t.Run(crawler.URL, func(t *testing.T) { + // print pattern to console for quickcheck in CI + fmt.Print(crawler.Pattern) + for _, instance := range crawler.Instances { require.True(t, IsCrawler(instance), instance) require.Contains(t, MatchingCrawlers(instance), i, instance) From cc59a03e1f7c1b2321dec29b53ba6c0daa7fa5fd Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 09:22:45 -0300 Subject: [PATCH 14/21] golang: println(pattern), use pattern as subtest --- validate_test.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/validate_test.go b/validate_test.go index 17a3262..d000363 100644 --- a/validate_test.go +++ b/validate_test.go @@ -8,17 +8,16 @@ import ( ) func TestPatterns(t *testing.T) { - // loading all crawlers wwith go:embed - // some validation happens in UnmarshalJSON + // Loading all crawlers with go:embed + // some validation happens in UnmarshalJSON. allCrawlers := Crawlers - // there is at least 10 crawlers + // There are at least 10 crawlers. require.GreaterOrEqual(t, len(allCrawlers), 10) for i, crawler := range allCrawlers { - t.Run(crawler.URL, func(t *testing.T) { - // print pattern to console for quickcheck in CI - fmt.Print(crawler.Pattern) + t.Run(crawler.Pattern, func(t *testing.T) { + fmt.Println(crawler.Pattern) for _, instance := range crawler.Instances { require.True(t, IsCrawler(instance), instance) From c402815e84937b219b77abb9d0fd407ed923467a Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 12:01:49 -0300 Subject: [PATCH 15/21] README: add example of Go program It will work after PR is merged https://github.com/monperrus/crawler-user-agents/pull/348/ --- README.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/README.md b/README.md index ed75f41..2d991ce 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,37 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f `IsCrawler` and `MatchingCrawlers`, install C++ RE2 into your system: `sudo apt-get install libre2-dev` and pass tag: `-tags re2_cgo`. +Example of Go program: + +```go +package main + +import ( + "fmt" + + "github.com/monperrus/crawler-user-agents" +) + +func main() { + userAgent := "Mozilla/5.0 (compatible; Discordbot/2.0; +https://discordapp.com)" + + isCrawler := agents.IsCrawler(userAgent) + fmt.Println("isCrawler:", isCrawler) + + indices := agents.MatchingCrawlers(userAgent) + fmt.Println("crawlers' indices:", indices) + fmt.Println("crawler' URL:", agents.Crawlers[indices[0]].URL) +} +``` + +Output: + +``` +isCrawler: true +crawlers' indices: [237] +crawler' URL: https://discordapp.com +``` + ## Contributing I do welcome additions contributed as pull requests. From 0bc397e0b916818dbb9d06e21c7eb9b4af0cf636 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 12:03:59 -0300 Subject: [PATCH 16/21] golang: remove copy-paste from benchmark test --- validate_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/validate_test.go b/validate_test.go index d000363..1744069 100644 --- a/validate_test.go +++ b/validate_test.go @@ -27,8 +27,9 @@ func TestPatterns(t *testing.T) { } } +const userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)" + func BenchmarkIsCrawler(b *testing.B) { - userAgent := "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)" b.SetBytes(int64(len(userAgent))) for n := 0; n < b.N; n++ { IsCrawler(userAgent) @@ -36,7 +37,6 @@ func BenchmarkIsCrawler(b *testing.B) { } func BenchmarkMatchingCrawlers(b *testing.B) { - userAgent := "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)" b.SetBytes(int64(len(userAgent))) for n := 0; n < b.N; n++ { MatchingCrawlers(userAgent) From 9fc7a2e288965e66c26f84768fefea2cc25aa4a1 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 12:40:47 -0300 Subject: [PATCH 17/21] golang: benchmark on browser UA --- validate_test.go | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/validate_test.go b/validate_test.go index 1744069..06ef12d 100644 --- a/validate_test.go +++ b/validate_test.go @@ -27,18 +27,35 @@ func TestPatterns(t *testing.T) { } } -const userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google-PageRenderer Google (+https://developers.google.com/+/web/snippet/)" +const ( + crawlerUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google (+https://developers.google.com/+/web/snippet/" + browserUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36" +) + +func BenchmarkIsCrawlerPositive(b *testing.B) { + b.SetBytes(int64(len(crawlerUA))) + for n := 0; n < b.N; n++ { + IsCrawler(crawlerUA) + } +} + +func BenchmarkMatchingCrawlersPositive(b *testing.B) { + b.SetBytes(int64(len(crawlerUA))) + for n := 0; n < b.N; n++ { + MatchingCrawlers(crawlerUA) + } +} -func BenchmarkIsCrawler(b *testing.B) { - b.SetBytes(int64(len(userAgent))) +func BenchmarkIsCrawlerNegative(b *testing.B) { + b.SetBytes(int64(len(browserUA))) for n := 0; n < b.N; n++ { - IsCrawler(userAgent) + IsCrawler(browserUA) } } -func BenchmarkMatchingCrawlers(b *testing.B) { - b.SetBytes(int64(len(userAgent))) +func BenchmarkMatchingCrawlersNegative(b *testing.B) { + b.SetBytes(int64(len(browserUA))) for n := 0; n < b.N; n++ { - MatchingCrawlers(userAgent) + MatchingCrawlers(browserUA) } } From 266245ee5f9dc24e888a0cfc6d676225ba134717 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 12:54:19 -0300 Subject: [PATCH 18/21] golang: don't use stretchr/testify --- go.mod | 8 +------- go.sum | 10 +--------- validate_test.go | 24 +++++++++++++++++++----- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/go.mod b/go.mod index cbc7e79..cfc2c1c 100644 --- a/go.mod +++ b/go.mod @@ -2,15 +2,9 @@ module github.com/monperrus/crawler-user-agents go 1.19 -require ( - github.com/stretchr/testify v1.9.0 - github.com/wasilibs/go-re2 v1.5.1 -) +require github.com/wasilibs/go-re2 v1.5.1 require ( - github.com/davecgh/go-spew v1.1.1 // indirect github.com/magefile/mage v1.14.0 // indirect - github.com/pmezard/go-difflib v1.0.0 // indirect github.com/tetratelabs/wazero v1.7.0 // indirect - gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/go.sum b/go.sum index db539bf..cbc2578 100644 --- a/go.sum +++ b/go.sum @@ -1,15 +1,7 @@ -github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= -github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/magefile/mage v1.14.0 h1:6QDX3g6z1YvJ4olPhT1wksUcSa/V0a1B+pJb73fBjyo= github.com/magefile/mage v1.14.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= -github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= -github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= -github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/tetratelabs/wazero v1.7.0 h1:jg5qPydno59wqjpGrHph81lbtHzTrWzwwtD4cD88+hQ= github.com/tetratelabs/wazero v1.7.0/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y= github.com/wasilibs/go-re2 v1.5.1 h1:a+Gb1mx6Q7MmU4d+3BCnnN28U2/cnADmY1oRRanQi10= github.com/wasilibs/go-re2 v1.5.1/go.mod h1:UqqxQ1O99boQUm1r61H/IYGiGQOS/P88K7hU5nLNkEg= -gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= -gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= -gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= +github.com/wasilibs/nottinygc v0.4.0 h1:h1TJMihMC4neN6Zq+WKpLxgd9xCFMw7O9ETLwY2exJQ= diff --git a/validate_test.go b/validate_test.go index 06ef12d..0a83de6 100644 --- a/validate_test.go +++ b/validate_test.go @@ -3,25 +3,39 @@ package agents import ( "fmt" "testing" - - "github.com/stretchr/testify/require" ) +func contains(list []int, value int) bool { + for _, elem := range list { + if elem == value { + return true + } + } + return false +} + func TestPatterns(t *testing.T) { // Loading all crawlers with go:embed // some validation happens in UnmarshalJSON. allCrawlers := Crawlers // There are at least 10 crawlers. - require.GreaterOrEqual(t, len(allCrawlers), 10) + if len(allCrawlers) < 10 { + t.Errorf("Number of crawlers must be at least 10, got %d.", len(allCrawlers)) + } for i, crawler := range allCrawlers { t.Run(crawler.Pattern, func(t *testing.T) { fmt.Println(crawler.Pattern) for _, instance := range crawler.Instances { - require.True(t, IsCrawler(instance), instance) - require.Contains(t, MatchingCrawlers(instance), i, instance) + if !IsCrawler(instance) { + t.Errorf("Instance %q is not detected as a crawler.", instance) + } + hits := MatchingCrawlers(instance) + if !contains(hits, i) { + t.Errorf("Crawler with index %d (pattern %q) is not in the list returned by MatchingCrawlers(%q): %v.", i, crawler.Pattern, instance, hits) + } } }) } From f08c3def4691f87dfadf996dc68412ab615d4e5d Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 13:23:17 -0300 Subject: [PATCH 19/21] golang: check results in banchmark, test FP --- validate_test.go | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/validate_test.go b/validate_test.go index 0a83de6..9eba6aa 100644 --- a/validate_test.go +++ b/validate_test.go @@ -24,6 +24,13 @@ func TestPatterns(t *testing.T) { t.Errorf("Number of crawlers must be at least 10, got %d.", len(allCrawlers)) } + if IsCrawler(browserUA) { + t.Errorf("Browser UA %q was detected as a crawler.", browserUA) + } + if len(MatchingCrawlers(browserUA)) != 0 { + t.Errorf("MatchingCrawlers found crawlers matching Browser UA %q.", browserUA) + } + for i, crawler := range allCrawlers { t.Run(crawler.Pattern, func(t *testing.T) { fmt.Println(crawler.Pattern) @@ -49,27 +56,35 @@ const ( func BenchmarkIsCrawlerPositive(b *testing.B) { b.SetBytes(int64(len(crawlerUA))) for n := 0; n < b.N; n++ { - IsCrawler(crawlerUA) + if !IsCrawler(crawlerUA) { + b.Fail() + } } } func BenchmarkMatchingCrawlersPositive(b *testing.B) { b.SetBytes(int64(len(crawlerUA))) for n := 0; n < b.N; n++ { - MatchingCrawlers(crawlerUA) + if len(MatchingCrawlers(crawlerUA)) == 0 { + b.Fail() + } } } func BenchmarkIsCrawlerNegative(b *testing.B) { b.SetBytes(int64(len(browserUA))) for n := 0; n < b.N; n++ { - IsCrawler(browserUA) + if IsCrawler(browserUA) { + b.Fail() + } } } func BenchmarkMatchingCrawlersNegative(b *testing.B) { b.SetBytes(int64(len(browserUA))) for n := 0; n < b.N; n++ { - MatchingCrawlers(browserUA) + if len(MatchingCrawlers(browserUA)) != 0 { + b.Fail() + } } } From a32a14939d9a9be02b098e2bd8f6747493ca4de2 Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 13:24:15 -0300 Subject: [PATCH 20/21] golang: switch back to standard Go Regexp See https://github.com/monperrus/crawler-user-agents/pull/348#issuecomment-2037492698 Also, it turned out to be faster if regexps are checked individually, not as one large |-concatenation of regexps. One regexp check consumes 66 microseconds on Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz. --- README.md | 4 +- go.mod | 7 ---- go.sum | 7 ---- validate.go | 103 ++++++++-------------------------------------------- 4 files changed, 16 insertions(+), 105 deletions(-) diff --git a/README.md b/README.md index 2d991ce..49e7834 100644 --- a/README.md +++ b/README.md @@ -36,9 +36,7 @@ Each `pattern` is a regular expression. It should work out-of-the-box wih your f * Python: `if re.search(entry['pattern'], ua): ...` * Go: use [this package](https://pkg.go.dev/github.com/monperrus/crawler-user-agents), it provides global variable `Crawlers` (it is synchronized with `crawler-user-agents.json`), - functions `IsCrawler` and `MatchingCrawlers`. To achieve the best performance possible in functions - `IsCrawler` and `MatchingCrawlers`, install C++ RE2 into your system: `sudo apt-get install libre2-dev` - and pass tag: `-tags re2_cgo`. + functions `IsCrawler` and `MatchingCrawlers`. Example of Go program: diff --git a/go.mod b/go.mod index cfc2c1c..54db180 100644 --- a/go.mod +++ b/go.mod @@ -1,10 +1,3 @@ module github.com/monperrus/crawler-user-agents go 1.19 - -require github.com/wasilibs/go-re2 v1.5.1 - -require ( - github.com/magefile/mage v1.14.0 // indirect - github.com/tetratelabs/wazero v1.7.0 // indirect -) diff --git a/go.sum b/go.sum index cbc2578..e69de29 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +0,0 @@ -github.com/magefile/mage v1.14.0 h1:6QDX3g6z1YvJ4olPhT1wksUcSa/V0a1B+pJb73fBjyo= -github.com/magefile/mage v1.14.0/go.mod h1:z5UZb/iS3GoOSn0JgWuiw7dxlurVYTu+/jHXqQg881A= -github.com/tetratelabs/wazero v1.7.0 h1:jg5qPydno59wqjpGrHph81lbtHzTrWzwwtD4cD88+hQ= -github.com/tetratelabs/wazero v1.7.0/go.mod h1:ytl6Zuh20R/eROuyDaGPkp82O9C/DJfXAwJfQ3X6/7Y= -github.com/wasilibs/go-re2 v1.5.1 h1:a+Gb1mx6Q7MmU4d+3BCnnN28U2/cnADmY1oRRanQi10= -github.com/wasilibs/go-re2 v1.5.1/go.mod h1:UqqxQ1O99boQUm1r61H/IYGiGQOS/P88K7hU5nLNkEg= -github.com/wasilibs/nottinygc v0.4.0 h1:h1TJMihMC4neN6Zq+WKpLxgd9xCFMw7O9ETLwY2exJQ= diff --git a/validate.go b/validate.go index 3df52ec..41ab9d1 100644 --- a/validate.go +++ b/validate.go @@ -4,10 +4,8 @@ import ( _ "embed" "encoding/json" "fmt" - "strings" + "regexp" "time" - - regexp "github.com/wasilibs/go-re2" ) //go:embed crawler-user-agents.json @@ -82,102 +80,31 @@ var Crawlers = func() []Crawler { return crawlers }() -func joinRes(begin, end int) string { - regexps := make([]string, 0, len(Crawlers)) - for _, crawler := range Crawlers[begin:end] { - regexps = append(regexps, "("+crawler.Pattern+")") +var regexps = func() []*regexp.Regexp { + regexps := make([]*regexp.Regexp, len(Crawlers)) + for i, crawler := range Crawlers { + regexps[i] = regexp.MustCompile(crawler.Pattern) } - return strings.Join(regexps, "|") -} - -var allRegexps = joinRes(0, len(Crawlers)) - -var allRegexpsRe = regexp.MustCompile(allRegexps) + return regexps +}() // Returns if User Agent string matches any of crawler patterns. func IsCrawler(userAgent string) bool { - return allRegexpsRe.MatchString(userAgent) -} - -// With RE2 it is fast to check the text against a large regexp. -// To find matching regexps faster, built a binary tree of regexps. - -type regexpNode struct { - re *regexp.Regexp - left *regexpNode - right *regexpNode - index int -} - -var regexpsTree = func() *regexpNode { - nodes := make([]*regexpNode, len(Crawlers)) - starts := make([]int, len(Crawlers)+1) - for i, crawler := range Crawlers { - nodes[i] = ®expNode{ - re: regexp.MustCompile(crawler.Pattern), - index: i, + for _, re := range regexps { + if re.MatchString(userAgent) { + return true } - starts[i] = i } - starts[len(Crawlers)] = len(Crawlers) // To get end of interval. - - for len(nodes) > 1 { - // Join into pairs. - nodes2 := make([]*regexpNode, (len(nodes)+1)/2) - starts2 := make([]int, 0, len(nodes2)+1) - for i := 0; i < len(nodes)/2; i++ { - leftIndex := 2 * i - rightIndex := 2*i + 1 - nodes2[i] = ®expNode{ - left: nodes[leftIndex], - right: nodes[rightIndex], - } - if len(nodes2) != 1 { - // Skip regexp for root node, it is not used. - joinedRe := joinRes(starts[leftIndex], starts[rightIndex+1]) - nodes2[i].re = regexp.MustCompile(joinedRe) - } - starts2 = append(starts2, starts[leftIndex]) - } - if len(nodes)%2 == 1 { - nodes2[len(nodes2)-1] = nodes[len(nodes)-1] - starts2 = append(starts2, starts[len(starts)-2]) - } - starts2 = append(starts2, starts[len(starts)-1]) - - nodes = nodes2 - starts = starts2 - } - - root := nodes[0] - - if root.left == nil { - panic("the algoriths does not work with just one regexp") - } - - return root -}() + return false +} // Finds all crawlers matching the User Agent and returns the list of their indices in Crawlers. func MatchingCrawlers(userAgent string) []int { indices := []int{} - - var visit func(node *regexpNode) - visit = func(node *regexpNode) { - if node.left != nil { - if node.left.re.MatchString(userAgent) { - visit(node.left) - } - if node.right.re.MatchString(userAgent) { - visit(node.right) - } - } else { - // Leaf. - indices = append(indices, node.index) + for i, re := range regexps { + if re.MatchString(userAgent) { + indices = append(indices, i) } } - - visit(regexpsTree) - return indices } From 2945d308a9da89bd5941ee15c78d4086b951d70e Mon Sep 17 00:00:00 2001 From: Boris Nagaev Date: Thu, 4 Apr 2024 22:09:52 -0300 Subject: [PATCH 21/21] golang: add test against false negatives Check against the list from https://github.com/microlinkhq/top-user-agents Fix https://github.com/monperrus/crawler-user-agents/issues/350 --- validate_test.go | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/validate_test.go b/validate_test.go index 9eba6aa..6812ad2 100644 --- a/validate_test.go +++ b/validate_test.go @@ -1,7 +1,9 @@ package agents import ( + "encoding/json" "fmt" + "net/http" "testing" ) @@ -48,6 +50,35 @@ func TestPatterns(t *testing.T) { } } +func TestFalseNegatives(t *testing.T) { + const browsersURL = "https://raw.githubusercontent.com/microlinkhq/top-user-agents/master/src/index.json" + resp, err := http.Get(browsersURL) + if err != nil { + t.Fatalf("Failed to fetch the list of browser User Agents from %s: %v.", browsersURL, err) + } + + t.Cleanup(func() { + if err := resp.Body.Close(); err != nil { + t.Fatal(err) + } + }) + + var browsers []string + if err := json.NewDecoder(resp.Body).Decode(&browsers); err != nil { + t.Fatalf("Failed to parse the list of browser User Agents: %v.", err) + } + + for _, userAgent := range browsers { + if IsCrawler(userAgent) { + t.Errorf("Browser User Agent %q is recognized as a crawler.", userAgent) + } + indices := MatchingCrawlers(userAgent) + if len(indices) != 0 { + t.Errorf("Browser User Agent %q matches with crawlers %v.", userAgent, indices) + } + } +} + const ( crawlerUA = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36 Google (+https://developers.google.com/+/web/snippet/" browserUA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) obsidian/1.5.3 Chrome/114.0.5735.289 Electron/25.8.1 Safari/537.36"