Make IsVendor quicker

Although iterating across the regexps is quicker than naively concatenating them, it is still quite slow. This PR proposes a slightly cleverer solution. First instead of just concatenating with groups this PR uses non-capturing groups. This speeds up the regexp processing. Secondly we group the regexps in to 3 groups - those that have to be at the start, those that are segments or at the start and the rest. This makes a considerable speed improvement. Thirdly the regexps are sorted within those groups - which also speeds things up. All in all for a non-vendored file this makes IsVendor around twice as fast. Signed-off-by: Andrew Thornton <art27@cantab.net>
go-enry · Apr 23, 2021 · 20726a1 · 20726a1
1 parent d2d4c32
commit 20726a1
Showing 1 changed file with 114 additions and 1 deletion.
diff --git a/utils.go b/utils.go
@@ -3,6 +3,8 @@ package enry
 import (
 	"bytes"
 	"path/filepath"
+	"regexp"
+	"sort"
 	"strings"
 
 	"github.com/go-enry/go-enry/v2/data"
@@ -61,9 +63,11 @@ func IsDotFile(path string) bool {
 	return strings.HasPrefix(base, ".") && base != "."
 }
 
+var isVendorRegExp *regexp.Regexp
+
 // IsVendor returns whether or not path is a vendor path.
 func IsVendor(path string) bool {
-	return matchRegexSlice(data.VendorMatchers, path)
+	return isVendorRegExp.MatchString(path)
 }
 
 // IsTest returns whether or not path is a test path.
@@ -131,3 +135,112 @@ func IsGenerated(path string, content []byte) bool {
 
 	return false
 }
+
+func init() {
+	// We now collate the individual regexps that make up the VendorMatchers to
+	// produce a single large regexp which is around twice as fast to test than
+	// simply iterating through all the regexps or naïvely collating the
+	// regexps.
+	//
+	// ---
+	//
+	// data.VendorMatchers here is a slice containing individual regexps that
+	// match a vendor file therefore if we want to test if a filename is a
+	// Vendor we need to test whether that filename matches one or more of
+	// those regexps.
+	//
+	// Now we could test each matcher in turn using a shortcircuiting test i.e.
+	//
+	//  	func IsVendor(filename string) bool {
+	// 			for _, matcher := range data.VendorMatchers {
+	// 				if matcher.Match(filename) {
+	//					return true
+	//				}
+	//			}
+	//			return false
+	//		}
+	//
+	// Or concatentate all these regexps using groups i.e.
+	//
+	//		`(regexp1)|(regexp2)|(regexp3)|...`
+	//
+	// However both of these are relatively slow and they don't take advantage
+	// of the inherent structure within our regexps...
+	//
+	// If we look at our regexps there are essentially three types of regexp:
+	//
+	// 1. Those that start with `^`
+	// 2. Those that start with `(^|/)`
+	// 3. Others
+	//
+	// If we collate our regexps into these groups that will significantly
+	// reduce the likelihood of backtracking within the regexp trie matcher.
+	//
+	// A further improvement is to use non-capturing groups as otherwise the
+	// regexp parser, whilst matching, will have to allocate slices for
+	// matching positions. (A future improvement here could be in the use of
+	// enforcing non-capturing groups within the sub-regexps too.)
+	//
+	// Finally if we sort the segments we can help the matcher build a more
+	// efficient matcher and trie.
+
+	// alias the VendorMatchers to simplify things
+	matchers := data.VendorMatchers
+
+	// Create three temporary string slices for our three groups above - prefixes removed
+	caretStrings := make([]string, 0, 10)
+	caretSegmentStrings := make([]string, 0, 10)
+	matcherStrings := make([]string, 0, len(matchers))
+
+	// Walk the matchers and check their string representation for each group prefix, remove it and add to the respective group slices
+	for _, matcher := range matchers {
+		str := matcher.String()
+		if str[0] == '^' {
+			caretStrings = append(caretStrings, str[1:])
+		} else if str[0:5] == "(^|/)" {
+			caretSegmentStrings = append(caretSegmentStrings, str[5:])
+		} else {
+			matcherStrings = append(matcherStrings, str)
+		}
+	}
+
+	// Sort the strings within each group - a potential further improvement could be in simplifying within these groups
+	sort.Strings(caretSegmentStrings)
+	sort.Strings(caretStrings)
+	sort.Strings(matcherStrings)
+
+	// Now build the collated regexp
+	sb := &strings.Builder{}
+
+	// Start with group 1 - those that started with `^`
+	sb.WriteString("(?:^(?:")
+	sb.WriteString(caretStrings[0])
+	for _, matcher := range caretStrings[1:] {
+		sb.WriteString(")|(?:")
+		sb.WriteString(matcher)
+	}
+	sb.WriteString("))")
+	sb.WriteString("|")
+
+	// Now add group 2 - those that started with `(^|/)`
+	sb.WriteString("(?:(?:^|/)(?:")
+	sb.WriteString(caretSegmentStrings[0])
+	for _, matcher := range caretSegmentStrings[1:] {
+		sb.WriteString(")|(?:")
+		sb.WriteString(matcher)
+	}
+	sb.WriteString("))")
+	sb.WriteString("|")
+
+	// Finally add the rest
+	sb.WriteString("(?:")
+	sb.WriteString(matcherStrings[0])
+	for _, matcher := range matcherStrings[1:] {
+		sb.WriteString(")|(?:")
+		sb.WriteString(matcher)
+	}
+	sb.WriteString(")")
+
+	// Compile the whole thing as the isVendorRegExp
+	isVendorRegExp = regexp.MustCompile(sb.String())
+}