From 544b75fd7f36e430df99caa85ae33641055e67bd Mon Sep 17 00:00:00 2001 From: Kirill Date: Mon, 5 Apr 2021 15:02:58 +0300 Subject: [PATCH 1/2] Added Github action workflows to reproduce races For #9 - reproducing data races in `licensedb.Detect()`: added workflow for tests with `-race` flag enabled. --- .github/workflows/race.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 .github/workflows/race.yml diff --git a/.github/workflows/race.yml b/.github/workflows/race.yml new file mode 100644 index 0000000..6d3d8e2 --- /dev/null +++ b/.github/workflows/race.yml @@ -0,0 +1,14 @@ +on: [push, pull_request] +name: Race +jobs: + race: + runs-on: ubuntu-latest + steps: + - name: Install Go + uses: actions/setup-go@v1 + with: + go-version: 1.14.x + - name: Checkout code + uses: actions/checkout@v2 + - name: Test race + run: go test -v -race ./... From b748bccfe75ed456b9da9fa43cd46f6aa43539cd Mon Sep 17 00:00:00 2001 From: Kirill Date: Thu, 15 Apr 2021 18:44:13 +0300 Subject: [PATCH 2/2] Fixes #9 - added two mutexes for problem calls Added mutex for prose chunks processor which uses single regex instance not in thread-safe way by wrapping chunks() call with mutex; Added mutex for rst to markdown processor which is also reported by race detector. --- licensedb/internal/nlp.go | 13 +++++++++++-- licensedb/internal/processors/markup.go | 7 +++++++ 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/licensedb/internal/nlp.go b/licensedb/internal/nlp.go index 11afd62..6162ad8 100644 --- a/licensedb/internal/nlp.go +++ b/licensedb/internal/nlp.go @@ -4,6 +4,7 @@ import ( "regexp" "sort" "strings" + "sync" "github.com/jdkato/prose/chunk" "github.com/jdkato/prose/tag" @@ -18,7 +19,8 @@ var ( digitsRe = regexp.MustCompile(`[0-9]+`) disabledNamePartsRe = regexp.MustCompile(`clause|or|only|deprecated|later`) - tagger = tag.NewPerceptronTagger() + tagger = tag.NewPerceptronTagger() + chunkLock sync.Mutex ) // investigateReadmeFile uses NER to match license name mentions. @@ -64,7 +66,8 @@ func investigateReadmeFile( } suspectedText := text[beginIndex:endIndex] suspectedWords := tokenize.TextToWords(suspectedText) - for _, entity := range chunk.Chunk(tagger.Tag(suspectedWords), chunk.TreebankNamedEntities) { + chunks := readmeChunks(tagger.Tag(suspectedWords)) + for _, entity := range chunks { if garbageReadmeRe.MatchString(entity) { continue } @@ -113,6 +116,12 @@ func investigateReadmeFile( return candidates } +func readmeChunks(tokens []tag.Token) []string { + chunkLock.Lock() + defer chunkLock.Unlock() + return chunk.Chunk(tokens, chunk.TreebankNamedEntities) +} + func splitLicenseName(name string) []substring { counts := map[string]int{} parts := licenseNamePartRe.FindAllString(strings.ToLower(name), -1) diff --git a/licensedb/internal/processors/markup.go b/licensedb/internal/processors/markup.go index a863578..691afef 100644 --- a/licensedb/internal/processors/markup.go +++ b/licensedb/internal/processors/markup.go @@ -2,11 +2,16 @@ package processors import ( "bytes" + "sync" rst "github.com/hhatto/gorst" "github.com/russross/blackfriday/v2" ) +var ( + parserLock sync.Mutex +) + // Markdown converts Markdown to plain text. It tries to revert all the decorations. func Markdown(text []byte) []byte { html := blackfriday.Run(text) @@ -17,6 +22,8 @@ func Markdown(text []byte) []byte { // RestructuredText converts ReStructuredText to plain text. // It tries to revert all the decorations. func RestructuredText(text []byte) []byte { + parserLock.Lock() + defer parserLock.Unlock() parser := rst.NewParser(nil) input := bytes.NewBuffer(text) output := &bytes.Buffer{}