Skip to content

Commit

Permalink
fix: improve host name detection
Browse files Browse the repository at this point in the history
  • Loading branch information
ldez committed Apr 19, 2024
1 parent b5ecf9a commit f33d7fd
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 5 deletions.
25 changes: 21 additions & 4 deletions notwords.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,17 @@ import (
"bytes"
"regexp"
"strings"
"unicode"
)

var (
reEmail = regexp.MustCompile(`[a-zA-Z0-9_.%+-]+@[a-zA-Z0-9-.]+\.[a-zA-Z]{2,6}[^a-zA-Z]`)
reHost = regexp.MustCompile(`[a-zA-Z0-9-.]+\.[a-zA-Z]+`)
reBackslash = regexp.MustCompile(`\\[a-z]`)
reEmail = regexp.MustCompile(`[[:alnum:]_.%+-]+@[[:alnum:]-.]+\.[[:alpha:]]{2,6}[^[:alpha:]]`)
reBackslash = regexp.MustCompile(`\\[[:lower:]]`)

// reHost Host name regular expression.
// The length of any one label is limited between 1 and 63 octets. (https://www.ietf.org/rfc/rfc2181.txt)
// A TLD has at least 2 letters.
reHost = regexp.MustCompile(`([[:alnum:]-]+\.)+[[:alpha:]]{2,63}`)
)

// RemovePath attempts to strip away embedded file system paths, e.g.
Expand Down Expand Up @@ -62,14 +67,26 @@ func replaceWithBlanks(s string) string {
return strings.Repeat(" ", len(s))
}

// replaceHost same as replaceWithBlanks but if the string contains at least one uppercase letter returns the string.
// Domain names are case-insensitive but browsers and DNS convert uppercase to lower case. (https://www.ietf.org/rfc/rfc4343.txt)
func replaceHost(s string) string {
for _, r := range s {
if unicode.IsUpper(r) {
return s
}
}

return replaceWithBlanks(s)
}

// RemoveEmail remove email-like strings, e.g. "nickg+junk@xfoobar.com", "nickg@xyz.abc123.biz".
func RemoveEmail(s string) string {
return reEmail.ReplaceAllStringFunc(s, replaceWithBlanks)
}

// RemoveHost removes host-like strings "foobar.com" "abc123.fo1231.biz".
func RemoveHost(s string) string {
return reHost.ReplaceAllStringFunc(s, replaceWithBlanks)
return reHost.ReplaceAllStringFunc(s, replaceHost)
}

// RemoveBackslashEscapes removes characters that are preceded by a backslash.
Expand Down
14 changes: 13 additions & 1 deletion notwords_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,20 @@ func TestNotWords(t *testing.T) {
{word: "[/foo/bar] abc", want: "[ ] abc"},
{word: "/", want: "/"},
{word: "x nickg@client9.xxx y", want: "x y"},
{word: "x fqdn.example.org. y", want: "x . y"},
{word: "x infinitie.net y", want: "x y"},
{word: "(s.svc.GetObject(", want: "( ("},
{word: "x infinitie.net ", want: "x "},
{word: "x infinitie.net", want: "x "},
{word: "x foo.example.com y", want: "x y"},
{word: "x foo.example.com ", want: "x "},
{word: "x foo.example.com", want: "x "},
{word: "foo.example.com y", want: " y"},
{word: "foo.example.com", want: " "},
{word: "(s.svc.GetObject(", want: "(s.svc.GetObject("},
{word: "defer file.Close()", want: "defer file.Close()"},
{word: "defer file.c()", want: "defer file.c()"},
{word: "defer file.cl()", want: "defer ()"}, // false negative
{word: "defer file.close()", want: "defer ()"}, // false negative
{word: "\\nto", want: " to"},
}

Expand Down

0 comments on commit f33d7fd

Please sign in to comment.