Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Parser Regex #43

Merged
merged 5 commits into from
Jul 28, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 24 additions & 11 deletions email/parser.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,21 @@ const (
)

var (
// extractSkylink64RE is a regex that is capable of extracting base-64
// encoded skylinks from text
extractSkylink64RE = regexp.MustCompile(".+?://.+?\\..+?/([a-zA-Z0-9-_]{46})")
// extractSkylink64RE and extractSkylink64RE_2 are regexes capable of
// extracting base-64 encoded skylinks from text
extractSkylink64RE = regexp.MustCompile(`.+?://.+?\..+?/([a-zA-Z0-9-_]{46})`)
extractSkylink64RE_2 = regexp.MustCompile(`(http.+|hxxp.+|\..+|://.+|^)([a-zA-Z0-9-_]{46})(\?.*)?$`)

// extractSkylink32RE is a regex that is capable of extracting base-32
// encoded skylinks from text
extractSkylink32RE = regexp.MustCompile(".+?://.*?([a-zA-Z0-9-_]{55})")
// extractSkylink32RE and extractSkylink32RE_2 are regexes capable of
// extracting base-32 encoded skylinks from text
extractSkylink32RE = regexp.MustCompile(`(?i).+?://.*?([a-z0-9]{55})`)
extractSkylink32RE_2 = regexp.MustCompile(`(?i)(http.+|hxxp.+|\..+|://.+|^)([a-z0-9]{55})(\?.*)?$`)

validateSkylink64RE = regexp.MustCompile("^([a-zA-Z0-9-_]{46})$")
validateSkylink32RE = regexp.MustCompile("^([a-zA-Z0-9-_]{55})$")
// space matches all whitespace
space = regexp.MustCompile(`\s+`)

validateSkylink64RE = regexp.MustCompile(`^([a-zA-Z0-9-_]{46})$`)
validateSkylink32RE = regexp.MustCompile(`(?i)^([a-z0-9]{55})$`)
)

type (
Expand Down Expand Up @@ -305,11 +310,19 @@ func extractSkylinks(input []byte) []string {
for sc.Scan() {
for _, line := range []string{
sc.Text(),
strings.ReplaceAll(sc.Text(), " ", ""),
space.ReplaceAllString(sc.Text(), ""),
} {
for _, matches := range append(
base64matches := append(
extractSkylink64RE.FindAllStringSubmatch(line, -1),
extractSkylink32RE.FindAllStringSubmatch(line, -1)...,
extractSkylink64RE_2.FindAllStringSubmatch(line, -1)...,
)
base32matches := append(
extractSkylink32RE.FindAllStringSubmatch(line, -1),
extractSkylink32RE_2.FindAllStringSubmatch(line, -1)...,
)
for _, matches := range append(
base64matches,
base32matches...,
) {
for _, match := range matches {
if validateSkylink64RE.Match([]byte(match)) {
Expand Down
27 changes: 19 additions & 8 deletions email/parser_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ var (
hxxps:// siasky [.] net/CADEnmNNR6arnyDSH60MlGjQK5O3Sv-ecK1PGt3MNmQUhA#apg@franklinbank [.] com
hxxps:// siasky [.] net/GABJJhT8AlfNh-XS-6YVH8en7O-t377ej9XS2eclnv2yFg

https:// siasky [.]netAAAg4mZrsNcedNPazZ4kSFAYBzf7f8ZgHO1Tu1L-NN8Gjg
BBBg4mZrsNcedNPazZ4kSFAYBzf7f8ZgHO1Tu1L-NN8Gjg

As a reminder, phishing is expressly prohibited by our Universal Terms of Service Agreement, paragraph 7. "Acceptable Use Policy (AUP)"
`)

Expand Down Expand Up @@ -254,17 +257,21 @@ func testExtractSkylinks(t *testing.T) {

// extract skylinks
skylinks = extractSkylinks(exampleBody)
if len(skylinks) != 4 {
t.Fatalf("unexpected amount of skylinks found, %v != 4", len(skylinks))
sort.Strings(skylinks)
if len(skylinks) != 6 {
t.Fatalf("unexpected amount of skylinks found, %v != 6, skylinks %+v", len(skylinks), skylinks)
}

// assert we have extracted the correct skylinks
//
// NOTE: we didn't discover IGzqsAmjjLJjN3Or8ZFb9AGX4Km12EJu5AVmgaX8HWNy7Q
// which could have been a false positive as it's a valid skylink
sort.Strings(skylinks)
if skylinks[0] != "CADEnmNNR6arnyDSH60MlGjQK5O3Sv-ecK1PGt3MNmQUhA" ||
skylinks[1] != "GABJJhT8AlfNh-XS-6YVH8en7O-t377ej9XS2eclnv2yFg" || skylinks[2] != "GAEE7l0IkIVcVEHDgRCcNkRYS8keZKr9v_ffxf9_614m6g" || skylinks[3] != "nAA_hbtNaOYyR2WrM9UNIc5jRu4WfGy5QK_iTGosDgLmSA" {
if skylinks[0] != "AAAg4mZrsNcedNPazZ4kSFAYBzf7f8ZgHO1Tu1L-NN8Gjg" ||
skylinks[1] != "BBBg4mZrsNcedNPazZ4kSFAYBzf7f8ZgHO1Tu1L-NN8Gjg" ||
skylinks[2] != "CADEnmNNR6arnyDSH60MlGjQK5O3Sv-ecK1PGt3MNmQUhA" ||
skylinks[3] != "GABJJhT8AlfNh-XS-6YVH8en7O-t377ej9XS2eclnv2yFg" ||
skylinks[4] != "GAEE7l0IkIVcVEHDgRCcNkRYS8keZKr9v_ffxf9_614m6g" ||
skylinks[5] != "nAA_hbtNaOYyR2WrM9UNIc5jRu4WfGy5QK_iTGosDgLmSA" {
t.Fatal("unexpected skylinks", skylinks)
}

Expand All @@ -277,9 +284,13 @@ func testExtractSkylinks(t *testing.T) {
hxxps:// 7g01n1fmusamd3k4c5l7ahb39356rfhfs92e9mjshj1vq93vk891m2o [.] siasky [.] net

hxxps:// [.] eu-ger-1 [.] siasky [.] net / 1005m6ki628f5t2o74h1qirph34lcavbn52oj7e2oan533sj3cgbr1o

hxxps:// [.] eu-ger-1 [.] siasky [.] net2005m6KI628f5t2o74h1qirph34lcavbn52oj7e2oan533sj3cgbr2b

3005m6ki628f5t2o74h1qirph34lcavbn52oj7e2oan533sj3cgbr2b
`))
if len(skylinks) != 2 {
t.Fatalf("unexpected amount of skylinks found, %v != 2", len(skylinks))
if len(skylinks) != 4 {
t.Fatalf("unexpected amount of skylinks found, %v != 4, skylinks: %v", len(skylinks), skylinks)
}

// NOTE: it will have loaded the base32 encoded version Skylink and output
Expand Down Expand Up @@ -474,7 +485,7 @@ func testBuildAbuseReport(t *testing.T) {
// since we use the example email body we can rest assured it's correct
// since the unit tests cover that as well
pr := updated.ParseResult
if len(pr.Skylinks) != 4 {
if len(pr.Skylinks) != 6 {
t.Fatal("unexpected amount of skylinks", pr.Skylinks)
}
if len(pr.Tags) != 1 {
Expand Down