-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtokenizer_utils.go
135 lines (116 loc) · 3.71 KB
/
tokenizer_utils.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package anitogo
import "strconv"
func newTokenizer(filename string, options TokenizerOptions, tkman *tokenManager, keyman *keywordManager) *tokenizer {
tknzr := new(tokenizer)
tknzr.Filename = filename
tknzr.Options = options
tknzr.tokenManager = tkman
tknzr.keywordManager = keyman
return tknzr
}
func newTokenizerWithManagers(filename string, options TokenizerOptions) *tokenizer {
keyman := newKeywordManager()
tkman := newTokenManager()
return newTokenizer(filename, options, tkman, keyman)
}
func newTokenizerWithDefaults(filename string) *tokenizer {
return newTokenizerWithManagers(filename, TokenizerDefaultOptions)
}
func (t tokenizer) FindPreviousValidToken(tkn token) (*token, error) {
return t.tokenManager.tokens.FindPrevious(tkn, tokenFlagsValid)
}
func (t tokenizer) FindNextValidToken(tkn token) (*token, error) {
return t.tokenManager.tokens.FindNext(tkn, tokenFlagsValid)
}
func isDelimiterToken(tkn token) bool {
return !tkn.Empty() && tkn.Category == tokenCategoryDelimiter
}
func isUnknownToken(tkn token) bool {
return !tkn.Empty() && tkn.Category == tokenCategoryUnknown
}
func isSingleCharacterToken(tkn token) bool {
return isUnknownToken(tkn) && len(tkn.Content) == 1 && tkn.Content != "-"
}
func (t *token) AppendToken(tkn *token) {
t.Content += tkn.Content
tkn.Category = tokenCategoryInvalid
}
func (t *tokenizer) ValidateDelimiterTokens() {
for _, tkn := range t.tokenManager.tokens {
if tkn.Category != tokenCategoryDelimiter {
continue
}
delimiter := tkn.Content
prevToken, err := t.FindPreviousValidToken(*tkn)
if err != nil {
continue
// is this actually best
}
nextToken, err := t.FindNextValidToken(*tkn)
if err != nil {
continue
// is this actually best
}
// check for single character tokens to prevent splitting up: group names, keywords, episode numbers, etc
if delimiter != " " && delimiter != "_" {
if isSingleCharacterToken(*prevToken) {
prevToken.AppendToken(tkn)
for isUnknownToken(*nextToken) {
prevToken.AppendToken(nextToken)
nextToken, err := t.FindNextValidToken(*nextToken)
if err != nil {
continue
// is this the best thing to do
}
if isDelimiterToken(*nextToken) && nextToken.Content == delimiter {
prevToken.AppendToken(nextToken)
nextToken, err = t.FindNextValidToken(*nextToken)
if err != nil {
continue
// is this the best thing to do
}
}
}
continue
}
if isSingleCharacterToken(*nextToken) {
prevToken.AppendToken(tkn)
prevToken.AppendToken(nextToken)
continue
}
}
// check for adjacent delimiters
if isUnknownToken(*prevToken) && isDelimiterToken(*nextToken) {
nextDelimiter := nextToken.Content
if delimiter != nextDelimiter && delimiter != "," {
if nextDelimiter == " " || nextDelimiter == "_" {
prevToken.AppendToken(tkn)
}
} else if isDelimiterToken(*prevToken) && isDelimiterToken(*nextToken) {
prevDelimiter := prevToken.Content
nextDelimiter = nextToken.Content
if prevDelimiter == nextDelimiter && prevDelimiter != delimiter {
tkn.Category = tokenCategoryUnknown // e.g. "&" in "_&_"
}
}
// check for other special cases
if delimiter == "&" || delimiter == "+" {
if isUnknownToken(*prevToken) && isUnknownToken(*nextToken) {
if _, err := strconv.ParseInt(prevToken.Content, 10, 16); err == nil {
if _, err := strconv.ParseInt(nextToken.Content, 10, 16); err == nil {
prevToken.AppendToken(tkn)
prevToken.AppendToken(nextToken)
}
}
}
}
}
}
var newTokens tokens
for _, tk := range t.tokenManager.tokens {
if tk.Category != tokenCategoryInvalid {
newTokens = append(newTokens, tk)
}
}
t.tokenManager.tokens = newTokens
}