diff --git a/README.md b/README.md index 5ec3111..44e5a6f 100644 --- a/README.md +++ b/README.md @@ -47,18 +47,18 @@ If you want to make sure that the lexical specification behaves as expected, you ⚠️ An encoding that `maleeni lex` and the driver can handle is only UTF-8. ```sh -$ echo -n 'The truth is out there.' | maleeni lex clexspec.json | jq -r '[.kind_id, .kind_name, .text, .eof] | @csv' -2,"word","The",false -1,"whitespace"," ",false -2,"word","truth",false -1,"whitespace"," ",false -2,"word","is",false -1,"whitespace"," ",false -2,"word","out",false -1,"whitespace"," ",false -2,"word","there",false -3,"punctuation",".",false -0,"","",true +$ echo -n 'The truth is out there.' | maleeni lex clexspec.json | jq -r '[.kind_name, .lexeme, .eof] | @csv' +"word","The",false +"whitespace"," ",false +"word","truth",false +"whitespace"," ",false +"word","is",false +"whitespace"," ",false +"word","out",false +"whitespace"," ",false +"word","there",false +"punctuation",".",false +"","",true ``` The JSON format of tokens that `maleeni lex` command prints is as follows: @@ -72,8 +72,7 @@ The JSON format of tokens that `maleeni lex` command prints is as follows: | kind_name | string | A name of a lexical kind. | | row | integer | A row number where a lexeme appears. | | col | integer | A column number where a lexeme appears. Note that `col` is counted in code points, not bytes. | -| match | array of integers | A byte sequense of a lexeme. | -| text | string | A string representation of a lexeme. | +| lexeme | array of integers | A byte sequense of a lexeme. | | eof | bool | When this field is `true`, it means the token is the EOF token. | | invalid | bool | When this field is `true`, it means the token is an error token. | @@ -336,7 +335,7 @@ For instance, you can define a subset of [the string literal of golang](https:// In the above specification, when the `"` mark appears in default mode (it's the initial mode), the driver transitions to the `string` mode and interprets character sequences (`char_seq`) and escape sequences (`escaped_char`). When the `"` mark appears the next time, the driver returns to the `default` mode. ```sh -$ echo -n '"foo\nbar"foo' | maleeni lex go-string-cspec.json | jq -r '[.mode_name, .kind_name, .text, .eof] | @csv' +$ echo -n '"foo\nbar"foo' | maleeni lex go-string-cspec.json | jq -r '[.mode_name, .kind_name, .lexeme, .eof] | @csv' "default","string_open","""",false "string","char_seq","foo",false "string","escaped_char","\n",false diff --git a/cmd/maleeni/lex.go b/cmd/maleeni/lex.go index d31ab17..7bc7126 100644 --- a/cmd/maleeni/lex.go +++ b/cmd/maleeni/lex.go @@ -53,7 +53,7 @@ func runLex(cmd *cobra.Command, args []string) (retErr error) { defer f.Close() src = f } - lex, err = driver.NewLexer(clspec, src) + lex, err = driver.NewLexer(driver.NewLexSpec(clspec), src) if err != nil { return err } diff --git a/driver/lexer.go b/driver/lexer.go index bce8d8c..d8230d2 100644 --- a/driver/lexer.go +++ b/driver/lexer.go @@ -5,71 +5,60 @@ import ( "fmt" "io" "io/ioutil" - "strings" - - "github.com/nihei9/maleeni/spec" ) -type byteSequence []byte +type ModeID int -func newByteSequence(b []byte) byteSequence { - return byteSequence(b) +func (id ModeID) Int() int { + return int(id) } -func (s byteSequence) ByteSlice() []byte { - return []byte(s) -} +type StateID int -func (s byteSequence) String() string { - if len(s) <= 0 { - return "" - } - var b strings.Builder - fmt.Fprintf(&b, "%X", s[0]) - for _, d := range s[1:] { - fmt.Fprintf(&b, " %X", d) - } - return b.String() +func (id StateID) Int() int { + return int(id) } -func (s byteSequence) GoString() string { - return fmt.Sprintf("\"%v\"", s.String()) +type KindID int + +func (id KindID) Int() int { + return int(id) } -func (s byteSequence) MarshalJSON() ([]byte, error) { - if len(s) <= 0 { - return []byte("[]"), nil - } - var b strings.Builder - fmt.Fprintf(&b, "[%v", uint8(s[0])) - for _, e := range s[1:] { - fmt.Fprintf(&b, ", %v", uint8(e)) - } - fmt.Fprintf(&b, "]") - return []byte(b.String()), nil +type ModeKindID int + +func (id ModeKindID) Int() int { + return int(id) } -func (s byteSequence) merge(a byteSequence) byteSequence { - return append([]byte(s), []byte(a)...) +type LexSpec interface { + InitialMode() ModeID + Pop(mode ModeID, modeKind ModeKindID) bool + Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) + ModeName(mode ModeID) string + InitialState(mode ModeID) StateID + NextState(mode ModeID, state StateID, v int) (StateID, bool) + Accept(mode ModeID, state StateID) (ModeKindID, bool) + KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) } // Token representes a token. type Token struct { // ModeID is an ID of a lex mode. - ModeID spec.LexModeID + ModeID ModeID // ModeName is a name of a lex mode. - ModeName spec.LexModeName + ModeName string // KindID is an ID of a kind. This is unique among all modes. - KindID spec.LexKindID + KindID KindID // ModeKindID is an ID of a lexical kind. This is unique only within a mode. // Note that you need to use KindID field if you want to identify a kind across all modes. - ModeKindID spec.LexModeKindID + ModeKindID ModeKindID // KindName is a name of a lexical kind. - KindName spec.LexKindName + KindName string // Row is a row number where a lexeme appears. Row int @@ -78,59 +67,37 @@ type Token struct { // Note that Col is counted in code points, not bytes. Col int + // Lexeme is a byte sequence matched a pattern of a lexical specification. + Lexeme []byte + // When this field is true, it means the token is the EOF token. EOF bool // When this field is true, it means the token is an error token. Invalid bool - - // match is a byte sequence matched a pattern of a lexical specification. - match byteSequence -} - -func (t *Token) String() string { - if t.Invalid { - return fmt.Sprintf("!{mode id: %v, mode name: %v, row: %v, col: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.Row, t.Col, t.Text(), t.Match()) - } - if t.EOF { - return fmt.Sprintf("{kind name: eof, row: %v, col: %v}", t.Row, t.Col) - } - return fmt.Sprintf("{mode id: %v, mode name: %v, kind id: %v, mode kind id: %v, kind name: %v, row: %v, col: %v, text: %v, byte: %v}", t.ModeID, t.ModeName, t.KindID, t.ModeKindID, t.KindName, t.Row, t.Col, t.Text(), t.Match()) -} - -// Match returns a byte slice matched a pattern of a lexical specification. -func (t *Token) Match() []byte { - return t.match.ByteSlice() -} - -// Text returns a string representation of a matched byte sequence. -func (t *Token) Text() string { - return string(t.Match()) } func (t *Token) MarshalJSON() ([]byte, error) { return json.Marshal(struct { - ModeID int `json:"mode_id"` - ModeName string `json:"mode_name"` - KindID int `json:"kind_id"` - ModeKindID int `json:"mode_kind_id"` - KindName string `json:"kind_name"` - Row int `json:"row"` - Col int `json:"col"` - Match byteSequence `json:"match"` - Text string `json:"text"` - EOF bool `json:"eof"` - Invalid bool `json:"invalid"` + ModeID int `json:"mode_id"` + ModeName string `json:"mode_name"` + KindID int `json:"kind_id"` + ModeKindID int `json:"mode_kind_id"` + KindName string `json:"kind_name"` + Row int `json:"row"` + Col int `json:"col"` + Lexeme string `json:"lexeme"` + EOF bool `json:"eof"` + Invalid bool `json:"invalid"` }{ ModeID: t.ModeID.Int(), - ModeName: t.ModeName.String(), + ModeName: t.ModeName, KindID: t.KindID.Int(), ModeKindID: t.ModeKindID.Int(), - KindName: t.KindName.String(), + KindName: t.KindName, Row: t.Row, Col: t.Col, - Match: t.match, - Text: t.Text(), + Lexeme: string(t.Lexeme), EOF: t.EOF, Invalid: t.Invalid, }) @@ -146,7 +113,7 @@ func DisableModeTransition() LexerOption { } type Lexer struct { - clspec *spec.CompiledLexSpec + spec LexSpec src []byte srcPtr int row int @@ -154,23 +121,23 @@ type Lexer struct { prevRow int prevCol int tokBuf []*Token - modeStack []spec.LexModeID + modeStack []ModeID passiveModeTran bool } -func NewLexer(clspec *spec.CompiledLexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) { +func NewLexer(spec LexSpec, src io.Reader, opts ...LexerOption) (*Lexer, error) { b, err := ioutil.ReadAll(src) if err != nil { return nil, err } l := &Lexer{ - clspec: clspec, + spec: spec, src: b, srcPtr: 0, row: 0, col: 0, - modeStack: []spec.LexModeID{ - clspec.InitialModeID, + modeStack: []ModeID{ + spec.InitialMode(), }, passiveModeTran: false, } @@ -207,7 +174,7 @@ func (l *Lexer) Next() (*Token, error) { if !tok.Invalid { break } - errTok.match = errTok.match.merge(tok.match) + errTok.Lexeme = append(errTok.Lexeme, tok.Lexeme...) } l.tokBuf = append(l.tokBuf, tok) @@ -225,15 +192,14 @@ func (l *Lexer) nextAndTransition() (*Token, error) { if l.passiveModeTran { return tok, nil } - spec := l.clspec.Specs[l.Mode()] - if spec.Pop[tok.ModeKindID] == 1 { + mode := l.Mode() + if l.spec.Pop(mode, tok.ModeKindID) { err := l.PopMode() if err != nil { return nil, err } } - mode := spec.Push[tok.ModeKindID] - if !mode.IsNil() { + if mode, ok := l.spec.Push(mode, tok.ModeKindID); ok { l.PushMode(mode) } // The checking length of the mode stack must be at after pop and push operations @@ -249,9 +215,8 @@ func (l *Lexer) nextAndTransition() (*Token, error) { func (l *Lexer) next() (*Token, error) { mode := l.Mode() - modeName := l.clspec.ModeNames[mode] - spec := l.clspec.Specs[mode] - state := spec.DFA.InitialStateID + modeName := l.spec.ModeName(mode) + state := l.spec.InitialState(mode) buf := []byte{} unfixedBufLen := 0 row := l.row @@ -271,9 +236,9 @@ func (l *Lexer) next() (*Token, error) { ModeID: mode, ModeName: modeName, ModeKindID: 0, + Lexeme: buf, Row: row, Col: col, - match: newByteSequence(buf), Invalid: true, }, nil } @@ -288,7 +253,7 @@ func (l *Lexer) next() (*Token, error) { } buf = append(buf, v) unfixedBufLen++ - nextState, ok := l.lookupNextState(mode, state, int(v)) + nextState, ok := l.spec.NextState(mode, state, int(v)) if !ok { if tok != nil { l.unread(unfixedBufLen) @@ -298,62 +263,35 @@ func (l *Lexer) next() (*Token, error) { ModeID: mode, ModeName: modeName, ModeKindID: 0, + Lexeme: buf, Row: row, Col: col, - match: newByteSequence(buf), Invalid: true, }, nil } state = nextState - modeKindID := spec.DFA.AcceptingStates[state] - if modeKindID != 0 { - kindID := l.clspec.KindIDs[mode][modeKindID] + if modeKindID, ok := l.spec.Accept(mode, state); ok { + kindID, kindName := l.spec.KindIDAndName(mode, modeKindID) tok = &Token{ ModeID: mode, ModeName: modeName, KindID: kindID, ModeKindID: modeKindID, - KindName: spec.KindNames[modeKindID], + KindName: kindName, + Lexeme: buf, Row: row, Col: col, - match: newByteSequence(buf), } unfixedBufLen = 0 } } } -func (l *Lexer) lookupNextState(mode spec.LexModeID, state spec.StateID, v int) (spec.StateID, bool) { - switch l.clspec.CompressionLevel { - case 2: - tab := l.clspec.Specs[mode].DFA.Transition - rowNum := tab.RowNums[state] - d := tab.UniqueEntries.RowDisplacement[rowNum] - if tab.UniqueEntries.Bounds[d+v] != rowNum { - return tab.UniqueEntries.EmptyValue, false - } - return tab.UniqueEntries.Entries[d+v], true - case 1: - tab := l.clspec.Specs[mode].DFA.Transition - next := tab.UncompressedUniqueEntries[tab.RowNums[state]*tab.OriginalColCount+v] - if next == spec.StateIDNil { - return spec.StateIDNil, false - } - return next, true - } - modeSpec := l.clspec.Specs[mode] - next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v] - if next == spec.StateIDNil { - return spec.StateIDNil, false - } - return next, true -} - -func (l *Lexer) Mode() spec.LexModeID { +func (l *Lexer) Mode() ModeID { return l.modeStack[len(l.modeStack)-1] } -func (l *Lexer) PushMode(mode spec.LexModeID) { +func (l *Lexer) PushMode(mode ModeID) { l.modeStack = append(l.modeStack, mode) } diff --git a/driver/lexer_test.go b/driver/lexer_test.go index 33b206f..ebb4aad 100644 --- a/driver/lexer_test.go +++ b/driver/lexer_test.go @@ -42,22 +42,29 @@ func newLexEntryFragment(kind string, pattern string) *spec.LexEntry { } } -func newToken(modeID spec.LexModeID, modeName spec.LexModeName, kindID spec.LexKindID, modeKindID spec.LexModeKindID, kindName spec.LexKindName, match byteSequence) *Token { +func newToken(modeID ModeID, modeName string, kindID KindID, modeKindID ModeKindID, kindName string, lexeme []byte) *Token { return &Token{ ModeID: modeID, ModeName: modeName, KindID: kindID, ModeKindID: modeKindID, KindName: kindName, - match: match, + Lexeme: lexeme, } } -func newTokenDefault(kindID int, modeKindID int, kindName string, match byteSequence) *Token { - return newToken(spec.LexModeIDDefault, spec.LexModeNameDefault, spec.LexKindID(kindID), spec.LexModeKindID(modeKindID), spec.LexKindName(kindName), match) +func newTokenDefault(kindID int, modeKindID int, kindName string, lexeme []byte) *Token { + return newToken( + ModeID(spec.LexModeIDDefault.Int()), + spec.LexModeNameDefault.String(), + KindID(spec.LexKindID(kindID).Int()), + ModeKindID(spec.LexModeKindID(modeKindID).Int()), + spec.LexKindName(kindName).String(), + lexeme, + ) } -func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token { +func newEOFToken(modeID ModeID, modeName string) *Token { return &Token{ ModeID: modeID, ModeName: modeName, @@ -67,15 +74,15 @@ func newEOFToken(modeID spec.LexModeID, modeName spec.LexModeName) *Token { } func newEOFTokenDefault() *Token { - return newEOFToken(spec.LexModeIDDefault, spec.LexModeNameDefault) + return newEOFToken(ModeID(spec.LexModeIDDefault.Int()), spec.LexModeNameDefault.String()) } -func newInvalidToken(modeID spec.LexModeID, modeName spec.LexModeName, match byteSequence) *Token { +func newInvalidToken(modeID ModeID, modeName string, lexeme []byte) *Token { return &Token{ ModeID: modeID, ModeName: modeName, ModeKindID: 0, - match: match, + Lexeme: lexeme, Invalid: true, } } @@ -103,17 +110,17 @@ func TestLexer_Next(t *testing.T) { }, src: "abb aabb aaabb babb bbabb abbbabb", tokens: []*Token{ - newTokenDefault(1, 1, "t1", newByteSequence([]byte("abb"))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("aabb"))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("aaabb"))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("babb"))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("bbabb"))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("abbbabb"))), + newTokenDefault(1, 1, "t1", []byte("abb")), + newTokenDefault(2, 2, "t2", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("aabb")), + newTokenDefault(2, 2, "t2", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("aaabb")), + newTokenDefault(2, 2, "t2", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("babb")), + newTokenDefault(2, 2, "t2", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("bbabb")), + newTokenDefault(2, 2, "t2", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("abbbabb")), newEOFTokenDefault(), }, }, @@ -127,21 +134,21 @@ func TestLexer_Next(t *testing.T) { }, src: "ba baaa a aaa abcd abcdcdcd cd cdcdcd", tokens: []*Token{ - newTokenDefault(1, 1, "t1", newByteSequence([]byte("ba"))), - newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("baaa"))), - newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("a"))), - newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("aaa"))), - newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte("abcd"))), - newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte("abcdcdcd"))), - newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte("cd"))), - newTokenDefault(3, 3, "t3", newByteSequence([]byte(" "))), - newTokenDefault(2, 2, "t2", newByteSequence([]byte("cdcdcd"))), + newTokenDefault(1, 1, "t1", []byte("ba")), + newTokenDefault(3, 3, "t3", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("baaa")), + newTokenDefault(3, 3, "t3", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("a")), + newTokenDefault(3, 3, "t3", []byte(" ")), + newTokenDefault(1, 1, "t1", []byte("aaa")), + newTokenDefault(3, 3, "t3", []byte(" ")), + newTokenDefault(2, 2, "t2", []byte("abcd")), + newTokenDefault(3, 3, "t3", []byte(" ")), + newTokenDefault(2, 2, "t2", []byte("abcdcdcd")), + newTokenDefault(3, 3, "t3", []byte(" ")), + newTokenDefault(2, 2, "t2", []byte("cd")), + newTokenDefault(3, 3, "t3", []byte(" ")), + newTokenDefault(2, 2, "t2", []byte("cdcdcd")), newEOFTokenDefault(), }, }, @@ -170,22 +177,22 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0x00})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0x7f})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xc2, 0x80})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xdf, 0xbf})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xed, 0x80, 0x80})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xee, 0x80, 0x80})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xef, 0xbf, 0xbf})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newTokenDefault(1, 1, "t1", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newTokenDefault(1, 1, "t1", []byte{0x00}), + newTokenDefault(1, 1, "t1", []byte{0x7f}), + newTokenDefault(1, 1, "t1", []byte{0xc2, 0x80}), + newTokenDefault(1, 1, "t1", []byte{0xdf, 0xbf}), + newTokenDefault(1, 1, "t1", []byte{0xe1, 0x80, 0x80}), + newTokenDefault(1, 1, "t1", []byte{0xec, 0xbf, 0xbf}), + newTokenDefault(1, 1, "t1", []byte{0xed, 0x80, 0x80}), + newTokenDefault(1, 1, "t1", []byte{0xed, 0x9f, 0xbf}), + newTokenDefault(1, 1, "t1", []byte{0xee, 0x80, 0x80}), + newTokenDefault(1, 1, "t1", []byte{0xef, 0xbf, 0xbf}), + newTokenDefault(1, 1, "t1", []byte{0xf0, 0x90, 0x80, 0x80}), + newTokenDefault(1, 1, "t1", []byte{0xf0, 0xbf, 0xbf, 0xbf}), + newTokenDefault(1, 1, "t1", []byte{0xf1, 0x80, 0x80, 0x80}), + newTokenDefault(1, 1, "t1", []byte{0xf3, 0xbf, 0xbf, 0xbf}), + newTokenDefault(1, 1, "t1", []byte{0xf4, 0x80, 0x80, 0x80}), + newTokenDefault(1, 1, "t1", []byte{0xf4, 0x8f, 0xbf, 0xbf}), newEOFTokenDefault(), }, }, @@ -197,17 +204,17 @@ func TestLexer_Next(t *testing.T) { }, src: "ab.*+?|()[]", tokens: []*Token{ - newTokenDefault(1, 1, "t1", newByteSequence([]byte("a"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("b"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("."))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("*"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("+"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("?"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("|"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("("))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte(")"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("["))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("]"))), + newTokenDefault(1, 1, "t1", []byte("a")), + newTokenDefault(1, 1, "t1", []byte("b")), + newTokenDefault(1, 1, "t1", []byte(".")), + newTokenDefault(1, 1, "t1", []byte("*")), + newTokenDefault(1, 1, "t1", []byte("+")), + newTokenDefault(1, 1, "t1", []byte("?")), + newTokenDefault(1, 1, "t1", []byte("|")), + newTokenDefault(1, 1, "t1", []byte("(")), + newTokenDefault(1, 1, "t1", []byte(")")), + newTokenDefault(1, 1, "t1", []byte("[")), + newTokenDefault(1, 1, "t1", []byte("]")), newEOFTokenDefault(), }, }, @@ -230,10 +237,10 @@ func TestLexer_Next(t *testing.T) { 0x7f, }), tokens: []*Token{ - newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x01})), - newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x02})), - newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x7e})), - newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x7f})), + newTokenDefault(1, 1, "1ByteChar", []byte{0x01}), + newTokenDefault(1, 1, "1ByteChar", []byte{0x02}), + newTokenDefault(1, 1, "1ByteChar", []byte{0x7e}), + newTokenDefault(1, 1, "1ByteChar", []byte{0x7f}), newEOFTokenDefault(), }, }, @@ -251,10 +258,10 @@ func TestLexer_Next(t *testing.T) { 0xdf, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xc2, 0x80})), - newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xc2, 0x81})), - newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbe})), - newTokenDefault(1, 1, "2ByteChar", newByteSequence([]byte{0xdf, 0xbf})), + newTokenDefault(1, 1, "2ByteChar", []byte{0xc2, 0x80}), + newTokenDefault(1, 1, "2ByteChar", []byte{0xc2, 0x81}), + newTokenDefault(1, 1, "2ByteChar", []byte{0xdf, 0xbe}), + newTokenDefault(1, 1, "2ByteChar", []byte{0xdf, 0xbf}), newEOFTokenDefault(), }, }, @@ -269,7 +276,7 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xa0, 0x80, }), tokens: []*Token{ - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), newEOFTokenDefault(), }, }, @@ -287,10 +294,10 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xa0, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbe})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0xbf})), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0xbe}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0xbf}), newEOFTokenDefault(), }, }, @@ -308,10 +315,10 @@ func TestLexer_Next(t *testing.T) { 0xe0, 0xbf, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}), newEOFTokenDefault(), }, }, @@ -341,22 +348,22 @@ func TestLexer_Next(t *testing.T) { 0xef, 0xbf, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x80})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xa0, 0x81})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbe})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe0, 0xbf, 0xbf})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x80})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xe1, 0x80, 0x81})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbe})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xec, 0xbf, 0xbf})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x80})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x80, 0x81})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbe})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xed, 0x9f, 0xbf})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x80})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xee, 0x80, 0x81})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbe})), - newTokenDefault(1, 1, "3ByteChar", newByteSequence([]byte{0xef, 0xbf, 0xbf})), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0x80}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xa0, 0x81}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xbf, 0xbe}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe0, 0xbf, 0xbf}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe1, 0x80, 0x80}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xe1, 0x80, 0x81}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xec, 0xbf, 0xbe}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xec, 0xbf, 0xbf}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xed, 0x80, 0x80}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xed, 0x80, 0x81}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xed, 0x9f, 0xbe}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xed, 0x9f, 0xbf}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xee, 0x80, 0x80}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xee, 0x80, 0x81}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xef, 0xbf, 0xbe}), + newTokenDefault(1, 1, "3ByteChar", []byte{0xef, 0xbf, 0xbf}), newEOFTokenDefault(), }, }, @@ -371,7 +378,7 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0x80, 0x80, }), tokens: []*Token{ - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), newEOFTokenDefault(), }, }, @@ -389,10 +396,10 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0x80, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbe})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0xbf})), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbe}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0xbf}), newEOFTokenDefault(), }, }, @@ -410,10 +417,10 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0x90, 0xbf, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbe})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0xbf, 0xbf})), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbe}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0xbf, 0xbf}), newEOFTokenDefault(), }, }, @@ -431,10 +438,10 @@ func TestLexer_Next(t *testing.T) { 0xf0, 0xbf, 0xbf, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}), newEOFTokenDefault(), }, }, @@ -460,18 +467,18 @@ func TestLexer_Next(t *testing.T) { 0xf4, 0x8f, 0xbf, 0xbf, }), tokens: []*Token{ - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x80})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0x90, 0x80, 0x81})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbe})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf0, 0xbf, 0xbf, 0xbf})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x80})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf1, 0x80, 0x80, 0x81})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbe})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf3, 0xbf, 0xbf, 0xbf})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x80})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x80, 0x80, 0x81})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbe})), - newTokenDefault(1, 1, "4ByteChar", newByteSequence([]byte{0xf4, 0x8f, 0xbf, 0xbf})), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x80}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0x90, 0x80, 0x81}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbe}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf0, 0xbf, 0xbf, 0xbf}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x80}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf1, 0x80, 0x80, 0x81}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbe}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf3, 0xbf, 0xbf, 0xbf}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x80}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf4, 0x80, 0x80, 0x81}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbe}), + newTokenDefault(1, 1, "4ByteChar", []byte{0xf4, 0x8f, 0xbf, 0xbf}), newEOFTokenDefault(), }, }, @@ -483,7 +490,7 @@ func TestLexer_Next(t *testing.T) { }, src: "foo9", tokens: []*Token{ - newTokenDefault(1, 1, "NonNumber", newByteSequence([]byte("foo9"))), + newTokenDefault(1, 1, "NonNumber", []byte("foo9")), newEOFTokenDefault(), }, }, @@ -498,10 +505,10 @@ func TestLexer_Next(t *testing.T) { }, src: "nνに😸", tokens: []*Token{ - newTokenDefault(1, 1, "1ByteChar", newByteSequence([]byte{0x6E})), - newTokenDefault(2, 2, "2ByteChar", newByteSequence([]byte{0xCE, 0xBD})), - newTokenDefault(3, 3, "3ByteChar", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newTokenDefault(4, 4, "4ByteChar", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newTokenDefault(1, 1, "1ByteChar", []byte{0x6E}), + newTokenDefault(2, 2, "2ByteChar", []byte{0xCE, 0xBD}), + newTokenDefault(3, 3, "3ByteChar", []byte{0xE3, 0x81, 0xAB}), + newTokenDefault(4, 4, "4ByteChar", []byte{0xF0, 0x9F, 0x98, 0xB8}), newEOFTokenDefault(), }, }, @@ -513,10 +520,10 @@ func TestLexer_Next(t *testing.T) { }, src: "nνに😸", tokens: []*Token{ - newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0x6E})), - newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0xCE, 0xBD})), - newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0xE3, 0x81, 0xAB})), - newTokenDefault(1, 1, "codePointsAlt", newByteSequence([]byte{0xF0, 0x9F, 0x98, 0xB8})), + newTokenDefault(1, 1, "codePointsAlt", []byte{0x6E}), + newTokenDefault(1, 1, "codePointsAlt", []byte{0xCE, 0xBD}), + newTokenDefault(1, 1, "codePointsAlt", []byte{0xE3, 0x81, 0xAB}), + newTokenDefault(1, 1, "codePointsAlt", []byte{0xF0, 0x9F, 0x98, 0xB8}), newEOFTokenDefault(), }, }, @@ -530,8 +537,8 @@ func TestLexer_Next(t *testing.T) { }, src: "abcdefdefabcdef", tokens: []*Token{ - newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdefdef"))), - newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdef"))), + newTokenDefault(1, 1, "t1", []byte("abcdefdef")), + newTokenDefault(1, 1, "t1", []byte("abcdef")), newEOFTokenDefault(), }, }, @@ -545,7 +552,7 @@ func TestLexer_Next(t *testing.T) { }, src: "abcdefdefabc", tokens: []*Token{ - newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdefdefabc"))), + newTokenDefault(1, 1, "t1", []byte("abcdefdefabc")), newEOFTokenDefault(), }, }, @@ -560,7 +567,7 @@ func TestLexer_Next(t *testing.T) { }, src: "abcdefdefabc", tokens: []*Token{ - newTokenDefault(1, 1, "t1", newByteSequence([]byte("abcdefdefabc"))), + newTokenDefault(1, 1, "t1", []byte("abcdefdefabc")), newEOFTokenDefault(), }, }, @@ -576,16 +583,16 @@ func TestLexer_Next(t *testing.T) { }, src: `"" "Hello world.\n\"Hello world.\""`, tokens: []*Token{ - newToken(1, "default", 2, 2, "string_open", newByteSequence([]byte(`"`))), - newToken(2, "string", 5, 3, "string_close", newByteSequence([]byte(`"`))), - newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(1, "default", 2, 2, "string_open", newByteSequence([]byte(`"`))), - newToken(2, "string", 4, 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), - newToken(2, "string", 3, 1, "escape_sequence", newByteSequence([]byte(`\n`))), - newToken(2, "string", 3, 1, "escape_sequence", newByteSequence([]byte(`\"`))), - newToken(2, "string", 4, 2, "char_sequence", newByteSequence([]byte(`Hello world.`))), - newToken(2, "string", 3, 1, "escape_sequence", newByteSequence([]byte(`\"`))), - newToken(2, "string", 5, 3, "string_close", newByteSequence([]byte(`"`))), + newToken(1, "default", 2, 2, "string_open", []byte(`"`)), + newToken(2, "string", 5, 3, "string_close", []byte(`"`)), + newToken(1, "default", 1, 1, "white_space", []byte(` `)), + newToken(1, "default", 2, 2, "string_open", []byte(`"`)), + newToken(2, "string", 4, 2, "char_sequence", []byte(`Hello world.`)), + newToken(2, "string", 3, 1, "escape_sequence", []byte(`\n`)), + newToken(2, "string", 3, 1, "escape_sequence", []byte(`\"`)), + newToken(2, "string", 4, 2, "char_sequence", []byte(`Hello world.`)), + newToken(2, "string", 3, 1, "escape_sequence", []byte(`\"`)), + newToken(2, "string", 5, 3, "string_close", []byte(`"`)), newEOFTokenDefault(), }, }, @@ -602,15 +609,15 @@ func TestLexer_Next(t *testing.T) { }, src: ` a b < < `, tokens: []*Token{ - newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(1, "default", 2, 2, "char_a", newByteSequence([]byte(`a`))), - newToken(2, "state_a", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(2, "state_a", 3, 2, "char_b", newByteSequence([]byte(`b`))), - newToken(3, "state_b", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(3, "state_b", 5, 2, "back_from_b", newByteSequence([]byte(`<`))), - newToken(2, "state_a", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(2, "state_a", 4, 3, "back_from_a", newByteSequence([]byte(`<`))), - newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))), + newToken(1, "default", 1, 1, "white_space", []byte(` `)), + newToken(1, "default", 2, 2, "char_a", []byte(`a`)), + newToken(2, "state_a", 1, 1, "white_space", []byte(` `)), + newToken(2, "state_a", 3, 2, "char_b", []byte(`b`)), + newToken(3, "state_b", 1, 1, "white_space", []byte(` `)), + newToken(3, "state_b", 5, 2, "back_from_b", []byte(`<`)), + newToken(2, "state_a", 1, 1, "white_space", []byte(` `)), + newToken(2, "state_a", 4, 3, "back_from_a", []byte(`<`)), + newToken(1, "default", 1, 1, "white_space", []byte(` `)), newEOFTokenDefault(), }, }, @@ -627,20 +634,20 @@ func TestLexer_Next(t *testing.T) { }, src: `-> 1 -> 2 <- <- a`, tokens: []*Token{ - newToken(1, "default", 3, 3, "push_1", newByteSequence([]byte(`-> 1`))), - newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(2, "mode_1", 4, 2, "push_2", newByteSequence([]byte(`-> 2`))), - newToken(3, "mode_2", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(3, "mode_2", 6, 2, "pop_2", newByteSequence([]byte(`<-`))), - newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(2, "mode_1", 5, 3, "pop_1", newByteSequence([]byte(`<-`))), - newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(1, "default", 2, 2, "char", newByteSequence([]byte(`a`))), + newToken(1, "default", 3, 3, "push_1", []byte(`-> 1`)), + newToken(2, "mode_1", 1, 1, "white_space", []byte(` `)), + newToken(2, "mode_1", 4, 2, "push_2", []byte(`-> 2`)), + newToken(3, "mode_2", 1, 1, "white_space", []byte(` `)), + newToken(3, "mode_2", 6, 2, "pop_2", []byte(`<-`)), + newToken(2, "mode_1", 1, 1, "white_space", []byte(` `)), + newToken(2, "mode_1", 5, 3, "pop_1", []byte(`<-`)), + newToken(1, "default", 1, 1, "white_space", []byte(` `)), + newToken(1, "default", 2, 2, "char", []byte(`a`)), newEOFTokenDefault(), }, passiveModeTran: true, tran: func(l *Lexer, tok *Token) error { - switch l.clspec.ModeNames[l.Mode()] { + switch l.spec.ModeName(l.Mode()) { case "default": switch tok.KindName { case "push_1": @@ -675,21 +682,21 @@ func TestLexer_Next(t *testing.T) { }, src: `-> 1 -> 2 <- <- a`, tokens: []*Token{ - newToken(1, "default", 3, 3, "push_1", newByteSequence([]byte(`-> 1`))), - newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(2, "mode_1", 4, 2, "push_2", newByteSequence([]byte(`-> 2`))), - newToken(3, "mode_2", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(3, "mode_2", 6, 2, "pop_2", newByteSequence([]byte(`<-`))), - newToken(2, "mode_1", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(2, "mode_1", 5, 3, "pop_1", newByteSequence([]byte(`<-`))), - newToken(1, "default", 1, 1, "white_space", newByteSequence([]byte(` `))), - newToken(1, "default", 2, 2, "char", newByteSequence([]byte(`a`))), + newToken(1, "default", 3, 3, "push_1", []byte(`-> 1`)), + newToken(2, "mode_1", 1, 1, "white_space", []byte(` `)), + newToken(2, "mode_1", 4, 2, "push_2", []byte(`-> 2`)), + newToken(3, "mode_2", 1, 1, "white_space", []byte(` `)), + newToken(3, "mode_2", 6, 2, "pop_2", []byte(`<-`)), + newToken(2, "mode_1", 1, 1, "white_space", []byte(` `)), + newToken(2, "mode_1", 5, 3, "pop_1", []byte(`<-`)), + newToken(1, "default", 1, 1, "white_space", []byte(` `)), + newToken(1, "default", 2, 2, "char", []byte(`a`)), newEOFTokenDefault(), }, // Active mode transition and an external transition function can be used together. passiveModeTran: false, tran: func(l *Lexer, tok *Token) error { - switch l.clspec.ModeNames[l.Mode()] { + switch l.spec.ModeName(l.Mode()) { case "mode_1": switch tok.KindName { case "push_2": @@ -717,15 +724,15 @@ func TestLexer_Next(t *testing.T) { }, src: `.*+?|()[\`, tokens: []*Token{ - newTokenDefault(1, 1, "dot", newByteSequence([]byte(`.`))), - newTokenDefault(2, 2, "star", newByteSequence([]byte(`*`))), - newTokenDefault(3, 3, "plus", newByteSequence([]byte(`+`))), - newTokenDefault(4, 4, "question", newByteSequence([]byte(`?`))), - newTokenDefault(5, 5, "vbar", newByteSequence([]byte(`|`))), - newTokenDefault(6, 6, "lparen", newByteSequence([]byte(`(`))), - newTokenDefault(7, 7, "rparen", newByteSequence([]byte(`)`))), - newTokenDefault(8, 8, "lbrace", newByteSequence([]byte(`[`))), - newTokenDefault(9, 9, "backslash", newByteSequence([]byte(`\`))), + newTokenDefault(1, 1, "dot", []byte(`.`)), + newTokenDefault(2, 2, "star", []byte(`*`)), + newTokenDefault(3, 3, "plus", []byte(`+`)), + newTokenDefault(4, 4, "question", []byte(`?`)), + newTokenDefault(5, 5, "vbar", []byte(`|`)), + newTokenDefault(6, 6, "lparen", []byte(`(`)), + newTokenDefault(7, 7, "rparen", []byte(`)`)), + newTokenDefault(8, 8, "lbrace", []byte(`[`)), + newTokenDefault(9, 9, "backslash", []byte(`\`)), newEOFTokenDefault(), }, }, @@ -741,7 +748,7 @@ func TestLexer_Next(t *testing.T) { if tt.passiveModeTran { opts = append(opts, DisableModeTransition()) } - lexer, err := NewLexer(clspec, strings.NewReader(tt.src), opts...) + lexer, err := NewLexer(NewLexSpec(clspec), strings.NewReader(tt.src), opts...) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -752,7 +759,7 @@ func TestLexer_Next(t *testing.T) { break } testToken(t, eTok, tok, false) - // t.Logf("token: ID: %v, Match: %+v Text: \"%v\", EOF: %v, Invalid: %v", tok.ID, tok.Match(), tok.Text(), tok.EOF, tok.Invalid) + if tok.EOF { break } @@ -813,39 +820,39 @@ func TestLexer_Next_WithPosition(t *testing.T) { }) expected := []*Token{ - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0x00})), 0, 0), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0x7F})), 0, 1), - withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 0, 2), - - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xC2, 0x80})), 1, 0), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xDF, 0xBF})), 1, 1), - withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 1, 2), - - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE0, 0xA0, 0x80})), 2, 0), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE0, 0xBF, 0xBF})), 2, 1), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xE1, 0x80, 0x80})), 2, 2), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEC, 0xBF, 0xBF})), 2, 3), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xED, 0x80, 0x80})), 2, 4), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xED, 0x9F, 0xBF})), 2, 5), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEE, 0x80, 0x80})), 2, 6), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xEF, 0xBF, 0xBF})), 2, 7), - withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A})), 2, 8), - - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF0, 0x90, 0x80, 0x80})), 3, 0), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF0, 0xBF, 0xBF, 0xBF})), 3, 1), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF1, 0x80, 0x80, 0x80})), 3, 2), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF3, 0xBF, 0xBF, 0xBF})), 3, 3), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF4, 0x80, 0x80, 0x80})), 3, 4), - withPos(newTokenDefault(2, 2, "any", newByteSequence([]byte{0xF4, 0x8F, 0xBF, 0xBF})), 3, 5), + withPos(newTokenDefault(2, 2, "any", []byte{0x00}), 0, 0), + withPos(newTokenDefault(2, 2, "any", []byte{0x7F}), 0, 1), + withPos(newTokenDefault(1, 1, "newline", []byte{0x0A}), 0, 2), + + withPos(newTokenDefault(2, 2, "any", []byte{0xC2, 0x80}), 1, 0), + withPos(newTokenDefault(2, 2, "any", []byte{0xDF, 0xBF}), 1, 1), + withPos(newTokenDefault(1, 1, "newline", []byte{0x0A}), 1, 2), + + withPos(newTokenDefault(2, 2, "any", []byte{0xE0, 0xA0, 0x80}), 2, 0), + withPos(newTokenDefault(2, 2, "any", []byte{0xE0, 0xBF, 0xBF}), 2, 1), + withPos(newTokenDefault(2, 2, "any", []byte{0xE1, 0x80, 0x80}), 2, 2), + withPos(newTokenDefault(2, 2, "any", []byte{0xEC, 0xBF, 0xBF}), 2, 3), + withPos(newTokenDefault(2, 2, "any", []byte{0xED, 0x80, 0x80}), 2, 4), + withPos(newTokenDefault(2, 2, "any", []byte{0xED, 0x9F, 0xBF}), 2, 5), + withPos(newTokenDefault(2, 2, "any", []byte{0xEE, 0x80, 0x80}), 2, 6), + withPos(newTokenDefault(2, 2, "any", []byte{0xEF, 0xBF, 0xBF}), 2, 7), + withPos(newTokenDefault(1, 1, "newline", []byte{0x0A}), 2, 8), + + withPos(newTokenDefault(2, 2, "any", []byte{0xF0, 0x90, 0x80, 0x80}), 3, 0), + withPos(newTokenDefault(2, 2, "any", []byte{0xF0, 0xBF, 0xBF, 0xBF}), 3, 1), + withPos(newTokenDefault(2, 2, "any", []byte{0xF1, 0x80, 0x80, 0x80}), 3, 2), + withPos(newTokenDefault(2, 2, "any", []byte{0xF3, 0xBF, 0xBF, 0xBF}), 3, 3), + withPos(newTokenDefault(2, 2, "any", []byte{0xF4, 0x80, 0x80, 0x80}), 3, 4), + withPos(newTokenDefault(2, 2, "any", []byte{0xF4, 0x8F, 0xBF, 0xBF}), 3, 5), // When a token contains multiple line breaks, the driver sets the token position to // the line number where a lexeme first appears. - withPos(newTokenDefault(1, 1, "newline", newByteSequence([]byte{0x0A, 0x0A, 0x0A})), 3, 6), + withPos(newTokenDefault(1, 1, "newline", []byte{0x0A, 0x0A, 0x0A}), 3, 6), withPos(newEOFTokenDefault(), 0, 0), } - lexer, err := NewLexer(clspec, strings.NewReader(src)) + lexer, err := NewLexer(NewLexSpec(clspec), strings.NewReader(src)) if err != nil { t.Fatalf("unexpected error: %v", err) } @@ -872,15 +879,15 @@ func testToken(t *testing.T, expected, actual *Token, checkPosition bool) { actual.KindID != expected.KindID || actual.ModeKindID != expected.ModeKindID || actual.KindName != expected.KindName || - !bytes.Equal(actual.Match(), expected.Match()) || + !bytes.Equal(actual.Lexeme, expected.Lexeme) || actual.EOF != expected.EOF || actual.Invalid != expected.Invalid { - t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, expected.Text(), actual, actual.Text()) + t.Fatalf(`unexpected token; want: %v ("%#v"), got: %v ("%#v")`, expected, string(expected.Lexeme), actual, string(actual.Lexeme)) } if checkPosition { if actual.Row != expected.Row || actual.Col != expected.Col { - t.Fatalf(`unexpected token; want: %v ("%v"), got: %v ("%v")`, expected, expected.Text(), actual, actual.Text()) + t.Fatalf(`unexpected token; want: %v ("%#v"), got: %v ("%#v")`, expected, string(expected.Lexeme), actual, string(actual.Lexeme)) } } } diff --git a/driver/spec.go b/driver/spec.go new file mode 100644 index 0000000..149d922 --- /dev/null +++ b/driver/spec.go @@ -0,0 +1,73 @@ +package driver + +import ( + "github.com/nihei9/maleeni/spec" +) + +type lexSpec struct { + spec *spec.CompiledLexSpec +} + +func NewLexSpec(spec *spec.CompiledLexSpec) *lexSpec { + return &lexSpec{ + spec: spec, + } +} + +func (s *lexSpec) InitialMode() ModeID { + return ModeID(s.spec.InitialModeID.Int()) +} + +func (s *lexSpec) Pop(mode ModeID, modeKind ModeKindID) bool { + return s.spec.Specs[mode].Pop[modeKind] == 1 +} + +func (s *lexSpec) Push(mode ModeID, modeKind ModeKindID) (ModeID, bool) { + modeID := s.spec.Specs[mode].Push[modeKind] + return ModeID(modeID.Int()), !modeID.IsNil() +} + +func (s *lexSpec) ModeName(mode ModeID) string { + return s.spec.ModeNames[mode].String() +} + +func (s *lexSpec) InitialState(mode ModeID) StateID { + return StateID(s.spec.Specs[mode].DFA.InitialStateID.Int()) +} + +func (s *lexSpec) NextState(mode ModeID, state StateID, v int) (StateID, bool) { + switch s.spec.CompressionLevel { + case 2: + tran := s.spec.Specs[mode].DFA.Transition + rowNum := tran.RowNums[state] + d := tran.UniqueEntries.RowDisplacement[rowNum] + if tran.UniqueEntries.Bounds[d+v] != rowNum { + return StateID(tran.UniqueEntries.EmptyValue.Int()), false + } + return StateID(tran.UniqueEntries.Entries[d+v].Int()), true + case 1: + tran := s.spec.Specs[mode].DFA.Transition + next := tran.UncompressedUniqueEntries[tran.RowNums[state]*tran.OriginalColCount+v] + if next == spec.StateIDNil { + return StateID(spec.StateIDNil.Int()), false + } + return StateID(next.Int()), true + } + + modeSpec := s.spec.Specs[mode] + next := modeSpec.DFA.UncompressedTransition[state.Int()*modeSpec.DFA.ColCount+v] + if next == spec.StateIDNil { + return StateID(spec.StateIDNil), false + } + return StateID(next.Int()), true +} + +func (s *lexSpec) Accept(mode ModeID, state StateID) (ModeKindID, bool) { + modeKindID := s.spec.Specs[mode].DFA.AcceptingStates[state] + return ModeKindID(modeKindID.Int()), modeKindID != spec.LexModeKindIDNil +} + +func (s *lexSpec) KindIDAndName(mode ModeID, modeKind ModeKindID) (KindID, string) { + kindID := s.spec.KindIDs[mode][modeKind] + return KindID(kindID.Int()), s.spec.KindNames[kindID].String() +} diff --git a/spec/spec.go b/spec/spec.go index d4f6346..9ac5f4b 100644 --- a/spec/spec.go +++ b/spec/spec.go @@ -24,8 +24,8 @@ func (id LexKindID) Int() int { type LexModeKindID int const ( - LexModeKindIDNil = LexKindID(0) - LexModeKindIDMin = LexKindID(1) + LexModeKindIDNil = LexModeKindID(0) + LexModeKindIDMin = LexModeKindID(1) ) func (id LexModeKindID) Int() int {