From c7e2aa160735f39e290450cf11f4ee0a415159d5 Mon Sep 17 00:00:00 2001 From: Frederic Lemoine Date: Thu, 14 Sep 2023 14:38:09 +0200 Subject: [PATCH] Updated newick parser to allow ';' in comments --- io/newick/newick_lexer.go | 23 +++++++++++++++++------ io/newick/newick_parser.go | 18 ++++++++++-------- io/newick/newick_token.go | 12 ++++++++++-- 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/io/newick/newick_lexer.go b/io/newick/newick_lexer.go index 0f001e7..82d52d1 100644 --- a/io/newick/newick_lexer.go +++ b/io/newick/newick_lexer.go @@ -33,7 +33,9 @@ func (s *Scanner) unread() { } // Scan returns the next token and literal value. -func (s *Scanner) Scan() (tok Token, lit string) { +// ignoreSemiColumn allows to parse identifiers that contain ";" +// such as comments [...;...] +func (s *Scanner) Scan(ignoreSemiColumn bool) (tok Token, lit string) { // Read the next rune. ch := s.read() @@ -60,13 +62,15 @@ func (s *Scanner) Scan() (tok Token, lit string) { case ',': return NEWSIBLING, string(ch) case ';': - return EOT, string(ch) + if !ignoreSemiColumn { + return EOT, string(ch) + } case ':': return STARTLEN, string(ch) } s.unread() - return s.scanIdent() + return s.scanIdent(ignoreSemiColumn) } // scanWhitespace consumes the current rune and all contiguous whitespace. @@ -91,8 +95,15 @@ func (s *Scanner) scanWhitespace() (tok Token, lit string) { return WS, buf.String() } -// scanIdent consumes the current rune and all contiguous ident runes. -func (s *Scanner) scanIdent() (tok Token, lit string) { +// scanIdent consumes the current rune and all contiguous identifier runes. +// An identifier can be: +// -tip, node and branch name +// - comments +// - branch length +// - branch support +// without newick keywords. If ignore semicolumn is true, then ";" is not +// considered as a newick keyword. (useful for parsing comments [...;...]) +func (s *Scanner) scanIdent(ignoreSemiColumn bool) (tok Token, lit string) { // Create a buffer and read the current character into it. var buf bytes.Buffer buf.WriteRune(s.read()) @@ -102,7 +113,7 @@ func (s *Scanner) scanIdent() (tok Token, lit string) { for { if ch := s.read(); ch == eof { break - } else if !isIdent(ch) { + } else if !isIdent(ch, ignoreSemiColumn) { s.unread() break } else { diff --git a/io/newick/newick_parser.go b/io/newick/newick_parser.go index 5c85150..989d479 100644 --- a/io/newick/newick_parser.go +++ b/io/newick/newick_parser.go @@ -28,7 +28,9 @@ func NewParser(r io.Reader) *Parser { // scan returns the next token from the underlying scanner. // If a token has been unscanned then read that instead. -func (p *Parser) scan() (tok Token, lit string) { +// ignoreSemiColumn allows to parse identifiers that contain ";" +// such as comments [...;...] +func (p *Parser) scan(ignoreSemiColumn bool) (tok Token, lit string) { // If we have a token on the buffer, then return it. if p.buf.n != 0 { p.buf.n = 0 @@ -36,7 +38,7 @@ func (p *Parser) scan() (tok Token, lit string) { } // Otherwise read the next token from the scanner. - tok, lit = p.s.Scan() + tok, lit = p.s.Scan(ignoreSemiColumn) // Save it to the buffer in case we unscan later. p.buf.tok, p.buf.lit = tok, lit @@ -49,9 +51,9 @@ func (p *Parser) unscan() { p.buf.n = 1 } // scanIgnoreWhitespace scans the next non-whitespace token. func (p *Parser) scanIgnoreWhitespace() (tok Token, lit string) { - tok, lit = p.scan() + tok, lit = p.scan(false) if tok == WS { - tok, lit = p.scan() + tok, lit = p.scan(false) } return } @@ -287,20 +289,20 @@ func (p *Parser) parseIter(t *tree.Tree, level *int) (prevTok Token, err error) } } -// Consumes comment inside brakets [comment] if the given current token is a [. +// Consumes comment inside brackets [comment] if the given current token is a [. // At the end returns the matching ] token and lit. // If the given token is not a [, then returns an error func (p *Parser) consumeComment(curtoken Token, curlit string) (comment string, err error) { if curtoken == OPENBRACK || curtoken == LABEL { - commenttoken, commentlit := p.scanIgnoreWhitespace() + commenttoken, commentlit := p.scan(true) for (curtoken == LABEL && commenttoken != LABEL) || (curtoken == OPENBRACK && commenttoken != CLOSEBRACK) { if commenttoken == EOF || commenttoken == ILLEGAL { - err = fmt.Errorf("unmatched bracket") + err = fmt.Errorf("unmatched bracket: %s (%s)", comment, commentlit) return } else { comment += commentlit } - commenttoken, commentlit = p.scanIgnoreWhitespace() + commenttoken, commentlit = p.scan(true) } } else { err = fmt.Errorf("a comment must start with [") diff --git a/io/newick/newick_token.go b/io/newick/newick_token.go index 4cc1ffe..be2dbd5 100644 --- a/io/newick/newick_token.go +++ b/io/newick/newick_token.go @@ -24,9 +24,17 @@ func isWhitespace(ch rune) bool { return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r' } -func isIdent(ch rune) bool { +// isIdent checks whether the given rune is part of a identifier, such as: +// -tip, node and branch name +// - comments +// - branch length +// - branch support +// If it corresponds to a newick keyword, then returns false +// If ignore semicolumn is true, then ";" is not considered as +// a newick keyword. (useful for parsing comments [...;...]) +func isIdent(ch rune, ignoreSemiColumn bool) bool { return ch != '[' && ch != ']' && ch != '(' && ch != ')' && ch != ',' && ch != ':' && - ch != ';' && ch != '\'' + (ignoreSemiColumn || ch != ';') && ch != '\'' }