From c7e2aa160735f39e290450cf11f4ee0a415159d5 Mon Sep 17 00:00:00 2001
From: Frederic Lemoine <fredericlemoine@users.noreply.github.com>
Date: Thu, 14 Sep 2023 14:38:09 +0200
Subject: [PATCH] Updated newick parser to allow ';' in comments

---
 io/newick/newick_lexer.go  | 23 +++++++++++++++++------
 io/newick/newick_parser.go | 18 ++++++++++--------
 io/newick/newick_token.go  | 12 ++++++++++--
 3 files changed, 37 insertions(+), 16 deletions(-)

diff --git a/io/newick/newick_lexer.go b/io/newick/newick_lexer.go
index 0f001e7..82d52d1 100644
--- a/io/newick/newick_lexer.go
+++ b/io/newick/newick_lexer.go
@@ -33,7 +33,9 @@ func (s *Scanner) unread() {
 }
 
 // Scan returns the next token and literal value.
-func (s *Scanner) Scan() (tok Token, lit string) {
+// ignoreSemiColumn allows to parse identifiers that contain ";"
+// such as comments [...;...]
+func (s *Scanner) Scan(ignoreSemiColumn bool) (tok Token, lit string) {
 	// Read the next rune.
 	ch := s.read()
 
@@ -60,13 +62,15 @@ func (s *Scanner) Scan() (tok Token, lit string) {
 	case ',':
 		return NEWSIBLING, string(ch)
 	case ';':
-		return EOT, string(ch)
+		if !ignoreSemiColumn {
+			return EOT, string(ch)
+		}
 	case ':':
 		return STARTLEN, string(ch)
 	}
 
 	s.unread()
-	return s.scanIdent()
+	return s.scanIdent(ignoreSemiColumn)
 }
 
 // scanWhitespace consumes the current rune and all contiguous whitespace.
@@ -91,8 +95,15 @@ func (s *Scanner) scanWhitespace() (tok Token, lit string) {
 	return WS, buf.String()
 }
 
-// scanIdent consumes the current rune and all contiguous ident runes.
-func (s *Scanner) scanIdent() (tok Token, lit string) {
+// scanIdent consumes the current rune and all contiguous identifier runes.
+// An identifier can be:
+// -tip, node and branch name
+// - comments
+// - branch length
+// - branch support
+// without newick keywords. If ignore semicolumn is true, then ";" is not
+// considered as a newick keyword. (useful for parsing comments [...;...])
+func (s *Scanner) scanIdent(ignoreSemiColumn bool) (tok Token, lit string) {
 	// Create a buffer and read the current character into it.
 	var buf bytes.Buffer
 	buf.WriteRune(s.read())
@@ -102,7 +113,7 @@ func (s *Scanner) scanIdent() (tok Token, lit string) {
 	for {
 		if ch := s.read(); ch == eof {
 			break
-		} else if !isIdent(ch) {
+		} else if !isIdent(ch, ignoreSemiColumn) {
 			s.unread()
 			break
 		} else {
diff --git a/io/newick/newick_parser.go b/io/newick/newick_parser.go
index 5c85150..989d479 100644
--- a/io/newick/newick_parser.go
+++ b/io/newick/newick_parser.go
@@ -28,7 +28,9 @@ func NewParser(r io.Reader) *Parser {
 
 // scan returns the next token from the underlying scanner.
 // If a token has been unscanned then read that instead.
-func (p *Parser) scan() (tok Token, lit string) {
+// ignoreSemiColumn allows to parse identifiers that contain ";"
+// such as comments [...;...]
+func (p *Parser) scan(ignoreSemiColumn bool) (tok Token, lit string) {
 	// If we have a token on the buffer, then return it.
 	if p.buf.n != 0 {
 		p.buf.n = 0
@@ -36,7 +38,7 @@ func (p *Parser) scan() (tok Token, lit string) {
 	}
 
 	// Otherwise read the next token from the scanner.
-	tok, lit = p.s.Scan()
+	tok, lit = p.s.Scan(ignoreSemiColumn)
 
 	// Save it to the buffer in case we unscan later.
 	p.buf.tok, p.buf.lit = tok, lit
@@ -49,9 +51,9 @@ func (p *Parser) unscan() { p.buf.n = 1 }
 
 // scanIgnoreWhitespace scans the next non-whitespace token.
 func (p *Parser) scanIgnoreWhitespace() (tok Token, lit string) {
-	tok, lit = p.scan()
+	tok, lit = p.scan(false)
 	if tok == WS {
-		tok, lit = p.scan()
+		tok, lit = p.scan(false)
 	}
 	return
 }
@@ -287,20 +289,20 @@ func (p *Parser) parseIter(t *tree.Tree, level *int) (prevTok Token, err error)
 	}
 }
 
-// Consumes comment inside brakets [comment] if the given current token is a [.
+// Consumes comment inside brackets [comment] if the given current token is a [.
 // At the end returns the matching ] token and lit.
 // If the given token is not a [, then returns an error
 func (p *Parser) consumeComment(curtoken Token, curlit string) (comment string, err error) {
 	if curtoken == OPENBRACK || curtoken == LABEL {
-		commenttoken, commentlit := p.scanIgnoreWhitespace()
+		commenttoken, commentlit := p.scan(true)
 		for (curtoken == LABEL && commenttoken != LABEL) || (curtoken == OPENBRACK && commenttoken != CLOSEBRACK) {
 			if commenttoken == EOF || commenttoken == ILLEGAL {
-				err = fmt.Errorf("unmatched bracket")
+				err = fmt.Errorf("unmatched bracket: %s (%s)", comment, commentlit)
 				return
 			} else {
 				comment += commentlit
 			}
-			commenttoken, commentlit = p.scanIgnoreWhitespace()
+			commenttoken, commentlit = p.scan(true)
 		}
 	} else {
 		err = fmt.Errorf("a comment must start with [")
diff --git a/io/newick/newick_token.go b/io/newick/newick_token.go
index 4cc1ffe..be2dbd5 100644
--- a/io/newick/newick_token.go
+++ b/io/newick/newick_token.go
@@ -24,9 +24,17 @@ func isWhitespace(ch rune) bool {
 	return ch == ' ' || ch == '\t' || ch == '\n' || ch == '\r'
 }
 
-func isIdent(ch rune) bool {
+// isIdent checks whether the given rune is part of a identifier, such as:
+// -tip, node and branch name
+// - comments
+// - branch length
+// - branch support
+// If it corresponds to a newick keyword, then returns false
+// If ignore semicolumn is true, then ";" is not considered as
+// a newick keyword. (useful for parsing comments [...;...])
+func isIdent(ch rune, ignoreSemiColumn bool) bool {
 	return ch != '[' && ch != ']' &&
 		ch != '(' && ch != ')' &&
 		ch != ',' && ch != ':' &&
-		ch != ';' && ch != '\''
+		(ignoreSemiColumn || ch != ';') && ch != '\''
 }