From 4a5c82b105e2160278b800e2ee0212fb4f673065 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Tue, 4 Feb 2025 10:09:05 +0100
Subject: [PATCH 01/28] New: a chapter on regexes

---
 docs/Regexes.md   | 188 ++++++++++++++++++++++++++++++++++++++++++++++
 docs/_toc.yml     |   1 +
 docs/finity.fan   |   1 +
 docs/infinity.fan |   1 +
 4 files changed, 191 insertions(+)
 create mode 100644 docs/Regexes.md
 create mode 100644 docs/finity.fan
 create mode 100644 docs/infinity.fan
diff --git a/docs/Regexes.md b/docs/Regexes.md
new file mode 100644
index 00000000..870a62f4
--- /dev/null
+++ b/docs/Regexes.md
@@ -0,0 +1,188 @@
+---
+jupytext:
+  formats: md:myst
+  text_representation:
+    extension: .md
+    format_name: myst
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+(sec:regexes)=
+# Regular Expressions
+
+Although the Fandango grammars cover a wide range of input language features, there are situations where they may be a bit cumbersome to work with.
+Consider specifying _every digit except for zeros_: this requires you to enumerate all the other digits `1`, `2`, and so on.
+This is why Fandango also supports _regular expressions_, which allow you to use a concise syntax for character ranges, repeated characters and more.
+Specifying all digits from `1` to `9`, for instance, becomes the short regular expression `r'[1-9]'`.
+
+
+## About Regular Expressions
+
+Regular expressions form a language on their own and come with several useful features.
+To get an introduction to the regular expressions Fandango uses, read the Python [Regular Expression HOWTO](https://docs.python.org/3/howto/regex.html) and check out the Python [Regular Expression Syntax](https://docs.python.org/3/library/re.html#regular-expression-syntax) for a complete reference.
+
+In Fandango, regular expressions are used for two purposes:
+
+* When _producing_ inputs, a regular expression is instantiated into a random string that matches the expression.
+* When _parsing_ inputs, a regular expression is used to _parse_ and _match_ inputs.
+
+
+## Writing Regular Expressions
+
+:::{margin}
+For Python aficionados: this is actually a Python "raw string"
+:::
+
+In Fandango, a regular expression comes as a string, prefixed with a `r` character.
+To express that a digit can have the values `0` to `9`, instead of
+
+```
+<digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
+```
+
+you can write
+
+```
+<digit> ::= r'[0-9]'
+```
+
+which is much more concise.
+
+Likewise, to match a sequence of characters that ends in `;`, you can write
+
+```
+<some_sequence> ::= r'[^;]+;'
+```
+
+Besides the `r` prefix indicating a regular expression, it also makes the string a _raw_ string.
+This means that backslashes are treated as _literal characters_.
+The regular expression `\d`, for instance, matches a Unicode digit, which includes `[0-9]`, and also [many other digit characters](https://en.wikipedia.org/wiki/Numerals_in_Unicode).
+To include `\d` in a regular expression, write it _as is_; do not escape the backslash with another backslash (as you would do in a regular string):
+
+:::{margin}
+The expression `r'\\d'` would actually match a backslash, followed by a `d` character.
+:::
+
+```
+<any_digit> ::= r'\d'
+```
+
+:::{warning}
+Be aware of the specific syntax of `r`-strings as it comes to backslashes.
+:::
+
+
+## Fine Points about Regular Expressions
+
+For parsing inputs, Fandango uses the Python [`re`](https://docs.python.org/3/library/re.html) module for matching strings against regular expressions;
+for producing inputs, Fandango uses the Python [`exrex`](https://github.com/asciimoo/exrex) module for generating strings that match regular expressions.
+All the `re` and `exrex` capabilities and limitations thus extend to Fandango.
+
+Most notably, `exrex` imposes a _repetition limit_ of 20 on generated strings that in principle can have arbitrary length; a `+` or `*` operator will not expand to more than 20 repetitions.
+Thus, a grammar [`infinity.fan`](infinity.fan)
+
+```{code-cell}
+:tags: ["remove-input"]
+!cat infinity.fan
+```
+
+that in principle, could produce arbitrary long sequences `abcabcabcabc...` will be limited to 20 repetitions at most:
+
+```shell
+$ fandango fuzz -f infinity.fan -n 10
+```
+
+```{code-cell}
+:tags: ["remove-input"]
+!fandango fuzz -f infinity.fan -n 10
+assert _exit_code == 0
+```
+
+To precisely control the number of repetitions, use the regular expression `{m,n}` construct, limiting the number of repetitions from `m` to `n`.
+Let us limit the number of repetitions to the range 1..5:
+
+```{code-cell}
+:tags: ["remove-input"]
+!cat finity.fan
+```
+
+This is what we get:
+
+```shell
+$ fandango fuzz -f finity.fan -n 10
+```
+
+```{code-cell}
+:tags: ["remove-input"]
+!fandango fuzz -f finity.fan -n 10
+assert _exit_code == 0
+```
+
+:::{tip}
+Remember that _grammars_ also have operators `+`, `*`, `?`, and `{N,M}` which apply to the preceding grammar element, and work like their _regular expression_ counterparts.
+Using these, we could also write the above as
+```
+<start> ::= "abc"+
+```
+and
+```
+<start> ::= "abc"{1,5}
+```
+respectively.
+:::
+
+
+## Regular Expressions vs. Grammars
+
+:::{margin}
+If it weren't for some regular expression features such as _backreferences_ (check out what `(?P=name)` does), the context-free grammars would be a strict superset of regular expressions - anything that can be expressed in a regular expression can also be expressed in an equivalent grammar.
+:::
+
+In many cases, a grammar can be replaced by a regular expression and vice versa.
+This raises the question: When should one use a regular expression, and when a grammar?
+Here are some points to help you decide.
+
+* Regular expressions are often more _concise_ (but arguably harder to read) than grammars.
+* If you want to _reference_ individual elements of a string (say, as part of a constraint now or in the future), use a _grammar_.
+* Since their underlying model is simpler, regular expressions are _faster_ to generate, and _much faster_ to [parse](Parsing.md) than grammars.
+* If your underlying language separates lexical and syntactical processing, use
+    - _regular expressions_ for specifying _lexical_ parts such as tokens and fragments;
+    - a _grammar_ for the _syntax_; and
+    - [constraints](Constraints.md) for _semantic_ properties.
+
+
+:::{warning}
+Do not use regular expressions for inputs that are [recursive](Recursive.md).
+Languages like HTML, XML, even e-mail addresses or URLs, are much easier to capture as grammars.
+:::
+
+
+## Regular Expressions as Equivalence Classes
+
+The choice of grammars vs. regular expressions also affects the Fandango generation algorithm.
+Generally speaking, Fandango attempts to cover all alternatives of a grammar.
+If, say, `<digits>` is specified as
+
+```
+<digit> ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9"
+```
+
+then Fandango will attempt to produce every digit at least once, and also try to cover digit _combinations_ up to a certain depth.
+This is useful if you want to specifically test digit processing, or if each of the digits causes a different behavior that needs to be covered.
+
+If, however, you specify `<digits>` as
+
+```
+<digit> ::= r'[0-9]'
+```
+
+then Fandango will treat this as a _single_ alternative (with all expansions considered semantically equivalent), which once expanded into (some) digit will be considered as covered.
+
+:::{tip}
+* If you do want or need to _differentiate_ between individual elements of a set (because they would be treated differently), consider _grammar alternatives_.
+* If you do _not_ want or need to differentiate between individual elements of a set (because they would all be treated the same), consider a _regular expression_.
+:::
+
diff --git a/docs/_toc.yml b/docs/_toc.yml
index 41d9f97b..6a7a1ac8 100644
--- a/docs/_toc.yml
+++ b/docs/_toc.yml
@@ -15,6 +15,7 @@ parts:
     - file: Constraints
     - file: Shell
     - file: Generators
+    - file: Regexes
     - file: Recursive
     - file: Paths
     - file: ISO8601
diff --git a/docs/finity.fan b/docs/finity.fan
new file mode 100644
index 00000000..c2352778
--- /dev/null
+++ b/docs/finity.fan
@@ -0,0 +1 @@
+<start> ::= r"(abc){1,5}"
\ No newline at end of file
diff --git a/docs/infinity.fan b/docs/infinity.fan
new file mode 100644
index 00000000..50632180
--- /dev/null
+++ b/docs/infinity.fan
@@ -0,0 +1 @@
+<start> ::= r"(abc)+"
\ No newline at end of file

From 8c35f707803ab3f208eb106e53631afad91ff2ef Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Tue, 4 Feb 2025 10:22:16 +0100
Subject: [PATCH 02/28] Doc fix

---
 docs/Regexes.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/docs/Regexes.md b/docs/Regexes.md
index 870a62f4..24eec758 100644
--- a/docs/Regexes.md
+++ b/docs/Regexes.md
@@ -138,7 +138,8 @@ respectively.
 ## Regular Expressions vs. Grammars
 
 :::{margin}
-If it weren't for some regular expression features such as _backreferences_ (check out what `(?P=name)` does), the context-free grammars would be a strict superset of regular expressions - anything that can be expressed in a regular expression can also be expressed in an equivalent grammar.
+In theory, context-free grammars are a strict _superset_ of regular expressions - any language that can be expressed in a regular expression can also be expressed in an equivalent grammar.
+Practical implementations of regular expressions break this hierarchy by introducing some features such as _backreferences_ (check out what `(?P=name)` does), which cannot be expressed in grammars.
 :::
 
 In many cases, a grammar can be replaced by a regular expression and vice versa.
@@ -151,7 +152,8 @@ Here are some points to help you decide.
 * If your underlying language separates lexical and syntactical processing, use
     - _regular expressions_ for specifying _lexical_ parts such as tokens and fragments;
     - a _grammar_ for the _syntax_; and
-    - [constraints](Constraints.md) for _semantic_ properties.
+    - [constraints](Constraints.md) for any semantic properties.
+* Prefer grammars and constraints over overly complex regular expressions.
 
 
 :::{warning}

From ee20f1a8163f3321e029ced98f0a36961276a283 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Tue, 4 Feb 2025 10:28:15 +0100
Subject: [PATCH 03/28] Fix: bad admonition

---
 docs/Parsing.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/Parsing.md b/docs/Parsing.md
index 3750c1a3..ec4f72ae 100644
--- a/docs/Parsing.md
+++ b/docs/Parsing.md
@@ -93,7 +93,7 @@ assert _exit_code == 0
 
 We see that input and output are identical (as should always be with parsing and unparsing).
 
-:::{info}
+:::{tip}
 As it comes to producing and storing outputs, the `parse` command has the same options as the `fuzz` command.
 :::
 

From 0e148a2bb56a182b3450d00093522aca0f929e8f Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Tue, 4 Feb 2025 10:28:47 +0100
Subject: [PATCH 04/28] Doc update

---
 docs/Stdlib.md | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/docs/Stdlib.md b/docs/Stdlib.md
index afa5182f..23f89f9c 100644
--- a/docs/Stdlib.md
+++ b/docs/Stdlib.md
@@ -37,6 +37,15 @@ Symbols starting with an underscore must _not_ be redefined.
 from fandango.language import stdlib
 ```
 
+## Characters
+
+A `<char>` represents any Unicode character, including newline.
+
+```{code-cell}
+:tags: ["remove-input"]
+print(stdlib.any_char)
+```
+
 ## Printable Characters
 
 These symbols mimic the [string constants from the Python `string` module](https://docs.python.org/3/library/string.html).
@@ -89,17 +98,6 @@ print(stdlib.bytes)
 ```
 
 
-## Characters
-
-A `<char>` is any Unicode character.
-
-```{error}
-`<char>` is currently not defined.
-Use `<ascii_char>`, `<byte>`, or `<utf8_char>` instead.
-```
-% We need charset or regex specs for this: <char> ::= /./ | '\n'
-
-
 ## UTF-8 characters
 
 A `<utf8_char>` is a UTF-8 encoding of a character, occupying one (`<utf8_char1>`) to four (`<utf8_char4`) bytes.

From fb246da65cc2403020802b39ae8415cbbc0fb64b Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Tue, 4 Feb 2025 15:51:59 +0100
Subject: [PATCH 05/28] New: support binary regexes

---
 docs/Binary.md                   | 31 ++++++++++++++++++++++++++++---
 docs/Regexes.md                  |  8 ++++++++
 docs/binfinity.fan               |  1 +
 src/fandango/language/grammar.py |  5 +++++
 4 files changed, 42 insertions(+), 3 deletions(-)
 create mode 100644 docs/binfinity.fan

diff --git a/docs/Binary.md b/docs/Binary.md
index 13ddbaa1..c15670d6 100644
--- a/docs/Binary.md
+++ b/docs/Binary.md
@@ -164,6 +164,31 @@ The default is `fuzz --file=mode=auto` (default), which will use `binary` or `te
 Avoid mixing non-ASCII strings with bits and bytes in a single grammar.
 :::
 
+(sec:byte-regexes)=
+### Bytes and Regular Expressions
+
+Fandango also supports [regular expressions](Regexes.md) over bytes.
+To obtain a regular expression over a byte string, use both `r` and `b` prefixes.
+This is especially useful for character classes.
+
+Here is an example: [`binfinity.fan`](binfinity.fan) produces strings of five bytes _outside_ the range `\x80-\xff`:
+
+```{code-cell}
+:tags: ["remove-input"]
+!cat binfinity.fan
+```
+
+This is what we get:
+
+```shell
+$ fandango fuzz -f binfinity.fan -n 10
+```
+
+```{code-cell}
+:tags: ["remove-input"]
+!fandango fuzz -f binfinity.fan -n 10
+assert _exit_code == 0
+```
 
 
 ## Length Encodings
@@ -221,12 +246,12 @@ Again, all of this goes into a single `.fan` file: [`binary.fan`](binary.fan) ho
 Let us produce a single output using `binary.fan` and view its (binary) contents, using `od -c`:
 
 ```shell
-$ fandango fuzz -n 1 -f binary.fan | od -c
+$ fandango fuzz -n 1 -f binary.fan -o - | od -c
 ```
 
 ```{code-cell}
 :tags: ["remove-input"]
-! fandango fuzz -n 1 -f binary.fan | od -c
+! fandango fuzz -n 1 -f binary.fan -o - | od -c
 ```
 
 The hexadecimal dump shows that the first two bytes encode the length of the string of digits that follows.
@@ -248,7 +273,7 @@ and obtain the same result:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -n 1 -f binary-pack.fan | od -c
+!fandango fuzz -n 1 -f binary-pack.fan -o - | od -c
 assert _exit_code == 0
 ```
 
diff --git a/docs/Regexes.md b/docs/Regexes.md
index 24eec758..24c0023e 100644
--- a/docs/Regexes.md
+++ b/docs/Regexes.md
@@ -81,6 +81,9 @@ For parsing inputs, Fandango uses the Python [`re`](https://docs.python.org/3/li
 for producing inputs, Fandango uses the Python [`exrex`](https://github.com/asciimoo/exrex) module for generating strings that match regular expressions.
 All the `re` and `exrex` capabilities and limitations thus extend to Fandango.
 
+
+### Repetition Limits
+
 Most notably, `exrex` imposes a _repetition limit_ of 20 on generated strings that in principle can have arbitrary length; a `+` or `*` operator will not expand to more than 20 repetitions.
 Thus, a grammar [`infinity.fan`](infinity.fan)
 
@@ -134,6 +137,11 @@ and
 respectively.
 :::
 
+### Regular Expressions over Bytes
+
+Regular expressions can also be formed over bytes.
+See [Bytes and Regular Expressions](sec:byte-regexes) for details.
+
 
 ## Regular Expressions vs. Grammars
 
diff --git a/docs/binfinity.fan b/docs/binfinity.fan
new file mode 100644
index 00000000..132efd0b
--- /dev/null
+++ b/docs/binfinity.fan
@@ -0,0 +1 @@
+<start> ::= rb"[^\x80-\xff]{5}"
\ No newline at end of file
diff --git a/src/fandango/language/grammar.py b/src/fandango/language/grammar.py
index 9ded398d..6d836ec7 100644
--- a/src/fandango/language/grammar.py
+++ b/src/fandango/language/grammar.py
@@ -256,6 +256,11 @@ def __init__(self, symbol: Terminal):
 
     def fuzz(self, grammar: "Grammar", max_nodes: int = 100) -> List[DerivationTree]:
         if self.symbol.is_regex:
+            if isinstance(self.symbol.symbol, bytes):
+                # Exrex can't do bytes, so we decode to str and back
+                instance = exrex.getone(self.symbol.symbol.decode('iso-8859-1'))
+                return [DerivationTree(Terminal(instance.encode('iso-8859-1')))]
+
             instance = exrex.getone(self.symbol.symbol)
             return [DerivationTree(Terminal(instance))]
 

From 3a040885a15dee78065c23458d7f9457dfd985e7 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 08:41:02 +0100
Subject: [PATCH 06/28] Fix: bad refs

---
 docs/ISO8601.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/ISO8601.md b/docs/ISO8601.md
index 8409c53f..20773a4d 100644
--- a/docs/ISO8601.md
+++ b/docs/ISO8601.md
@@ -370,7 +370,7 @@ Let us write it into a `.fan` file, so we can use it for fuzzing:
 open('ISO8601.fan', 'w').write(iso8601lib);
 ```
 
-Here comes [`iso9601.fan`](iso9601.fan) in all its glory:
+Here comes [`iso8601.fan`](iso8601.fan) in all its glory:
 
 ```{code-cell}
 :tags: ["remove-input"]
@@ -412,4 +412,4 @@ assert _exit_code == 0
 ```
 
 Try out more constraints for yourself!
-The generated [`ISO9601.fan`](ISO9601.fan) file is available for download.
\ No newline at end of file
+The generated [`ISO8601.fan`](ISO8601.fan) file is available for download.
\ No newline at end of file

From f6151a122864760107f7ec3fc33a980292defa0b Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 08:41:22 +0100
Subject: [PATCH 07/28] New: for future citations

---
 docs/fandango.bib | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 docs/fandango.bib

diff --git a/docs/fandango.bib b/docs/fandango.bib
new file mode 100644
index 00000000..e69de29b

From 0846944c54257cafc85b492164fafd997b6c136e Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 08:41:31 +0100
Subject: [PATCH 08/28] Doc update

---
 docs/Regexes.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/docs/Regexes.md b/docs/Regexes.md
index 24c0023e..dd1acbe1 100644
--- a/docs/Regexes.md
+++ b/docs/Regexes.md
@@ -81,6 +81,17 @@ For parsing inputs, Fandango uses the Python [`re`](https://docs.python.org/3/li
 for producing inputs, Fandango uses the Python [`exrex`](https://github.com/asciimoo/exrex) module for generating strings that match regular expressions.
 All the `re` and `exrex` capabilities and limitations thus extend to Fandango.
 
+:::{tip}
+For regex shortcuts, the `exrex` producer only produces characters in the range `\0x00` to `\0xff`:
+
+* for digits (`\d`), the characters `[0-9]`
+* for whitespace (`\s`), the characters `[ \t\n\r\f\v]`
+* for words (`\w`), the characters `[a-zA-Z0-9_]`
+* for non-words (`\W`), the character range `[^a-zA-Z0-9_]`
+
+To produce Unicode characters, make them part of an explicit range (e.g. `[äöüÄÖÜß]`).
+:::
+
 
 ### Repetition Limits
 

From 27278f62aee645a8497f89fad83f323ddcaa1885 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 08:41:48 +0100
Subject: [PATCH 09/28] Improved diagnostics when checking for failed
 assertions

---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 6da241a7..216dca3a 100644
--- a/Makefile
+++ b/Makefile
@@ -90,7 +90,7 @@ osascript -e 'tell application "Safari" to set URL of document of window 1 to UR
 VIEW_PDF = open $(PDF_TARGET)
 
 # Command to check docs for failed assertions
-CHECK_DOCS = grep -l AssertionError $(DOCS)/_build/html/*.html; if [ $$? == 0 ]; then false; else true; fi
+CHECK_DOCS = grep -l AssertionError $(DOCS)/_build/html/*.html; if [ $$? == 0 ]; then echo 'Check the above files for failed assertions'; false; else true; fi
 
 
 # Targets.

From c062a121b5aae35a51fd94d2f64573dfbc1a584a Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 08:54:49 +0100
Subject: [PATCH 10/28] Fix: be sure to have only bytes

---
 docs/gif89a.fan | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/gif89a.fan b/docs/gif89a.fan
index 26489b14..d9c58720 100644
--- a/docs/gif89a.fan
+++ b/docs/gif89a.fan
@@ -36,7 +36,7 @@ where <GifHeader>..<Version> == "89a"
 <SizeOfLocalColorTable> ::= 0 0 0
 <SizeOfGlobalColorTable> ::= 0 0 0
 
-<GlobalColorTable> ::= <RGB> '\x00' '\x00' '\x00'  
+<GlobalColorTable> ::= <RGB> b'\x00' b'\x00' b'\x00'
 <LocalColorTable> ::= b'\x02' b'\x02' b'L'
 
 <PixelAspectRatio> ::= b'\x00'

From 49fe5c56d5491f4c92441169c22a3e8a55eaf87f Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 08:55:41 +0100
Subject: [PATCH 11/28] Improved diagnostics for mismatched bytes

---
 src/fandango/cli/__init__.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/fandango/cli/__init__.py b/src/fandango/cli/__init__.py
index af611850..2768f135 100644
--- a/src/fandango/cli/__init__.py
+++ b/src/fandango/cli/__init__.py
@@ -791,9 +791,10 @@ def report_syntax_error(
     if position >= len(individual):
         return f"{filename!r}: missing input at end of file"
 
-    mismatch = repr(individual[position])
+    mismatch = individual[position]
     if binary:
-        return f"{filename!r}, position {hex(position)} ({position}): mismatched input 0x{mismatch}"
+        assert isinstance(mismatch, int)
+        return f"{filename!r}, position {hex(position)} ({position}): mismatched input {mismatch.to_bytes()!r}"
 
     line = 1
     column = 1
@@ -803,7 +804,7 @@ def report_syntax_error(
             column = 1
         else:
             column += 1
-    return f"{filename!r}, line {line}, column {column}: mismatched input {mismatch}"
+    return f"{filename!r}, line {line}, column {column}: mismatched input {mismatch!r}"
 
 
 def validate(individual, tree, *, filename="<file>"):

From 6ea232babda9aa64809208a1682720649cd4ee23 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 08:56:15 +0100
Subject: [PATCH 12/28] Fix: better check of bytes against bytes

---
 src/fandango/language/symbol.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/fandango/language/symbol.py b/src/fandango/language/symbol.py
index 099067b8..206066f2 100644
--- a/src/fandango/language/symbol.py
+++ b/src/fandango/language/symbol.py
@@ -108,15 +108,15 @@ def check(self, word: str | int) -> tuple[bool, int]:
         # LOGGER.debug(f"Checking {self.symbol!r} against {word!r}")
         symbol = self.symbol
 
-        if isinstance(self.symbol, bytes) and isinstance(word, str):
+        if isinstance(symbol, bytes) and isinstance(word, str):
             assert isinstance(symbol, bytes)
             symbol = symbol.decode("iso-8859-1")
-        if isinstance(self.symbol, str) and isinstance(word, bytes):
+        if isinstance(symbol, str) and isinstance(word, bytes):
             assert isinstance(word, bytes)
             word = word.decode("iso-8859-1")
 
-        assert isinstance(symbol, str)
-        assert isinstance(word, str)
+        assert ((isinstance(symbol, str) and isinstance(word, str))
+                or (isinstance(symbol, bytes) and isinstance(word, bytes)))
 
         if self.is_regex:
             match = re.match(symbol, word)

From 7fe4b9b4ab3c87098263b872e795893cb5dc3b4a Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 09:07:29 +0100
Subject: [PATCH 13/28] New: use regexes by default

---
 docs/gif.fan           |  4 ++--
 utils/bt2fan/bt2fan.py | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/docs/gif.fan b/docs/gif.fan
index b94e7158..95d0474d 100644
--- a/docs/gif.fan
+++ b/docs/gif.fan
@@ -53,7 +53,7 @@
             <DATASUBBLOCKS> ::= (<DataSubBlock>)* <BlockTerminator_1>
               <DataSubBlock> ::= <DATASUBBLOCK>
                 <DATASUBBLOCK> ::= <Size_1> <Data>
-                  <Size_1> ::= (b'\x01' | b'\x02' | b'\x03' | b'\x04' | b'\x05' | b'\x06' | b'\x07' | b'\x08' | b'\t' | b'\n' | b'\x0b' | b'\x0c' | b'\r' | b'\x0e' | b'\x0f' | b'\x10' | b'\x11' | b'\x12' | b'\x13' | b'\x14' | b'\x15' | b'\x16' | b'\x17' | b'\x18' | b'\x19' | b'\x1a' | b'\x1b' | b'\x1c' | b'\x1d' | b'\x1e' | b'\x1f' | b' ' | b'!' | b'"' | b'#' | b'$' | b'%' | b'&' | b"'" | b'(' | b')' | b'*' | b'+' | b',' | b'-' | b'.' | b'/' | b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' | b'8' | b'9' | b':' | b';' | b'<' | b'=' | b'>' | b'?' | b'@' | b'A' | b'B' | b'C' | b'D' | b'E' | b'F' | b'G' | b'H' | b'I' | b'J' | b'K' | b'L' | b'M' | b'N' | b'O' | b'P' | b'Q' | b'R' | b'S' | b'T' | b'U' | b'V' | b'W' | b'X' | b'Y' | b'Z' | b'[' | b'\\' | b']' | b'^' | b'_' | b'`' | b'a' | b'b' | b'c' | b'd' | b'e' | b'f' | b'g' | b'h' | b'i' | b'j' | b'k' | b'l' | b'm' | b'n' | b'o' | b'p' | b'q' | b'r' | b's' | b't' | b'u' | b'v' | b'w' | b'x' | b'y' | b'z' | b'{' | b'|' | b'}' | b'~' | b'\x7f' | b'\x80' | b'\x81' | b'\x82' | b'\x83' | b'\x84' | b'\x85' | b'\x86' | b'\x87' | b'\x88' | b'\x89' | b'\x8a' | b'\x8b' | b'\x8c' | b'\x8d' | b'\x8e' | b'\x8f' | b'\x90' | b'\x91' | b'\x92' | b'\x93' | b'\x94' | b'\x95' | b'\x96' | b'\x97' | b'\x98' | b'\x99' | b'\x9a' | b'\x9b' | b'\x9c' | b'\x9d' | b'\x9e' | b'\x9f' | b'\xa0' | b'\xa1' | b'\xa2' | b'\xa3' | b'\xa4' | b'\xa5' | b'\xa6' | b'\xa7' | b'\xa8' | b'\xa9' | b'\xaa' | b'\xab' | b'\xac' | b'\xad' | b'\xae' | b'\xaf' | b'\xb0' | b'\xb1' | b'\xb2' | b'\xb3' | b'\xb4' | b'\xb5' | b'\xb6' | b'\xb7' | b'\xb8' | b'\xb9' | b'\xba' | b'\xbb' | b'\xbc' | b'\xbd' | b'\xbe' | b'\xbf' | b'\xc0' | b'\xc1' | b'\xc2' | b'\xc3' | b'\xc4' | b'\xc5' | b'\xc6' | b'\xc7' | b'\xc8' | b'\xc9' | b'\xca' | b'\xcb' | b'\xcc' | b'\xcd' | b'\xce' | b'\xcf' | b'\xd0' | b'\xd1' | b'\xd2' | b'\xd3' | b'\xd4' | b'\xd5' | b'\xd6' | b'\xd7' | b'\xd8' | b'\xd9' | b'\xda' | b'\xdb' | b'\xdc' | b'\xdd' | b'\xde' | b'\xdf' | b'\xe0' | b'\xe1' | b'\xe2' | b'\xe3' | b'\xe4' | b'\xe5' | b'\xe6' | b'\xe7' | b'\xe8' | b'\xe9' | b'\xea' | b'\xeb' | b'\xec' | b'\xed' | b'\xee' | b'\xef' | b'\xf0' | b'\xf1' | b'\xf2' | b'\xf3' | b'\xf4' | b'\xf5' | b'\xf6' | b'\xf7' | b'\xf8' | b'\xf9' | b'\xfa' | b'\xfb' | b'\xfc' | b'\xfd' | b'\xfe' | b'\xff')  # not b'\x00'
+                  <Size_1> ::= br'[^\x00]'
                   <Data> ::= <char>*  # len(<Data>) == ord(str(<Size_1>)); see below
               <BlockTerminator_1> ::= b'\x00'
       <GraphicControlExtension> ::= <GRAPHICCONTROLEXTENSION>
@@ -105,7 +105,7 @@
           <ApplicationData> ::= <DATASUBBLOCKS>
       <UndefinedData> ::= <UNDEFINEDDATA>
         <UNDEFINEDDATA> ::= <ExtensionIntroducer_9> <Label> <DataSubBlocks_1>
-          <ExtensionIntroducer_9> ::= (b'\x00' | b'\x01' | b'\x02' | b'\x03' | b'\x04' | b'\x05' | b'\x06' | b'\x07' | b'\x08' | b'\t' | b'\n' | b'\x0b' | b'\x0c' | b'\r' | b'\x0e' | b'\x0f' | b'\x10' | b'\x11' | b'\x12' | b'\x13' | b'\x14' | b'\x15' | b'\x16' | b'\x17' | b'\x18' | b'\x19' | b'\x1a' | b'\x1b' | b'\x1c' | b'\x1d' | b'\x1e' | b'\x1f' | b' ' | b'"' | b'#' | b'$' | b'%' | b'&' | b"'" | b'(' | b')' | b'*' | b'+' | b',' | b'-' | b'.' | b'/' | b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' | b'8' | b'9' | b':' | b';' | b'<' | b'=' | b'>' | b'?' | b'@' | b'A' | b'B' | b'C' | b'D' | b'E' | b'F' | b'G' | b'H' | b'I' | b'J' | b'K' | b'L' | b'M' | b'N' | b'O' | b'P' | b'Q' | b'R' | b'S' | b'T' | b'U' | b'V' | b'W' | b'X' | b'Y' | b'Z' | b'[' | b'\\' | b']' | b'^' | b'_' | b'`' | b'a' | b'b' | b'c' | b'd' | b'e' | b'f' | b'g' | b'h' | b'i' | b'j' | b'k' | b'l' | b'm' | b'n' | b'o' | b'p' | b'q' | b'r' | b's' | b't' | b'u' | b'v' | b'w' | b'x' | b'y' | b'z' | b'{' | b'|' | b'}' | b'~' | b'\x7f' | b'\x80' | b'\x81' | b'\x82' | b'\x83' | b'\x84' | b'\x85' | b'\x86' | b'\x87' | b'\x88' | b'\x89' | b'\x8a' | b'\x8b' | b'\x8c' | b'\x8d' | b'\x8e' | b'\x8f' | b'\x90' | b'\x91' | b'\x92' | b'\x93' | b'\x94' | b'\x95' | b'\x96' | b'\x97' | b'\x98' | b'\x99' | b'\x9a' | b'\x9b' | b'\x9c' | b'\x9d' | b'\x9e' | b'\x9f' | b'\xa0' | b'\xa1' | b'\xa2' | b'\xa3' | b'\xa4' | b'\xa5' | b'\xa6' | b'\xa7' | b'\xa8' | b'\xa9' | b'\xaa' | b'\xab' | b'\xac' | b'\xad' | b'\xae' | b'\xaf' | b'\xb0' | b'\xb1' | b'\xb2' | b'\xb3' | b'\xb4' | b'\xb5' | b'\xb6' | b'\xb7' | b'\xb8' | b'\xb9' | b'\xba' | b'\xbb' | b'\xbc' | b'\xbd' | b'\xbe' | b'\xbf' | b'\xc0' | b'\xc1' | b'\xc2' | b'\xc3' | b'\xc4' | b'\xc5' | b'\xc6' | b'\xc7' | b'\xc8' | b'\xc9' | b'\xca' | b'\xcb' | b'\xcc' | b'\xcd' | b'\xce' | b'\xcf' | b'\xd0' | b'\xd1' | b'\xd2' | b'\xd3' | b'\xd4' | b'\xd5' | b'\xd6' | b'\xd7' | b'\xd8' | b'\xd9' | b'\xda' | b'\xdb' | b'\xdc' | b'\xdd' | b'\xde' | b'\xdf' | b'\xe0' | b'\xe1' | b'\xe2' | b'\xe3' | b'\xe4' | b'\xe5' | b'\xe6' | b'\xe7' | b'\xe8' | b'\xe9' | b'\xea' | b'\xeb' | b'\xec' | b'\xed' | b'\xee' | b'\xef' | b'\xf0' | b'\xf1' | b'\xf2' | b'\xf3' | b'\xf4' | b'\xf5' | b'\xf6' | b'\xf7' | b'\xf8' | b'\xf9' | b'\xfa' | b'\xfb' | b'\xfc' | b'\xfd' | b'\xfe' | b'\xff')  # not b'!'
+          <ExtensionIntroducer_9> ::= br'[^!]'
           <Label> ::= <UBYTE>
           <DataSubBlocks_1> ::= <DATASUBBLOCKS>
   <Trailer> ::= <TRAILER>
diff --git a/utils/bt2fan/bt2fan.py b/utils/bt2fan/bt2fan.py
index 452d3847..99a8e9ae 100755
--- a/utils/bt2fan/bt2fan.py
+++ b/utils/bt2fan/bt2fan.py
@@ -62,8 +62,7 @@ def char(self, c):
 
     def not_char(self, c):
         if self.use_regexes:
-            # Fandango does not support regexes yet
-            return f"r'[^{self._char(c)}]'"
+            return f"br'[^{self._char(c)}]'"
         chars = []
         for i in range(256):
             if chr(i) != c:
@@ -485,11 +484,11 @@ def visit_If(self, node: c_ast.If) -> str:
         description="Convert a binary template to a Fandango specification"
     )
     parser.add_argument(
-        "--regexes",
-        action="store_true",
+        "--no-regexes",
+        action="store_false",
         dest="use_regexes",
-        default=False,
-        help="use regexes",
+        default=True,
+        help="do not use regexes",
     )
     parser.add_argument(
         "--endianness", choices=["little", "big"], help="set endianness"

From aaa8a570496d48c0154d6749c229e58486f626ed Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 09:13:49 +0100
Subject: [PATCH 14/28] New: redefined `<byte>` as regexp, dramatically
 speeding up parsing

---
 src/fandango/language/stdlib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fandango/language/stdlib.py b/src/fandango/language/stdlib.py
index bf5ad1ae..1a70efda 100755
--- a/src/fandango/language/stdlib.py
+++ b/src/fandango/language/stdlib.py
@@ -137,7 +137,7 @@ def make_comment(comment: str) -> str:
 stdlib += bits
 
 stdlib += make_header("Bytes")
-bytes = make_def("byte", "".join(chr(c) for c in range(0, 256)), force_binary=True)
+bytes = make_rule("byte", [r"rb'[\x00-\xff]'"])
 stdlib += bytes
 
 

From 3c97cb744d6d8b02ef8a32fa1cc5babfce95fa20 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 09:19:59 +0100
Subject: [PATCH 15/28] Improved diagnostics when printing parser states

---
 src/fandango/language/grammar.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/fandango/language/grammar.py b/src/fandango/language/grammar.py
index 6d836ec7..409cfa59 100644
--- a/src/fandango/language/grammar.py
+++ b/src/fandango/language/grammar.py
@@ -487,7 +487,7 @@ def __repr__(self):
             f"({self.nonterminal} -> "
             + "".join(
                 [
-                    f"{'•' if i == self._dot else ''}{s.symbol}"
+                    f"{'•' if i == self._dot else ''}{s!r}"
                     for i, s in enumerate(self.symbols)
                 ]
             )
@@ -853,6 +853,7 @@ def _parse_forest(
                                     # LOGGER.warning(f"Position {hex(w)} ({w}): Parsing a byte while expecting bit {bit_count}. Check if bits come in multiples of eight")
                                     bit_count = -1
 
+                                # LOGGER.debug(f"Checking byte(s) {state} at position {hex(w)} ({w}) {word[w:]!r}")
                                 match, match_length = \
                                     self.scan_bytes(state, word, table, k, w)
                                 if match:

From d744d0edef95ab9ac04b2efeece07a1b67e9c264 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 09:58:05 +0100
Subject: [PATCH 16/28] Doc update

---
 docs/Regexes.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/docs/Regexes.md b/docs/Regexes.md
index dd1acbe1..6d9b17ce 100644
--- a/docs/Regexes.md
+++ b/docs/Regexes.md
@@ -74,6 +74,21 @@ The expression `r'\\d'` would actually match a backslash, followed by a `d` char
 Be aware of the specific syntax of `r`-strings as it comes to backslashes.
 :::
 
+One consequence of backslashes being interpreted literally is that you cannot escape quote characters in a regular expression.
+This causes a problem if you need two kinds of quotes (`"` and `'`) in the same regular expression – say, a rule that checks for forbidden characters.
+
+However, encodings of the form `\xNN` are also interpreted by regular expressions.
+Hence, if you need quotes, you can use
+
+* `\x22` instead of `"`
+* `\x27` instead of `'`
+
+Here is an example:
+
+```
+<forbidden_characters> ::= r'[\x22\x27;]'
+```
+
 
 ## Fine Points about Regular Expressions
 

From 3d480c7351c1badb4afadd6056b1cb4981863f8e Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 10:29:13 +0100
Subject: [PATCH 17/28] Fix: `fuzz` command failed validation

---
 src/fandango/cli/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/fandango/cli/__init__.py b/src/fandango/cli/__init__.py
index 2768f135..a296ec99 100644
--- a/src/fandango/cli/__init__.py
+++ b/src/fandango/cli/__init__.py
@@ -952,7 +952,7 @@ def fuzz_command(args):
             try:
                 with open_file(generated_file, file_mode, mode="r") as fd:
                     tree = parse_file(fd, args, grammar, constraints, settings)
-                    validate(individual, tree, filename=fd.n)
+                    validate(individual, tree, filename=fd.name)
 
             except Exception as e:
                 print_exception(e)

From 2278cfe9624c9ca8751f8da4db0132a897bd6e48 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 10:30:50 +0100
Subject: [PATCH 18/28] New: also use regexes for printable, punctuation, ascii
 and utf8 chars

---
 src/fandango/language/stdlib.py | 45 ++++++++++++++-------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/src/fandango/language/stdlib.py b/src/fandango/language/stdlib.py
index 1a70efda..e9cf01fa 100755
--- a/src/fandango/language/stdlib.py
+++ b/src/fandango/language/stdlib.py
@@ -50,8 +50,12 @@ def make_comment(comment: str) -> str:
 
 
 stdlib += make_header("Printable characters")
-printable = make_def("printable", string.printable)
-printable += make_def("whitespace", string.whitespace)
+# printable = make_def("printable", string.printable)
+printable = make_rule("printable",
+                       ["r'[0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!\\x22#$%&\\x27()*+,-./:;<=>?@[\\]^_`{|}~ \\t\\n\\r\\x0b\\x0c]'"])
+
+# printable += make_def("whitespace", string.whitespace)
+printable += make_rule("whitespace", [r"r'[ \t\n\r\x0b\x0c]'"])
 
 # printable += make_def("digit", string.digits)
 printable += make_rule("digit", ["r'[0-9]'"])
@@ -71,7 +75,9 @@ def make_comment(comment: str) -> str:
 # printable += make_def("ascii_uppercase_letter", string.ascii_uppercase)
 printable += make_rule("ascii_uppercase_letter", ["r'[A-Z]'"])
 
-printable += make_def("punctuation", string.punctuation)
+# printable += make_def("punctuation", string.punctuation)
+printable += make_rule("punctuation",
+                      ["r'[!\\x22#$%&\\x27()*+,-./:;<=>?@[\\]^_`{|}~]'"])
 
 # printable += make_def("alphanum", string.ascii_letters + string.digits)
 printable += make_rule("alphanum", ["r'[a-zA-Z0-9]'"])
@@ -82,9 +88,7 @@ def make_comment(comment: str) -> str:
 stdlib += printable
 
 stdlib += make_header("ASCII characters")
-ascii_char = make_def(
-    "ascii_char", "".join(chr(c) for c in range(0, 128)), force_binary=False
-)
+ascii_char = make_rule("ascii_char", ["rb'[\\x00-\\x7f]'"])
 stdlib += ascii_char
 
 stdlib += make_header("ASCII control characters")
@@ -144,26 +148,15 @@ def make_comment(comment: str) -> str:
 stdlib += make_header("UTF-8 characters, read and processed as bytes")
 
 
-def make_utf8_rule(symbol: str, chars: list[int], suffix: str = "") -> str:
-    return make_rule(
-        symbol, ["(" + " | ".join(f"b'\\x{c:02x}'" for c in chars) + ")" + suffix]
-    )
-
 
-utf8 = make_def(
-    "utf8_char1", "".join(chr(c) for c in range(0, 128)), force_binary=False
-)
-utf8 += make_def(
-    "utf8_continuation_byte",
-    "".join(chr(c) for c in range(0x80, 0xC0)),
-    force_binary=True,
-)
-utf8 += make_utf8_rule("utf8_char2", range(0xC2, 0xE0), " <utf8_continuation_byte>")
-utf8 += make_utf8_rule("utf8_char3", range(0xE0, 0xF0), " <utf8_continuation_byte>{2}")
-utf8 += make_utf8_rule("utf8_char4", range(0xF0, 0xF6), " <utf8_continuation_byte>{3}")
-utf8 += make_rule(
-    "utf8_char", ["<utf8_char1>", "<utf8_char2>", "<utf8_char3>", "<utf8_char4>"]
-)  # UTF-8 character
+utf8 = make_rule("utf8_char1", [r"rb'[\x00-\x7f]'"])
+utf8 += make_rule("utf8_continuation_byte", [r"rb'[\x80-\xbf]'"])
+utf8 += make_rule("utf8_char2", [r"rb'[\xc2-\xdf]' <utf8_continuation_byte>"])
+utf8 += make_rule("utf8_char3", [r"rb'[\xe0-\xef]' <utf8_continuation_byte>{2}"])
+utf8 += make_rule("utf8_char4", [r"rb'[\xf0-\xf5]' <utf8_continuation_byte>{3}"])
+utf8 += make_rule("utf8_char",
+                  ["<utf8_char1>", "<utf8_char2>",
+                   "<utf8_char3>", "<utf8_char4>"])  # UTF-8 character
 stdlib += utf8
 
 
@@ -185,7 +178,7 @@ def make_utf8_rule(symbol: str, chars: list[int], suffix: str = "") -> str:
 # stdlib += make_rule("uuid", ["<hexdigit>{8} '-' <hexdigit>{4} '-' <hexdigit>{4}'-' <hexdigit>{4} '-' <hexdigit>{12}"])
 
 stdlib += make_header("Fandango dancer")
-stdlib += make_comment("We use this to test UTF-8 compatibility")
+stdlib += make_comment("We use this to test Unicode compatibility")
 dancer = make_rule("fandango_dancer", ["'💃'"])
 stdlib += dancer
 

From dc556cbf2ca6867dd2b289af7932245034fc6983 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 13:44:18 +0100
Subject: [PATCH 19/28] Always report file positions with four hex digits

---
 src/fandango/cli/__init__.py     |  2 +-
 src/fandango/language/grammar.py | 14 +++++++-------
 src/fandango/language/tree.py    |  2 +-
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/fandango/cli/__init__.py b/src/fandango/cli/__init__.py
index a296ec99..6398c94a 100644
--- a/src/fandango/cli/__init__.py
+++ b/src/fandango/cli/__init__.py
@@ -794,7 +794,7 @@ def report_syntax_error(
     mismatch = individual[position]
     if binary:
         assert isinstance(mismatch, int)
-        return f"{filename!r}, position {hex(position)} ({position}): mismatched input {mismatch.to_bytes()!r}"
+        return f"{filename!r}, position {position:#06x} ({position}): mismatched input {mismatch.to_bytes()!r}"
 
     line = 1
     column = 1
diff --git a/src/fandango/language/grammar.py b/src/fandango/language/grammar.py
index 409cfa59..d64d7f9b 100644
--- a/src/fandango/language/grammar.py
+++ b/src/fandango/language/grammar.py
@@ -718,10 +718,10 @@ def scan_bytes(
             match, match_length = state.dot.check(word[w:])
             if match:
                 # Found a match
-                # LOGGER.debug(f"Matched {state.dot!r} at position {hex(w)} ({w}) (len = {match_length}) {word[w:w+match_length]!r}")
+                # LOGGER.debug(f"Matched {state.dot!r} at position {w:#06x} ({w}) (len = {match_length}) {word[w:w+match_length]!r}")
                 next_state = state.next()
                 next_state.children.append(
-                    DerivationTree(Terminal(word[w : w + match_length]))
+                    DerivationTree(Terminal(word[w:w+match_length]))
                 )
                 table[k + match_length].add(next_state)
                 self._max_position = max(self._max_position, w)
@@ -826,7 +826,7 @@ def _parse_forest(
                     elif not state.is_incomplete:
                         if state.next_symbol_is_nonterminal():
                             self.predict(state, table, k)
-                            # LOGGER.debug(f"Predicted {state} at position {hex(w)} ({w}) {word[w:]!r}")
+                            # LOGGER.debug(f"Predicted {state} at position {w:#06x} ({w}) {word[w:]!r}")
                         else:
                             if isinstance(state.dot.symbol, int):
                                 # Scan a bit
@@ -836,7 +836,7 @@ def _parse_forest(
                                     state, word, table, k, w, bit_count
                                 )
                                 if match:
-                                    # LOGGER.debug(f"Scanned bit {state} at position {hex(w)} ({w}) {word[w:]!r}")
+                                    # LOGGER.debug(f"Scanned bit {state} at position {w:#06x} ({w}) {word[w:]!r}")
                                     scanned = 1
                             else:
                                 # Scan a byte
@@ -850,14 +850,14 @@ def _parse_forest(
                                     #
                                     # In either case, we need to skip back
                                     # to scanning bytes here.
-                                    # LOGGER.warning(f"Position {hex(w)} ({w}): Parsing a byte while expecting bit {bit_count}. Check if bits come in multiples of eight")
+                                    # LOGGER.warning(f"Position {w:#06x} ({w}): Parsing a byte while expecting bit {bit_count}. Check if bits come in multiples of eight")
                                     bit_count = -1
 
-                                # LOGGER.debug(f"Checking byte(s) {state} at position {hex(w)} ({w}) {word[w:]!r}")
+                                # LOGGER.debug(f"Checking byte(s) {state} at position {w:#06x} ({w}) {word[w:]!r}")
                                 match, match_length = \
                                     self.scan_bytes(state, word, table, k, w)
                                 if match:
-                                    # LOGGER.debug(f"Scanned {match_length} byte(s) {state} at position {hex(w)} ({w}) {word[w:]!r}")
+                                    # LOGGER.debug(f"Scanned {match_length} byte(s) {state} at position {w:#06x} ({w}) {word[w:]!r}")
                                     scanned = max(scanned, match_length)
 
                 if scanned > 0:
diff --git a/src/fandango/language/tree.py b/src/fandango/language/tree.py
index fe6ed851..220bef5a 100644
--- a/src/fandango/language/tree.py
+++ b/src/fandango/language/tree.py
@@ -263,7 +263,7 @@ def _to_grammar(node, indent=0, start_indent=0) -> str:
             s = "  " * start_indent + f"{node.symbol.symbol} ::="
             have_nonterminal = False
 
-            position = f"  # Position {hex(byte_count)} ({byte_count})"
+            position = f"  # Position {byte_count:#06x} ({byte_count})"
             max_bit_count = bit_count - 1
 
             for child in node._children:

From 1b9d5fefcc2af610c2d5ec4b5fc5d829a1b1d2e1 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 13:55:57 +0100
Subject: [PATCH 20/28] Fix: enable parsing GIF again (but still no validation)

---
 src/fandango/language/grammar.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/fandango/language/grammar.py b/src/fandango/language/grammar.py
index d64d7f9b..02b5bebf 100644
--- a/src/fandango/language/grammar.py
+++ b/src/fandango/language/grammar.py
@@ -841,6 +841,8 @@ def _parse_forest(
                             else:
                                 # Scan a byte
                                 if 0 <= bit_count <= 7:
+                                    # LOGGER.warning(f"Position {w:#06x} ({w}): Parsing a byte while expecting bit {bit_count}. Check if bits come in multiples of eight")
+
                                     # We are still expecting bits here:
                                     #
                                     # * we may have _peeked_ at a bit,
@@ -850,8 +852,8 @@ def _parse_forest(
                                     #
                                     # In either case, we need to skip back
                                     # to scanning bytes here.
-                                    # LOGGER.warning(f"Position {w:#06x} ({w}): Parsing a byte while expecting bit {bit_count}. Check if bits come in multiples of eight")
                                     bit_count = -1
+                                    scanned = 1
 
                                 # LOGGER.debug(f"Checking byte(s) {state} at position {w:#06x} ({w}) {word[w:]!r}")
                                 match, match_length = \
@@ -861,6 +863,7 @@ def _parse_forest(
                                     scanned = max(scanned, match_length)
 
                 if scanned > 0:
+                    # LOGGER.debug(f"Scanned {scanned} byte(s) at position {w:#06x} ({w}); bit_count = {bit_count}")
                     if bit_count >= 0:
                         # Advance by one bit
                         bit_count -= 1

From c0cbb2a34ccb8a4d3a40793d7779762841c4d9b7 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 15:34:00 +0100
Subject: [PATCH 21/28] Fix: parsing bits yielded random results

---
 src/fandango/language/grammar.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/fandango/language/grammar.py b/src/fandango/language/grammar.py
index 5f973cae..1c9554d1 100644
--- a/src/fandango/language/grammar.py
+++ b/src/fandango/language/grammar.py
@@ -682,9 +682,11 @@ def scan_bit(
                 return False
 
             # Found a match
+            # LOGGER.debug(f"Found bit {bit}")
             next_state = state.next()
-            next_state.children.append(DerivationTree(state.dot))
-
+            tree = DerivationTree(Terminal(bit))
+            next_state.children.append(tree)
+            # LOGGER.debug(f"Added tree {tree.to_string()!r} to state {next_state!r}")
             # Insert a new table entry with next state
             # This is necessary, as our initial table holds one entry
             # per input byte, yet needs to be expanded to hold the bits, too.

From 9071a075d4b3c15717b62b4ecb34678c1bdd2450 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 16:20:11 +0100
Subject: [PATCH 22/28] Added `--validate` flag for better consistency checking

---
 docs/Binary.md      |  6 +++---
 docs/Bits.md        | 10 +++++-----
 docs/Constraints.md | 10 +++++-----
 docs/Fuzzing.md     |  2 +-
 docs/Generators.md  | 16 ++++++++--------
 docs/Gif.md         |  3 ++-
 docs/ISO8601.md     | 12 ++++++------
 docs/Invoking.md    |  2 +-
 docs/Paths.md       | 16 ++++++++--------
 docs/Recursive.md   |  8 ++++----
 docs/Regexes.md     |  4 ++--
 docs/Strategies.md  |  6 +++---
 docs/empty.fan      |  1 +
 docs/iso8601.fan    |  1 -
 14 files changed, 49 insertions(+), 48 deletions(-)
 create mode 100644 docs/empty.fan

diff --git a/docs/Binary.md b/docs/Binary.md
index c15670d6..e923386c 100644
--- a/docs/Binary.md
+++ b/docs/Binary.md
@@ -83,7 +83,7 @@ $ fandango fuzz -f credit_card.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f credit_card.fan -n 10
+!fandango fuzz -f credit_card.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -186,7 +186,7 @@ $ fandango fuzz -f binfinity.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f binfinity.fan -n 10
+!fandango fuzz -f binfinity.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -273,7 +273,7 @@ and obtain the same result:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -n 1 -f binary-pack.fan -o - | od -c
+!fandango fuzz -n 1 -f binary-pack.fan -o - --validate | od -c
 assert _exit_code == 0
 ```
 
diff --git a/docs/Bits.md b/docs/Bits.md
index 23ee7a4e..2126c18f 100644
--- a/docs/Bits.md
+++ b/docs/Bits.md
@@ -54,7 +54,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 --start-symbol='<format_flag>'
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz --format=bits -f bits.fan -n 1 --start-symbol='<format_flag>'
+!fandango fuzz --format=bits -f bits.fan -n 1 --start-symbol='<format_flag>' --validate
 assert _exit_code == 0
 ```
 
@@ -71,7 +71,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 10 -c '<italic> == "\x01" and <bold
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz --format=bits -f bits.fan -n 10 -c '<italic> == "\x01" and <bold> == "\x00"'
+!fandango fuzz --format=bits -f bits.fan -n 10 -c '<italic> == "\x01" and <bold> == "\x00"' --validate
 assert _exit_code == 0
 ```
 
@@ -83,7 +83,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 -c '<italic> == chr(1) and <bold>
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz --format=bits -f bits.fan -n 1 -c '<italic> == chr(1) and <bold> == chr(0)'
+!fandango fuzz --format=bits -f bits.fan -n 1 -c '<italic> == chr(1) and <bold> == chr(0)' --validate
 assert _exit_code == 0
 ```
 
@@ -95,7 +95,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 -c '<format_flag> == chr(0b111100
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz --format=bits -f bits.fan -n 1 -c '<format_flag> == chr(0b11110000)'
+!fandango fuzz --format=bits -f bits.fan -n 1 -c '<format_flag> == chr(0b11110000)' --validate
 assert _exit_code == 0
 ```
 
@@ -115,7 +115,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 -c 'ord(str(<brightness>)) > 10'
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz --format=bits -f bits.fan -n 10 -c 'ord(str(<brightness>)) > 10'
+!fandango fuzz --format=bits -f bits.fan -n 10 -c 'ord(str(<brightness>)) > 10' --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/Constraints.md b/docs/Constraints.md
index 63b078e6..e99e16a0 100644
--- a/docs/Constraints.md
+++ b/docs/Constraints.md
@@ -69,7 +69,7 @@ $ fandango fuzz -f persons.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c 'int(<age>) < 50'
+!fandango fuzz -f persons.fan -n 10 -c 'int(<age>) < 50' --validate
 assert _exit_code == 0
 ```
 
@@ -92,7 +92,7 @@ and we obtain these inputs:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c '25 <= int(<age>) and int(<age>) <= 45'
+!fandango fuzz -f persons.fan -n 10 -c '25 <= int(<age>) and int(<age>) <= 45' --validate
 assert _exit_code == 0
 ```
 
@@ -100,7 +100,7 @@ Start with [`persons.fan`](persons.fan) and add a constraint such that we genera
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c 'int(<age>) % 7 == 0'
+!fandango fuzz -f persons.fan -n 10 -c 'int(<age>) % 7 == 0' --validate
 assert _exit_code == 0
 ```
 (Hint: The modulo operator in Python is `%`).
@@ -201,7 +201,7 @@ $ fandango -v fuzz -f persons.fan -n 10 -c 'int(<age>) % 7 == 0'
 
 ```{code-cell}
 :tags: ["remove-input", "scroll-output"]
-!fandango -v fuzz -f persons.fan -n 10 -c 'int(<age>) % 7 == 0'
+!fandango -v fuzz -f persons.fan -n 10 -c 'int(<age>) % 7 == 0' --validate
 assert _exit_code == 0
 ```
 
@@ -226,7 +226,7 @@ $ fandango -v fuzz -f persons.fan -n 10 -c 'False' -N 50
 
 ```{code-cell}
 :tags: ["remove-input", "scroll-output"]
-!fandango -v fuzz -f persons.fan -n 10 -c 'False' -N 50
+!fandango -v fuzz -f persons.fan -n 10 -c 'False' -N 50 --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/Fuzzing.md b/docs/Fuzzing.md
index 44819b3a..1c855746 100644
--- a/docs/Fuzzing.md
+++ b/docs/Fuzzing.md
@@ -66,7 +66,7 @@ Your output will look like this:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10
+!fandango fuzz -f persons.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/Generators.md b/docs/Generators.md
index fee8f4f1..d767b67b 100644
--- a/docs/Generators.md
+++ b/docs/Generators.md
@@ -52,7 +52,7 @@ then we can have Fandango create names such as
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-nat.fan -n 10
+!fandango fuzz -f persons-nat.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -127,7 +127,7 @@ This is what the output of the above spec looks like:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-faker.fan -n 10
+!fandango fuzz -f persons-faker.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -161,7 +161,7 @@ The resulting [Fandango spec file](persons-faker-age.fan) produces the desired r
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-faker-age.fan -n 10
+!fandango fuzz -f persons-faker-age.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -178,7 +178,7 @@ These are the ages we get this way:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-faker-gauss.fan -n 10
+!fandango fuzz -f persons-faker-gauss.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -203,7 +203,7 @@ With this, both random names (`<name>`) and natural names (`<natural_name>`) wil
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-faker50.fan -n 10
+!fandango fuzz -f persons-faker50.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -236,7 +236,7 @@ and we get
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-faker.fan -c '<last_name>.startswith("S")' -n 10
+!fandango fuzz -f persons-faker.fan -c '<last_name>.startswith("S")' -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -259,7 +259,7 @@ In case this should work, this is only through some internal Fandango optimizati
 Unfortunately, this does not work.
 % ```{code-cell}
 % :tags: ["remove-input"]
-% !fandango fuzz -f persons-faker.fan -c '<first_name> == fake.first_name()' -n 10
+% !fandango fuzz -f persons-faker.fan -c '<first_name> == fake.first_name()' -n 10 --validate
 % assert _exit_code == 0
 % ```
 The reason is that the faker returns _a different value_ every time it is invoked, making it hard for Fandango to solve the constraint.
@@ -275,7 +275,7 @@ $ fandango fuzz -f persons-faker.fan -c 'str(<last_name>).startswith("S")' -c 'i
 This would work:
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-faker.fan -c 'str(<last_name>).startswith("S")' -c 'int(<age>) >= 25 and int(<age>) <= 35' -n 10
+!fandango fuzz -f persons-faker.fan -c 'str(<last_name>).startswith("S")' -c 'int(<age>) >= 25 and int(<age>) <= 35' -n 10 --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/Gif.md b/docs/Gif.md
index d79c8767..010ce848 100644
--- a/docs/Gif.md
+++ b/docs/Gif.md
@@ -24,5 +24,6 @@ We start with a very short GIF to keep things simple ([source](http://probablypr
 We can parse this file using Fandango:
 
 ```{code-cell}
-!fandango parse -f gif89a.fan tinytrans.gif -o - --format=grammar
+!fandango parse -f gif89a.fan tinytrans.gif -o - --format=grammar --validate
+assert _exit_code == 0
 ```
\ No newline at end of file
diff --git a/docs/ISO8601.md b/docs/ISO8601.md
index 20773a4d..b64d33c6 100644
--- a/docs/ISO8601.md
+++ b/docs/ISO8601.md
@@ -112,10 +112,10 @@ iso8601lib += make_rule("iso8601calendardate",
 iso8601lib += make_rule("iso8601year", ["('+'|'-')? <digit>{4}"])
 ```
 
-And yes, we need digits for specifying a year:
-```{code-cell}
-iso8601lib += make_rule("digit", [f"'{digit}'" for digit in range(0, 10)])
-```
+% And yes, we need digits for specifying a year:
+% ```{code-cell}
+% iso8601lib += make_rule("digit", [f"'{digit}'" for digit in range(0, 10)])
+% ```
 
 
 ### Months
@@ -385,7 +385,7 @@ $ fandango fuzz -f iso8601.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f ISO8601.fan -n 10
+!fandango fuzz -f ISO8601.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -396,7 +396,7 @@ $ fandango fuzz -f ISO8601.fan -n 10 -c 'int(<iso8601year>) > 1950 and int(<iso8
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f ISO8601.fan -n 10 -c 'int(<iso8601year>) > 1950 and int(<iso8601year>) < 2000'
+!fandango fuzz -f ISO8601.fan -n 10 -c 'int(<iso8601year>) > 1950 and int(<iso8601year>) < 2000' --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/Invoking.md b/docs/Invoking.md
index 7a976d4a..acb6331f 100644
--- a/docs/Invoking.md
+++ b/docs/Invoking.md
@@ -38,7 +38,7 @@ And this is what we get:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f digits.fan -n 10    
+!fandango fuzz -f digits.fan -n 10 --validate
 ```
 
 Success! We have created 10 random sequences of digits.
diff --git a/docs/Paths.md b/docs/Paths.md
index c6390c89..0937fc1b 100644
--- a/docs/Paths.md
+++ b/docs/Paths.md
@@ -235,7 +235,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '<first_name>[0].endswith("x")'
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c '<first_name>[0].endswith("x")'
+!fandango fuzz -f persons.fan -n 10 -c '<first_name>[0].endswith("x")' --validate
 assert _exit_code == 0
 ```
 
@@ -274,7 +274,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '<first_name>.<name>.endswith("x")'
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c '<first_name>.<name>.endswith("x")'
+!fandango fuzz -f persons.fan -n 10 -c '<first_name>.<name>.endswith("x")' --validate
 assert _exit_code == 0
 ```
 
@@ -324,7 +324,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '<first_name>..<ascii_uppercase_letter>
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c '<first_name>..<ascii_uppercase_letter> == "X"'
+!fandango fuzz -f persons.fan -n 10 -c '<first_name>..<ascii_uppercase_letter> == "X"' --validate
 assert _exit_code == 0
 ```
 
@@ -350,7 +350,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '<start>[0].<last_name>..<ascii_lowercas
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c '<start>[0].<last_name>..<ascii_lowercase_letter> == "x"'
+!fandango fuzz -f persons.fan -n 10 -c '<start>[0].<last_name>..<ascii_lowercase_letter> == "x"' --validate
 assert _exit_code == 0
 ```
 
@@ -420,7 +420,7 @@ $ fandango fuzz -f persons.fan -n 10 -c 'any(n.startswith("A") for n in *<name>)
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c 'exists <name> in <start>: <name>.startswith("A")'
+!fandango fuzz -f persons.fan -n 10 -c 'exists <name> in <start>: <name>.startswith("A")' --validate
 assert _exit_code == 0
 ```
 
@@ -457,7 +457,7 @@ $ fandango fuzz -f persons.fan -n 10 -c 'all(c == "a" for c in *<first_name>..<a
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c '<first_name>..<ascii_lowercase_letter> == "a"'
+!fandango fuzz -f persons.fan -n 10 -c '<first_name>..<ascii_lowercase_letter> == "a"' --validate
 assert _exit_code == 0
 ```
 
@@ -503,7 +503,7 @@ $ fandango fuzz -f persons.fan -n 10 -c 'int(<age>) > 30 -> <first_name>.startsw
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10 -c 'int(<age>) > 30 -> <first_name>.startswith("A")'
+!fandango fuzz -f persons.fan -n 10 -c 'int(<age>) > 30 -> <first_name>.startswith("A")' --validate
 assert _exit_code == 0
 ```
 
@@ -516,7 +516,7 @@ $ fandango fuzz -f persons-faker-gauss.fan -n 10 -c 'int(<age>) > 30 -> <first_n
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-faker-gauss.fan -n 10 -c 'int(<age>) > 30 -> <first_name>.startswith("A")'
+!fandango fuzz -f persons-faker-gauss.fan -n 10 -c 'int(<age>) > 30 -> <first_name>.startswith("A")' --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/Recursive.md b/docs/Recursive.md
index a9c7b93b..5feb530f 100644
--- a/docs/Recursive.md
+++ b/docs/Recursive.md
@@ -82,7 +82,7 @@ $ fandango fuzz -f additions.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f additions.fan -n 10
+!fandango fuzz -f additions.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -148,7 +148,7 @@ $ fandango fuzz -f expr.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f expr.fan -n 10
+!fandango fuzz -f expr.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -192,7 +192,7 @@ $ fandango fuzz -f expr-float.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f expr-float.fan -n 10
+!fandango fuzz -f expr-float.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -204,7 +204,7 @@ $ fandango fuzz -f expr-float.fan -n 10 -c 'eval(str(<start>)) > 1000'
 
 ```{code-cell}
 :tags: ["remove-input", "remove-stderr"]
-!fandango fuzz -f expr-float.fan -n 10 -c 'eval(str(<start>)) > 1000' 2> /dev/null
+!fandango fuzz -f expr-float.fan -n 10 -c 'eval(str(<start>)) > 1000' --validate 2> /dev/null
 assert _exit_code == 0
 ```
 
diff --git a/docs/Regexes.md b/docs/Regexes.md
index 6d9b17ce..7b05b16a 100644
--- a/docs/Regexes.md
+++ b/docs/Regexes.md
@@ -126,7 +126,7 @@ $ fandango fuzz -f infinity.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f infinity.fan -n 10
+!fandango fuzz -f infinity.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -146,7 +146,7 @@ $ fandango fuzz -f finity.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f finity.fan -n 10
+!fandango fuzz -f finity.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/Strategies.md b/docs/Strategies.md
index 35963a11..a0f9f295 100644
--- a/docs/Strategies.md
+++ b/docs/Strategies.md
@@ -22,7 +22,7 @@ $ fandango fuzz -f persons.fan -n 10
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons.fan -n 10
+!fandango fuzz -f persons.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -66,7 +66,7 @@ This is the effect of this rule:
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons100.fan -n 10
+!fandango fuzz -f persons100.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
@@ -96,7 +96,7 @@ Try it yourself and modify [`persons.fan`](persons.fan) such that it can also pr
 
 ```{code-cell}
 :tags: ["remove-input"]
-!fandango fuzz -f persons-neg.fan -n 10
+!fandango fuzz -f persons-neg.fan -n 10 --validate
 assert _exit_code == 0
 ```
 
diff --git a/docs/empty.fan b/docs/empty.fan
new file mode 100644
index 00000000..eb3369dc
--- /dev/null
+++ b/docs/empty.fan
@@ -0,0 +1 @@
+<start> ::= '123' <digit> | '12345' '' <digit>
\ No newline at end of file
diff --git a/docs/iso8601.fan b/docs/iso8601.fan
index b17475af..04cab0d3 100644
--- a/docs/iso8601.fan
+++ b/docs/iso8601.fan
@@ -11,7 +11,6 @@ import dateutil  # See https://dateutil.readthedocs.io
 
 <iso8601calendardate> ::= <iso8601year> '-' <iso8601month> ('-' <iso8601day>)? | <iso8601year> <iso8601month> <iso8601day>
 <iso8601year> ::= ('+'|'-')? <digit>{4}
-<digit> ::= '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'
 <iso8601month> ::= '01' | '02' | '03' | '04' | '05' | '06' | '07' | '08' | '09' | '10' | '11' | '12'
 <iso8601day> ::= '01' | '02' | '03' | '04' | '05' | '06' | '07' | '08' | '09' | '10' | '11' | '12' | '13' | '14' | '15' | '16' | '17' | '18' | '19' | '20' | '21' | '22' | '23' | '24' | '25' | '26' | '27' | '28' | '29' | '30' | '31'
 

From dbd64c011c6a8d9d4765fcfdf511f628b05dd6fe Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 16:41:04 +0100
Subject: [PATCH 23/28] Added more tests

---
 tests/resources/empty.fan |  2 ++
 tests/test_parsing.py     | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 tests/resources/empty.fan

diff --git a/tests/resources/empty.fan b/tests/resources/empty.fan
new file mode 100644
index 00000000..c1aa6028
--- /dev/null
+++ b/tests/resources/empty.fan
@@ -0,0 +1,2 @@
+<start> ::= '123' <digit> | '12345' '' <digit>
+<digit> ::= r'[0-9]'
\ No newline at end of file
diff --git a/tests/test_parsing.py b/tests/test_parsing.py
index 4911a490..aeb8db64 100644
--- a/tests/test_parsing.py
+++ b/tests/test_parsing.py
@@ -182,3 +182,42 @@ def test_a(self):
                 ],
             ),
         )
+
+
+class TestEmptyParsing(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.file = open("tests/resources/empty.fan", "r")
+        cls.grammar, _ = parse(cls.file, use_stdlib=False, use_cache=False)
+
+    def _test(self, example, tree):
+        actual_tree = self.grammar.parse(example)
+        self.assertEqual(tree, actual_tree)
+
+    def test_a(self):
+        self._test(
+            "1234",
+            DerivationTree(
+                NonTerminal("<start>"),
+                [
+                    DerivationTree(Terminal("123")),
+                    DerivationTree(NonTerminal("<digit>"),
+                            [DerivationTree(Terminal("4"))]
+                    ),
+                ],
+            ),
+        )
+
+    def test_b(self):
+        self._test(
+            "123456",
+            DerivationTree(
+                NonTerminal("<start>"),
+                [
+                    DerivationTree(Terminal("12345")),
+                    DerivationTree(NonTerminal("<digit>"),
+                            [DerivationTree(Terminal("6"))]
+                    ),
+                ],
+            ),
+        )

From 76b1566b91abe2e1181f4cb6b84576205661c864 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 16:45:08 +0100
Subject: [PATCH 24/28] Added check for `rgb.fan` parse test

---
 src/fandango/language/grammar.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/fandango/language/grammar.py b/src/fandango/language/grammar.py
index 1c9554d1..5270b55f 100644
--- a/src/fandango/language/grammar.py
+++ b/src/fandango/language/grammar.py
@@ -674,6 +674,9 @@ def scan_bit(
             assert isinstance(state.dot.symbol, int)
             assert 0 <= bit_count <= 7
 
+            if w >= len(word):
+                return False
+
             # Get the highest bit. If `word` is bytes, word[w] is an integer.
             byte = ord(word[w]) if isinstance(word, str) else word[w]
             bit = (byte >> bit_count) & 1

From 5a624a3a1e12f13b7a1829fb389cb429b5202680 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 17:03:26 +0100
Subject: [PATCH 25/28] Fix: better encoding of mixed quotes

---
 src/fandango/language/symbol.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/fandango/language/symbol.py b/src/fandango/language/symbol.py
index 1e3d8b80..d6c30cf1 100644
--- a/src/fandango/language/symbol.py
+++ b/src/fandango/language/symbol.py
@@ -143,8 +143,8 @@ def __repr__(self):
             if '"' not in self.symbol:
                 return 'r"' + str(self.symbol) + '"'
 
-            # Mixed quotes: escape single quotes
-            symbol = self.symbol.replace("'", r"\'")
+            # Mixed quotes: encode single quotes
+            symbol = self.symbol.replace("'", r"\x27")
             return "r'" + str(symbol) + "'"
 
         # Not a regex

From b26b2daf9a4616de097a9e86bdeabf6b1e2fa045 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 17:03:57 +0100
Subject: [PATCH 26/28] Fix: skip alternatives only if regexes are used

---
 src/fandango/language/grammar.py | 35 ++++++++------------------------
 1 file changed, 9 insertions(+), 26 deletions(-)

diff --git a/src/fandango/language/grammar.py b/src/fandango/language/grammar.py
index 5270b55f..e50ff1db 100644
--- a/src/fandango/language/grammar.py
+++ b/src/fandango/language/grammar.py
@@ -731,6 +731,11 @@ def scan_bytes(
                 table[k + match_length].add(next_state)
                 self._max_position = max(self._max_position, w)
 
+                if not state.dot.is_regex:
+                    # We only advance by more than 1 if we have regexes.
+                    # Otherwise, we may skip alternatives.
+                    match_length = 1
+
             return match, match_length
 
         def complete(
@@ -758,28 +763,6 @@ def complete(
                         else:
                             s.children.extend(state.children)
 
-        # Commented this out, as
-        # (a) it is not adapted to bits yet, and (b) not used -- AZ
-        #
-        # def parse_table(self, word, start: str | NonTerminal = "<start>"):
-        #     if isinstance(start, str):
-        #         start = NonTerminal(start)
-        #     table = [Column() for _ in range(len(word) + 1)]
-        #     table[0].add(ParseState(NonTerminal("<*start*>"), 0, (start,)))
-        #     self._max_position = -1
-        #
-        #     for k in range(len(word) + 1):
-        #         for state in table[k]:
-        #             if state.finished():
-        #                 self.complete(state, table, k)
-        #             else:
-        #                 if state.next_symbol_is_nonterminal():
-        #                     self.predict(state, table, k)
-        #                 else:
-        #                     # No bit parsing support yet
-        #                     self.scan_byte(state, word, table, k, k)
-        #     return table
-
         def _parse_forest(
             self,
             word: str,
@@ -811,7 +794,7 @@ def _parse_forest(
             bit_count = -1  # If > 0, indicates the next bit to be scanned (7-0)
 
             while k < len(table) and w <= len(word):
-                scanned = 0
+                scanned = 1
 
                 for state in table[k]:
                     if w >= len(word):
@@ -841,7 +824,7 @@ def _parse_forest(
                                     state, word, table, k, w, bit_count
                                 )
                                 if match:
-                                    # LOGGER.debug(f"Scanned bit {state} at position {w:#06x} ({w}) {word[w:]!r}")
+                                    LOGGER.debug(f"Matched bit {state} at position {w:#06x} ({w}) {word[w:]!r}")
                                     scanned = 1
                             else:
                                 # Scan a byte
@@ -864,11 +847,11 @@ def _parse_forest(
                                 match, match_length = \
                                     self.scan_bytes(state, word, table, k, w)
                                 if match:
-                                    # LOGGER.debug(f"Scanned {match_length} byte(s) {state} at position {w:#06x} ({w}) {word[w:]!r}")
+                                    LOGGER.debug(f"Matched {match_length} byte(s) {state} at position {w:#06x} ({w}) {word[w:]!r}")
                                     scanned = max(scanned, match_length)
 
                 if scanned > 0:
-                    # LOGGER.debug(f"Scanned {scanned} byte(s) at position {w:#06x} ({w}); bit_count = {bit_count}")
+                    LOGGER.debug(f"Scanned {scanned} byte(s) at position {w:#06x} ({w}); bit_count = {bit_count}")
                     if bit_count >= 0:
                         # Advance by one bit
                         bit_count -= 1

From d6fd9021655567671710202e732c4c3cb7ad578e Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 17:27:49 +0100
Subject: [PATCH 27/28] New: `--format=repr` outputs tree in internal
 representation (for unit tests)

---
 src/fandango/cli/__init__.py    |  6 ++++--
 src/fandango/language/symbol.py |  9 +++++++++
 src/fandango/language/tree.py   | 17 +++++++++++++++++
 3 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/fandango/cli/__init__.py b/src/fandango/cli/__init__.py
index 6398c94a..870df819 100644
--- a/src/fandango/cli/__init__.py
+++ b/src/fandango/cli/__init__.py
@@ -261,9 +261,9 @@ def get_parser(in_command_line=True):
     )
     file_parser.add_argument(
         "--format",
-        choices=["string", "bits", "tree", "grammar", "none"],
+        choices=["string", "bits", "tree", "repr", "grammar", "none"],
         default="string",
-        help="produce output(s) as string (default), as a bit string, as a derivation tree, as a grammar, or none",
+        help="produce output(s) as string (default), as a bit string, as a derivation tree, in internal representation, as a grammar, or none",
     )
     file_parser.add_argument(
         "--file-mode",
@@ -676,6 +676,8 @@ def convert(s: str) -> str | bytes:
 
     if args.format == "tree":
         return convert(tree.to_tree())
+    if args.format == "repr":
+        return convert(tree.to_repr())
     if args.format == "bits":
         return convert(tree.to_bits())
     if args.format == "grammar":
diff --git a/src/fandango/language/symbol.py b/src/fandango/language/symbol.py
index d6c30cf1..efa2adc8 100644
--- a/src/fandango/language/symbol.py
+++ b/src/fandango/language/symbol.py
@@ -42,6 +42,9 @@ def is_regex(self):
     def __hash__(self):
         return NotImplemented
 
+    def to_repr(self):
+        return "Symbol(" + repr(self.symbol) + ")"
+
 
 class NonTerminal(Symbol):
     def __init__(self, symbol: str):
@@ -57,6 +60,9 @@ def __eq__(self, other):
     def __hash__(self):
         return hash((self.symbol, self.type))
 
+    def to_repr(self):
+        return "NonTerminal(" + repr(self.symbol) + ")"
+
 
 class Terminal(Symbol):
     def __init__(self, symbol: str | bytes | int):
@@ -158,3 +164,6 @@ def __str__(self):
 
     def __hash__(self):
         return hash((self.symbol, self.type))
+
+    def to_repr(self):
+        return "Terminal(" + repr(self.symbol) + ")"
diff --git a/src/fandango/language/tree.py b/src/fandango/language/tree.py
index 3dbad911..43d6793b 100644
--- a/src/fandango/language/tree.py
+++ b/src/fandango/language/tree.py
@@ -270,6 +270,23 @@ def to_tree(self, indent=0, start_indent=0) -> str:
         s += ")"
         return s
 
+    def to_repr(self, indent=0, start_indent=0) -> str:
+        """
+        Output the derivation tree in internal representation.
+        """
+        s = "  " * start_indent + "DerivationTree(" + self.symbol.to_repr()
+        if len(self._children) == 1:
+            s += ", [" + self._children[0].to_repr(indent, start_indent=0) + "])"
+        elif len(self._children) >= 1:
+            s += ",\n" + "  " * indent + "  [\n"
+            for child in self._children:
+                s += child.to_repr(indent + 2, start_indent=indent + 2)
+                s += ",\n"
+            s += "  " * indent + "  ]\n" + "  " * indent + ")"
+        else:
+            s += ")"
+        return s
+
     def to_grammar(self, include_position=True) -> str:
         """
         Output the derivation tree as (specialized) grammar

From e48b2724937d98939a686226ce392de567958925 Mon Sep 17 00:00:00 2001
From: Andreas Zeller <zeller@cispa.de>
Date: Wed, 5 Feb 2025 17:32:32 +0100
Subject: [PATCH 28/28] New: more parsing tests

---
 docs/empty.fan        |  1 -
 tests/test_parsing.py | 14 +++++++-------
 2 files changed, 7 insertions(+), 8 deletions(-)
 delete mode 100644 docs/empty.fan

diff --git a/docs/empty.fan b/docs/empty.fan
deleted file mode 100644
index eb3369dc..00000000
--- a/docs/empty.fan
+++ /dev/null
@@ -1 +0,0 @@
-<start> ::= '123' <digit> | '12345' '' <digit>
\ No newline at end of file
diff --git a/tests/test_parsing.py b/tests/test_parsing.py
index aeb8db64..2578a514 100644
--- a/tests/test_parsing.py
+++ b/tests/test_parsing.py
@@ -212,12 +212,12 @@ def test_b(self):
         self._test(
             "123456",
             DerivationTree(
-                NonTerminal("<start>"),
+                NonTerminal('<start>'),
                 [
-                    DerivationTree(Terminal("12345")),
-                    DerivationTree(NonTerminal("<digit>"),
-                            [DerivationTree(Terminal("6"))]
-                    ),
-                ],
-            ),
+                    DerivationTree(Terminal('12345')),
+                    DerivationTree(Terminal('')),
+                    DerivationTree(NonTerminal('<digit>'),
+                                    [DerivationTree(Terminal('6'))]),
+                ]
+            )
         )