diff --git a/Makefile b/Makefile index 763bd9c4..9be53ed4 100644 --- a/Makefile +++ b/Makefile @@ -90,7 +90,7 @@ osascript -e 'tell application "Safari" to set URL of document of window 1 to UR VIEW_PDF = open $(PDF_TARGET) # Command to check docs for failed assertions -CHECK_DOCS = grep -l AssertionError $(DOCS)/_build/html/*.html; if [ $$? == 0 ]; then false; else true; fi +CHECK_DOCS = grep -l AssertionError $(DOCS)/_build/html/*.html; if [ $$? == 0 ]; then echo 'Check the above files for failed assertions'; false; else true; fi # Targets. diff --git a/docs/Binary.md b/docs/Binary.md index 13ddbaa1..e923386c 100644 --- a/docs/Binary.md +++ b/docs/Binary.md @@ -83,7 +83,7 @@ $ fandango fuzz -f credit_card.fan -n 10 ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f credit_card.fan -n 10 +!fandango fuzz -f credit_card.fan -n 10 --validate assert _exit_code == 0 ``` @@ -164,6 +164,31 @@ The default is `fuzz --file=mode=auto` (default), which will use `binary` or `te Avoid mixing non-ASCII strings with bits and bytes in a single grammar. ::: +(sec:byte-regexes)= +### Bytes and Regular Expressions + +Fandango also supports [regular expressions](Regexes.md) over bytes. +To obtain a regular expression over a byte string, use both `r` and `b` prefixes. +This is especially useful for character classes. + +Here is an example: [`binfinity.fan`](binfinity.fan) produces strings of five bytes _outside_ the range `\x80-\xff`: + +```{code-cell} +:tags: ["remove-input"] +!cat binfinity.fan +``` + +This is what we get: + +```shell +$ fandango fuzz -f binfinity.fan -n 10 +``` + +```{code-cell} +:tags: ["remove-input"] +!fandango fuzz -f binfinity.fan -n 10 --validate +assert _exit_code == 0 +``` ## Length Encodings @@ -221,12 +246,12 @@ Again, all of this goes into a single `.fan` file: [`binary.fan`](binary.fan) ho Let us produce a single output using `binary.fan` and view its (binary) contents, using `od -c`: ```shell -$ fandango fuzz -n 1 -f binary.fan | od -c +$ fandango fuzz -n 1 -f binary.fan -o - | od -c ``` ```{code-cell} :tags: ["remove-input"] -! fandango fuzz -n 1 -f binary.fan | od -c +! fandango fuzz -n 1 -f binary.fan -o - | od -c ``` The hexadecimal dump shows that the first two bytes encode the length of the string of digits that follows. @@ -248,7 +273,7 @@ and obtain the same result: ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -n 1 -f binary-pack.fan | od -c +!fandango fuzz -n 1 -f binary-pack.fan -o - --validate | od -c assert _exit_code == 0 ``` diff --git a/docs/Bits.md b/docs/Bits.md index 23ee7a4e..2126c18f 100644 --- a/docs/Bits.md +++ b/docs/Bits.md @@ -54,7 +54,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 --start-symbol='' ```{code-cell} :tags: ["remove-input"] -!fandango fuzz --format=bits -f bits.fan -n 1 --start-symbol='' +!fandango fuzz --format=bits -f bits.fan -n 1 --start-symbol='' --validate assert _exit_code == 0 ``` @@ -71,7 +71,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 10 -c ' == "\x01" and == "\x01" and == "\x00"' +!fandango fuzz --format=bits -f bits.fan -n 10 -c ' == "\x01" and == "\x00"' --validate assert _exit_code == 0 ``` @@ -83,7 +83,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 -c ' == chr(1) and ```{code-cell} :tags: ["remove-input"] -!fandango fuzz --format=bits -f bits.fan -n 1 -c ' == chr(1) and == chr(0)' +!fandango fuzz --format=bits -f bits.fan -n 1 -c ' == chr(1) and == chr(0)' --validate assert _exit_code == 0 ``` @@ -95,7 +95,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 -c ' == chr(0b111100 ```{code-cell} :tags: ["remove-input"] -!fandango fuzz --format=bits -f bits.fan -n 1 -c ' == chr(0b11110000)' +!fandango fuzz --format=bits -f bits.fan -n 1 -c ' == chr(0b11110000)' --validate assert _exit_code == 0 ``` @@ -115,7 +115,7 @@ $ fandango fuzz --format=bits -f bits.fan -n 1 -c 'ord(str()) > 10' ```{code-cell} :tags: ["remove-input"] -!fandango fuzz --format=bits -f bits.fan -n 10 -c 'ord(str()) > 10' +!fandango fuzz --format=bits -f bits.fan -n 10 -c 'ord(str()) > 10' --validate assert _exit_code == 0 ``` diff --git a/docs/Constraints.md b/docs/Constraints.md index 63b078e6..e99e16a0 100644 --- a/docs/Constraints.md +++ b/docs/Constraints.md @@ -69,7 +69,7 @@ $ fandango fuzz -f persons.fan -n 10 ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c 'int() < 50' +!fandango fuzz -f persons.fan -n 10 -c 'int() < 50' --validate assert _exit_code == 0 ``` @@ -92,7 +92,7 @@ and we obtain these inputs: ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c '25 <= int() and int() <= 45' +!fandango fuzz -f persons.fan -n 10 -c '25 <= int() and int() <= 45' --validate assert _exit_code == 0 ``` @@ -100,7 +100,7 @@ Start with [`persons.fan`](persons.fan) and add a constraint such that we genera ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c 'int() % 7 == 0' +!fandango fuzz -f persons.fan -n 10 -c 'int() % 7 == 0' --validate assert _exit_code == 0 ``` (Hint: The modulo operator in Python is `%`). @@ -201,7 +201,7 @@ $ fandango -v fuzz -f persons.fan -n 10 -c 'int() % 7 == 0' ```{code-cell} :tags: ["remove-input", "scroll-output"] -!fandango -v fuzz -f persons.fan -n 10 -c 'int() % 7 == 0' +!fandango -v fuzz -f persons.fan -n 10 -c 'int() % 7 == 0' --validate assert _exit_code == 0 ``` @@ -226,7 +226,7 @@ $ fandango -v fuzz -f persons.fan -n 10 -c 'False' -N 50 ```{code-cell} :tags: ["remove-input", "scroll-output"] -!fandango -v fuzz -f persons.fan -n 10 -c 'False' -N 50 +!fandango -v fuzz -f persons.fan -n 10 -c 'False' -N 50 --validate assert _exit_code == 0 ``` diff --git a/docs/Fuzzing.md b/docs/Fuzzing.md index 44819b3a..1c855746 100644 --- a/docs/Fuzzing.md +++ b/docs/Fuzzing.md @@ -66,7 +66,7 @@ Your output will look like this: ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 +!fandango fuzz -f persons.fan -n 10 --validate assert _exit_code == 0 ``` diff --git a/docs/Generators.md b/docs/Generators.md index fee8f4f1..d767b67b 100644 --- a/docs/Generators.md +++ b/docs/Generators.md @@ -52,7 +52,7 @@ then we can have Fandango create names such as ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons-nat.fan -n 10 +!fandango fuzz -f persons-nat.fan -n 10 --validate assert _exit_code == 0 ``` @@ -127,7 +127,7 @@ This is what the output of the above spec looks like: ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons-faker.fan -n 10 +!fandango fuzz -f persons-faker.fan -n 10 --validate assert _exit_code == 0 ``` @@ -161,7 +161,7 @@ The resulting [Fandango spec file](persons-faker-age.fan) produces the desired r ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons-faker-age.fan -n 10 +!fandango fuzz -f persons-faker-age.fan -n 10 --validate assert _exit_code == 0 ``` @@ -178,7 +178,7 @@ These are the ages we get this way: ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons-faker-gauss.fan -n 10 +!fandango fuzz -f persons-faker-gauss.fan -n 10 --validate assert _exit_code == 0 ``` @@ -203,7 +203,7 @@ With this, both random names (``) and natural names (``) wil ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons-faker50.fan -n 10 +!fandango fuzz -f persons-faker50.fan -n 10 --validate assert _exit_code == 0 ``` @@ -236,7 +236,7 @@ and we get ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons-faker.fan -c '.startswith("S")' -n 10 +!fandango fuzz -f persons-faker.fan -c '.startswith("S")' -n 10 --validate assert _exit_code == 0 ``` @@ -259,7 +259,7 @@ In case this should work, this is only through some internal Fandango optimizati Unfortunately, this does not work. % ```{code-cell} % :tags: ["remove-input"] -% !fandango fuzz -f persons-faker.fan -c ' == fake.first_name()' -n 10 +% !fandango fuzz -f persons-faker.fan -c ' == fake.first_name()' -n 10 --validate % assert _exit_code == 0 % ``` The reason is that the faker returns _a different value_ every time it is invoked, making it hard for Fandango to solve the constraint. @@ -275,7 +275,7 @@ $ fandango fuzz -f persons-faker.fan -c 'str().startswith("S")' -c 'i This would work: ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons-faker.fan -c 'str().startswith("S")' -c 'int() >= 25 and int() <= 35' -n 10 +!fandango fuzz -f persons-faker.fan -c 'str().startswith("S")' -c 'int() >= 25 and int() <= 35' -n 10 --validate assert _exit_code == 0 ``` diff --git a/docs/Gif.md b/docs/Gif.md index d79c8767..010ce848 100644 --- a/docs/Gif.md +++ b/docs/Gif.md @@ -24,5 +24,6 @@ We start with a very short GIF to keep things simple ([source](http://probablypr We can parse this file using Fandango: ```{code-cell} -!fandango parse -f gif89a.fan tinytrans.gif -o - --format=grammar +!fandango parse -f gif89a.fan tinytrans.gif -o - --format=grammar --validate +assert _exit_code == 0 ``` \ No newline at end of file diff --git a/docs/ISO8601.md b/docs/ISO8601.md index 8409c53f..b64d33c6 100644 --- a/docs/ISO8601.md +++ b/docs/ISO8601.md @@ -112,10 +112,10 @@ iso8601lib += make_rule("iso8601calendardate", iso8601lib += make_rule("iso8601year", ["('+'|'-')? {4}"]) ``` -And yes, we need digits for specifying a year: -```{code-cell} -iso8601lib += make_rule("digit", [f"'{digit}'" for digit in range(0, 10)]) -``` +% And yes, we need digits for specifying a year: +% ```{code-cell} +% iso8601lib += make_rule("digit", [f"'{digit}'" for digit in range(0, 10)]) +% ``` ### Months @@ -370,7 +370,7 @@ Let us write it into a `.fan` file, so we can use it for fuzzing: open('ISO8601.fan', 'w').write(iso8601lib); ``` -Here comes [`iso9601.fan`](iso9601.fan) in all its glory: +Here comes [`iso8601.fan`](iso8601.fan) in all its glory: ```{code-cell} :tags: ["remove-input"] @@ -385,7 +385,7 @@ $ fandango fuzz -f iso8601.fan -n 10 ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f ISO8601.fan -n 10 +!fandango fuzz -f ISO8601.fan -n 10 --validate assert _exit_code == 0 ``` @@ -396,7 +396,7 @@ $ fandango fuzz -f ISO8601.fan -n 10 -c 'int() > 1950 and int() > 1950 and int() < 2000' +!fandango fuzz -f ISO8601.fan -n 10 -c 'int() > 1950 and int() < 2000' --validate assert _exit_code == 0 ``` @@ -412,4 +412,4 @@ assert _exit_code == 0 ``` Try out more constraints for yourself! -The generated [`ISO9601.fan`](ISO9601.fan) file is available for download. \ No newline at end of file +The generated [`ISO8601.fan`](ISO8601.fan) file is available for download. \ No newline at end of file diff --git a/docs/Invoking.md b/docs/Invoking.md index 7a976d4a..acb6331f 100644 --- a/docs/Invoking.md +++ b/docs/Invoking.md @@ -38,7 +38,7 @@ And this is what we get: ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f digits.fan -n 10 +!fandango fuzz -f digits.fan -n 10 --validate ``` Success! We have created 10 random sequences of digits. diff --git a/docs/Parsing.md b/docs/Parsing.md index 3750c1a3..ec4f72ae 100644 --- a/docs/Parsing.md +++ b/docs/Parsing.md @@ -93,7 +93,7 @@ assert _exit_code == 0 We see that input and output are identical (as should always be with parsing and unparsing). -:::{info} +:::{tip} As it comes to producing and storing outputs, the `parse` command has the same options as the `fuzz` command. ::: diff --git a/docs/Paths.md b/docs/Paths.md index c6390c89..0937fc1b 100644 --- a/docs/Paths.md +++ b/docs/Paths.md @@ -235,7 +235,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '[0].endswith("x")' ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c '[0].endswith("x")' +!fandango fuzz -f persons.fan -n 10 -c '[0].endswith("x")' --validate assert _exit_code == 0 ``` @@ -274,7 +274,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '..endswith("x")' ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c '..endswith("x")' +!fandango fuzz -f persons.fan -n 10 -c '..endswith("x")' --validate assert _exit_code == 0 ``` @@ -324,7 +324,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '.. ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c '.. == "X"' +!fandango fuzz -f persons.fan -n 10 -c '.. == "X"' --validate assert _exit_code == 0 ``` @@ -350,7 +350,7 @@ $ fandango fuzz -f persons.fan -n 10 -c '[0]...[0]... == "x"' +!fandango fuzz -f persons.fan -n 10 -c '[0]... == "x"' --validate assert _exit_code == 0 ``` @@ -420,7 +420,7 @@ $ fandango fuzz -f persons.fan -n 10 -c 'any(n.startswith("A") for n in *) ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c 'exists in : .startswith("A")' +!fandango fuzz -f persons.fan -n 10 -c 'exists in : .startswith("A")' --validate assert _exit_code == 0 ``` @@ -457,7 +457,7 @@ $ fandango fuzz -f persons.fan -n 10 -c 'all(c == "a" for c in *.... == "a"' +!fandango fuzz -f persons.fan -n 10 -c '.. == "a"' --validate assert _exit_code == 0 ``` @@ -503,7 +503,7 @@ $ fandango fuzz -f persons.fan -n 10 -c 'int() > 30 -> .startsw ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f persons.fan -n 10 -c 'int() > 30 -> .startswith("A")' +!fandango fuzz -f persons.fan -n 10 -c 'int() > 30 -> .startswith("A")' --validate assert _exit_code == 0 ``` @@ -516,7 +516,7 @@ $ fandango fuzz -f persons-faker-gauss.fan -n 10 -c 'int() > 30 -> ) > 30 -> .startswith("A")' +!fandango fuzz -f persons-faker-gauss.fan -n 10 -c 'int() > 30 -> .startswith("A")' --validate assert _exit_code == 0 ``` diff --git a/docs/Recursive.md b/docs/Recursive.md index a9c7b93b..5feb530f 100644 --- a/docs/Recursive.md +++ b/docs/Recursive.md @@ -82,7 +82,7 @@ $ fandango fuzz -f additions.fan -n 10 ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f additions.fan -n 10 +!fandango fuzz -f additions.fan -n 10 --validate assert _exit_code == 0 ``` @@ -148,7 +148,7 @@ $ fandango fuzz -f expr.fan -n 10 ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f expr.fan -n 10 +!fandango fuzz -f expr.fan -n 10 --validate assert _exit_code == 0 ``` @@ -192,7 +192,7 @@ $ fandango fuzz -f expr-float.fan -n 10 ```{code-cell} :tags: ["remove-input"] -!fandango fuzz -f expr-float.fan -n 10 +!fandango fuzz -f expr-float.fan -n 10 --validate assert _exit_code == 0 ``` @@ -204,7 +204,7 @@ $ fandango fuzz -f expr-float.fan -n 10 -c 'eval(str()) > 1000' ```{code-cell} :tags: ["remove-input", "remove-stderr"] -!fandango fuzz -f expr-float.fan -n 10 -c 'eval(str()) > 1000' 2> /dev/null +!fandango fuzz -f expr-float.fan -n 10 -c 'eval(str()) > 1000' --validate 2> /dev/null assert _exit_code == 0 ``` diff --git a/docs/Regexes.md b/docs/Regexes.md new file mode 100644 index 00000000..7b05b16a --- /dev/null +++ b/docs/Regexes.md @@ -0,0 +1,224 @@ +--- +jupytext: + formats: md:myst + text_representation: + extension: .md + format_name: myst +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +(sec:regexes)= +# Regular Expressions + +Although the Fandango grammars cover a wide range of input language features, there are situations where they may be a bit cumbersome to work with. +Consider specifying _every digit except for zeros_: this requires you to enumerate all the other digits `1`, `2`, and so on. +This is why Fandango also supports _regular expressions_, which allow you to use a concise syntax for character ranges, repeated characters and more. +Specifying all digits from `1` to `9`, for instance, becomes the short regular expression `r'[1-9]'`. + + +## About Regular Expressions + +Regular expressions form a language on their own and come with several useful features. +To get an introduction to the regular expressions Fandango uses, read the Python [Regular Expression HOWTO](https://docs.python.org/3/howto/regex.html) and check out the Python [Regular Expression Syntax](https://docs.python.org/3/library/re.html#regular-expression-syntax) for a complete reference. + +In Fandango, regular expressions are used for two purposes: + +* When _producing_ inputs, a regular expression is instantiated into a random string that matches the expression. +* When _parsing_ inputs, a regular expression is used to _parse_ and _match_ inputs. + + +## Writing Regular Expressions + +:::{margin} +For Python aficionados: this is actually a Python "raw string" +::: + +In Fandango, a regular expression comes as a string, prefixed with a `r` character. +To express that a digit can have the values `0` to `9`, instead of + +``` + ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" +``` + +you can write + +``` + ::= r'[0-9]' +``` + +which is much more concise. + +Likewise, to match a sequence of characters that ends in `;`, you can write + +``` + ::= r'[^;]+;' +``` + +Besides the `r` prefix indicating a regular expression, it also makes the string a _raw_ string. +This means that backslashes are treated as _literal characters_. +The regular expression `\d`, for instance, matches a Unicode digit, which includes `[0-9]`, and also [many other digit characters](https://en.wikipedia.org/wiki/Numerals_in_Unicode). +To include `\d` in a regular expression, write it _as is_; do not escape the backslash with another backslash (as you would do in a regular string): + +:::{margin} +The expression `r'\\d'` would actually match a backslash, followed by a `d` character. +::: + +``` + ::= r'\d' +``` + +:::{warning} +Be aware of the specific syntax of `r`-strings as it comes to backslashes. +::: + +One consequence of backslashes being interpreted literally is that you cannot escape quote characters in a regular expression. +This causes a problem if you need two kinds of quotes (`"` and `'`) in the same regular expression – say, a rule that checks for forbidden characters. + +However, encodings of the form `\xNN` are also interpreted by regular expressions. +Hence, if you need quotes, you can use + +* `\x22` instead of `"` +* `\x27` instead of `'` + +Here is an example: + +``` + ::= r'[\x22\x27;]' +``` + + +## Fine Points about Regular Expressions + +For parsing inputs, Fandango uses the Python [`re`](https://docs.python.org/3/library/re.html) module for matching strings against regular expressions; +for producing inputs, Fandango uses the Python [`exrex`](https://github.com/asciimoo/exrex) module for generating strings that match regular expressions. +All the `re` and `exrex` capabilities and limitations thus extend to Fandango. + +:::{tip} +For regex shortcuts, the `exrex` producer only produces characters in the range `\0x00` to `\0xff`: + +* for digits (`\d`), the characters `[0-9]` +* for whitespace (`\s`), the characters `[ \t\n\r\f\v]` +* for words (`\w`), the characters `[a-zA-Z0-9_]` +* for non-words (`\W`), the character range `[^a-zA-Z0-9_]` + +To produce Unicode characters, make them part of an explicit range (e.g. `[äöüÄÖÜß]`). +::: + + +### Repetition Limits + +Most notably, `exrex` imposes a _repetition limit_ of 20 on generated strings that in principle can have arbitrary length; a `+` or `*` operator will not expand to more than 20 repetitions. +Thus, a grammar [`infinity.fan`](infinity.fan) + +```{code-cell} +:tags: ["remove-input"] +!cat infinity.fan +``` + +that in principle, could produce arbitrary long sequences `abcabcabcabc...` will be limited to 20 repetitions at most: + +```shell +$ fandango fuzz -f infinity.fan -n 10 +``` + +```{code-cell} +:tags: ["remove-input"] +!fandango fuzz -f infinity.fan -n 10 --validate +assert _exit_code == 0 +``` + +To precisely control the number of repetitions, use the regular expression `{m,n}` construct, limiting the number of repetitions from `m` to `n`. +Let us limit the number of repetitions to the range 1..5: + +```{code-cell} +:tags: ["remove-input"] +!cat finity.fan +``` + +This is what we get: + +```shell +$ fandango fuzz -f finity.fan -n 10 +``` + +```{code-cell} +:tags: ["remove-input"] +!fandango fuzz -f finity.fan -n 10 --validate +assert _exit_code == 0 +``` + +:::{tip} +Remember that _grammars_ also have operators `+`, `*`, `?`, and `{N,M}` which apply to the preceding grammar element, and work like their _regular expression_ counterparts. +Using these, we could also write the above as +``` + ::= "abc"+ +``` +and +``` + ::= "abc"{1,5} +``` +respectively. +::: + +### Regular Expressions over Bytes + +Regular expressions can also be formed over bytes. +See [Bytes and Regular Expressions](sec:byte-regexes) for details. + + +## Regular Expressions vs. Grammars + +:::{margin} +In theory, context-free grammars are a strict _superset_ of regular expressions - any language that can be expressed in a regular expression can also be expressed in an equivalent grammar. +Practical implementations of regular expressions break this hierarchy by introducing some features such as _backreferences_ (check out what `(?P=name)` does), which cannot be expressed in grammars. +::: + +In many cases, a grammar can be replaced by a regular expression and vice versa. +This raises the question: When should one use a regular expression, and when a grammar? +Here are some points to help you decide. + +* Regular expressions are often more _concise_ (but arguably harder to read) than grammars. +* If you want to _reference_ individual elements of a string (say, as part of a constraint now or in the future), use a _grammar_. +* Since their underlying model is simpler, regular expressions are _faster_ to generate, and _much faster_ to [parse](Parsing.md) than grammars. +* If your underlying language separates lexical and syntactical processing, use + - _regular expressions_ for specifying _lexical_ parts such as tokens and fragments; + - a _grammar_ for the _syntax_; and + - [constraints](Constraints.md) for any semantic properties. +* Prefer grammars and constraints over overly complex regular expressions. + + +:::{warning} +Do not use regular expressions for inputs that are [recursive](Recursive.md). +Languages like HTML, XML, even e-mail addresses or URLs, are much easier to capture as grammars. +::: + + +## Regular Expressions as Equivalence Classes + +The choice of grammars vs. regular expressions also affects the Fandango generation algorithm. +Generally speaking, Fandango attempts to cover all alternatives of a grammar. +If, say, `` is specified as + +``` + ::= "0" | "1" | "2" | "3" | "4" | "5" | "6" | "7" | "8" | "9" +``` + +then Fandango will attempt to produce every digit at least once, and also try to cover digit _combinations_ up to a certain depth. +This is useful if you want to specifically test digit processing, or if each of the digits causes a different behavior that needs to be covered. + +If, however, you specify `` as + +``` + ::= r'[0-9]' +``` + +then Fandango will treat this as a _single_ alternative (with all expansions considered semantically equivalent), which once expanded into (some) digit will be considered as covered. + +:::{tip} +* If you do want or need to _differentiate_ between individual elements of a set (because they would be treated differently), consider _grammar alternatives_. +* If you do _not_ want or need to differentiate between individual elements of a set (because they would all be treated the same), consider a _regular expression_. +::: + diff --git a/docs/Stdlib.md b/docs/Stdlib.md index afa5182f..23f89f9c 100644 --- a/docs/Stdlib.md +++ b/docs/Stdlib.md @@ -37,6 +37,15 @@ Symbols starting with an underscore must _not_ be redefined. from fandango.language import stdlib ``` +## Characters + +A `` represents any Unicode character, including newline. + +```{code-cell} +:tags: ["remove-input"] +print(stdlib.any_char) +``` + ## Printable Characters These symbols mimic the [string constants from the Python `string` module](https://docs.python.org/3/library/string.html). @@ -89,17 +98,6 @@ print(stdlib.bytes) ``` -## Characters - -A `` is any Unicode character. - -```{error} -`` is currently not defined. -Use ``, ``, or `` instead. -``` -% We need charset or regex specs for this: ::= /./ | '\n' - - ## UTF-8 characters A `` is a UTF-8 encoding of a character, occupying one (``) to four (` ::= rb"[^\x80-\xff]{5}" \ No newline at end of file diff --git a/docs/fandango.bib b/docs/fandango.bib new file mode 100644 index 00000000..e69de29b diff --git a/docs/finity.fan b/docs/finity.fan new file mode 100644 index 00000000..c2352778 --- /dev/null +++ b/docs/finity.fan @@ -0,0 +1 @@ + ::= r"(abc){1,5}" \ No newline at end of file diff --git a/docs/gif.fan b/docs/gif.fan index b94e7158..95d0474d 100644 --- a/docs/gif.fan +++ b/docs/gif.fan @@ -53,7 +53,7 @@ ::= ()* ::= ::= - ::= (b'\x01' | b'\x02' | b'\x03' | b'\x04' | b'\x05' | b'\x06' | b'\x07' | b'\x08' | b'\t' | b'\n' | b'\x0b' | b'\x0c' | b'\r' | b'\x0e' | b'\x0f' | b'\x10' | b'\x11' | b'\x12' | b'\x13' | b'\x14' | b'\x15' | b'\x16' | b'\x17' | b'\x18' | b'\x19' | b'\x1a' | b'\x1b' | b'\x1c' | b'\x1d' | b'\x1e' | b'\x1f' | b' ' | b'!' | b'"' | b'#' | b'$' | b'%' | b'&' | b"'" | b'(' | b')' | b'*' | b'+' | b',' | b'-' | b'.' | b'/' | b'0' | b'1' | b'2' | b'3' | b'4' | b'5' | b'6' | b'7' | b'8' | b'9' | b':' | b';' | b'<' | b'=' | b'>' | b'?' | b'@' | b'A' | b'B' | b'C' | b'D' | b'E' | b'F' | b'G' | b'H' | b'I' | b'J' | b'K' | b'L' | b'M' | b'N' | b'O' | b'P' | b'Q' | b'R' | b'S' | b'T' | b'U' | b'V' | b'W' | b'X' | b'Y' | b'Z' | b'[' | b'\\' | b']' | b'^' | b'_' | b'`' | b'a' | b'b' | b'c' | b'd' | b'e' | b'f' | b'g' | b'h' | b'i' | b'j' | b'k' | b'l' | b'm' | b'n' | b'o' | b'p' | b'q' | b'r' | b's' | b't' | b'u' | b'v' | b'w' | b'x' | b'y' | b'z' | b'{' | b'|' | b'}' | b'~' | b'\x7f' | b'\x80' | b'\x81' | b'\x82' | b'\x83' | b'\x84' | b'\x85' | b'\x86' | b'\x87' | b'\x88' | b'\x89' | b'\x8a' | b'\x8b' | b'\x8c' | b'\x8d' | b'\x8e' | b'\x8f' | b'\x90' | b'\x91' | b'\x92' | b'\x93' | b'\x94' | b'\x95' | b'\x96' | b'\x97' | b'\x98' | b'\x99' | b'\x9a' | b'\x9b' | b'\x9c' | b'\x9d' | b'\x9e' | b'\x9f' | b'\xa0' | b'\xa1' | b'\xa2' | b'\xa3' | b'\xa4' | b'\xa5' | b'\xa6' | b'\xa7' | b'\xa8' | b'\xa9' | b'\xaa' | b'\xab' | b'\xac' | b'\xad' | b'\xae' | b'\xaf' | b'\xb0' | b'\xb1' | b'\xb2' | b'\xb3' | b'\xb4' | b'\xb5' | b'\xb6' | b'\xb7' | b'\xb8' | b'\xb9' | b'\xba' | b'\xbb' | b'\xbc' | b'\xbd' | b'\xbe' | b'\xbf' | b'\xc0' | b'\xc1' | b'\xc2' | b'\xc3' | b'\xc4' | b'\xc5' | b'\xc6' | b'\xc7' | b'\xc8' | b'\xc9' | b'\xca' | b'\xcb' | b'\xcc' | b'\xcd' | b'\xce' | b'\xcf' | b'\xd0' | b'\xd1' | b'\xd2' | b'\xd3' | b'\xd4' | b'\xd5' | b'\xd6' | b'\xd7' | b'\xd8' | b'\xd9' | b'\xda' | b'\xdb' | b'\xdc' | b'\xdd' | b'\xde' | b'\xdf' | b'\xe0' | b'\xe1' | b'\xe2' | b'\xe3' | b'\xe4' | b'\xe5' | b'\xe6' | b'\xe7' | b'\xe8' | b'\xe9' | b'\xea' | b'\xeb' | b'\xec' | b'\xed' | b'\xee' | b'\xef' | b'\xf0' | b'\xf1' | b'\xf2' | b'\xf3' | b'\xf4' | b'\xf5' | b'\xf6' | b'\xf7' | b'\xf8' | b'\xf9' | b'\xfa' | b'\xfb' | b'\xfc' | b'\xfd' | b'\xfe' | b'\xff') # not b'\x00' + ::= br'[^\x00]' ::= * # len() == ord(str()); see below ::= b'\x00' ::= @@ -105,7 +105,7 @@ ::= ::= ::=