From a6adc0cf9657c47acfda3eff61daf4d6b078308a Mon Sep 17 00:00:00 2001 From: Tim O'Callaghan Date: Sun, 26 May 2019 13:31:23 +0200 Subject: [PATCH] update to release 7.0.0.12 - removes .gitignores, seems to remove unit tests and issue files from build. updated and kept .gitignores in --- COPYING | 2 +- CREDITS | 72 +- ChangeLog | 67 + aapl/Makefile.am | 6 + autogen.sh | 10 - configure.ac | 24 +- contrib/Makefile.am | 2 - contrib/ragel.m4 | 53 - contrib/ragel.make | 6 - contrib/unicode2ragel.rb | 305 - doc/Makefile.am | 3 + doc/RELEASE_NOTES_V2 | 86 - doc/RELEASE_NOTES_V3 | 8 - doc/RELEASE_NOTES_V4 | 361 - doc/RELEASE_NOTES_V5 | 112 - doc/RELEASE_NOTES_V6 | 95 - doc/extract.awk | 41 - doc/fixbackbox.awk | 10 - doc/generate.lm | 547 - doc/genfigs.sh | 18 - doc/ragel-guide.tex | 3561 ---- doc/ragel-guide.txt | 8 +- doc/ragel.1.in | 2 +- examples/Makefile.am | 72 - examples/README | 40 - examples/atoi.rl | 59 - examples/awkemu.rl | 116 - examples/awkequiv.awk | 10 - examples/clang.rl | 150 - examples/concurrent.rl | 126 - examples/cppscan.lex | 143 - examples/cppscan.rec | 183 - examples/cppscan.rl | 208 - examples/format.rl | 191 - examples/go/Makefile | 32 - examples/go/README | 36 - examples/go/atoi.rl | 89 - examples/go/rpn.rl | 159 - examples/go/url.rl | 414 - examples/go/url_authority.rl | 165 - examples/gotocallret.rl | 96 - examples/mailbox.rl | 207 - examples/params.rl | 102 - examples/pullscan.rl | 170 - examples/rlscan.rl | 300 - examples/statechart.rl | 116 - examples/uri.rl | 31 - issues/1.txt | 9 - issues/11.txt | 9 - issues/12.txt | 12 - issues/2.txt | 9 - issues/252.txt | 13 - issues/254.txt | 9 - issues/255.txt | 9 - issues/256.txt | 9 - issues/291.txt | 9 - issues/292.txt | 9 - issues/293.txt | 9 - issues/3.txt | 9 - issues/301.txt | 9 - issues/302.txt | 9 - issues/306.txt | 11 - issues/307.txt | 19 - issues/311.txt | 9 - issues/319.txt | 9 - issues/320.txt | 37 - issues/321.txt | 9 - issues/338.txt | 9 - issues/339.txt | 9 - issues/4.txt | 14 - issues/5.txt | 9 - issues/7.txt | 9 - issues/8.txt | 53 - issues/9.txt | 11 - issues/97.txt | 9 - issues/98.txt | 35 - issues/99.txt | 54 - issues/closed/10.txt | 9 - issues/closed/100.txt | 11 - issues/closed/101.txt | 11 - issues/closed/155.txt | 11 - issues/closed/244.txt | 60 - issues/closed/250.txt | 9 - issues/closed/251.txt | 44 - issues/closed/259.txt | 29 - issues/closed/263.txt | 11 - issues/closed/264.txt | 9 - issues/closed/265.txt | 9 - issues/closed/266.txt | 9 - issues/closed/267.txt | 9 - issues/closed/268.txt | 9 - issues/closed/334.txt | 11 - issues/closed/6.txt | 9 - package/control | 5 - package/postinst | 6 - src/Makefile.am | 142 +- src/actexp.cc | 218 + src/{bingotoexp.h => actexp.h} | 61 +- src/action.h | 4 +- src/actloop.cc | 229 + src/{flatgotoexp.h => actloop.h} | 66 +- src/allocgen.cc | 180 +- src/asm.cc | 158 +- src/asm.h | 15 +- src/binary.cc | 688 +- src/binary.h | 78 +- src/binbreak.cc | 132 + src/{bingotoloop.h => binbreak.h} | 82 +- src/bingoto.cc | 131 + src/bingoto.h | 64 +- src/bingotoexp.cc | 488 - src/bingotoloop.cc | 518 - src/binvar.cc | 383 +- src/binvar.h | 71 +- src/binvarexp.cc | 480 - src/binvarexp.h | 75 - src/binvarloop.cc | 517 - src/binvarloop.h | 76 - src/buffer.h | 2 +- src/codegen.cc | 224 +- src/codegen.h | 98 +- src/common.cc | 383 +- src/common.h | 119 +- src/dot.cc | 31 +- src/dot.h | 3 +- src/dotcodegen-orig.cc | 322 - src/flat.cc | 613 +- src/flat.h | 70 +- src/flatbreak.cc | 118 + src/{flatgotoloop.h => flatbreak.h} | 84 +- src/flatgoto.cc | 118 + src/flatgoto.h | 72 + src/flatgotoexp.cc | 431 - src/flatgotoloop.cc | 471 - src/flatvar.cc | 396 +- src/flatvar.h | 66 +- src/flatvarexp.cc | 473 - src/flatvarexp.h | 75 - src/flatvarloop.cc | 564 - src/flatvarloop.h | 76 - src/fsmap.cc | 11 +- src/fsmattach.cc | 2 +- src/fsmbase.cc | 22 +- src/fsmcond.cc | 3 +- src/fsmgraph.cc | 44 +- src/fsmgraph.h | 57 +- src/fsmmin.cc | 2 +- src/fsmnfa.cc | 234 +- src/fsmstate.cc | 8 +- src/gendata.cc | 177 +- src/gendata.h | 34 +- src/goto.cc | 628 +- src/goto.h | 136 +- src/gotoexp.cc | 212 +- src/gotoexp.h | 37 +- src/gotoloop.cc | 295 +- src/gotoloop.h | 36 +- src/host-asm/Makefile.am | 25 + src/host-asm/Makefile.in | 740 + src/{libragel.h => host-asm/main.cc} | 19 +- src/host-asm/rlparse.lm | 204 + src/host-c/Makefile.am | 27 + src/host-c/Makefile.in | 757 + src/host-c/main.cc | 47 + src/host-c/rlhc.c | 16516 ++++++++++++++++ src/host-c/rlhc.lm | 459 + src/host-c/rlparse.lm | 203 + src/host-crack/Makefile.am | 27 + src/host-crack/Makefile.in | 759 + src/host-crack/main.cc | 63 + src/host-crack/rlhc.c | 16714 ++++++++++++++++ src/{rlhc-crack.lm => host-crack/rlhc.lm} | 32 +- src/host-crack/rlparse.lm | 202 + src/host-csharp/Makefile.am | 27 + src/host-csharp/Makefile.in | 759 + src/host-csharp/main.cc | 72 + src/host-csharp/rlhc.c | 16176 ++++++++++++++++ src/{rlhc-csharp.lm => host-csharp/rlhc.lm} | 30 +- src/host-csharp/rlparse.lm | 202 + src/host-d/Makefile.am | 27 + src/host-d/Makefile.in | 757 + src/host-d/main.cc | 72 + src/host-d/rlhc.c | 16489 ++++++++++++++++ src/{rlhc-d.lm => host-d/rlhc.lm} | 34 +- src/host-d/rlparse.lm | 211 + src/host-go/Makefile.am | 27 + src/host-go/Makefile.in | 758 + src/host-go/main.cc | 70 + src/host-go/rlhc.c | 16312 ++++++++++++++++ src/host-go/rlhc.lm | 432 + src/host-go/rlparse.lm | 202 + src/host-java/Makefile.am | 27 + src/host-java/Makefile.in | 759 + src/host-java/main.cc | 64 + src/host-java/rlhc.c | 16279 ++++++++++++++++ src/{rlhc-java.lm => host-java/rlhc.lm} | 59 +- src/host-java/rlparse.lm | 202 + src/host-js/Makefile.am | 27 + src/host-js/Makefile.in | 758 + src/host-js/main.cc | 66 + src/host-js/rlhc.c | 16257 ++++++++++++++++ src/{rlhc-js.lm => host-js/rlhc.lm} | 50 +- src/host-js/rlparse.lm | 202 + src/host-julia/Makefile.am | 27 + src/host-julia/Makefile.in | 759 + src/host-julia/main.cc | 60 + src/host-julia/rlhc.c | 18033 +++++++++++++++++ src/{rlhc-julia.lm => host-julia/rlhc.lm} | 42 +- src/host-julia/rlparse.lm | 202 + src/host-ocaml/Makefile.am | 32 + src/host-ocaml/Makefile.in | 762 + src/host-ocaml/main.cc | 59 + src/host-ocaml/rlhc.c | 18160 ++++++++++++++++++ src/{rlhc-ocaml.lm => host-ocaml/rlhc.lm} | 41 +- src/host-ocaml/rlparse.lm | 204 + src/host-ruby/Makefile.am | 27 +- src/host-ruby/driver.sh | 47 - src/host-ruby/main.cc | 58 + src/host-ruby/rlhc-ruby.lm | 496 - src/host-ruby/rlhc.lm | 596 +- src/host-ruby/rlparse.lm | 203 + src/host-ruby/ruby.dsc | 2 - src/host-ruby/ruby.lm | 26 - src/host-rust/Makefile.am | 27 + src/host-rust/Makefile.in | 759 + src/host-rust/main.cc | 60 + src/host-rust/rlhc.c | 16410 ++++++++++++++++ src/{rlhc-rust.lm => host-rust/rlhc.lm} | 35 +- src/host-rust/rlparse.lm | 202 + src/host.lm | 207 - src/idbase.cc | 13 +- src/inputdata.cc | 802 +- src/inputdata.h | 51 +- src/ipgoto.cc | 417 +- src/ipgoto.h | 30 +- src/langdesc.lm | 35 - src/load.cc | 2 +- src/load.h | 2 +- src/longest.cc | 571 + src/main.cc | 755 +- src/parsedata.cc | 32 +- src/parsedata.h | 5 +- src/parsetree.cc | 401 +- src/parsetree.h | 52 +- src/pcheck.h | 2 +- src/ragel.h | 4 +- src/ragel.lm | 116 +- src/redfsm.cc | 25 +- src/redfsm.h | 11 +- src/reducer.cc | 31 +- src/reducer.h | 13 +- src/{host-ruby/rlhc-host.lm => ril.lm} | 97 +- src/rlhc-c.lm | 504 - src/rlhc-go.lm | 491 - src/rlhc-main.lm | 19 + src/rlparse.lm | 210 +- src/{reducer.lm => rlreduce.lm} | 1048 +- src/rlscan.h | 2 +- src/switch.cc | 1036 + src/switch.h | 106 + src/switchbreak.cc | 75 + src/switchbreak.h | 70 + src/switchgoto.cc | 73 + src/switchgoto.h | 70 + src/switchvar.cc | 77 + src/switchvar.h | 72 + src/tabbreak.cc | 378 + src/tabgoto.cc | 330 + src/tables.cc | 81 + src/tables.h | 265 + src/tabvar.cc | 332 + src/xml.cc | 786 - src/xml.h | 81 - src/xmlparse.kh | 211 - src/xmlparse.kl | 1006 - src/xmlscan.rl | 315 - src/xmltags.gperf | 95 - test/Makefile.am | 68 - test/README | 11 - test/any1.rl | 17 - test/args1.rl | 95 - test/args2.rl | 95 - test/argsinc.rl | 11 - test/atoi1.rl | 67 - test/atoi2.rl | 81 - test/atoi3.rl | 74 - test/atoi4.rl | 73 - test/atoi5.rl | 266 - test/autogen.sh | 9 - test/awkemu.rl | 154 - test/buffer.h | 57 - test/builtin.rl | 1208 -- test/call1.rl | 101 - test/call2.rl | 116 - test/call3.rl | 122 - test/call4.rl | 39 - test/caseindep.rl | 54 - test/clang1.rl | 282 - test/clang2.rl | 323 - test/clang3.rl | 318 - test/clang4.rl | 187 - test/clang5.rl | 151 - test/cond1.rl | 84 - test/cond10.rl | 79 - test/cond2.rl | 90 - test/cond3.rl | 58 - test/cond4.rl | 53 - test/cond5.rl | 58 - test/cond6.rl | 60 - test/cond7.rl | 80 - test/cond8.rl | 55 - test/cond9.rl | 292 - test/conderr1.rl | 61 - test/conderr2.rl | 36 - test/condrep1.rl | 117 - test/condrep2.rl | 117 - test/condrep3.rl | 115 - test/condrep4.rl | 115 - test/configure.ac | 116 - test/cppscan1.h | 112 - test/cppscan1.rl | 311 - test/cppscan2.rl | 403 - test/cppscan3.rl | 284 - test/cppscan4.rl | 300 - test/cppscan5.rl | 273 - test/cppscan6.rl | 359 - test/crack1.rl | 39 - test/curs1.rl | 34 - test/element1.rl | 107 - test/element2.rl | 82 - test/element3.rl | 143 - test/empty1.rl | 15 - test/eofact.h | 9 - test/eofact.rl | 51 - test/eofcall1.rl | 103 - test/eofcall2.rl | 103 - test/eofgoto1.rl | 101 - test/eofgoto2.rl | 100 - test/eofret1.rl | 104 - test/erract1.rl | 144 - test/erract2.rl | 90 - test/erract3.rl | 103 - test/erract4.rl | 134 - test/erract5.rl | 144 - test/erract6.rl | 81 - test/erract7.rl | 41 - test/erract8.rl | 43 - test/erract9.rl | 42 - test/export1.rl | 58 - test/export2.rl | 56 - test/export3.rl | 52 - test/export4.rl | 58 - test/fnext1.rl | 81 - test/fnext2.rl | 81 - test/fnext3.rl | 81 - test/forder1.rl | 97 - test/forder2.rl | 132 - test/forder3.rl | 106 - test/genrep1.rl | 111 - test/genrep2.rl | 125 - test/genrep3.rl | 623 - test/genrep4.rl | 188 - test/genrep5.rl | 122 - test/genrep6.rl | 177 - test/goto1.rl | 37 - test/gotocallret1.rl | 117 - test/gotocallret2.rl | 78 - test/gotocallret3.rl | 121 - test/high1.rl | 179 - test/high2.rl | 102 - test/high3.rl | 111 - test/import1.rl | 73 - test/import2.h | 13 - test/import2.rl | 24 - test/include1.rl | 28 - test/include2.rl | 51 - test/include3.rl | 6 - test/include3/smtp_addr_parser.rl | 17 - test/include3/smtp_address.rl | 8 - test/include3/smtp_ip.rl | 4 - test/include3/smtp_whitespace.rl | 4 - test/java1.rl | 47 - test/java2.rl | 50 - test/julia1.rl | 33 - test/keller1.rl | 1074 -- test/lmgoto.rl | 197 - test/mailbox1.h | 33 - test/mailbox1.rl | 266 - test/mailbox2.rl | 172 - test/mailbox3.rl | 246 - test/main.c | 16 - test/minimize1.rl | 80 - test/ncall1.rl | 38 - test/next1.rl | 36 - test/next2.rl | 64 - test/nfa1.rl | 141 - test/nfa2.rl | 295 - test/nfa3.rl | 109 - test/noignore.rl | 68 - test/patact.rl | 99 - test/perftest | 38 - test/range.rl | 73 - test/rangei.rl | 28 - test/recdescent1.rl | 129 - test/recdescent2.rl | 116 - test/recdescent4.rl | 129 - test/recdescent5.rl | 116 - test/repetition.rl | 292 - test/rlscan.rl | 353 - test/rpn1.rl | 110 - test/ruby1.rl | 55 - test/runtests.sh | 611 - test/rust1.rl | 37 - test/scan1.rl | 70 - test/scan2.rl | 34 - test/scan3.rl | 33 - test/scan4.rl | 34 - test/scan5.rl | 88 - test/scan6.rl | 62 - test/sedsubst | 12 - test/stateact1.rl | 47 - test/statechart1.rl | 99 - test/strings1.rl | 200 - test/strings2.h | 9 - test/strings2.rl | 1369 -- test/strings3.rl | 1434 -- test/subject.mk.in | 4 - test/subject.sh.in | 4 - test/targs1.rl | 35 - test/tofrom1.rl | 351 - test/tofrom2.rl | 178 - test/tokstart1.rl | 237 - test/trans-asm.lm | 616 - test/trans-c.lm | 351 - test/trans.lm | 364 - test/union.rl | 196 - test/url1.rl | 540 - test/xml.rl | 105 - test/xmlcommon.rl | 205 - test/zlen1.rl | 15 - 440 files changed, 190088 insertions(+), 48160 deletions(-) delete mode 100755 autogen.sh delete mode 100644 contrib/Makefile.am delete mode 100644 contrib/ragel.m4 delete mode 100644 contrib/ragel.make delete mode 100644 contrib/unicode2ragel.rb delete mode 100644 doc/RELEASE_NOTES_V2 delete mode 100644 doc/RELEASE_NOTES_V3 delete mode 100644 doc/RELEASE_NOTES_V4 delete mode 100644 doc/RELEASE_NOTES_V5 delete mode 100644 doc/RELEASE_NOTES_V6 delete mode 100644 doc/extract.awk delete mode 100644 doc/fixbackbox.awk delete mode 100644 doc/generate.lm delete mode 100755 doc/genfigs.sh delete mode 100644 doc/ragel-guide.tex delete mode 100644 examples/Makefile.am delete mode 100644 examples/README delete mode 100644 examples/atoi.rl delete mode 100644 examples/awkemu.rl delete mode 100755 examples/awkequiv.awk delete mode 100644 examples/clang.rl delete mode 100644 examples/concurrent.rl delete mode 100644 examples/cppscan.lex delete mode 100644 examples/cppscan.rec delete mode 100644 examples/cppscan.rl delete mode 100644 examples/format.rl delete mode 100644 examples/go/Makefile delete mode 100644 examples/go/README delete mode 100644 examples/go/atoi.rl delete mode 100644 examples/go/rpn.rl delete mode 100644 examples/go/url.rl delete mode 100644 examples/go/url_authority.rl delete mode 100644 examples/gotocallret.rl delete mode 100644 examples/mailbox.rl delete mode 100644 examples/params.rl delete mode 100644 examples/pullscan.rl delete mode 100644 examples/rlscan.rl delete mode 100644 examples/statechart.rl delete mode 100644 examples/uri.rl delete mode 100644 issues/1.txt delete mode 100644 issues/11.txt delete mode 100644 issues/12.txt delete mode 100644 issues/2.txt delete mode 100644 issues/252.txt delete mode 100644 issues/254.txt delete mode 100644 issues/255.txt delete mode 100644 issues/256.txt delete mode 100644 issues/291.txt delete mode 100644 issues/292.txt delete mode 100644 issues/293.txt delete mode 100644 issues/3.txt delete mode 100644 issues/301.txt delete mode 100644 issues/302.txt delete mode 100644 issues/306.txt delete mode 100644 issues/307.txt delete mode 100644 issues/311.txt delete mode 100644 issues/319.txt delete mode 100644 issues/320.txt delete mode 100644 issues/321.txt delete mode 100644 issues/338.txt delete mode 100644 issues/339.txt delete mode 100644 issues/4.txt delete mode 100644 issues/5.txt delete mode 100644 issues/7.txt delete mode 100644 issues/8.txt delete mode 100644 issues/9.txt delete mode 100644 issues/97.txt delete mode 100644 issues/98.txt delete mode 100644 issues/99.txt delete mode 100644 issues/closed/10.txt delete mode 100644 issues/closed/100.txt delete mode 100644 issues/closed/101.txt delete mode 100644 issues/closed/155.txt delete mode 100644 issues/closed/244.txt delete mode 100644 issues/closed/250.txt delete mode 100644 issues/closed/251.txt delete mode 100644 issues/closed/259.txt delete mode 100644 issues/closed/263.txt delete mode 100644 issues/closed/264.txt delete mode 100644 issues/closed/265.txt delete mode 100644 issues/closed/266.txt delete mode 100644 issues/closed/267.txt delete mode 100644 issues/closed/268.txt delete mode 100644 issues/closed/334.txt delete mode 100644 issues/closed/6.txt delete mode 100644 package/control delete mode 100644 package/postinst create mode 100644 src/actexp.cc rename src/{bingotoexp.h => actexp.h} (68%) create mode 100644 src/actloop.cc rename src/{flatgotoexp.h => actloop.h} (67%) create mode 100644 src/binbreak.cc rename src/{bingotoloop.h => binbreak.h} (50%) create mode 100644 src/bingoto.cc delete mode 100644 src/bingotoexp.cc delete mode 100644 src/bingotoloop.cc delete mode 100644 src/binvarexp.cc delete mode 100644 src/binvarexp.h delete mode 100644 src/binvarloop.cc delete mode 100644 src/binvarloop.h delete mode 100644 src/dotcodegen-orig.cc create mode 100644 src/flatbreak.cc rename src/{flatgotoloop.h => flatbreak.h} (51%) create mode 100644 src/flatgoto.cc create mode 100644 src/flatgoto.h delete mode 100644 src/flatgotoexp.cc delete mode 100644 src/flatgotoloop.cc delete mode 100644 src/flatvarexp.cc delete mode 100644 src/flatvarexp.h delete mode 100644 src/flatvarloop.cc delete mode 100644 src/flatvarloop.h create mode 100644 src/host-asm/Makefile.am create mode 100644 src/host-asm/Makefile.in rename src/{libragel.h => host-asm/main.cc} (80%) create mode 100644 src/host-asm/rlparse.lm create mode 100644 src/host-c/Makefile.am create mode 100644 src/host-c/Makefile.in create mode 100644 src/host-c/main.cc create mode 100644 src/host-c/rlhc.c create mode 100644 src/host-c/rlhc.lm create mode 100644 src/host-c/rlparse.lm create mode 100644 src/host-crack/Makefile.am create mode 100644 src/host-crack/Makefile.in create mode 100644 src/host-crack/main.cc create mode 100644 src/host-crack/rlhc.c rename src/{rlhc-crack.lm => host-crack/rlhc.lm} (96%) create mode 100644 src/host-crack/rlparse.lm create mode 100644 src/host-csharp/Makefile.am create mode 100644 src/host-csharp/Makefile.in create mode 100644 src/host-csharp/main.cc create mode 100644 src/host-csharp/rlhc.c rename src/{rlhc-csharp.lm => host-csharp/rlhc.lm} (96%) create mode 100644 src/host-csharp/rlparse.lm create mode 100644 src/host-d/Makefile.am create mode 100644 src/host-d/Makefile.in create mode 100644 src/host-d/main.cc create mode 100644 src/host-d/rlhc.c rename src/{rlhc-d.lm => host-d/rlhc.lm} (95%) create mode 100644 src/host-d/rlparse.lm create mode 100644 src/host-go/Makefile.am create mode 100644 src/host-go/Makefile.in create mode 100644 src/host-go/main.cc create mode 100644 src/host-go/rlhc.c create mode 100644 src/host-go/rlhc.lm create mode 100644 src/host-go/rlparse.lm create mode 100644 src/host-java/Makefile.am create mode 100644 src/host-java/Makefile.in create mode 100644 src/host-java/main.cc create mode 100644 src/host-java/rlhc.c rename src/{rlhc-java.lm => host-java/rlhc.lm} (93%) create mode 100644 src/host-java/rlparse.lm create mode 100644 src/host-js/Makefile.am create mode 100644 src/host-js/Makefile.in create mode 100644 src/host-js/main.cc create mode 100644 src/host-js/rlhc.c rename src/{rlhc-js.lm => host-js/rlhc.lm} (94%) create mode 100644 src/host-js/rlparse.lm create mode 100644 src/host-julia/Makefile.am create mode 100644 src/host-julia/Makefile.in create mode 100644 src/host-julia/main.cc create mode 100644 src/host-julia/rlhc.c rename src/{rlhc-julia.lm => host-julia/rlhc.lm} (96%) create mode 100644 src/host-julia/rlparse.lm create mode 100644 src/host-ocaml/Makefile.am create mode 100644 src/host-ocaml/Makefile.in create mode 100644 src/host-ocaml/main.cc create mode 100644 src/host-ocaml/rlhc.c rename src/{rlhc-ocaml.lm => host-ocaml/rlhc.lm} (96%) create mode 100644 src/host-ocaml/rlparse.lm delete mode 100644 src/host-ruby/driver.sh create mode 100644 src/host-ruby/main.cc delete mode 100644 src/host-ruby/rlhc-ruby.lm create mode 100644 src/host-ruby/rlparse.lm delete mode 100644 src/host-ruby/ruby.dsc delete mode 100644 src/host-ruby/ruby.lm create mode 100644 src/host-rust/Makefile.am create mode 100644 src/host-rust/Makefile.in create mode 100644 src/host-rust/main.cc create mode 100644 src/host-rust/rlhc.c rename src/{rlhc-rust.lm => host-rust/rlhc.lm} (96%) create mode 100644 src/host-rust/rlparse.lm delete mode 100644 src/host.lm delete mode 100644 src/langdesc.lm create mode 100644 src/longest.cc rename src/{host-ruby/rlhc-host.lm => ril.lm} (72%) delete mode 100644 src/rlhc-c.lm delete mode 100644 src/rlhc-go.lm create mode 100644 src/rlhc-main.lm rename src/{reducer.lm => rlreduce.lm} (93%) create mode 100644 src/switch.cc create mode 100644 src/switch.h create mode 100644 src/switchbreak.cc create mode 100644 src/switchbreak.h create mode 100644 src/switchgoto.cc create mode 100644 src/switchgoto.h create mode 100644 src/switchvar.cc create mode 100644 src/switchvar.h create mode 100644 src/tabbreak.cc create mode 100644 src/tabgoto.cc create mode 100644 src/tables.cc create mode 100644 src/tables.h create mode 100644 src/tabvar.cc delete mode 100644 src/xml.cc delete mode 100644 src/xml.h delete mode 100644 src/xmlparse.kh delete mode 100644 src/xmlparse.kl delete mode 100644 src/xmlscan.rl delete mode 100644 src/xmltags.gperf delete mode 100644 test/Makefile.am delete mode 100644 test/README delete mode 100644 test/any1.rl delete mode 100644 test/args1.rl delete mode 100644 test/args2.rl delete mode 100644 test/argsinc.rl delete mode 100644 test/atoi1.rl delete mode 100644 test/atoi2.rl delete mode 100644 test/atoi3.rl delete mode 100644 test/atoi4.rl delete mode 100644 test/atoi5.rl delete mode 100755 test/autogen.sh delete mode 100644 test/awkemu.rl delete mode 100644 test/buffer.h delete mode 100644 test/builtin.rl delete mode 100644 test/call1.rl delete mode 100644 test/call2.rl delete mode 100644 test/call3.rl delete mode 100644 test/call4.rl delete mode 100644 test/caseindep.rl delete mode 100644 test/clang1.rl delete mode 100644 test/clang2.rl delete mode 100644 test/clang3.rl delete mode 100644 test/clang4.rl delete mode 100644 test/clang5.rl delete mode 100644 test/cond1.rl delete mode 100644 test/cond10.rl delete mode 100644 test/cond2.rl delete mode 100644 test/cond3.rl delete mode 100644 test/cond4.rl delete mode 100644 test/cond5.rl delete mode 100644 test/cond6.rl delete mode 100644 test/cond7.rl delete mode 100644 test/cond8.rl delete mode 100644 test/cond9.rl delete mode 100644 test/conderr1.rl delete mode 100644 test/conderr2.rl delete mode 100644 test/condrep1.rl delete mode 100644 test/condrep2.rl delete mode 100644 test/condrep3.rl delete mode 100644 test/condrep4.rl delete mode 100644 test/configure.ac delete mode 100644 test/cppscan1.h delete mode 100644 test/cppscan1.rl delete mode 100644 test/cppscan2.rl delete mode 100644 test/cppscan3.rl delete mode 100644 test/cppscan4.rl delete mode 100644 test/cppscan5.rl delete mode 100644 test/cppscan6.rl delete mode 100644 test/crack1.rl delete mode 100644 test/curs1.rl delete mode 100644 test/element1.rl delete mode 100644 test/element2.rl delete mode 100644 test/element3.rl delete mode 100644 test/empty1.rl delete mode 100644 test/eofact.h delete mode 100644 test/eofact.rl delete mode 100644 test/eofcall1.rl delete mode 100644 test/eofcall2.rl delete mode 100644 test/eofgoto1.rl delete mode 100644 test/eofgoto2.rl delete mode 100644 test/eofret1.rl delete mode 100644 test/erract1.rl delete mode 100644 test/erract2.rl delete mode 100644 test/erract3.rl delete mode 100644 test/erract4.rl delete mode 100644 test/erract5.rl delete mode 100644 test/erract6.rl delete mode 100644 test/erract7.rl delete mode 100644 test/erract8.rl delete mode 100644 test/erract9.rl delete mode 100644 test/export1.rl delete mode 100644 test/export2.rl delete mode 100644 test/export3.rl delete mode 100644 test/export4.rl delete mode 100644 test/fnext1.rl delete mode 100644 test/fnext2.rl delete mode 100644 test/fnext3.rl delete mode 100644 test/forder1.rl delete mode 100644 test/forder2.rl delete mode 100644 test/forder3.rl delete mode 100644 test/genrep1.rl delete mode 100644 test/genrep2.rl delete mode 100644 test/genrep3.rl delete mode 100644 test/genrep4.rl delete mode 100644 test/genrep5.rl delete mode 100644 test/genrep6.rl delete mode 100644 test/goto1.rl delete mode 100644 test/gotocallret1.rl delete mode 100644 test/gotocallret2.rl delete mode 100644 test/gotocallret3.rl delete mode 100644 test/high1.rl delete mode 100644 test/high2.rl delete mode 100644 test/high3.rl delete mode 100644 test/import1.rl delete mode 100644 test/import2.h delete mode 100644 test/import2.rl delete mode 100644 test/include1.rl delete mode 100644 test/include2.rl delete mode 100644 test/include3.rl delete mode 100644 test/include3/smtp_addr_parser.rl delete mode 100644 test/include3/smtp_address.rl delete mode 100644 test/include3/smtp_ip.rl delete mode 100644 test/include3/smtp_whitespace.rl delete mode 100644 test/java1.rl delete mode 100644 test/java2.rl delete mode 100644 test/julia1.rl delete mode 100644 test/keller1.rl delete mode 100644 test/lmgoto.rl delete mode 100644 test/mailbox1.h delete mode 100644 test/mailbox1.rl delete mode 100644 test/mailbox2.rl delete mode 100644 test/mailbox3.rl delete mode 100644 test/main.c delete mode 100644 test/minimize1.rl delete mode 100644 test/ncall1.rl delete mode 100644 test/next1.rl delete mode 100644 test/next2.rl delete mode 100644 test/nfa1.rl delete mode 100644 test/nfa2.rl delete mode 100644 test/nfa3.rl delete mode 100644 test/noignore.rl delete mode 100644 test/patact.rl delete mode 100644 test/perftest delete mode 100644 test/range.rl delete mode 100644 test/rangei.rl delete mode 100644 test/recdescent1.rl delete mode 100644 test/recdescent2.rl delete mode 100644 test/recdescent4.rl delete mode 100644 test/recdescent5.rl delete mode 100644 test/repetition.rl delete mode 100644 test/rlscan.rl delete mode 100644 test/rpn1.rl delete mode 100644 test/ruby1.rl delete mode 100644 test/runtests.sh delete mode 100644 test/rust1.rl delete mode 100644 test/scan1.rl delete mode 100644 test/scan2.rl delete mode 100644 test/scan3.rl delete mode 100644 test/scan4.rl delete mode 100644 test/scan5.rl delete mode 100644 test/scan6.rl delete mode 100755 test/sedsubst delete mode 100644 test/stateact1.rl delete mode 100644 test/statechart1.rl delete mode 100644 test/strings1.rl delete mode 100644 test/strings2.h delete mode 100644 test/strings2.rl delete mode 100644 test/strings3.rl delete mode 100644 test/subject.mk.in delete mode 100644 test/subject.sh.in delete mode 100644 test/targs1.rl delete mode 100644 test/tofrom1.rl delete mode 100644 test/tofrom2.rl delete mode 100644 test/tokstart1.rl delete mode 100644 test/trans-asm.lm delete mode 100644 test/trans-c.lm delete mode 100644 test/trans.lm delete mode 100644 test/union.rl delete mode 100644 test/url1.rl delete mode 100644 test/xml.rl delete mode 100644 test/xmlcommon.rl delete mode 100644 test/zlen1.rl diff --git a/COPYING b/COPYING index e246673..7fa1cdf 100644 --- a/COPYING +++ b/COPYING @@ -1,5 +1,5 @@ -Copyright (c) 2001-2016 Adrian Thurston et al. +Copyright (c) 2001-2018 Adrian Thurston et al. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in diff --git a/CREDITS b/CREDITS index ad7ad5c..32a3a1b 100644 --- a/CREDITS +++ b/CREDITS @@ -1,25 +1,53 @@ Ragel State Machine Compiler -- CREDITS ======================================= -* Ragel was designed and written by Adrian Thurston . - -* Many others have helped out along the way. Feedback, Packaging, and Fixes - provided by: - - Bob Tennent, Robert Lemmen, Tobias Jahn, Cris Bailiff, Buddy Betts, - Scott Dixon, Steven Handerson, Michael Somos, Bob Paddock, Istvan Buki, - David Drai, Matthias Rahlf, Zinx Verituse, Markus W. Weissmann, - Marc Liyanage, Erich Ocean, Alan West, Steven Kibbler, Laurent Boulard, - Jon Oberheide, David Helder, Lexington Luthor, Jason Jobe, Colin Fleming, - Carlos Antunes, Steve Horne, Matt Mower, Josef Goettgens, Zed Shaw, - Marcus Rueckert, Jeremy Hinegardner, Aaron Campbell, Josh Purinton, - Judson Lester, Barry Arthur, Tim Potter, Ryan Phelps, David Waite, - Kenny MacDermid, MenTaLguY, Manoj Rajagopalan, Tim Chklovski, - Mikkel Fahnøe Jørgensen, Andrei Polushin, Evan Phoenix, David Balmain, - Ross Thomas, Mitchell Foral, John D. Mitchell, Diego 'Flameeyes' Pettenò, - Jose Quinteiro, William Morgan, _why, Iñaki Baz Castillo, Attila Sztupák, - Graham Miller, Ismael Luceno, Josh Stern, Denis Naumov, Arbor Networks, - Victor Hugo Borja, Daniel Tang, Justine Tunney, Johannes Pfau, ygrek, - Victor Khimenko, David James, Anton Ageev, Daniel Salzman, Jungshik Shin, - Steven R. Loomis, Ingvar Stepanyan, Kelvin Sherlock, Conrad Steenberg, - Jan Engelhardt, Kenta Sato, Harald Grossauer +Ragel was designed and written by Adrian Thurston . Many +others have helped out along the way. + +* Financial support provided by Arbor Networks and Barracuda + Networks. + +* Objective-C output and valuable feedback contributed by Erich Ocean. + +* D output and many great ideas contributed by Alan West. + +* Conditionals inspired by David Helder. + +* Java code generation contributions, bug reports, fixes, test cases + and suggestions from Colin Fleming. + +* Useful discussions and bug reports from to Carlos Antunes. + +* Ruby code generation contributed by Victor Hugo Borja. + +* C# code generation contributed by Daniel Tang. + +* Go code generation contributed by Justine Tunney. Significantly expanded by + Anton Ageev + +* D2 patch from Johannes Pfau. + +* OCaml patch from ygrek. + +* Crack language support from XXXXXXXXXXXXXXX + +* Various feedback, packaging, and fixes provided by: + + Bob Tennent, Robert Lemmen, Tobias Jahn, Cris Bailiff, Buddy Betts, Scott + Dixon, Steven Handerson, Michael Somos, Bob Paddock, Istvan Buki, David + Drai, Matthias Rahlf, Zinx Verituse, Markus W. Weissmann, Marc Liyanage, + Erich Ocean, Alan West, Steven Kibbler, Laurent Boulard, Jon Oberheide, + David Helder, Lexington Luthor, Jason Jobe, Colin Fleming, Carlos Antunes, + Steve Horne, Matt Mower, Josef Goettgens, Zed Shaw, Marcus Rueckert, Jeremy + Hinegardner, Aaron Campbell, Josh Purinton, Judson Lester, Barry Arthur, + Tim Potter, Ryan Phelps, David Waite, Kenny MacDermid, MenTaLguY, Manoj + Rajagopalan, Tim Chklovski, Mikkel Fahnøe Jørgensen, Andrei Polushin, Evan + Phoenix, David Balmain, Ross Thomas, Mitchell Foral, John D. Mitchell, + Diego 'Flameeyes' Pettenò, Jose Quinteiro, William Morgan, Why the Lucky + Stiff, Iñaki Baz Castillo, Attila Sztupák, Graham Miller, Ismael Luceno, + Josh Stern, Denis Naumov, Victor Hugo Borja, Daniel Tang, Justine Tunney, + Johannes Pfau, ygrek, Victor Khimenko, David James, Anton Ageev, Daniel + Salzman, Jungshik Shin, Steven R. Loomis, Ingvar Stepanyan, Kelvin + Sherlock, Conrad Steenberg, Jan Engelhardt, Kenta Sato, Harald Grossauer, + Kamil Klimkiewicz, Hesham Wahba, Phil Carmody, Yang Hong + diff --git a/ChangeLog b/ChangeLog index 04cbe9c..e7fbc80 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,70 @@ +Ragel 7.0.1 - Aug XXX, 2018 +=========================== + -Condition implemenation rewritten. Previously utilized an extension of the + alphabet space to encode "character when A". The character was looked up, the + applicable condition set determined, a constant value was added to the + character value, then the real transition looked up. Now a more natural + implemation has the transition encoded in two levels of lists. The first + level is indexed by the alphabet character, which is unmodified. This tells + us which condition set to execute. The result of the condition execution is + an integer that is looked up in a second level list. This new condition + implementation allows for a much less complicated implemation, and does not + rely on available bits in the alphabet space. + -Conditions now properly execution on EOF. + -Added a Condition-based repetition operator. + -Ragel frontend is now colm-based. The grammar is separated into a core ragel + grammar that can then be extended for different host languages. + -Intermediate codegen language was added and non-C/ASM code generators are now + based on the intermediate language. Separate ragel executable files are used + to implement the different host languages. + -Restrictions on action-based jumping to and calling of state machines in + languages where it cannot be implemented properly (no goto in host language). + These statements must be replaced with the "next" version, which does not + jump out of the action, but instead causes the jump/call after the action + list. + -Added NFA features. This includes repetition and support for large unions of + expressions. The operator can create a deterministic prefix, the depth of + which is configurable, before NFA alternation begins. + -Consolodating code in the different code generation styles. + +Ragel 6.10 - Mar 24, 2017 +========================= + -C codegen: test P vs PE in goto/call/ret statements in EOF actions, just + before re-entering. If at the end of the input block then the EOF check is + jumped to. This change prevents overrunning the buffer if control flow is + issued in an EOF action without fixing the input pointer first. If a program + properly issues an fhold before the control flow the program won't be + affected. + -Updated action label generation. The previous set of conditions for + generating the label didn't cover actions coming from the eofAction pointer + (eof trans covered since it points into the set of transitions). + -Use separate signed/unsigned values for host type min/max. Using separate + values avoids the need to type cast before the data goes into FsmCtx structs. + Keep it in native types until it is used. + -Optionally do not generate entry point variables. Adds noentry write option + for data. + -Various warning elimination and build updates. + +Ragel 6.9 - Oct 13, 2014 +======================== + -updated command-line synopsis + -ocaml: fix missing semicolon + -ocaml: support -G1 + -ocaml: choose a unique name for type state + -ruby: reduce the amount of calls to GET_WIDE_KEY() + -union test case: warning fix + -omit line directives around expression-oriented write statements + -use AS_IF and test command to check if the DIST file is present + -added missing std:: using + -go: added '//line' directive support + +Ragel 6.8 - Feb 11, 2013 +======================== + + -The -G2 code generator for Go1 was rewritten. Table, flat and switch-based + code generators were added. (Anton Ageev) + -The CXXFLAGS variable is not longer set in the configure script. + Ragel 6.7 - May 22, 2011 ======================== -The C vim file now supports L,l on the end of literal numbers, other syntax diff --git a/aapl/Makefile.am b/aapl/Makefile.am index 80b972f..ca4993b 100644 --- a/aapl/Makefile.am +++ b/aapl/Makefile.am @@ -8,3 +8,9 @@ pkginclude_HEADERS = \ EXTRA_DIST = README COPYING +BUILT_SOURCES = \ + include/ragel + +include/ragel: + mkdir -p include + ln -s .. include/ragel diff --git a/autogen.sh b/autogen.sh deleted file mode 100755 index c0471dc..0000000 --- a/autogen.sh +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/bash -# - -set -x - -libtoolize --copy --force -aclocal -autoheader -automake --foreign --add-missing -autoconf diff --git a/configure.ac b/configure.ac index fb04bb3..963130b 100644 --- a/configure.ac +++ b/configure.ac @@ -1,5 +1,5 @@ dnl -dnl Copyright 2001-2016 Adrian Thurston +dnl Copyright 2001-2018 Adrian Thurston dnl dnl Permission is hereby granted, free of charge, to any person obtaining a copy dnl of this software and associated documentation files (the "Software"), to @@ -20,10 +20,10 @@ dnl OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN TH dnl SOFTWARE. dnl -AC_INIT(ragel, 7.0.0.10) -PUBDATE="May 2017" +AC_INIT(ragel, 7.0.0.12) +PUBDATE="May 2019" -EXPECTED_COLM=0.13.0.5 +EXPECTED_COLM=0.13.0.7 AM_INIT_AUTOMAKE([foreign]) AC_SUBST(PUBDATE) @@ -126,7 +126,7 @@ if [test "x$build_program" = "xyes"]; then CPPFLAGS="${CPPFLAGS} -I$withval/src/include" LDFLAGS="${LDFLAGS} -L$withval/src" COLM="$withval/src/colm" - COLM_LD="$withval/src/.libs/libcolm.a" + COLM_LD="$withval/src/libcolm.la" COLM_LIBDEP="$COLM_LD" COLM_BINDEP="$COLM" else @@ -196,11 +196,25 @@ AM_CONDITIONAL([BUILD_PROGRAM], [test "x$build_program" = "xyes"]) AM_CONDITIONAL([BUILD_MANUAL], [test "x$build_manual" = "xyes"]) AM_CONDITIONAL([WITH_RAGEL_KELBT], [test "x$RAGEL_KELBT" = "xyes"]) +AC_CANONICAL_HOST() +AM_CONDITIONAL([LINKER_NO_UNDEFINED], [test "x$host_os" = "xlinux-gnu"]) + dnl write output files AC_OUTPUT([ Makefile src/Makefile src/host-ruby/Makefile + src/host-asm/Makefile + src/host-julia/Makefile + src/host-ocaml/Makefile + src/host-c/Makefile + src/host-d/Makefile + src/host-csharp/Makefile + src/host-go/Makefile + src/host-java/Makefile + src/host-rust/Makefile + src/host-crack/Makefile + src/host-js/Makefile aapl/Makefile doc/Makefile doc/ragel.1 diff --git a/contrib/Makefile.am b/contrib/Makefile.am deleted file mode 100644 index 7ef7e8d..0000000 --- a/contrib/Makefile.am +++ /dev/null @@ -1,2 +0,0 @@ - -EXTRA_DIST = ragel.make ragel.m4 unicode2ragel.rb diff --git a/contrib/ragel.m4 b/contrib/ragel.m4 deleted file mode 100644 index 3629d33..0000000 --- a/contrib/ragel.m4 +++ /dev/null @@ -1,53 +0,0 @@ -dnl Check for presence of the Ragel State Machine generator. -dnl -dnl This macro checks for the presence of the ragel tool in the system, -dnl and whether the ragel tool is absolutely needed for a complete -dnl build. -dnl -dnl To check for the need for Ragel, you have to provide the relative -dnl path of a source file generated through Ragel: if the file is -dnl present in the source tree, a missing ragel command will not cause -dnl the configure to abort. - -AC_DEFUN([_RAGEL_VARS], [ - AC_ARG_VAR([RAGEL], [Ragel generator command]) - AC_ARG_VAR([RAGELFLAGS], [Ragel generator flags]) -]) - -AC_DEFUN([CHECK_RAGEL], [ - AC_REQUIRE([_RAGEL_VARS]) - AC_CHECK_PROG([RAGEL], [ragel], [ragel], [no]) - - dnl We set RAGEL to false so that it would execute the "false" - dnl command if needed. - AS_IF([test x"$RAGEL" = x"no"], - [RAGEL=false], - AS_IF([test x"$2" != "x"], - [ragel_version=`$RAGEL --version | sed -n -e '1s:.*version \(@<:@0-9@:>@\.@<:@0-9@:>@\) .*:\1:p'` - ragel_version_compare=`echo $ragel_version | tr -d .` - ragel_wanted_version=`echo $2 | tr -d .` - AS_IF([test $ragel_version_compare -lt $ragel_wanted_version], - [AC_MSG_WARN([Found Ragel $ragel_version but Ragel $2 requested]) - RAGEL=false - ]) - ])) - - dnl Only test the need if not found - AS_IF([test x"$RAGEL" = x"false"], [ - AC_MSG_CHECKING([whether we need ragel to regenerate sources]) - AS_IF([test -a "${srcdir}/$1"], [ragel_needed=no], [ragel_needed=yes]) - AC_MSG_RESULT([$ragel_needed]) - - AS_IF([test x"$ragel_needed" = x"yes"], - [AC_MSG_ERROR([dnl -You need Ragel to build from development sources. -You can find Ragel at http://www.colm.net/open-source/ragel/dnl - ])]) - ]) -]) - -AC_DEFUN([CHECK_RAGEL_AM], [ - CHECK_RAGEL([$1], [$2]) - - AM_CONDITIONAL([HAVE_RAGEL], [test x"$RAGEL" != x"false"]) -]) diff --git a/contrib/ragel.make b/contrib/ragel.make deleted file mode 100644 index f7a71b5..0000000 --- a/contrib/ragel.make +++ /dev/null @@ -1,6 +0,0 @@ -# -*- Makefile -*- - -SUFFIXES = .rl - -.rl.c: - $(RAGEL) $(RAGELFLAGS) -C $< -o $@ diff --git a/contrib/unicode2ragel.rb b/contrib/unicode2ragel.rb deleted file mode 100644 index d64e601..0000000 --- a/contrib/unicode2ragel.rb +++ /dev/null @@ -1,305 +0,0 @@ -#!/usr/bin/env ruby -# -# This script uses the unicode spec to generate a Ragel state machine -# that recognizes unicode alphanumeric characters. It generates 5 -# character classes: uupper, ulower, ualpha, udigit, and ualnum. -# Currently supported encodings are UTF-8 [default] and UCS-4. -# -# Usage: unicode2ragel.rb [options] -# -e, --encoding [ucs4 | utf8] Data encoding -# -h, --help Show this message -# -# This script was originally written as part of the Ferret search -# engine library. -# -# Author: Rakan El-Khalil - -require 'optparse' -require 'open-uri' - -ENCODINGS = [ :utf8, :ucs4 ] -ALPHTYPES = { :utf8 => "unsigned char", :ucs4 => "unsigned int" } -CHART_URL = "http://www.unicode.org/Public/5.1.0/ucd/DerivedCoreProperties.txt" - -### -# Display vars & default option - -TOTAL_WIDTH = 80 -RANGE_WIDTH = 23 -@encoding = :utf8 - -### -# Option parsing - -cli_opts = OptionParser.new do |opts| - opts.on("-e", "--encoding [ucs4 | utf8]", "Data encoding") do |o| - @encoding = o.downcase.to_sym - end - opts.on("-h", "--help", "Show this message") do - puts opts - exit - end -end - -cli_opts.parse(ARGV) -unless ENCODINGS.member? @encoding - puts "Invalid encoding: #{@encoding}" - puts cli_opts - exit -end - -## -# Downloads the document at url and yields every alpha line's hex -# range and description. - -def each_alpha( url, property ) - open( url ) do |file| - file.each_line do |line| - next if line =~ /^#/; - next if line !~ /; #{property} #/; - - range, description = line.split(/;/) - range.strip! - description.gsub!(/.*#/, '').strip! - - if range =~ /\.\./ - start, stop = range.split '..' - else start = stop = range - end - - yield start.hex .. stop.hex, description - end - end -end - -### -# Formats to hex at minimum width - -def to_hex( n ) - r = "%0X" % n - r = "0#{r}" unless (r.length % 2).zero? - r -end - -### -# UCS4 is just a straight hex conversion of the unicode codepoint. - -def to_ucs4( range ) - rangestr = "0x" + to_hex(range.begin) - rangestr << "..0x" + to_hex(range.end) if range.begin != range.end - [ rangestr ] -end - -## -# 0x00 - 0x7f -> 0zzzzzzz[7] -# 0x80 - 0x7ff -> 110yyyyy[5] 10zzzzzz[6] -# 0x800 - 0xffff -> 1110xxxx[4] 10yyyyyy[6] 10zzzzzz[6] -# 0x010000 - 0x10ffff -> 11110www[3] 10xxxxxx[6] 10yyyyyy[6] 10zzzzzz[6] - -UTF8_BOUNDARIES = [0x7f, 0x7ff, 0xffff, 0x10ffff] - -def to_utf8_enc( n ) - r = 0 - if n <= 0x7f - r = n - elsif n <= 0x7ff - y = 0xc0 | (n >> 6) - z = 0x80 | (n & 0x3f) - r = y << 8 | z - elsif n <= 0xffff - x = 0xe0 | (n >> 12) - y = 0x80 | (n >> 6) & 0x3f - z = 0x80 | n & 0x3f - r = x << 16 | y << 8 | z - elsif n <= 0x10ffff - w = 0xf0 | (n >> 18) - x = 0x80 | (n >> 12) & 0x3f - y = 0x80 | (n >> 6) & 0x3f - z = 0x80 | n & 0x3f - r = w << 24 | x << 16 | y << 8 | z - end - - to_hex(r) -end - -def from_utf8_enc( n ) - n = n.hex - r = 0 - if n <= 0x7f - r = n - elsif n <= 0xdfff - y = (n >> 8) & 0x1f - z = n & 0x3f - r = y << 6 | z - elsif n <= 0xefffff - x = (n >> 16) & 0x0f - y = (n >> 8) & 0x3f - z = n & 0x3f - r = x << 10 | y << 6 | z - elsif n <= 0xf7ffffff - w = (n >> 24) & 0x07 - x = (n >> 16) & 0x3f - y = (n >> 8) & 0x3f - z = n & 0x3f - r = w << 18 | x << 12 | y << 6 | z - end - r -end - -### -# Given a range, splits it up into ranges that can be continuously -# encoded into utf8. Eg: 0x00 .. 0xff => [0x00..0x7f, 0x80..0xff] -# This is not strictly needed since the current [5.1] unicode standard -# doesn't have ranges that straddle utf8 boundaries. This is included -# for completeness as there is no telling if that will ever change. - -def utf8_ranges( range ) - ranges = [] - UTF8_BOUNDARIES.each do |max| - if range.begin <= max - return ranges << range if range.end <= max - - ranges << range.begin .. max - range = (max + 1) .. range.end - end - end - ranges -end - -def build_range( start, stop ) - size = start.size/2 - left = size - 1 - return [""] if size < 1 - - a = start[0..1] - b = stop[0..1] - - ### - # Shared prefix - - if a == b - return build_range(start[2..-1], stop[2..-1]).map do |elt| - "0x#{a} " + elt - end - end - - ### - # Unshared prefix, end of run - - return ["0x#{a}..0x#{b} "] if left.zero? - - ### - # Unshared prefix, not end of run - # Range can be 0x123456..0x56789A - # Which is equivalent to: - # 0x123456 .. 0x12FFFF - # 0x130000 .. 0x55FFFF - # 0x560000 .. 0x56789A - - ret = [] - ret << build_range(start, a + "FF" * left) - - ### - # Only generate middle range if need be. - - if a.hex+1 != b.hex - max = to_hex(b.hex - 1) - max = "FF" if b == "FF" - ret << "0x#{to_hex(a.hex+1)}..0x#{max} " + "0x00..0xFF " * left - end - - ### - # Don't generate last range if it is covered by first range - - ret << build_range(b + "00" * left, stop) unless b == "FF" - ret.flatten! -end - -def to_utf8( range ) - utf8_ranges( range ).map do |r| - build_range to_utf8_enc(r.begin), to_utf8_enc(r.end) - end.flatten! -end - -## -# Perform a 3-way comparison of the number of codepoints advertised by -# the unicode spec for the given range, the originally parsed range, -# and the resulting utf8 encoded range. - -def count_codepoints( code ) - code.split(' ').inject(1) do |acc, elt| - if elt =~ /0x(.+)\.\.0x(.+)/ - if @encoding == :utf8 - acc * (from_utf8_enc($2) - from_utf8_enc($1) + 1) - else - acc * ($2.hex - $1.hex + 1) - end - else - acc - end - end -end - -def is_valid?( range, desc, codes ) - spec_count = 1 - spec_count = $1.to_i if desc =~ /\[(\d+)\]/ - range_count = range.end - range.begin + 1 - - sum = codes.inject(0) { |acc, elt| acc + count_codepoints(elt) } - sum == spec_count and sum == range_count -end - -## -# Generate the state maching to stdout - -def generate_machine( name, property ) - pipe = " " - puts " #{name} = " - each_alpha( CHART_URL, property ) do |range, desc| - - codes = (@encoding == :ucs4) ? to_ucs4(range) : to_utf8(range) - - raise "Invalid encoding of range #{range}: #{codes.inspect}" unless - is_valid? range, desc, codes - - range_width = codes.map { |a| a.size }.max - range_width = RANGE_WIDTH if range_width < RANGE_WIDTH - - desc_width = TOTAL_WIDTH - RANGE_WIDTH - 11 - desc_width -= (range_width - RANGE_WIDTH) if range_width > RANGE_WIDTH - - if desc.size > desc_width - desc = desc[0..desc_width - 4] + "..." - end - - codes.each_with_index do |r, idx| - desc = "" unless idx.zero? - code = "%-#{range_width}s" % r - puts " #{pipe} #{code} ##{desc}" - pipe = "|" - end - end - puts " ;" - puts "" -end - -puts < 0 ) - cout << "ACCEPT" << endl; diff --git a/doc/RELEASE_NOTES_V3 b/doc/RELEASE_NOTES_V3 deleted file mode 100644 index 64dd2f1..0000000 --- a/doc/RELEASE_NOTES_V3 +++ /dev/null @@ -1,8 +0,0 @@ - Porting Ragel Version 2 Programs to Version 3 - ============================================= - -1. Replace all instances of *p in action code with the keyword fc. - -2. Replace all instances of : used to set actions or priorities with @. - -3. Wrap named priorities in parentheses so they are of the form @(name,1). diff --git a/doc/RELEASE_NOTES_V4 b/doc/RELEASE_NOTES_V4 deleted file mode 100644 index a142f36..0000000 --- a/doc/RELEASE_NOTES_V4 +++ /dev/null @@ -1,361 +0,0 @@ - - RELEASE NOTES Ragel 4.X - - -To-State and From-State Action Embedding Operators Added (4.2) -============================================================== - -Added operators for embedding actions into all transitions into a state and all -transitions out of a state. These embeddings stay with the state, and are -irrespective of what the current transitions are and any future transitions -that may be added into or out of the state. - -In the following example act is executed on the transitions for 't' and 'y'. -Even though it is only embedded in the context of the first alternative. This -is because after matching 'hi ', the machine has not yet distinguished beween -the two threads. The machine is simultaneously in the state expecting 'there' -and the state expecting 'you'. - - action act {} - main := - 'hi ' %*act 'there' | - 'hi you'; - -The to-state action embedding operators embed into transitions that go into: ->~ the start state -$~ all states -%~ final states -<~ states that are not the start -@~ states that are not final -<@~ states that are not the start AND not final - -The from-state action embedding operators embed into transitions that leave: ->* the start state -$* all states -%* final states -<* states that are not the start -@* states that are not final -<@* states that are not the start AND not final - -Changed Operators for Embedding Context/Actions Into States (4.2) -================================================================= - -The operators used to embed context and actions into states have been modified. -The purpose of the modification is to make it easier to distribute actions to -take among the states in a chain of concatenations such that each state has -only a single action embedded. An example follows below. - -Now Gone: - -1. The use of >@ for selecting the states to modfiy (as in >@/ to embed eof - actions, etc) has been removed. This prefix meant start state OR not start AND - not final. - -2. The use of @% for selecting states to modify (as in @%/ to embed eof - actions, etc) has been removed. This prefix previously meant not start AND not - final OR final. - -Now Added: - -1. The prefix < which means not start. -2. The prefix @ which means not final. -3. The prefix <@ which means not start & not final" - -The new matrix of operators used to embed into states is: - ->: $: %: <: @: <@: - context ->~ $~ %~ <~ @~ <@~ - to state action ->* $* %* <* @* <@* - from state action ->/ $/ %/ ! $! %! ^ $^ %^ <^ @^ <@^ - local error action - -| | | | | | -| | | | | *- not start & not final -| | | | | -| | | | *- not final -| | | | -| | | *- not start -| | | -| | *- final -| | -| *- all states -| -*- start state - -This example shows one way to use the new operators to cover all the states -with a single action. The embedding of eof2 covers all the states in m2. The -embeddings of eof1 and eof3 avoid the boundaries that m1 and m3 both share with -m2. - - action eof1 {} - action eof2 {} - action eof3 {} - m1 = 'm1'; - m2 = ' '+; - m3 = 'm3'; - - main := m1 @/eof1 . m2 $/eof2 . m3 s $a %l; - main := - ( word ' ' word ) | - ( word '\t' word ); - -This machine needed to be rewritten as the following to avoid duplicate -actions. This is essentially a refactoring of the machine. - - main := word ( ' ' | '\t' ) word; - -An alternative was to specialize the machines: - - word1 = [a-z]+ >s $a %l; - word2 = [a-z]+; - main := - ( word1 ' ' word1 ) | - ( word2 '\t' word1 ); - -Since duplicating an action on a transition is never (in my experience) desired -and must be manually avoided, sometimes to the point of obscuring the machine -specification, it is now done automatically by Ragel. This change should have -no effect on existing code that is properly written and will allow the -programmer more freedom when writing new code. - -New Frontend (4.0) -================== - -The syntax for embedding Ragel statements into the host language has changed. -The primary motivation is a better interaction with Objective-C. Under the -previous scheme Ragel generated the opening and closing of the structure and -the interface. The user could inject user defined declarations into the struct -using the struct {}; statement, however there was no way to inject interface -declarations. Under this scheme it was also awkward to give the machine a base -class. Rather then add another statement similar to struct for including -declarations in the interface we take the reverse approach, the user now writes -the struct and interface and Ragel statements are injected as needed. - -Machine specifications now begin with %% and are followed with an optional name -and either a single ragel statement or a sequence of statements enclosed in {}. -If a machine specification does not have a name then Ragel tries to find a name -for it by first checking if the specification is inside a struct or class or -interface. If it is not then it uses the name of the previous machine -specification. If still no name is found then an error is raised. - -Since the user now specifies the fsm struct directly and since the current -state and stack variables are now of type integer in all code styles, it is -more appropriate for the user to manage the declarations of these variables. -Ragel no longer generates the current state and the stack data variables. This -also gives the user more freedom in deciding how the stack is to be allocated, -and also permits it to be grown as necessary, rather than allowing only a fixed -stack size. - -FSM specifications now persist in memory, so the second time a specification of -any particular name is seen the statements will be added to the previous -specification. Due to this it is no longer necessary to give the element or -alphabet type in the header portion and in the code portion. In addition there -is now an include statement that allows the inclusion of the header portion of -a machine it it resides in a different file, as well as allowing the inclusion -of a machine spec of a different name from the any file at all. - -Ragel is still able to generate the machine's function declarations. This may -not be required for C code, however this will be necessary for C++ and -Objective-C code. This is now accomplished with the interface statement. - -Ragel now has different criteria for deciding what to generate. If the spec -contains the interface statement then the machine's interface is generated. If -the spec contains the definition of a main machine, then the code is generated. -It is now possible to put common machine definitions into a separate library -file and to include them in other machine specifications. - -To port Ragel 3.x programs to 4.x, the FSM's structure must be explicitly coded -in the host language and it must include the declaration of current state. This -should be called 'curs' and be of type int. If the machine uses the fcall -and fret directives, the structure must also include the stack variables. The -stack should be named 'stack' and be of type int*. The stack top should be -named 'top' and be of type int. - -In Objective-C, the both the interface and implementation directives must also -be explicitly coded by the user. Examples can be found in the section "New -Interface Examples". - -Action and Priority Embedding Operators (4.0) -============================================= - -In the interest of simplifying the language, operators now embed strictly -either on characters or on EOF, but never both. Operators should be doing one -well-defined thing, rather than have multiple effects. This also enables the -detection of FSM commands that do not make sense in EOF actions. - -This change is summarized by: - -'%' operator embeds only into leaving characters. - -All global and local error operators only embed on error character - transitions, their action will not be triggerend on EOF in non-final states. - -Addition of EOF action embedding operators for all classes of states to make - up for functionality removed from other operators. These are >/ $/ @/ %/. - -Start transition operator '>' does not imply leaving transtions when start - state is final. - -This change results in a simpler and more direct relationship between the -operators and the physical state machine entities they operate on. It removes -the special cases within the operators that require you to stop and think as -you program in Ragel. - -Previously, the pending out transition operator % simultaneously served two -purposes. First, to embed actions to that are to get transfered to transitions -made going out of the machine. These transitions are created by the -concatentation and kleene star operators. Second, to specify actions that get -executed on EOF should the final state in the machine to which the operator is -applied remain final. - -To convert Ragel 3.x programs: Any place where there is an embedding of an -action into pending out transitions using the % operator and the final states -remain final in the end result machine, add an embedding of the same action -using the EOF operator %/action. - -Also note that when generating dot file output of a specific component of a -machine that has leaving transitions embedded in the final states, these -transitions will no longer show up since leaving transtion operator no longer -causes actions to be moved into the the EOF event when the state they are -embeeded into becomes a final state of the final machine. - -Const Element Type (4.0) -======================== - -If the element type has not been defined, the previous behaviour was to default -to the alphabet type. The element type however is usually not specified as -const and in most cases the data pointer in the machine's execute function -should be a const pointer. Therefore ragel now makes the element type default -to a constant version of the alphabet type. This can always be changed by using -the element statment. For example 'element char;' will result in a non-const -data pointer. - -New Interface Examples (4.0) -============================ - ----------- C ---------- - -struct fsm -{ - int curs; -}; - -%% fsm -{ - main := 'hello world'; -} - ---------- C++ --------- - -struct fsm -{ - int curs; - %% interface; -}; - -%% main := 'hello world'; - ------ Objective-C ----- - -@interface Clang : Object -{ -@public - int curs; -}; - -%% interface; - -@end - -@implementation Clang - -%% main := 'hello world'; - -@end - diff --git a/doc/RELEASE_NOTES_V5 b/doc/RELEASE_NOTES_V5 deleted file mode 100644 index 15147d8..0000000 --- a/doc/RELEASE_NOTES_V5 +++ /dev/null @@ -1,112 +0,0 @@ - - RELEASE NOTES Ragel 5.X - -This file describes the changes in Ragel version 5.X that are not backwards -compatible. For a list of all the changes see the ChangeLog file. - - -Interface to Host Programming Language -====================================== - -In version 5.0 there is a new interface to the host programming language. -There are two major changes: the way Ragel specifications are embedded in the -host program text, and the way that the host program interfaces with the -generated code. - -Multiline Ragel specifications begin with '%%{' and end with '}%%'. Single line -specifications start with '%%' and end at the first newline. Machine names are -given with the machine statement at the very beginning of a machine spec. This -change was made in order to make the task of separating Ragel code from the -host code as straightforward as possible. This will ease the addition of more -supported host languages. - -Ragel no longer parses structure and class names in order to infer machine -names. Parsing structures and clases requires knowledge of the host language -hardcoded into Ragel. Since Ragel is moving towards language independence, this -feature has been removed. - -If a machine spec does not have a name then the previous spec name is used. If -there is no previous specification then this is an error. - -The second major frontend change in 5.0 is doing away with the init(), -execute() and finish() routines. Instead of generating these functions Ragel -now only generates their contents. This scheme is more flexible, allowing the -user to use a single function to drive the machine or separate out the -different tasks if desired. It also frees the user from having to build the -machine around a structure or a class. - -An example machine is: - --------------------------- - -%%{ - machine fsm; - main := 'hello world'; -}%% - -%% write data; - -int parse( char *p ) -{ - int cs; - char *pe = p + strlen(p); - %%{ - write init; - write exec; - }%% - return cs; -}; - --------------------------- - -The generated code expects certain variables to be available. In some cases -only if the corresponding features are used. - - el* p: A pointer to the data to parse. - el* pe: A pointer to one past the last item. - int cs: The current state. - el* tokstart: The beginning of current match of longest match machines. - el* tokend: The end of the current match. - int act: The longest match pattern that has been matched. - int stack[n]: The stack for machine call statements - int top: The top of the stack for machine call statements - -It is possible to specify to Ragel how the generated code should access all the -variables except p and pe by using the access statement. - - access some_pointer->; - access variable_name_prefix; - -The writing statments are: - - write data; - write init; - write exec; - write eof; - -There are some options available: - - write data noerror nofinal noprefix; - write exec noend - - noerror: Do not write the id of the error state. - nofinal: Do not write the id of the first_final state. - noprefix: Do not prefix the variable with the name of the machine - noend: Do not test if the current character has reached pe. This is - useful if one wishes to break out of the machine using fbreak - when hitting some marker, such as the null character. - -The fexec Action Statement Changed -================================== - -The fexec action statement has been changed to take only the new position to -move to. This statement is more useful for moving backwards and reparsing input -than for specifying a whole new buffer entirely and has been shifted to this -new use. Also, using only a single argument simplifies the parsing of Ragel -input files and will ease the addition of other host languages. - -Context Embedding Has Been Dropped -================================== - -The context embedding operators were not carried over from version 4.X. Though -interesting, they have not found any real practical use. diff --git a/doc/RELEASE_NOTES_V6 b/doc/RELEASE_NOTES_V6 deleted file mode 100644 index b08b8a3..0000000 --- a/doc/RELEASE_NOTES_V6 +++ /dev/null @@ -1,95 +0,0 @@ - - RELEASE NOTES Ragel 6.X - -This file describes the changes in Ragel version 6.X that are not backwards -compatible. For a list of all the changes see the ChangeLog file. - -Leaving Actions in Scanners (new in 6.1) -======================================== - -Scanners now ensure that any leaving actions at the end of a pattern are -executed. They are always executed before the pattern action. - -The EOF Event -============= - -There is a new execution variable called "eof". This should be set to pe on the -execution of the last buffer block. When p == eof the state machine's EOF -actions are executed. The variable is required only when EOF actions have been -embedded. - -The advantage of this over "write eof" is that EOF actions are now executed in -the same context as regular actions. They are free to manipulate p, and jump to -a new portion of the machine to reprocess input. This was not possible with -"write eof". - -The "write eof" directive was consequently removed. - -Scanners now use EOF actions to to flush out the last token, if needed. This -eliminates the need to manually flush the last token. - -Semantics of > % and Error Actions -================================== - -Ragel has gone back to the 3.X semantics for >, % and error actions. - -Those that have been using Ragel since the 3.X days will remember that the -entering operator > embedded a leaving action/priority into the start state -when it was final. The leaving operator % would embed EOF actions when the -final states stayed final all the way to the end of compilation. Also, error -actions would embed EOF actions when at the end of compilation the states the -error actions were embedded into were not final. - -The problem before was that EOF actions and regular actions were executed in -different contexts ("write exec" and "write eof"), and a single action block -could easily end up in two different functions. This could lead to compile -errors and other subtle errors. Now that all actions are executed in the same -context ("write exec") these problems go away. The original semantics has been -restored. - -Backend Automatically Executed -============================== - -The "ragel" program now automatically executes the appropriate backend. If you -need the intermediate XML format you can use the -x option. - -The fbreak Statement -==================== - -The fbreak statement now advances p. It is now possible to break out of the -machine and restart it without having to fix p first. Originally, fbreak did -not advance p because it was intended to be used to terminate processing. -Advancing p was more work than necessary in that case. But fbreak turns out to -be useful for stopping to return a token as well. In this case the failure to -advance p is an inconvenience. - -Guarded Concatenation Operators are Stronger -============================================ - -The :> :>> and <: guarded concatenation operators have been strengthened. In -the previous version of Ragel is was possible for the priority assignments to -be bypassed via the the zero length string. Running the following examples -through 5.25 you will see that the a and b actions are executed on a single -transition, showing the guard fails. This happens because the operators did not -consider that the middle machine might have a start state that is final. In 6.0 -these cases have been fixed. - - (' '@a)* <: 'x'* . ' '@b; - (' '@a)* :> 'x'? . ' '@b; - (' '@a)* :>> 'xyz'? . ' '@b; - -The tokstart and tokend Variables Renamed -========================================= - -The "tokstart" and "tokend" variables were changed to "ts" and "te". These -variables get referenced a lot in scanner actions. They should be shorter. - -To update your code simply search and replace: - tokstart => ts - tokend => te - -Options -======= - -The -l option in rlgen-cd was changed to -L because -l is used in the frontend, -which now must pass options through. diff --git a/doc/extract.awk b/doc/extract.awk deleted file mode 100644 index 2874456..0000000 --- a/doc/extract.awk +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/awk -# - -BEGIN { - in_generate = 0; - in_verbatim = 0; - return_val = 1; -} - -/^% GENERATE: *[a-z0-9A-Z_\.\-]+ *$/ && $3 == exname { - in_generate = 1; - return_val = 0; - next; -} - -/^% END GENERATE$/ { - in_generate = 0; - next; -} - -in_generate && /\\begin\{verbatim\}/ { - in_generate = 0; - in_verbatim = 1; - next; -} - -in_verbatim && /\\end\{verbatim\}/ { - in_generate = 1; - in_verbatim = 0; - next; -} - -in_generate && /^%/ { - print substr( $0, 2 ); -} - -in_verbatim { - print $0; -} - -END { exit return_val; } diff --git a/doc/fixbackbox.awk b/doc/fixbackbox.awk deleted file mode 100644 index 434fd20..0000000 --- a/doc/fixbackbox.awk +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/awk -# - -NF == 16 && $16 == 5 { - $7 = 1 - print $0 - next; -} - -{ print $0; } diff --git a/doc/generate.lm b/doc/generate.lm deleted file mode 100644 index bd4faef..0000000 --- a/doc/generate.lm +++ /dev/null @@ -1,547 +0,0 @@ -# -# Copyright 2012 Adrian Thurston -# -# Permission is hereby granted, free of charge, to any person obtaining a copy -# of this software and associated documentation files (the "Software"), to -# deal in the Software without restriction, including without limitation the -# rights to use, copy, modify, merge, publish, distribute, sublicense, and/or -# sell copies of the Software, and to permit persons to whom the Software is -# furnished to do so, subject to the following conditions: -# -# The above copyright notice and this permission notice shall be included in all -# copies or substantial portions of the Software. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -# SOFTWARE. -# - -lex - token word /( [^. \t\n]+ | '.' )/ - token lws /[ \t]+/ - token nl / '\n'/ - - token cmd_verb1 /'.verb|'/ - token cmd_verb2 /'.verb/'/ - token cmd_label /'.label{'/ - token cmd_ref /'.ref{'/ - token cmd_em /'.em{'/ - token cmd_tt /'.tt{'/ - - token cmd_title /'.title' lws/ - token cmd_sub_title /'.subtitle' lws/ - token cmd_author /'.author' lws/ - - token cmd_chapter /'.chapter' lws/ - token cmd_section /'.section' lws/ - token cmd_sub_section /'.subsection' lws/ - token cmd_sub_sub_section /'.subsubsection' lws/ - - token cmd_graphic /'.graphic' lws/ - token cmd_comment /'.comment' lws? '\n'/ - token cmd_verbatim /'.verbatim' lws? '\n'/ - token cmd_code /'.code' lws? '\n'/ - - token cmd_itemize /'.itemize' lws? '\n'/ - token end_itemize /'.end' lws 'itemize' lws? '\n'/ - token cmd_item /'.item' lws/ - - token cmd_center /'.center' lws? '\n'/ - token end_center /'.end' lws 'center' lws? '\n'/ - - token cmd_tabular /'.tabular' lws? '\n'/ - token cmd_row /'.row' lws/ - token end_tabular /'.end' lws 'tabular' lws? '\n'/ - - token cmd_multicols /'.multicols' lws? '\n'/ - token cmd_columnbreak /'.columnbreak' lws? '\n'/ - token end_multicols /'.end' lws 'multicols' lws? '\n'/ - - token cmd_figure / '.figure' lws?/ - token cmd_caption / '.caption' lws/ - token end_figure / '.end' lws 'figure' lws? '\n'/ - - token cmd_list /'.list' lws? '\n'/ - token end_list /'.end' lws 'list' lws? '\n'/ - token cmd_li /'.li' lws/ - - token cmd_license /'.license' lws? '\n'/ -end - -lex - token bar_data /[^|]*/ - token end_bar /'|'/ -end - -lex - token slash_data /[^/]*/ - token end_slash /'/'/ -end - -lex - token curly_data /[^}]*/ - token end_curly /'}'/ -end - -def cmd_il - [cmd_verb1 bar_data end_bar] -| [cmd_verb2 slash_data end_slash] -| [cmd_label curly_data end_curly] -| [cmd_ref curly_data end_curly] -| [cmd_em curly_data end_curly] -| [cmd_tt curly_data end_curly] - -def text - [word] -| [lws] -| [cmd_il] - -lex - token end_verbatim /lws? '.' lws? 'end' lws 'verbatim' lws? '\n'/ - token verbatim_line /[^\n]* '\n'/ -end - -def verbatim - [cmd_verbatim verbatim_line* end_verbatim] - -lex - token end_code /lws? '.' lws? 'end' lws 'code' lws? '\n'/ - token code_line /[^\n]* '\n'/ -end - -def code - [cmd_code code_line* end_code] - -lex - token end_comment /lws? '.' lws? 'end' lws 'comment' lws? '\n'/ - token comment_line /[^\n]* '\n'/ -end - -def comment - [cmd_comment comment_line* end_comment] - -def figure - [cmd_figure text nl line* caption? end_figure] - -def li - [cmd_li text* nl] - -def _list - [cmd_list li* end_list] - -def scale - [lws word word*] - -def graphic - [cmd_graphic word scale? nl] - -def itemize - [cmd_itemize line* item* end_itemize] - -def center - [cmd_center line* end_center] - -def row - [cmd_row text* nl] - -def tabular - [cmd_tabular row* end_tabular] - -def multicols_line - [cmd_columnbreak] -| [line] - -def multicols - [cmd_multicols multicols_line* end_multicols] - -def item - [cmd_item line*] - -def caption - [cmd_caption line*] - -def line - [text] -| [nl] -| [comment] -| [verbatim] -| [code] -| [graphic] -| [itemize] -| [center] -| [tabular] -| [multicols] -| [figure] -| [_list] - -def sub_sub_section - [cmd_sub_sub_section text* nl line*] - -def sub_section - [cmd_sub_section text* nl line* sub_sub_section*] - -def section - [cmd_section text* nl line* sub_section*] - -def chapter - [cmd_chapter text* nl line* section*] - -def title - [cmd_title text* nl] - -def subtitle - [cmd_sub_title text* nl] - -def author - [cmd_author text* nl] - -# -# Paragraphs. -# - -def pline - [text text* nl] - -def paragraph - [pline pline*] - -def pextra - [nl paragraph] - -def block - [paragraph pextra*] - -def license - [cmd_license nl* block nl*] - -# -# Preamble. -# - -def preamble_item - [text] -| [nl] -| [title] -| [subtitle] -| [author] - -def preamble - [preamble_item* license] - -def start - [preamble chapter*] - -parse Start: start[ stdin ] -if ( ! Start ) { - print( error, '\n' ) - exit( 1 ) -} - -int printPlData( Pld: cmd_il ) -{ - if match Pld [ cmd_verb1 V: bar_data end_bar] { - print( '\\verb|' ) - print( V ) - print( '|' ) - } - else if match Pld [cmd_verb2 V: slash_data end_slash] { - print( '\\verb/' ) - print( V ) - print( '/' ) - } - else if match Pld [cmd_label L: curly_data end_curly] { - print( '\\label{' ) - print( L ) - print( '}' ) - } - else if match Pld [cmd_ref L: curly_data end_curly] { - print( '\\ref{' ) - print( L ) - print( '}' ) - } - else if match Pld [cmd_em L: curly_data end_curly] { - print( '{\\em ' ) - print( L ) - print( '}' ) - } - else if match Pld [cmd_tt L: curly_data end_curly] { - print( '{\\tt ' ) - print( L ) - print( '}' ) - } - else { - print( Pld ) - } -} - -int printText( Lines: text* ) -{ - for L: text in repeat(Lines) { - if match L [PlData: cmd_il] { - printPlData( PlData ) - } - else { - print( L ) - } - } -} - -int printLines( Lines: line* ) -{ - for L: line in repeat(Lines) { - if match L [word] { - print( L ) - } - if match L [lws] { - print( L ) - } - if match L [nl] { - print( L ) - } - if match L [PlData: cmd_il] { - printPlData( PlData ) - } - if match L [cmd_verbatim Lines: verbatim_line* end_verbatim] { - print( '\\begin{verbatim}\n' ) - print( Lines ) - print( '\\end{verbatim}\n' ) - print( '\\verbspace\n' ) - } - if match L [cmd_code Lines: code_line* end_code] { - print( '\\begin{inline_code}\n' ) - print( '\\begin{verbatim}\n' ) - print( Lines ) - print( '\\end{verbatim}\n' ) - print( '\\end{inline_code}\n' ) - print( '\\verbspace\n' ) - } - if match L [cmd_graphic Name: word Scale: scale? nl] { - print( '\\graphspace\n' ) - print( '\\begin{center}\n' ) - print( '\\includegraphics' ) - if match Scale [lws Spd: word Spd2: word*] - print( '[scale=', Spd, Spd2, ']' ) - else - print( '[scale=0.55]' ) - print( '{', Name, '}\n' ) - print( '\\end{center}\n' ) - print( '\\graphspace\n' ) - } - if match L [cmd_itemize Lines: line* Items: item* end_itemize] { - print( '\\begin{itemize}\n' ) - printLines( Lines ) - for Item: item in repeat(Items) { - match Item [cmd_item Lines: line*] - print( '\\item ' ) - printLines( Lines ) - } - print( '\\end{itemize}\n' ) - } - if match L [cmd_figure DirData: text nl Lines: line* Caption: caption? end_figure] { - print( '\\begin{figure}\n' ) - print( '\\small\n' ) - printLines( Lines ) - if match Caption [cmd_caption CL: line*] { - print( '\\caption{' ) - printLines( CL ) - print( '}\n' ) - } - print( '\\label{', DirData, '}\n' ) - print( '\\end{figure}\n' ) - } - if match L [cmd_list LiList: li* end_list] { - for Li: li* in LiList { - if match Li [cmd_li Lines: text* nl Rest: li*] { - print( '\\noindent\\\hspace*{24pt}' ) - printText( Lines ) - if match Rest [ li li* ] - print( '\\\\' ) - print( '\n' ) - } - } - print( '\\vspace{12pt}\n' ) - } - if match L [cmd_center Lines: line* end_center] { - print( '\\begin{center}\n' ) - printLines( Lines ) - print( '\\end{center}\n' ) - } - if match L [cmd_tabular Rows: row* end_tabular] { - print( '\\begin{tabular}{|c|c|c|}\n' ) - print( '\\hline\n' ) - for Row: row in repeat(Rows) { - if match Row [cmd_row Lines: text* nl ] { - printText( Lines ) - print( '\\\\' '\n' ) - print( '\\hline\n' ) - } - } - print( '\\end{tabular}\n' ) - } - if match L [cmd_multicols Lines: multicols_line* end_multicols] { - print( '\\begin{multicols}{2}\n' ) - for McLine: multicols_line in repeat( Lines ) { - if match McLine [Line: line] - printLines( cons line* [Line] ) - else if match McLine [cmd_columnbreak] { - print( '\\columnbreak\n' ) - } - } - print( '\\end{multicols}\n' ) - } - } -} - -match Start - [Preamble: preamble Chapters: chapter*] - -Title: title = title in Preamble -match Title [cmd_title TitleData: text* nl] - -SubTitle: subtitle = subtitle in Preamble -match SubTitle [cmd_sub_title SubTitleData: text* nl] - -Author: author = author in Preamble -match Author [cmd_author AuthorData: text* nl] - -License: license = license in Preamble - -print( - ~\documentclass[letterpaper,11pt,oneside]{book} - ~\usepackage{graphicx} - ~\usepackage{comment} - ~\usepackage{multicol} - ~\usepackage[ - ~ colorlinks=true, - ~ linkcolor=black, - ~ citecolor=green, - ~ filecolor=black, - ~ urlcolor=black]{hyperref} - ~ - ~\topmargin -0.20in - ~\oddsidemargin 0in - ~\textwidth 6.5in - ~\textheight 9in - ~ - ~\setlength{\parskip}{0pt} - ~\setlength{\topsep}{0pt} - ~\setlength{\partopsep}{0pt} - ~\setlength{\itemsep}{0pt} - ~ - ~\input{version} - ~ - ~\newcommand{\verbspace}{\vspace{10pt}} - ~\newcommand{\graphspace}{\vspace{10pt}} - ~ - ~\renewcommand\floatpagefraction{.99} - ~\renewcommand\topfraction{.99} - ~\renewcommand\bottomfraction{.99} - ~\renewcommand\textfraction{.01} - ~\setcounter{totalnumber}{50} - ~\setcounter{topnumber}{50} - ~\setcounter{bottomnumber}{50} - ~ - ~\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{} - ~ - ~\begin{document} - ~ - ~\thispagestyle{empty} - ~\begin{center} - ~\vspace*{3in} -) - -print( '{\\huge ', TitleData, '}\\\\\n' ) - -print( '\\vspace*{12pt}\n' ) - -print( '{\\Large ', SubTitleData, '}\\\\\n' ) - -print( - ~\vspace{1in} - ~by\\ - ~\vspace{12pt} -) - -print( '{\\large ', AuthorData, '}\\\\\n' ) - -print( - ~\end{center} - ~\clearpage - ~ - ~\pagenumbering{roman} - ~ - ~\chapter*{License} -) - -print( - ~Ragel version \version, \pubdate\\ - ~Copyright \copyright\ 2003-2012 Adrian D. Thurston - ~\vspace{6mm} - ~ -) - -i: int = 0 -for P: paragraph in License { - if ( i != 0 ) { - print( - ~ - ~\vspace{5pt} - ~ - ) - } - print( "{\\bf\\it\\noindent " ) - print( P ) - print( "}\n" ) - i = i + 1 -} - -print( - ~ - ~\clearpage - ~\tableofcontents - ~\clearpage - ~ - ~\pagenumbering{arabic} -) - - -for Chapter: chapter in repeat(Chapters) { - match Chapter - [cmd_chapter DirData: text* nl Lines: line* SectionList: section*] - - print( '\\chapter{', DirData, '}\n' ) - printLines( Lines ) - - for Section: section in repeat(SectionList) { - match Section - [cmd_section DirData: text* nl Lines: line* SubSectionList: sub_section*] - - print( '\\section{', DirData, '}\n' ) - printLines( Lines ) - for SubSection: sub_section in repeat(SubSectionList) { - match SubSection - [cmd_sub_section DirData: text* nl Lines: line* - SubSubSectionList: sub_sub_section*] - - print( '\\subsection{', DirData, '}\n' ) - printLines( Lines ) - - for SubSubSection: sub_sub_section in repeat(SubSubSectionList) { - match SubSubSection - [cmd_sub_sub_section DirData: text* nl Lines: line*] - - print( '\\subsubsection{', DirData, '}\n' ) - printLines( Lines ) - } - } - } -} - -print( - ~ - ~\end{document} -) diff --git a/doc/genfigs.sh b/doc/genfigs.sh deleted file mode 100755 index 8d52107..0000000 --- a/doc/genfigs.sh +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash -# - -input=ragel-guide.tex - -for fig; do - if awk -f extract.awk -vexname=$fig $input > /dev/null; then - echo generating ${fig}.dot - opt=`awk -f extract.awk -vexname=$fig $input | - sed '/^ *OPT:/s/^.*: *//p;d'` - awk -f extract.awk -vexname=$fig $input > ${fig}.rl - ../ragel/ragel -V -p ${fig}.rl > ${fig}.dot - else - echo "$0: internal error: figure $fig not found in $input" >&2 - exit 1 - fi -done - diff --git a/doc/ragel-guide.tex b/doc/ragel-guide.tex deleted file mode 100644 index ac43edb..0000000 --- a/doc/ragel-guide.tex +++ /dev/null @@ -1,3561 +0,0 @@ -\documentclass[letterpaper,11pt,oneside]{book} -\usepackage{graphicx} -\usepackage{comment} -\usepackage{multicol} -\usepackage[ - colorlinks=true, - linkcolor=black, - citecolor=green, - filecolor=black, - urlcolor=black]{hyperref} - -\topmargin -0.20in -\oddsidemargin 0in -\textwidth 6.5in -\textheight 9in - -\setlength{\parskip}{0pt} -\setlength{\topsep}{0pt} -\setlength{\partopsep}{0pt} -\setlength{\itemsep}{0pt} - -\input{version} - -\newcommand{\verbspace}{\vspace{10pt}} -\newcommand{\graphspace}{\vspace{10pt}} - -\renewcommand\floatpagefraction{.99} -\renewcommand\topfraction{.99} -\renewcommand\bottomfraction{.99} -\renewcommand\textfraction{.01} -\setcounter{totalnumber}{50} -\setcounter{topnumber}{50} -\setcounter{bottomnumber}{50} - -\newenvironment{inline_code}{\def\baselinestretch{1}\vspace{12pt}\small}{} - -\begin{document} - -\thispagestyle{empty} -\begin{center} -\vspace*{3in} -{\huge Ragel State Machine Compiler}\\ -\vspace*{12pt} -{\Large User Guide}\\ -\vspace{1in} -by\\ -\vspace{12pt} -{\large Adrian Thurston}\\ -\end{center} -\clearpage - -\pagenumbering{roman} - -\chapter*{License} -Ragel version \version, \pubdate\\ -Copyright \copyright\ 2003-2016 Adrian D. Thurston -\vspace{6mm} - -{\bf\it\noindent Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including without -limitation the rights to use, copy, modify, merge, publish, distribute, -sublicense, and/or sell copies of the Software, and to permit persons to whom -the Software is furnished to do so, subject to the following conditions:} - -\vspace{5pt} - -{\bf\it\noindent The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software.} - -\vspace{5pt} - -{\bf\it\noindent THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY -KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO -EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES -OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE.} - -\clearpage -\tableofcontents -\clearpage - -\pagenumbering{arabic} -\chapter{Introduction} - -\section{Abstract} - -Regular expressions are used heavily in practice for the purpose of specifying -parsers. They are normally used as black boxes linked together with program -logic. User actions are executed in between invocations of the regular -expression engine. Adding actions before a pattern terminates requires patterns -to be broken and pasted back together with program logic. The more user actions -are needed, the less the advantages of regular expressions are seen. - -Ragel is a software development tool that allows user actions to be -embedded into the transitions of a regular expression's corresponding state -machine, eliminating the need to switch from the regular expression engine and -user code execution environment and back again. As a result, expressions can be -maximally continuous. One is free to specify an entire parser using a single -regular expression. The single-expression model affords concise and elegant -descriptions of languages and the generation of very simple, fast and robust -code. Ragel compiles executable finite state machines from a high level regular language -notation. Ragel targets C, C++, Objective-C, D, Go, Java, Ruby and OCaml. - -In addition to building state machines from regular expressions, Ragel allows -the programmer to directly specify state machines with state charts. These two -notations may be freely combined. There are also facilities for controlling -nondeterminism in the resulting machines and building scanners using patterns -that themselves have embedded actions. Ragel can produce code that is small and -runs very fast. Ragel can handle integer-sized alphabets and can compile very -large state machines. - -\section{Motivation} - -When a programmer is faced with the task of producing a parser for a -context-free language, there are many tools to choose from. It is quite common -to generate useful and efficient parsers for programming languages from a -formal grammar. It is also quite common for programmers to avoid such tools -when making parsers for simple computer languages, such as file formats and -communication protocols. Such languages are often regular, and tools for -processing the context-free languages are viewed as too heavyweight for the -purpose of parsing regular languages. The extra run-time effort required for -supporting the recursive nature of context-free languages is wasted. - -When we turn to the regular expression-based parsing tools, such as Lex, Re2C, -and scripting languages such as Sed, Awk and Perl we find that they are split -into two levels: a regular expression matching engine and some kind of program -logic for linking patterns together. For example, a Lex program is composed of -sets of regular expressions. The implied program logic repeatedly attempts to -match a pattern in the current set. When a match is found, the associated user -code executed. It requires the user to consider a language as a sequence of -independent tokens. Scripting languages and regular expression libraries allow -one to link patterns together using arbitrary program code. This is very -flexible and powerful; however, we can be more concise and clear if we avoid -gluing together regular expressions with if statements and while loops. - -This model of execution, where the runtime alternates between regular -expression matching and user code execution places restrictions on when -action code may be executed. Since action code can only be associated with -complete patterns, any action code that must be executed before an entire -pattern is matched requires that the pattern be broken into smaller units. -Instead of being forced to disrupt the regular expression syntax and write -smaller expressions, it is desirable to retain a single expression and embed -code for performing actions directly into the transitions that move over the -characters. After all, capable programmers are astutely aware of the machinery -underlying their programs, so why not provide them with access to that -machinery? To achieve this, we require an action execution model for associating -code with the sub-expressions of a regular expression in a way that does not -disrupt its syntax. - -The primary goal of Ragel is to provide developers with an ability to embed -actions into the transitions and states of a regular expression's state machine -in support of the definition of entire parsers or large sections of parsers -using a single regular expression. From the regular expression we gain a clear -and concise statement of our language. From the state machine we obtain a very -fast and robust executable that lends itself to many kinds of analysis and -visualization. - -\section{Overview} - -Ragel is a language for specifying state machines. The Ragel program is a -compiler that assembles a state machine definition to executable code. Ragel -is based on the principle that any regular language can be converted to a -deterministic finite state automaton. Since every regular language has a state -machine representation and vice versa, the terms regular language and state -machine (or just machine) will be used interchangeably in this document. - -Ragel outputs machines to C, C++, Objective-C, D, Go, Java, Ruby or OCaml code. The output is -designed to be generic and is not bound to any particular input or processing -method. A Ragel machine expects to have data passed to it in buffer blocks. -When there is no more input, the machine can be queried for acceptance. In -this way, a Ragel machine can be used to simply recognize a regular language -like a regular expression library. By embedding code into the regular language, -a Ragel machine can also be used to parse input. - -The Ragel language has many operators for constructing and manipulating -machines. Machines are built up from smaller machines, to bigger ones, to the -final machine representing the language that needs to be recognized or parsed. - -The core state machine construction operators are those found in most theory -of computation textbooks. They date back to the 1950s and are widely studied. -They are based on set operations and permit one to think of languages as a set -of strings. They are Union, Intersection, Difference, Concatenation and Kleene -Star. Put together, these operators make up what most people know as regular -expressions. Ragel also provides a scanner construction operator -and provides operators for explicitly constructing machines -using a state chart method. In the state chart method, one joins machines -together without any implied transitions and then explicitly specifies where -epsilon transitions should be drawn. - -The state machine manipulation operators are specific to Ragel. They allow the -programmer to access the states and transitions of regular language's -corresponding machine. There are two uses of the manipulation operators. The -first and primary use is to embed code into transitions and states, allowing -the programmer to specify the actions of the state machine. - -Ragel attempts to make the action embedding facility as intuitive as possible. -To do so, a number of issues need to be addressed. For example, when making a -nondeterministic specification into a DFA using machines that have embedded -actions, new transitions are often made that have the combined actions of -several source transitions. Ragel ensures that multiple actions associated with -a single transition are ordered consistently with respect to the order of -reference and the natural ordering implied by the construction operators. - -The second use of the manipulation operators is to assign priorities to -transitions. Priorities provide a convenient way of controlling any -nondeterminism introduced by the construction operators. Suppose two -transitions leave from the same state and go to distinct target states on the -same character. If these transitions are assigned conflicting priorities, then -during the determinization process the transition with the higher priority will -take precedence over the transition with the lower priority. The lower priority -transition gets abandoned. The transitions would otherwise be combined into a new -transition that goes to a new state that is a combination of the original -target states. Priorities are often required for segmenting machines. The most -common uses of priorities have been encoded into a set of simple operators -that should be used instead of priority embeddings whenever possible. - -For the purposes of embedding, Ragel divides transitions and states into -different classes. There are four operators for embedding actions and -priorities into the transitions of a state machine. It is possible to embed -into entering transitions, finishing transitions, all transitions and leaving -transitions. The embedding into leaving transitions is a special case. -These transition embeddings get stored in the final states of a machine. They -are transferred to any transitions that are made going out of the machine by -future concatenation or kleene star operations. - -There are several more operators for embedding actions into states. Like the -transition embeddings, there are various different classes of states that the -embedding operators access. For example, one can access start states, final -states or all states, among others. Unlike the transition embeddings, there are -several different types of state action embeddings. These are executed at -various different times during the processing of input. It is possible to embed -actions that are executed on transitions into a state, on transitions out of a -state, on transitions taken on the error event, or on transitions taken on the -EOF event. - -Within actions, it is possible to influence the behaviour of the state machine. -The user can write action code that jumps or calls to another portion of the -machine, changes the current character being processed, or breaks out of the -processing loop. With the state machine calling feature Ragel can be used to -parse languages that are not regular. For example, one can parse balanced -parentheses by calling into a parser when an open parenthesis character is seen -and returning to the state on the top of the stack when the corresponding -closing parenthesis character is seen. More complicated context-free languages -such as expressions in C are out of the scope of Ragel. - -Ragel also provides a scanner construction operator that can be used to build -scanners much the same way that Lex is used. The Ragel generated code, which -relies on user-defined variables for backtracking, repeatedly tries to match -patterns to the input, favouring longer patterns over shorter ones and patterns -that appear ahead of others when the lengths of the possible matches are -identical. When a pattern is matched the associated action is executed. - -The key distinguishing feature between scanners in Ragel and scanners in Lex is -that Ragel patterns may be arbitrary Ragel expressions and can therefore -contain embedded code. With a Ragel-based scanner the user need not wait until -the end of a pattern before user code can be executed. - -Scanners do take Ragel out of the domain of pure state machines and require the -user to maintain the backtracking related variables. However, scanners -integrate well with regular state machine instantiations. They can be called to -or jumped to only when needed, or they can be called out of or jumped out of -when a simpler, pure state machine model is appropriate. - -Two types of output code style are available. Ragel can produce a table-driven -machine or a directly executable machine. The directly executable machine is -much faster than the table-driven. On the other hand, the table-driven machine -is more compact and less demanding on the host language compiler. It is better -suited to compiling large state machines. - -\section{Related Work} - -Lex is perhaps the best-known tool for constructing parsers from regular -expressions. In the Lex processing model, generated code attempts to match one -of the user's regular expression patterns, favouring longer matches over -shorter ones. Once a match is made it then executes the code associated with -the pattern and consumes the matching string. This process is repeated until -the input is fully consumed. - -Through the use of start conditions, related sets of patterns may be defined. -The active set may be changed at any time. This allows the user to define -different lexical regions. It also allows the user to link patterns together by -requiring that some patterns come before others. This is quite like a -concatenation operation. However, use of Lex for languages that require a -considerable amount of pattern concatenation is inappropriate. In such cases a -Lex program deteriorates into a manually specified state machine, where start -conditions define the states and pattern actions define the transitions. Lex -is therefore best suited to parsing tasks where the language to be parsed can -be described in terms of regions of tokens. - -Lex is useful in many scenarios and has undoubtedly stood the test of time. -There are, however, several drawbacks to using Lex. Lex can impose too much -overhead for parsing applications where buffering is not required because all -the characters are available in a single string. In these cases there is -structure to the language to be parsed and a parser specification tool can -help, but employing a heavyweight processing loop that imposes a stream -``pull'' model and dynamic input buffer allocation is inappropriate. An -example of this kind of scenario is the conversion of floating point numbers -contained in a string to their corresponding numerical values. - -Another drawback is the very issue that Ragel attempts to solve. -It is not possible to execute a user action while -matching a character contained inside a pattern. For example, if scanning a -programming language and string literals can contain newlines which must be -counted, a Lex user must break up a string literal pattern so as to associate -an action with newlines. This forces the definition of a new start condition. -Alternatively the user can reprocess the text of the matched string literal to -count newlines. - - -The Re2C program defines an input processing model similar to that of Lex. -Re2C focuses on making generated state machines run very fast and -integrate easily into any program, free of dependencies. Re2C generates -directly executable code and is able to claim that generated parsers run nearly -as fast as their hand-coded equivalents. This is very important for user -adoption, as programmers are reluctant to use a tool when a faster alternative -exists. A consideration to ease of use is also important because developers -need the freedom to integrate the generated code as they see fit. - -Many scripting languages provide ways of composing parsers by linking regular -expressions using program logic. For example, Sed and Awk are two established -Unix scripting tools that allow the programmer to exploit regular expressions -for the purpose of locating and extracting text of interest. High-level -programming languages such as Perl, Python, PHP and Ruby all provide regular -expression libraries that allow the user to combine regular expressions with -arbitrary code. - -In addition to supporting the linking of regular expressions with arbitrary -program logic, the Perl programming language permits the embedding of code into -regular expressions. Perl embeddings do not translate into the embedding of -code into deterministic state machines. Perl regular expressions are in fact -not fully compiled to deterministic machines when embedded code is involved. -They are instead interpreted and involve backtracking. This is shown by the -following Perl program. When it is fed the input \verb|abcd| the interpreter -attempts to match the first alternative, printing \verb|a1 b1|. When this -possibility fails it backtracks and tries the second possibility, printing -\verb|a2 b2|, at which point it succeeds. - -\begin{inline_code} -\begin{verbatim} -print "YES\n" if ( =~ - /( a (?{ print "a1 "; }) b (?{ print "b1 "; }) cX ) | - ( a (?{ print "a2 "; }) b (?{ print "b2 "; }) cd )/x ) -\end{verbatim} -\end{inline_code} -\verbspace - -In Ragel there is no regular expression interpreter. Aside from the scanner -operator, all Ragel expressions are made into deterministic machines and the -run time simply moves from state to state as it consumes input. An equivalent -parser expressed in Ragel would attempt both of the alternatives concurrently, -printing \verb|a1 a2 b1 b2|. - -\section{Development Status} - -Ragel is a relatively new tool and is under continuous development. As a rough -release guide, minor revision number changes are for implementation -improvements and feature additions. Major revision number changes are for -implementation and language changes that do not preserve backwards -compatibility. Though in the past this has not always held true: changes that -break code have crept into minor version number changes. Typically, the -documentation lags behind the development in the interest of documenting only -the lasting features. The latest changes are always documented in the ChangeLog -file. - -\chapter{Constructing State Machines} - -\section{Ragel State Machine Specifications} - -A Ragel input file consists of a program in the host language that contains embedded machine -specifications. Ragel normally passes input straight to output. When it sees -a machine specification it stops to read the Ragel statements and possibly generate -code in place of the specification. -Afterwards it continues to pass input through. There -can be any number of FSM specifications in an input file. A multi-line FSM spec -starts with \verb|%%{| and ends with \verb|}%%|. A single-line FSM spec starts -with \verb|%%| and ends at the first newline. - -While Ragel is looking for FSM specifications it does basic lexical analysis on -the surrounding input. It interprets literal strings and comments so a -\verb|%%| sequence in either of those will not trigger the parsing of an FSM -specification. Ragel does not pass the input through any preprocessor nor does it -interpret preprocessor directives itself so includes, defines and ifdef logic -cannot be used to alter the parse of a Ragel input file. It is therefore not -possible to use an \verb|#if 0| directive to comment out a machine as is -commonly done in C code. As an alternative, a machine can be prevented from -causing any generated output by commenting out write statements. - -In Figure \ref{cmd-line-parsing}, a multi-line specification is used to define the -machine and single line specifications are used to trigger the writing of the machine -data and execution code. - -\begin{figure} -\small -\begin{multicols}{2} -\begin{verbatim} -#include -#include - -%%{ - machine foo; - main := - ( 'foo' | 'bar' ) - 0 @{ res = 1; }; -}%% - -%% write data; -\end{verbatim} -\verbspace -\columnbreak -\begin{verbatim} -int main( int argc, char **argv ) -{ - int cs, res = 0; - if ( argc > 1 ) { - char *p = argv[1]; - char *pe = p + strlen(p) + 1; - %% write init; - %% write exec; - } - printf("result = %i\n", res ); - return 0; -} -\end{verbatim} -\verbspace -\end{multicols} -\caption{Parsing a command line argument. -} -\label{cmd-line-parsing} -\end{figure} - -\subsection{Naming Ragel Blocks} - -\begin{verbatim} -machine fsm_name; -\end{verbatim} -\verbspace - -The \verb|machine| statement gives the name of the FSM. If present in a -specification, this statement must appear first. If a machine specification -does not have a name then Ragel uses the previous specification name. If no -previous specification name exists then this is an error. Because FSM -specifications persist in memory, a machine's statements can be spread across -multiple machine specifications. This allows one to break up a machine across -several files or draw in statements that are common to multiple machines using -the \verb|include| statement. - -\subsection{Machine Definition} -\label{definition} - -\begin{verbatim} - = ; -\end{verbatim} -\verbspace - -The machine definition statement associates an FSM expression with a name. Machine -expressions assigned to names can later be referenced in other expressions. A -definition statement on its own does not cause any states to be generated. It is simply a -description of a machine to be used later. States are generated only when a definition is -instantiated, which happens when a definition is referenced in an instantiated -expression. - -\subsection{Machine Instantiation} -\label{instantiation} - -\begin{verbatim} - := ; -\end{verbatim} -\verbspace - -The machine instantiation statement generates a set of states representing an -expression. Each instantiation generates a distinct set of states. The starting -state of the instantiation is written in the data section of the generated code -using the instantiation name. If a machine named -\verb|main| is instantiated, its start state is used as the -specification's start state and is assigned to the \verb|cs| variable by the -\verb|write init| command. If no \verb|main| machine is given, the start state -of the last machine instantiation to appear is used as the specification's -start state. - -From outside the execution loop, control may be passed to any machine by -assigning the entry point to the \verb|cs| variable. From inside the execution -loop, control may be passed to any machine instantiation using \verb|fcall|, -\verb|fgoto| or \verb|fnext| statements. - -\subsection{Including Ragel Code} - -\begin{verbatim} -include FsmName "inputfile.rl"; -\end{verbatim} -\verbspace - -The \verb|include| statement can be used to draw in the statements of another FSM -specification. Both the name and input file are optional, however at least one -must be given. Without an FSM name, the given input file is searched for an FSM -of the same name as the current specification. Without an input file, the -current file is searched for a machine of the given name. If both are present, -the given input file is searched for a machine of the given name. - -Ragel searches for included files from the location of the current file. -Additional directories can be added to the search path using the \verb|-I| -option. - -\subsection{Importing Definitions} -\label{import} - -\begin{verbatim} -import "inputfile.h"; -\end{verbatim} -\verbspace - -The \verb|import| statement scrapes a file for sequences of tokens that match -the following forms. Ragel treats these forms as state machine definitions. - -\noindent\hspace*{24pt}\verb|name '=' number|\\ -\noindent\hspace*{24pt}\verb|name '=' lit_string|\\ -\noindent\hspace*{24pt}\verb|'define' name number|\\ -\noindent\hspace*{24pt}\verb|'define' name lit_string| -\vspace{12pt} - -If the input file is a Ragel program then tokens inside any Ragel -specifications are ignored. See Section \ref{export} for a description of -exporting machine definitions. - -Ragel searches for imported files from the location of the current file. -Additional directories can be added to the search path using the \verb|-I| -option. - -\section{Lexical Analysis of a Ragel Block} -\label{lexing} - -Within a machine specification the following lexical rules apply to the input. - -\begin{itemize} - -\item The \verb|#| symbol begins a comment that terminates at the next newline. - -\item The symbols \verb|""|, \verb|''|, \verb|//|, \verb|[]| behave as the -delimiters of literal strings. Within them, the following escape sequences -are interpreted: - -\verb| \0 \a \b \t \n \v \f \r| - -A backslash at the end of a line joins the following line onto the current. A -backslash preceding any other character removes special meaning. This applies -to terminating characters and to special characters in regular expression -literals. As an exception, regular expression literals do not support escape -sequences as the operands of a range within a list. See the bullet on regular -expressions in Section \ref{basic}. - -\item The symbols \verb|{}| delimit a block of host language code that will be -embedded into the machine as an action. Within the block of host language -code, basic lexical analysis of comments and strings is done in order to -correctly find the closing brace of the block. With the exception of FSM -commands embedded in code blocks, the entire block is preserved as is for -identical reproduction in the output code. - -\item The pattern \verb|[+-]?[0-9]+| denotes an integer in decimal format. -Integers used for specifying machines may be negative only if the alphabet type -is signed. Integers used for specifying priorities may be positive or negative. - -\item The pattern \verb|0x[0-9A-Fa-f]+| denotes an integer in hexadecimal -format. - -\item The keywords are \verb|access|, \verb|action|, \verb|alphtype|, -\verb|getkey|, \verb|write|, \verb|machine| and \verb|include|. - -\item The pattern \verb|[a-zA-Z_][a-zA-Z_0-9]*| denotes an identifier. - - -\item Any amount of whitespace may separate tokens. - -\end{itemize} - - -\section{Basic Machines} -\label{basic} - -The basic machines are the base operands of regular language expressions. They -are the smallest unit to which machine construction and manipulation operators -can be applied. - -\begin{itemize} - -\item \verb|'hello'| -- Concatenation Literal. Produces a machine that matches -the sequence of characters in the quoted string. If there are 5 characters -there will be 6 states chained together with the characters in the string. See -Section \ref{lexing} for information on valid escape sequences. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmconcat} -\end{center} -\graphspace - -It is possible -to make a concatenation literal case-insensitive by appending an \verb|i| to -the string, for example \verb|'cmd'i|. - -\item \verb|"hello"| -- Identical to the single quoted version. - -\item \verb|[hello]| -- Or Expression. Produces a union of characters. There -will be two states with a transition for each unique character between the two states. -The \verb|[]| delimiters behave like the quotes of a literal string. For example, -\verb|[ \t]| means tab or space. The \verb|or| expression supports character ranges -with the \verb|-| symbol as a separator. The meaning of the union can be negated -using an initial \verb|^| character as in standard regular expressions. -See Section \ref{lexing} for information on valid escape sequences -in \verb|or| expressions. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmor} -\end{center} -\graphspace - -\item \verb|''|, \verb|""|, and \verb|[]| -- Zero Length Machine. Produces a machine -that matches the zero length string. Zero length machines have one state that is both -a start state and a final state. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmnull} -\end{center} -\graphspace - -% FIXME: More on the range of values here. -\item \verb|42| -- Numerical Literal. Produces a two state machine with one -transition on the given number. The number may be in decimal or hexadecimal -format and should be in the range allowed by the alphabet type. The minimum and -maximum values permitted are defined by the host machine that Ragel is compiled -on. For example, numbers in a \verb|short| alphabet on an i386 machine should -be in the range \verb|-32768| to \verb|32767|. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmnum} -\end{center} -\graphspace - -\item \verb|/simple_regex/| -- Regular Expression. Regular expressions are -parsed as a series of expressions that are concatenated together. Each -concatenated expression -may be a literal character, the ``any'' character specified by the \verb|.| -symbol, or a union of characters specified by the \verb|[]| delimiters. If the -first character of a union is \verb|^| then it matches any character not in the -list. Within a union, a range of characters can be given by separating the first -and last characters of the range with the \verb|-| symbol. Each -concatenated machine may have repetition specified by following it with the -\verb|*| symbol. The standard escape sequences described in Section -\ref{lexing} are supported everywhere in regular expressions except as the -operands of a range within in a list. This notation also supports the \verb|i| -trailing option. Use it to produce case-insensitive machines, as in \verb|/GET/i|. - -Ragel does not support very complex regular expressions because the desired -results can always be achieved using the more general machine construction -operators listed in Section \ref{machconst}. The following diagram shows the -result of compiling \verb|/ab*[c-z].*[123]/|. \verb|DEF| represents the default -transition, which is taken if no other transition can be taken. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmregex} -\end{center} -\graphspace - -\item \verb|'a' .. 'z'| -- Range. Produces a machine that matches any -characters in the specified range. Allowable upper and lower bounds of the -range are concatenation literals of length one and numerical literals. For -example, \verb|0x10..0x20|, \verb|0..63|, and \verb|'a'..'z'| are valid ranges. -The bounds should be in the range allowed by the alphabet type. - - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{bmrange} -\end{center} -\graphspace - -\item \verb|variable_name| -- Lookup the machine definition assigned to the -variable name given and use an instance of it. See Section \ref{definition} for -an important note on what it means to reference a variable name. - -\item \verb|builtin_machine| -- There are several built-in machines available -for use. They are all two state machines for the purpose of matching common -classes of characters. They are: - -\begin{itemize} - -\item \verb|any | -- Any character in the alphabet. - -\item \verb|ascii | -- Ascii characters. \verb|0..127| - -\item \verb|extend| -- Ascii extended characters. This is the range -\verb|-128..127| for signed alphabets and the range \verb|0..255| for unsigned -alphabets. - -\item \verb|alpha | -- Alphabetic characters. \verb|[A-Za-z]| - -\item \verb|digit | -- Digits. \verb|[0-9]| - -\item \verb|alnum | -- Alpha numerics. \verb|[0-9A-Za-z]| - -\item \verb|lower | -- Lowercase characters. \verb|[a-z]| - -\item \verb|upper | -- Uppercase characters. \verb|[A-Z]| - -\item \verb|xdigit| -- Hexadecimal digits. \verb|[0-9A-Fa-f]| - -\item \verb|cntrl | -- Control characters. \verb|0..31|, \verb|127| - -\item \verb|graph | -- Graphical characters. \verb|[!-~]| - -\item \verb|print | -- Printable characters. \verb|[ -~]| - -\item \verb|punct | -- Punctuation. Graphical characters that are not alphanumerics. -\verb|[!-/:-@[-`{-~]| - -\item \verb|space | -- Whitespace. \verb|[\t\v\f\n\r ]| - -\item \verb|zlen | -- Zero length string. \verb|""| - -\item \verb|empty | -- Empty set. Matches nothing. \verb|^any| - -\end{itemize} -\end{itemize} - -\section{Operator Precedence} -The following table shows operator precedence from lowest to highest. Operators -in the same precedence group are evaluated from left to right. - -\begin{tabular}{|c|c|c|} -\hline -1&\verb| , |&Join\\ -\hline -2&\verb/ | & - --/&Union, Intersection and Subtraction\\ -\hline -3&\verb| . <: :> :>> |&Concatenation\\ -\hline -4&\verb| : |&Label\\ -\hline -5&\verb| -> |&Epsilon Transition\\ -\hline -6&\verb| > @ $ % |&Transitions Actions and Priorities\\ -\hline -6&\verb| >/ $/ %/ / |&EOF Actions\\ -\hline -6&\verb| >! $! %! ! |&Global Error Actions\\ -\hline -6&\verb| >^ $^ %^ <^ @^ <>^ |&Local Error Actions\\ -\hline -6&\verb| >~ $~ %~ <~ @~ <>~ |&To-State Actions\\ -\hline -6&\verb| >* $* %* <* @* <>* |&From-State Action\\ -\hline -7&\verb| * ** ? + {n} {,n} {n,} {n,m} |&Repetition\\ -\hline -8&\verb| ! ^ |&Negation and Character-Level Negation\\ -\hline -9&\verb| ( ) |&Grouping\\ -\hline -\end{tabular} - -\section{Regular Language Operators} -\label{machconst} - -When using Ragel it is helpful to have a sense of how it constructs machines. -The determinization process can produce results that seem unusual to someone -not familiar with the NFA to DFA conversion algorithm. In this section we -describe Ragel's state machine operators. Though the operators are defined -using epsilon transitions, it should be noted that this is for discussion only. -The epsilon transitions described in this section do not persist, but are -immediately removed by the determinization process which is executed at every -operation. Ragel does not make use of any nondeterministic intermediate state -machines. - -To create an epsilon transition between two states \verb|x| and \verb|y| is to -copy all of the properties of \verb|y| into \verb|x|. This involves drawing in -all of \verb|y|'s to-state actions, EOF actions, etc., in addition to its -transitions. If \verb|x| and \verb|y| both have a transition out on the same -character, then the transitions must be combined. During transition -combination a new transition is made that goes to a new state that is the -combination of both target states. The new combination state is created using -the same epsilon transition method. The new state has an epsilon transition -drawn to all the states that compose it. Since the creation of new epsilon -transitions may be triggered every time an epsilon transition is drawn, the -process of drawing epsilon transitions is repeated until there are no more -epsilon transitions to be made. - -A very common error that is made when using Ragel is to make machines that do -too much. That is, to create machines that have unintentional -nondeterministic properties. This usually results from being unaware of the common strings -between machines that are combined together using the regular language -operators. This can involve never leaving a machine, causing its actions to be -propagated through all the following states. Or it can involve an alternation -where both branches are unintentionally taken simultaneously. - -This problem forces one to think hard about the language that needs to be -matched. To guard against this kind of problem one must ensure that the machine -specification is divided up using boundaries that do not allow ambiguities from -one portion of the machine to the next. See Chapter -\ref{controlling-nondeterminism} for more on this problem and how to solve it. - -The Graphviz tool is an immense help when debugging improperly compiled -machines or otherwise learning how to use Ragel. Graphviz Dot files can be -generated from Ragel programs using the \verb|-V| option. See Section -\ref{visualization} for more information. - - -\subsection{Union} - -\verb/expr | expr/ - -The union operation produces a machine that matches any string in machine one -or machine two. The operation first creates a new start state. Epsilon -transitions are drawn from the new start state to the start states of both -input machines. The resulting machine has a final state set equivalent to the -union of the final state sets of both input machines. In this operation, there -is the opportunity for nondeterminism among both branches. If there are -strings, or prefixes of strings that are matched by both machines then the new -machine will follow both parts of the alternation at once. The union operation is -shown below. - -\graphspace -\begin{center} -\includegraphics[scale=1.0]{opor} -\end{center} -\graphspace - -The following example demonstrates the union of three machines representing -common tokens. - -% GENERATE: exor -% OPT: -p -% %%{ -% machine exor; -\begin{inline_code} -\begin{verbatim} -# Hex digits, decimal digits, or identifiers -main := '0x' xdigit+ | digit+ | alpha alnum*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exor} -\end{center} -\graphspace - -\subsection{Intersection} - -\verb|expr & expr| - -Intersection produces a machine that matches any -string that is in both machine one and machine two. To achieve intersection, a -union is performed on the two machines. After the result has been made -deterministic, any final state that is not a combination of final states from -both machines has its final state status revoked. To complete the operation, -paths that do not lead to a final state are pruned from the machine. Therefore, -if there are any such paths in either of the expressions they will be removed -by the intersection operator. Intersection can be used to require that two -independent patterns be simultaneously satisfied as in the following example. - -% GENERATE: exinter -% OPT: -p -% %%{ -% machine exinter; -\begin{inline_code} -\begin{verbatim} -# Match lines four characters wide that contain -# words separated by whitespace. -main := - /[^\n][^\n][^\n][^\n]\n/* & - (/[a-z][a-z]*/ | [ \n])**; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exinter} -\end{center} -\graphspace - -\subsection{Difference} - -\verb|expr - expr| - -The difference operation produces a machine that matches -strings that are in machine one but are not in machine two. To achieve subtraction, -a union is performed on the two machines. After the result has been made -deterministic, any final state that came from machine two or is a combination -of states involving a final state from machine two has its final state status -revoked. As with intersection, the operation is completed by pruning any path -that does not lead to a final state. The following example demonstrates the -use of subtraction to exclude specific cases from a set. - -% GENERATE: exsubtr -% OPT: -p -% %%{ -% machine exsubtr; -\begin{inline_code} -\begin{verbatim} -# Subtract keywords from identifiers. -main := /[a-z][a-z]*/ - ( 'for' | 'int' ); -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exsubtr} -\end{center} -\graphspace - -\subsection{Strong Difference} -\label{strong_difference} - -\verb|expr -- expr| - -Strong difference produces a machine that matches any string of the first -machine that does not have any string of the second machine as a substring. In -the following example, strong subtraction is used to excluded \verb|CRLF| from -a sequence. In the corresponding visualization, the label \verb|DEF| is short -for default. The default transition is taken if no other transition can be -taken. - -% GENERATE: exstrongsubtr -% OPT: -p -% %%{ -% machine exstrongsubtr; -\begin{inline_code} -\begin{verbatim} -crlf = '\r\n'; -main := [a-z]+ ':' ( any* -- crlf ) crlf; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exstrongsubtr} -\end{center} -\graphspace - -This operator is equivalent to the following. - -\begin{verbatim} -expr - ( any* expr any* ) -\end{verbatim} -\verbspace - -\subsection{Concatenation} - -\verb|expr . expr| - -Concatenation produces a machine that matches all the strings in machine one followed by all -the strings in machine two. Concatenation draws epsilon transitions from the -final states of the first machine to the start state of the second machine. The -final states of the first machine lose their final state status, unless the -start state of the second machine is final as well. -Concatenation is the default operator. Two machines next to each other with no -operator between them results in concatenation. - -\graphspace -\begin{center} -\includegraphics[scale=1.0]{opconcat} -\end{center} -\graphspace - -The opportunity for nondeterministic behaviour results from the possibility of -the final states of the first machine accepting a string that is also accepted -by the start state of the second machine. -The most common scenario in which this happens is the -concatenation of a machine that repeats some pattern with a machine that gives -a terminating string, but the repetition machine does not exclude the -terminating string. The example in Section \ref{strong_difference} -guards against this. Another example is the expression \verb|("'" any* "'")|. -When executed the thread of control will -never leave the \verb|any*| machine. This is a problem especially if actions -are embedded to process the characters of the \verb|any*| component. - -In the following example, the first machine is always active due to the -nondeterministic nature of concatenation. This particular nondeterminism is intended, -however, because we wish to permit EOF strings before the end of the input. - -% GENERATE: exconcat -% OPT: -p -% %%{ -% machine exconcat; -\begin{inline_code} -\begin{verbatim} -# Require an eof marker on the last line. -main := /[^\n]*\n/* . 'EOF\n'; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exconcat} -\end{center} -\graphspace - -There is a language -ambiguity involving concatenation and subtraction. Because concatenation is the -default operator for two -adjacent machines there is an ambiguity between subtraction of -a positive numerical literal and concatenation of a negative numerical literal. -For example, \verb|(x-7)| could be interpreted as \verb|(x . -7)| or -\verb|(x - 7)|. In the Ragel language, the subtraction operator always takes precedence -over concatenation of a negative literal. We adhere to the rule that the default -concatenation operator takes effect only when there are no other operators between -two machines. Beware of writing machines such as \verb|(any -1)| when what is -desired is a concatenation of \verb|any| and \verb|-1|. Instead write -\verb|(any . -1)| or \verb|(any (-1))|. If in doubt of the meaning of your program do not -rely on the default concatenation operator; always use the \verb|.| symbol. - - -\subsection{Kleene Star} - -\verb|expr*| - -The machine resulting from the Kleene Star operator will match zero or more -repetitions of the machine it is applied to. -It creates a new start state and an additional final -state. Epsilon transitions are drawn between the new start state and the old start -state, between the new start state and the new final state, and -between the final states of the machine and the new start state. After the -machine is made deterministic, the final states get all the -transitions of the start state. - -\graphspace -\begin{center} -\includegraphics[scale=1.0]{opstar} -\end{center} -\graphspace - -The possibility for nondeterministic behaviour arises if the final states have -transitions on any of the same characters as the start state. This is common -when applying kleene star to an alternation of tokens. Like the other problems -arising from nondeterministic behavior, this is discussed in more detail in Chapter -\ref{controlling-nondeterminism}. This particular problem can also be solved -by using the longest-match construction discussed in Section -\ref{generating-scanners} on scanners. - -In this -example, there is no nondeterminism introduced by the exterior kleene star due to -the newline at the end of the regular expression. Without the newline the -exterior kleene star would be redundant and there would be ambiguity between -repeating the inner range of the regular expression and the entire regular -expression. Though it would not cause a problem in this case, unnecessary -nondeterminism in the kleene star operator often causes undesired results for -new Ragel users and must be guarded against. - -% GENERATE: exstar -% OPT: -p -% %%{ -% machine exstar; -\begin{inline_code} -\begin{verbatim} -# Match any number of lines with only lowercase letters. -main := /[a-z]*\n/*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exstar} -\end{center} -\graphspace - -\subsection{One Or More Repetition} - -\verb|expr+| - -This operator produces the concatenation of the machine with the kleene star of -itself. The result will match one or more repetitions of the machine. The plus -operator is equivalent to \verb|(expr . expr*)|. - -% GENERATE: explus -% OPT: -p -% %%{ -% machine explus; -\begin{inline_code} -\begin{verbatim} -# Match alpha-numeric words. -main := alnum+; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{explus} -\end{center} -\graphspace - -\subsection{Optional} - -\verb|expr?| - -The {\em optional} operator produces a machine that accepts the machine -given or the zero length string. The optional operator is equivalent to -\verb/(expr | '' )/. In the following example the optional operator is used to -possibly extend a token. - -% GENERATE: exoption -% OPT: -p -% %%{ -% machine exoption; -\begin{inline_code} -\begin{verbatim} -# Match integers or floats. -main := digit+ ('.' digit+)?; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exoption} -\end{center} -\graphspace - -\subsection{Repetition} - -\noindent\hspace*{24pt}\verb|expr {n}| -- Exactly N copies of expr.\\ -\noindent\hspace*{24pt}\verb|expr {,n}| -- Zero to N copies of expr.\\ -\noindent\hspace*{24pt}\verb|expr {n,}| -- N or more copies of expr.\\ -\noindent\hspace*{24pt}\verb|expr {n,m}| -- N to M copies of expr. -\vspace{12pt} - -\subsection{Negation} - -\verb|!expr| - -Negation produces a machine that matches any string not matched by the given -machine. Negation is equivalent to \verb|(any* - expr)|. - -% GENERATE: exnegate -% OPT: -p -% %%{ -% machine exnegate; -\begin{inline_code} -\begin{verbatim} -# Accept anything but a string beginning with a digit. -main := ! ( digit any* ); -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exnegate} -\end{center} -\graphspace - -\subsection{Character-Level Negation} - -\verb|^expr| - -Character-level negation produces a machine that matches any single character -not matched by the given machine. Character-Level Negation is equivalent to -\verb|(any - expr)|. It must be applied only to machines that match strings of -length one. - -\section{State Machine Minimization} - -State machine minimization is the process of finding the minimal equivalent FSM accepting -the language. Minimization reduces the number of states in machines -by merging equivalent states. It does not change the behaviour of the machine -in any way. It will cause some states to be merged into one because they are -functionally equivalent. State minimization is on by default. It can be turned -off with the \verb|-n| option. - -The algorithm implemented is similar to Hopcroft's state minimization -algorithm. Hopcroft's algorithm assumes a finite alphabet that can be listed in -memory, whereas Ragel supports arbitrary integer alphabets that cannot be -listed in memory. Though exact analysis is very difficult, Ragel minimization -runs close to O(n * log(n)) and requires O(n) temporary storage where -$n$ is the number of states. - -\section{Visualization} -\label{visualization} - -%In many cases, practical -%parsing programs will be too large to completely visualize with Graphviz. The -%proper approach is to reduce the language to the smallest subset possible that -%still exhibits the characteristics that one wishes to learn about or to fix. -%This can be done without modifying the source code using the \verb|-M| and -%\verb|-S| options. If a machine cannot be easily reduced, -%embeddings of unique actions can be very useful for tracing a -%particular component of a larger machine specification, since action names are -%written out on transition labels. - -Ragel is able to emit compiled state machines in Graphviz's Dot file format. -This is done using the \verb|-V| option. -Graphviz support allows users to perform -incremental visualization of their parsers. User actions are displayed on -transition labels of the graph. - -If the final graph is too large to be -meaningful, or even drawn, the user is able to inspect portions of the parser -by naming particular regular expression definitions with the \verb|-S| and -\verb|-M| options to the \verb|ragel| program. Use of Graphviz greatly -improves the Ragel programming experience. It allows users to learn Ragel by -experimentation and also to track down bugs caused by unintended -nondeterminism. - -Ragel has another option to help debugging. The \verb|-x| option causes Ragel -to emit the compiled machine in an XML format. - -\chapter{User Actions} - -Ragel permits the user to embed actions into the transitions of a regular -expression's corresponding state machine. These actions are executed when the -generated code moves over a transition. Like the regular expression operators, -the action embedding operators are fully compositional. They take a state -machine and an action as input, embed the action and yield a new state machine -that can be used in the construction of other machines. Due to the -compositional nature of embeddings, the user has complete freedom in the -placement of actions. - -A machine's transitions are categorized into four classes. The action embedding -operators access the transitions defined by these classes. The {\em entering -transition} operator \verb|>| isolates the start state, then embeds an action -into all transitions leaving it. The {\em finishing transition} operator -\verb|@| embeds an action into all transitions going into a final state. The -{\em all transition} operator \verb|$| embeds an action into all transitions of -an expression. The {\em leaving transition} operator \verb|%| provides access -to the yet-unmade transitions moving out of the machine via the final states. - -\section{Embedding Actions} - -\begin{verbatim} -action ActionName { - /* Code an action here. */ - count += 1; -} -\end{verbatim} -\verbspace - -The action statement defines a block of code that can be embedded into an FSM. -Action names can be referenced by the action embedding operators in -expressions. Though actions need not be named in this way (literal blocks -of code can be embedded directly when building machines), defining reusable -blocks of code whenever possible is good practice because it potentially increases the -degree to which the machine can be minimized. - -Within an action some Ragel expressions and statements are parsed and -translated. These allow the user to interact with the machine from action code. -See Section \ref{vals} for a complete list of statements and values available -in code blocks. - -\subsection{Entering Action} - -\verb|expr > action| - -The entering action operator embeds an action into all transitions -that enter into the machine from the start state. If the start state is final, -then the action is also embedded into the start state as a leaving action. This -means that if a machine accepts the zero-length string and control passes -through the start state then the entering action is executed. Note -that this can happen on both a following character and on the EOF event. - -In some machines, the start state has transtions coming in from within the -machine. In these cases the start state is first isolated from the rest of the -machine ensuring that the entering actions are executed once only. - -% GENERATE: exstact -% OPT: -p -% %%{ -% machine exstact; -\begin{inline_code} -\begin{verbatim} -# Execute A at the beginning of a string of alpha. -action A {} -main := ( lower* >A ) . ' '; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exstact} -\end{center} -\graphspace - -\subsection{Finishing Action} - -\verb|expr @ action| - -The finishing action operator embeds an action into any transitions that move -the machine into a final state. Further input may move the machine out of the -final state, but keep it in the machine. Therefore, finishing actions may be -executed more than once if a machine has any internal transitions out of a -final state. In the following example, the final state has no transitions out -and the finishing action is executed only once. - -% GENERATE: exdoneact -% OPT: -p -% %%{ -% machine exdoneact; -% action A {} -\begin{inline_code} -\begin{verbatim} -# Execute A when the trailing space is seen. -main := ( lower* ' ' ) @A; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exdoneact} -\end{center} -\graphspace - -\subsection{All Transition Action} - -\verb|expr $ action| - -The all transition operator embeds an action into all transitions of a machine. -The action is executed whenever a transition of the machine is taken. In the -following example, A is executed on every character matched. - -% GENERATE: exallact -% OPT: -p -% %%{ -% machine exallact; -% action A {} -\begin{inline_code} -\begin{verbatim} -# Execute A on any characters of the machine. -main := ( 'm1' | 'm2' ) $A; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exallact} -\end{center} -\graphspace - -\subsection{Leaving Actions} -\label{out-actions} - -\verb|expr % action| - -The leaving action operator queues an action for embedding into the transitions -that go out of a machine via a final state. The action is first stored in -the machine's final states and is later transferred to any transitions that are -made going out of the machine by a kleene star or concatenation operation. - -If a final state of the machine is still final when compilation is complete -then the leaving action is also embedded as an EOF action. Therefore, leaving -the machine is defined as either leaving on a character or as state machine -acceptance. - -This operator allows one to associate an action with the termination of a -sequence, without being concerned about what particular character terminates -the sequence. In the following example, A is executed when leaving the alpha -machine on the newline character. - -% GENERATE: exoutact1 -% OPT: -p -% %%{ -% machine exoutact1; -% action A {} -\begin{inline_code} -\begin{verbatim} -# Match a word followed by a newline. Execute A when -# finishing the word. -main := ( lower+ %A ) . '\n'; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exoutact1} -\end{center} -\graphspace - -In the following example, the \verb|term_word| action could be used to register -the appearance of a word and to clear the buffer that the \verb|lower| action used -to store the text of it. - -% GENERATE: exoutact2 -% OPT: -p -% %%{ -% machine exoutact2; -% action lower {} -% action space {} -% action term_word {} -% action newline {} -\begin{inline_code} -\begin{verbatim} -word = ( [a-z] @lower )+ %term_word; -main := word ( ' ' @space word )* '\n' @newline; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exoutact2} -\end{center} -\graphspace - -In this final example of the action embedding operators, A is executed upon entering -the alpha machine, B is executed on all transitions of the -alpha machine, C is executed when the alpha machine is exited by moving into the -newline machine and N is executed when the newline machine moves into a final -state. - -% GENERATE: exaction -% OPT: -p -% %%{ -% machine exaction; -% action A {} -% action B {} -% action C {} -% action N {} -\begin{inline_code} -\begin{verbatim} -# Execute A on starting the alpha machine, B on every transition -# moving through it and C upon finishing. Execute N on the newline. -main := ( lower* >A $B %C ) . '\n' @N; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% END GENERATE - -\graphspace -\begin{center} -\includegraphics[scale=0.55]{exaction} -\end{center} -\graphspace - - -\section{State Action Embedding Operators} - -The state embedding operators allow one to embed actions into states. Like the -transition embedding operators, there are several different classes of states -that the operators access. The meanings of the symbols are similar to the -meanings of the symbols used for the transition embedding operators. The design -of the state selections was driven by a need to cover the states of an -expression with exactly one error action. - -Unlike the transition embedding operators, the state embedding operators are -also distinguished by the different kinds of events that embedded actions can -be associated with. Therefore the state embedding operators have two -components. The first, which is the first one or two characters, specifies the -class of states that the action will be embedded into. The second component -specifies the type of event the action will be executed on. The symbols of the -second component also have equivalent keywords. - -\begin{multicols}{2} -The different classes of states are: - -\noindent\hspace*{24pt}\verb|> | -- the start state\\ -\noindent\hspace*{24pt}\verb|< | -- any state except the start state\\ -\noindent\hspace*{24pt}\verb|$ | -- all states\\ -\noindent\hspace*{24pt}\verb|% | -- final states\\ -\noindent\hspace*{24pt}\verb|@ | -- any state except final states\\ -\noindent\hspace*{24pt}\verb|<>| -- any except start and final (middle) -\vspace{12pt} - -\columnbreak - -The different kinds of embeddings are: - -\noindent\hspace*{24pt}\verb|~| -- to-state actions (\verb|to|)\\ -\noindent\hspace*{24pt}\verb|*| -- from-state actions (\verb|from|)\\ -\noindent\hspace*{24pt}\verb|/| -- EOF actions (\verb|eof|)\\ -\noindent\hspace*{24pt}\verb|!| -- error actions (\verb|err|)\\ -\noindent\hspace*{24pt}\verb|^| -- local error actions (\verb|lerr|) -\vspace{12pt} - -\end{multicols} - -\subsection{To-State and From-State Actions} - -\subsubsection{To-State Actions} - -\noindent\hspace*{24pt}\verb|>~action >to(name) >to{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|<~action ~action <>to(name) <>to{...}| -- any except start and final (middle) -\vspace{12pt} - - -To-state actions are executed whenever the state machine moves into the -specified state, either by a natural movement over a transition or by an -action-based transfer of control such as \verb|fgoto|. They are executed after the -in-transition's actions but before the current character is advanced and -tested against the end of the input block. To-state embeddings stay with the -state. They are irrespective of the state's current set of transitions and any -future transitions that may be added in or out of the state. - -Note that the setting of the current state variable \verb|cs| outside of the -execute code is not considered by Ragel as moving into a state and consequently -the to-state actions of the new current state are not executed. This includes -the initialization of the current state when the machine begins. This is -because the entry point into the machine execution code is after the execution -of to-state actions. - -\subsubsection{From-State Actions} - -\noindent\hspace*{24pt}\verb|>*action >from(name) >from{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|<*action *action <>from(name) <>from{...}| -- any except start and final (middle) -\vspace{12pt} - -From-state actions are executed whenever the state machine takes a transition from a -state, either to itself or to some other state. These actions are executed -immediately after the current character is tested against the input block end -marker and before the transition to take is sought based on the current -character. From-state actions are therefore executed even if a transition -cannot be found and the machine moves into the error state. Like to-state -embeddings, from-state embeddings stay with the state. - -\subsection{EOF Actions} - -\noindent\hspace*{24pt}\verb|>/action >eof(name) >eof{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|/action <>eof(name) <>eof{...}| -- any except start and final (middle) -\vspace{12pt} - -The EOF action embedding operators enable the user to embed actions that are -executed at the end of the input stream. EOF actions are stored in states and -generated in the \verb|write exec| block. They are run when \verb|p == pe == eof| -as the execute block is finishing. EOF actions are free to adjust \verb|p| and -jump to another part of the machine to restart execution. - -\subsection{Handling Errors} - -In many applications it is useful to be able to react to parsing errors. The -user may wish to print an error message that depends on the context. It -may also be desirable to consume input in an attempt to return the input stream -to some known state and resume parsing. To support error handling and recovery, -Ragel provides error action embedding operators. There are two kinds of error -actions: global error actions and local error actions. -Error actions can be used to simply report errors, or by jumping to a machine -instantiation that consumes input, can attempt to recover from errors. - -\subsubsection{Global Error Actions} - -\noindent\hspace*{24pt}\verb|>!action >err(name) >err{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|!action <>err(name) <>err{...}| -- any except start and final (middle) -\vspace{12pt} - -Global error actions are stored in the states they are embedded into until -compilation is complete. They are then transferred to the transitions that move -into the error state. These transitions are taken on all input characters that -are not already covered by the state's transitions. If a state with an error -action is not final when compilation is complete, then the action is also -embedded as an EOF action. - -Error actions can be used to recover from errors by jumping back into the -machine with \verb|fgoto| and optionally altering \verb|p|. - -\subsubsection{Local Error Actions} - -\noindent\hspace*{24pt}\verb|>^action >lerr(name) >lerr{...} | -- the start state\\ -\noindent\hspace*{24pt}\verb|<^action ^action <>lerr(name) <>lerr{...}| -- any except start and final (middle) -\vspace{12pt} - -Like global error actions, local error actions are also stored in the states -they are embedded into until a transfer point. The transfer point is different -however. Each local error action embedding is associated with a name. When a -machine definition has been fully constructed, all local error action -embeddings associated with the same name as the machine definition are -transferred to the error transitions. At this time they are also embedded as -EOF actions in the case of non-final states. - -Local error actions can be used to specify an action to take when a particular -section of a larger state machine fails to match. A particular machine -definition's ``thread'' may die and the local error actions executed, however -the machine as a whole may continue to match input. - -There are two forms of local error action embeddings. In the first form the -name defaults to the current machine. In the second form the machine name can -be specified. This is useful when it is more convenient to specify the local -error action in a sub-definition that is used to construct the machine -definition that the local error action is associated with. To embed local -error actions and -explicitly state the machine definition on which the transfer is to happen use -\verb|(name, action)| as the action. - -\subsubsection{Example} - -The following example uses error actions to report an error and jump to a -machine that consumes the remainder of the line when parsing fails. After -consuming the line, the error recovery machine returns to the main loop. - -% GENERATE: erract -% %%{ -% machine erract; -% ws = ' '; -% address = 'foo AT bar..com'; -% date = 'Monday May 12'; -\begin{inline_code} -\begin{verbatim} -action cmd_err { - printf( "command error\n" ); - fhold; fgoto line; -} -action from_err { - printf( "from error\n" ); - fhold; fgoto line; -} -action to_err { - printf( "to error\n" ); - fhold; fgoto line; -} - -line := [^\n]* '\n' @{ fgoto main; }; - -main := ( - ( - 'from' @err(cmd_err) - ( ws+ address ws+ date '\n' ) $err(from_err) | - 'to' @err(cmd_err) - ( ws+ address '\n' ) $err(to_err) - ) -)*; -\end{verbatim} -\end{inline_code} -\verbspace -% }%% -% %% write data; -% void f() -% { -% %% write init; -% %% write exec; -% } -% END GENERATE - - - -\section{Action Ordering and Duplicates} - -When combining expressions that have embedded actions it is often the case that -a number of actions must be executed on a single input character. For example, -following a concatenation the leaving action of the left expression and the -entering action of the right expression will be embedded into one transition. -This requires a method of ordering actions that is intuitive and -predictable for the user, and repeatable for the compiler. - -We associate with the embedding of each action a unique timestamp that is -used to order actions that appear together on a single transition in the final -state machine. To accomplish this, we recursively traverse the parse tree of -regular expressions and assign timestamps to action embeddings. References to -machine definitions are followed in the traversal. When we visit a -parse tree node, we assign timestamps to all {\em entering} action embeddings, -recurse on the parse tree, then assign timestamps to the remaining {\em all}, -{\em finishing}, and {\em leaving} embeddings in the order in which they -appear. - -By default Ragel does not permit a single action to appear multiple times in an action -list. When the final machine has been created, actions that appear more than -once in a single transition, to-state, from-state or EOF action list have their -duplicates removed. -The first appearance of the action is preserved. This is useful in a number of -scenarios. First, it allows us to union machines with common prefixes without -worrying about the action embeddings in the prefix being duplicated. Second, it -prevents leaving actions from being transferred multiple times. This can -happen when a machine is repeated, then followed with another machine that -begins with a common character. For example: - -\begin{verbatim} -word = [a-z]+ %act; -main := word ( '\n' word )* '\n\n'; -\end{verbatim} -\verbspace - -Note that Ragel does not compare action bodies to determine if they have -identical program text. It simply checks for duplicates using each action -block's unique location in the program. - -The removal of duplicates can be turned off using the \verb|-d| option. - -\section{Values and Statements Available in Code Blocks} -\label{vals} - -The following values are available in code blocks: - -\begin{itemize} -\item \verb|fpc| -- A pointer to the current character. This is equivalent to -accessing the \verb|p| variable. - -\item \verb|fc| -- The current character. This is equivalent to the expression \verb|(*p)|. - -\item \verb|fcurs| -- An integer value representing the current state. This -value should only be read from. To move to a different place in the machine -from action code use the \verb|fgoto|, \verb|fnext| or \verb|fcall| statements. -Outside of the machine execution code the \verb|cs| variable may be modified. - -\item \verb|ftargs| -- An integer value representing the target state. This -value should only be read from. Again, \verb|fgoto|, \verb|fnext| and -\verb|fcall| can be used to move to a specific entry point. - -\item \verb|fentry(