diff --git a/README.md b/README.md index 5218427..3d93789 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # jison-lex -A lexical analyzer generator used by [jison](http://jison.org). +A lexical analyzer generator used by [jison](http://jison.org). It takes a lexical grammar definition (either in JSON or Bison's lexical grammar format) and outputs a JavaScript lexer. ## install npm install jison-lex -g @@ -16,5 +16,32 @@ Options: --version print version and exit ``` +## programatic usage + +``` +var JisonLex = require('jison-lex'); + +var grammar = { + rules: [ + ["x", "return 'X';" ], + ["y", "return 'Y';" ], + ["$", "return 'EOF';" ] + ] +}; + +// or load from a file +// var grammar = fs.readFileSync('mylexer.l', 'utf8'); + +// generate source +var lexerSource = JisonLex.generate(grammar); + +// or create a parser in memory +var lexer = new JisonLex(grammar); +lexer.setInput('xyxxy'); +lexer.lex(); +// => 'X' +lexer.lex(); +// => 'Y' + ## license MIT diff --git a/cli.js b/cli.js index b78e317..6e4e7f3 100755 --- a/cli.js +++ b/cli.js @@ -33,10 +33,9 @@ var opts = require("nomnom") callback: function() { return version; } - }) - .parse(); + }); -exports.main = function () { +exports.main = function (opts) { if (opts.file) { var raw = fs.readFileSync(path.normalize(opts.file), 'utf8'), name = path.basename((opts.outfile||opts.file)).replace(/\..*$/g,''); @@ -67,8 +66,7 @@ function processGrammar (file, name) { grammar.options = settings; - var lexer = new RegExpLexer(grammar); - return lexer.generate(settings); + return RegExpLexer.generate(grammar); } function readin (cb) { @@ -85,4 +83,4 @@ function readin (cb) { } if (require.main === module) - exports.main(); + exports.main(opts.parse()); diff --git a/examples/lex.l b/examples/lex.l new file mode 100644 index 0000000..515984d --- /dev/null +++ b/examples/lex.l @@ -0,0 +1,90 @@ + +NAME [a-zA-Z_][a-zA-Z0-9_-]* +BR \r\n|\n|\r + +%s indented trail rules +%x code start_condition options conditions action + +%% + +"/*"(.|\n|\r)*?"*/" return 'ACTION_BODY'; +"//".* return 'ACTION_BODY'; +"/"[^ /]*?['"{}'][^ ]*?"/" return 'ACTION_BODY'; // regexp with braces or quotes (and no spaces) +\"("\\\\"|'\"'|[^"])*\" return 'ACTION_BODY'; +"'"("\\\\"|"\'"|[^'])*"'" return 'ACTION_BODY'; +[/"'][^{}/"']+ return 'ACTION_BODY'; +[^{}/"']+ return 'ACTION_BODY'; +"{" yy.depth++; return '{' +"}" yy.depth == 0 ? this.begin('trail') : yy.depth--; return '}' + +{NAME} return 'NAME'; +">" this.popState(); return '>'; +"," return ','; +"*" return '*'; + +{BR}+ /* */ +\s+{BR}+ /* */ +\s+ this.begin('indented') +"%%" this.begin('code'); return '%%' +[a-zA-Z0-9_]+ return 'CHARACTER_LIT' + +{NAME} yy.options[yytext] = true +{BR}+ this.begin('INITIAL') +\s+{BR}+ this.begin('INITIAL') +\s+ /* empty */ + +{NAME} return 'START_COND' +{BR}+ this.begin('INITIAL') +\s+{BR}+ this.begin('INITIAL') +\s+ /* empty */ + +.*{BR}+ this.begin('rules') + +"{" yy.depth = 0; this.begin('action'); return '{' +"%{"(.|{BR})*?"%}" this.begin('trail'); yytext = yytext.substr(2, yytext.length-4);return 'ACTION' +"%{"(.|{BR})*?"%}" yytext = yytext.substr(2, yytext.length-4); return 'ACTION' +.+ this.begin('rules'); return 'ACTION' + +"/*"(.|\n|\r)*?"*/" /* ignore */ +"//".* /* ignore */ + +{BR}+ /* */ +\s+ /* */ +{NAME} return 'NAME'; +\"("\\\\"|'\"'|[^"])*\" yytext = yytext.replace(/\\"/g,'"'); return 'STRING_LIT'; +"'"("\\\\"|"\'"|[^'])*"'" yytext = yytext.replace(/\\'/g,"'"); return 'STRING_LIT'; +"|" return '|'; +"["("\\\\"|"\]"|[^\]])*"]" return 'ANY_GROUP_REGEX'; +"(?:" return 'SPECIAL_GROUP'; +"(?=" return 'SPECIAL_GROUP'; +"(?!" return 'SPECIAL_GROUP'; +"(" return '('; +")" return ')'; +"+" return '+'; +"*" return '*'; +"?" return '?'; +"^" return '^'; +"," return ','; +"<>" return '$'; +"<" this.begin('conditions'); return '<'; +"/!" return '/!'; +"/" return '/'; +"\\"([0-7]{1,3}|[rfntvsSbBwWdD\\*+()${}|[\]\/.^?]|"c"[A-Z]|"x"[0-9A-F]{2}|"u"[a-fA-F0-9]{4}) return 'ESCAPE_CHAR'; +"\\". yytext = yytext.replace(/^\\/g,''); return 'ESCAPE_CHAR'; +"$" return '$'; +"." return '.'; +"%options" yy.options = {}; this.begin('options'); +"%s" this.begin('start_condition'); return 'START_INC'; +"%x" this.begin('start_condition'); return 'START_EXC'; +"%%" this.begin('rules'); return '%%'; +"{"\d+(","\s?\d+|",")?"}" return 'RANGE_REGEX'; +"{"{NAME}"}" return 'NAME_BRACE'; +"{" return '{'; +"}" return '}'; +. /* ignore bad characters */ +<*><> return 'EOF'; + +(.|{BR})+ return 'CODE'; + +%% + diff --git a/regexp-lexer.js b/regexp-lexer.js index 3e73a05..7032979 100644 --- a/regexp-lexer.js +++ b/regexp-lexer.js @@ -1,7 +1,6 @@ // Basic Lexer implemented using JavaScript regular expressions // MIT Licensed -var RegExpLexer = (function () { "use strict"; var lexParser = require('lex-parser'); @@ -17,7 +16,7 @@ function prepareRules(rules, macros, actions, tokens, startConditions, caseless) } function tokenNumberReplacement (str, token) { - return "return "+(tokens[token] || "'"+token+"'"); + return "return " + (tokens[token] || "'" + token + "'"); } actions.push('switch($avoiding_name_collisions) {'); @@ -48,10 +47,10 @@ function prepareRules(rules, macros, actions, tokens, startConditions, caseless) if (typeof m === 'string') { for (k in macros) { if (macros.hasOwnProperty(k)) { - m = m.split("{"+k+"}").join('(' + macros[k] + ')'); + m = m.split("{" + k + "}").join('(' + macros[k] + ')'); } } - m = new RegExp("^(?:"+m+")", caseless ? 'i':''); + m = new RegExp("^(?:" + m.replace(/\//g, '\\/') + ")", caseless ? 'i':''); } newRules.push(m); if (typeof rules[i][1] === 'function') { @@ -61,7 +60,7 @@ function prepareRules(rules, macros, actions, tokens, startConditions, caseless) if (tokens && action.match(/return '[^']+'/)) { action = action.replace(/return '([^']+)'/g, tokenNumberReplacement); } - actions.push('case '+i+':' +action+'\nbreak;'); + actions.push('case ' + i + ':' + action + '\nbreak;'); } actions.push("}"); @@ -77,7 +76,7 @@ function prepareMacros (macros) { for (i in macros) if (macros.hasOwnProperty(i)) { m = macros[i]; for (k in macros) if (macros.hasOwnProperty(k) && i !== k) { - mnew = m.split("{"+k+"}").join('(' + macros[k] + ')'); + mnew = m.split("{" + k + "}").join('(' + macros[k] + ')'); if (mnew !== m) { cont = true; macros[i] = mnew; @@ -113,38 +112,28 @@ function buildActions (dict, tokens) { this.rules = prepareRules(dict.rules, dict.macros, actions, tokens && toks, this.conditions, this.options["case-insensitive"]); var fun = actions.join("\n"); "yytext yyleng yylineno yylloc".split(' ').forEach(function (yy) { - fun = fun.replace(new RegExp("\\b("+yy+")\\b", "g"), "yy_.$1"); + fun = fun.replace(new RegExp("\\b(" + yy + ")\\b", "g"), "yy_.$1"); }); - - // first try to create the performAction function the old way, - // but this will break for some legal constructs in the user action code: - try { - return Function("yy,yy_,$avoiding_name_collisions,YY_START", fun); - } catch (e) { - return "function anonymous(yy,yy_,$avoiding_name_collisions,YY_START) {" + fun + "\n}"; - } + return "function anonymous(yy,yy_,$avoiding_name_collisions,YY_START) {" + fun + "\n}"; } function RegExpLexer (dict, input, tokens) { - if (typeof dict === 'string') { - dict = lexParser.parse(dict); - } - dict = dict || {}; - this.options = dict.options || {}; - - this.conditions = prepareStartConditions(dict.startConditions); - this.conditions.INITIAL = {rules:[],inclusive:true}; + var opts = processGrammar(dict, tokens); + var source = generateModuleBody(opts); + var lexer = eval(source); - this.performAction = buildActions.call(this, dict, tokens); - this.conditionStack = ['INITIAL']; - - this.moduleInclude = (dict.moduleInclude || '').trim(); - - this.yy = {}; + lexer.yy = {}; if (input) { - this.setInput(input); + lexer.setInput(input); } + + lexer.generate = function () { return generateFromOpts(opts); }; + lexer.generateModule = function () { return generateModule(opts); }; + lexer.generateCommonJSModule = function () { return generateCommonJSModule(opts); }; + lexer.generateAMDModule = function () { return generateAMDModule(opts); }; + + return lexer; } RegExpLexer.prototype = { @@ -465,107 +454,149 @@ RegExpLexer.prototype = { // return the number of states pushed stateStackSize: function stateStackSize() { return this.conditionStack.length; - }, + } +}; - generate: function generate(opt) { - var code = ""; - if (opt.moduleType === 'commonjs') { - code = this.generateCommonJSModule(opt); - } else if (opt.moduleType === 'amd') { - code = this.generateAMDModule(opt); - } else { - code = this.generateModule(opt); - } - return code; - }, - generateModuleBody: function generateModule() { - var function_descriptions = { - setInput: "resets the lexer, sets new input", - input: "consumes and returns one char from the input", - unput: "unshifts one char (or a string) into the input", - more: "When called from action, caches matched text and appends it on next action", - reject: "When called from action, signals the lexer that this rule fails to match the input, so the next matching rule (regex) should be tested instead.", - less: "retain first n characters of the match", - pastInput: "displays already matched input, i.e. for error messages", - upcomingInput: "displays upcoming input, i.e. for error messages", - showPosition: "displays the character position where the lexing error occurred, i.e. for error messages", - test_match: "test the lexed token: return FALSE when not a match, otherwise return token", - next: "return next match in input", - lex: "return next match that has a token", - begin: "activates a new lexer condition state (pushes the new lexer condition state onto the condition stack)", - popState: "pop the previously active lexer condition state off the condition stack", - _currentRules: "produce the lexer rule set which is active for the currently active lexer condition state", - topState: "return the currently active lexer condition state; when an index argument is provided it produces the N-th previous condition state, if available", - pushState: "alias for begin(condition)", - stateStackSize: "return the number of states currently on the stack" - }; - var out = "{\n"; - var p = []; - var descr; - for (var k in RegExpLexer.prototype) { - if (RegExpLexer.prototype.hasOwnProperty(k) && k.indexOf("generate") === -1) { - // copy the function description as a comment before the implementation; supports multi-line descriptions - descr = "\n"; - if (function_descriptions[k]) { - descr += "// " + function_descriptions[k].replace(/\n/g, "\n\/\/ ") + "\n"; - } - p.push(descr + k + ":" + (RegExpLexer.prototype[k].toString() || '""')); +// generate lexer source from a grammar +function generate (dict, tokens) { + var opt = processGrammar(dict, tokens); + + return generateFromOpts(opt); +} + +// process the grammar and build final data structures and functions +function processGrammar(dict, tokens) { + var opts = {}; + if (typeof dict === 'string') { + dict = lexParser.parse(dict); + } + dict = dict || {}; + + opts.options = dict.options || {}; + opts.moduleType = opts.options.moduleType; + opts.moduleName = opts.options.moduleName; + + opts.conditions = prepareStartConditions(dict.startConditions); + opts.conditions.INITIAL = {rules:[],inclusive:true}; + + opts.performAction = buildActions.call(opts, dict, tokens); + opts.conditionStack = ['INITIAL']; + + opts.moduleInclude = (dict.moduleInclude || '').trim(); + return opts; +} + +// Assemble the final source from the processed grammar +function generateFromOpts (opt) { + var code = ""; + + if (opt.moduleType === 'commonjs') { + code = generateCommonJSModule(opt); + } else if (opt.moduleType === 'amd') { + code = generateAMDModule(opt); + } else { + code = generateModule(opt); + } + + return code; +} + +function generateModuleBody (opt) { + var functionDescriptions = { + setInput: "resets the lexer, sets new input", + input: "consumes and returns one char from the input", + unput: "unshifts one char (or a string) into the input", + more: "When called from action, caches matched text and appends it on next action", + reject: "When called from action, signals the lexer that this rule fails to match the input, so the next matching rule (regex) should be tested instead.", + less: "retain first n characters of the match", + pastInput: "displays already matched input, i.e. for error messages", + upcomingInput: "displays upcoming input, i.e. for error messages", + showPosition: "displays the character position where the lexing error occurred, i.e. for error messages", + test_match: "test the lexed token: return FALSE when not a match, otherwise return token", + next: "return next match in input", + lex: "return next match that has a token", + begin: "activates a new lexer condition state (pushes the new lexer condition state onto the condition stack)", + popState: "pop the previously active lexer condition state off the condition stack", + _currentRules: "produce the lexer rule set which is active for the currently active lexer condition state", + topState: "return the currently active lexer condition state; when an index argument is provided it produces the N-th previous condition state, if available", + pushState: "alias for begin(condition)", + stateStackSize: "return the number of states currently on the stack" + }; + var out = "({\n"; + var p = []; + var descr; + for (var k in RegExpLexer.prototype) { + if (RegExpLexer.prototype.hasOwnProperty(k) && k.indexOf("generate") === -1) { + // copy the function description as a comment before the implementation; supports multi-line descriptions + descr = "\n"; + if (functionDescriptions[k]) { + descr += "// " + functionDescriptions[k].replace(/\n/g, "\n\/\/ ") + "\n"; } + p.push(descr + k + ":" + (RegExpLexer.prototype[k].toString() || '""')); } - out += p.join(",\n"); + } + out += p.join(",\n"); - if (this.options) { - out += ",\noptions: " + JSON.stringify(this.options); - } + if (opt.options) { + out += ",\noptions: " + JSON.stringify(opt.options); + } - out += ",\nperformAction: " + String(this.performAction); - out += ",\nrules: [" + this.rules + "]"; - out += ",\nconditions: " + JSON.stringify(this.conditions); - out += "\n}"; + out += ",\nperformAction: " + String(opt.performAction); + out += ",\nrules: [" + opt.rules + "]"; + out += ",\nconditions: " + JSON.stringify(opt.conditions); + out += "\n})"; - return out; - }, - generateModule: function generateModule(opt) { - opt = opt || {}; + return out; +} - var out = "/* generated by jison-lex " + version + " */"; - var moduleName = opt.moduleName || "lexer"; +function generateModule(opt) { + opt = opt || {}; - out += "\nvar " + moduleName + " = (function(){\nvar lexer = " - + this.generateModuleBody(); + var out = "/* generated by jison-lex " + version + " */"; + var moduleName = opt.moduleName || "lexer"; - if (this.moduleInclude) out += ";\n"+this.moduleInclude; - out += ";\nreturn lexer;\n})();"; - return out; - }, - generateAMDModule: function generateAMDModule() { - var out = "/* generated by jison-lex " + version + " */"; + out += "\nvar " + moduleName + " = (function(){\nvar lexer = " + + generateModuleBody(opt); - out += "define([], function(){\nvar lexer = " - + this.generateModuleBody(); + if (opt.moduleInclude) { + out += ";\n" + opt.moduleInclude; + } - if (this.moduleInclude) out += ";\n"+this.moduleInclude; - out += ";\nreturn lexer;" - + "\n})();"; - return out; - }, - generateCommonJSModule: function generateCommonJSModule(opt) { - opt = opt || {}; + out += ";\nreturn lexer;\n})();"; - var out = ""; - var moduleName = opt.moduleName || "lexer"; + return out; +} - out += this.generateModule(opt); - out += "\nexports.lexer = "+moduleName; - out += ";\nexports.lex = function () { return "+moduleName+".lex.apply(lexer, arguments); };"; - return out; +function generateAMDModule(opt) { + var out = "/* generated by jison-lex " + version + " */"; + + out += "define([], function(){\nvar lexer = " + + generateModuleBody(opt); + + if (opt.moduleInclude) { + out += ";\n" + opt.moduleInclude; } -}; -return RegExpLexer; + out += ";\nreturn lexer;" + + "\n});"; + + return out; +} + +function generateCommonJSModule(opt) { + opt = opt || {}; + + var out = ""; + var moduleName = opt.moduleName || "lexer"; + + out += generateModule(opt); + out += "\nexports.lexer = " + moduleName; + out += ";\nexports.lex = function () { return " + moduleName + ".lex.apply(lexer, arguments); };"; + return out; +} -})(); +RegExpLexer.generate = generate; module.exports = RegExpLexer; diff --git a/tests/regexplexer.js b/tests/regexplexer.js index afac1b1..c658e33 100644 --- a/tests/regexplexer.js +++ b/tests/regexplexer.js @@ -331,6 +331,28 @@ exports["test defined token returns"] = function() { assert.equal(lexer.lex(), 4); }; +exports["test module generator from constructor"] = function() { + var dict = { + rules: [ + ["x", "return 'X';" ], + ["y", "return 'Y';" ], + ["$", "return 'EOF';" ] + ] + }; + + var input = "xxyx"; + + var lexerSource = RegExpLexer.generate(dict); + eval(lexerSource); + lexer.setInput(input); + + assert.equal(lexer.lex(), "X"); + assert.equal(lexer.lex(), "X"); + assert.equal(lexer.lex(), "Y"); + assert.equal(lexer.lex(), "X"); + assert.equal(lexer.lex(), "EOF"); +}; + exports["test module generator"] = function() { var dict = { rules: [ @@ -419,7 +441,7 @@ exports["test amd module generator"] = function() { var input = "xxyx"; var lexer_ = new RegExpLexer(dict); - var lexerSource = lexer_.generateCommonJSModule(); + var lexerSource = lexer_.generateAMDModule(); var lexer; var define = function (_, fn) {