Skip to content

Commit

Permalink
Process escape sequences in string literals appearing as terminals.
Browse files Browse the repository at this point in the history
The "\\" and "\"" escapes are necessary to make it possible to put arbitrary
text in a string literal at all. We have to support "\t" and "\n" to compile
the `whitespace` example from the book. But the other Rust escape sequences like
"\u{2a}" don't really seem necessary.
  • Loading branch information
jimblandy committed Sep 21, 2018
1 parent 04c40f5 commit e915302
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 1 deletion.
1 change: 1 addition & 0 deletions lalrpop/src/build/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ fn parse_and_normalize_grammar(session: &Session, file_text: &FileText) -> io::R
let string = match error.code {
tok::ErrorCode::UnrecognizedToken => "unrecognized token",
tok::ErrorCode::UnterminatedEscape => "unterminated escape; missing '`'?",
tok::ErrorCode::UnrecognizedEscape => "unrecognized escape; only \\n, \\t, \\\" and \\\\ are recognized",
tok::ErrorCode::UnterminatedStringLiteral => {
"unterminated string literal; missing `\"`?"
}
Expand Down
7 changes: 6 additions & 1 deletion lalrpop/src/parser/lrgrammar.lalrpop
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ use grammar::pattern::*;
use std::iter::once;
use tok::{self, Tok};
use util::strip;
use lalrpop_util::ParseError;

grammar<'input>(text: &'input str);

Expand Down Expand Up @@ -394,7 +395,11 @@ QuotedLiteral: TerminalLiteral = {
};

StringLiteral: Atom =
<s:"StringLiteral"> => Atom::from(s);
<lo:@L> <s:"StringLiteral"> =>? {
let text = tok::apply_string_escapes(s, lo + 1)
.map_err(|e| ParseError::User { error: e })?;
Ok(Atom::from(text))
};

RegexLiteral: Atom =
<s:"RegexLiteral"> => Atom::from(s);
Expand Down
30 changes: 30 additions & 0 deletions lalrpop/src/tok/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
//! A tokenizer for use in LALRPOP itself.
use std::borrow::Cow;
use std::str::CharIndices;
use unicode_xid::UnicodeXID;

Expand All @@ -19,6 +20,7 @@ pub struct Error {
pub enum ErrorCode {
UnrecognizedToken,
UnterminatedEscape,
UnrecognizedEscape,
UnterminatedStringLiteral,
UnterminatedCharacterLiteral,
UnterminatedAttribute,
Expand Down Expand Up @@ -736,3 +738,31 @@ fn is_identifier_start(c: char) -> bool {
fn is_identifier_continue(c: char) -> bool {
UnicodeXID::is_xid_continue(c) || c == '_'
}

/// Expand escape characters in a string literal, converting the source code
/// representation to the text it represents. The `idx0` argument should be the
/// position in the input stream of the first character of `text`, the position
/// after the opening double-quote.
pub fn apply_string_escapes(code: &str, idx0: usize) -> Result<Cow<str>, Error> {
if !code.contains('\\') {
Ok(code.into())
} else {
let mut iter = code.char_indices();
let mut text = String::new();
while let Some((_, mut ch)) = iter.next() {
if ch == '\\' {
// The parser should never have accepted an ill-formed string
// literal, so we know it can't end in a backslash.
let (offset, next_ch) = iter.next().unwrap();
ch = match next_ch {
'\\' | '\"' => next_ch,
'n' => '\n',
't' => '\t',
_ => { return error(UnrecognizedEscape, idx0 + offset); }
}
}
text.push(ch);
}
Ok(text.into())
}
}
21 changes: 21 additions & 0 deletions lalrpop/src/tok/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -705,3 +705,24 @@ fn char_literals() {
],
);
}

#[test]
fn string_escapes() {
use std::borrow::Cow;
use super::apply_string_escapes;

assert_eq!(apply_string_escapes(r#"foo"#, 5), Ok(Cow::Borrowed("foo")));
assert_eq!(apply_string_escapes(r#"\\"#, 10), Ok(Cow::Owned::<str>(r#"\"#.into())));
assert_eq!(apply_string_escapes(r#"\""#, 15), Ok(Cow::Owned::<str>(r#"""#.into())));
assert_eq!(apply_string_escapes(r#"up\ndown"#, 25),
Ok(Cow::Owned::<str>("up\ndown".into())));
assert_eq!(apply_string_escapes(r#"left\tright"#, 40),
Ok(Cow::Owned::<str>("left\tright".into())));

// Errors.
assert_eq!(apply_string_escapes("\u{192}\\oo", 65), // "ƒ\oo"
Err(Error { location: 68, code: ErrorCode::UnrecognizedEscape }));
// LALRPOP doesn't support the other Rust escape sequences.
assert_eq!(apply_string_escapes(r#"star: \u{2a}"#, 105),
Err(Error { location: 112, code: ErrorCode::UnrecognizedEscape }));
}

0 comments on commit e915302

Please sign in to comment.