Skip to content

Commit

Permalink
Rollup merge of rust-lang#133070 - nnethercote:lexer-tweaks, r=chenyu…
Browse files Browse the repository at this point in the history
…kang

Lexer tweaks

Some cleanups and small performance improvements.

r? ```@chenyukang```
  • Loading branch information
compiler-errors authored Nov 26, 2024
2 parents 5915190 + 16a39bb commit 9d6a11a
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 125 deletions.
8 changes: 4 additions & 4 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -566,19 +566,19 @@ impl Cursor<'_> {

fn c_or_byte_string(
&mut self,
mk_kind: impl FnOnce(bool) -> LiteralKind,
mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
mk_kind: fn(bool) -> LiteralKind,
mk_kind_raw: fn(Option<u8>) -> LiteralKind,
single_quoted: Option<fn(bool) -> LiteralKind>,
) -> TokenKind {
match (self.first(), self.second(), single_quoted) {
('\'', _, Some(mk_kind)) => {
('\'', _, Some(single_quoted)) => {
self.bump();
let terminated = self.single_quoted_string();
let suffix_start = self.pos_within_token();
if terminated {
self.eat_literal_suffix();
}
let kind = mk_kind(terminated);
let kind = single_quoted(terminated);
Literal { kind, suffix_start }
}
('"', _, _) => {
Expand Down
72 changes: 31 additions & 41 deletions compiler/rustc_lexer/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,61 +77,51 @@ fn test_too_many_hashes() {
check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
}

// https://github.com/rust-lang/rust/issues/70528
#[test]
fn test_valid_shebang() {
// https://github.com/rust-lang/rust/issues/70528
let input = "#!/usr/bin/rustrun\nlet x = 5;";
assert_eq!(strip_shebang(input), Some(18));
}
let input = "#!/bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));

#[test]
fn test_invalid_shebang_valid_rust_syntax() {
// https://github.com/rust-lang/rust/issues/70528
let input = "#! [bad_attribute]";
let input = "#![attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_shebang_second_line() {
// Because shebangs are interpreted by the kernel, they must be on the first line
let input = "\n#!/bin/bash";
let input = "#! /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));

let input = "#! [attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_shebang_space() {
let input = "#! /bin/bash";
let input = "#! /* blah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(input.len()));
}

#[test]
fn test_shebang_empty_shebang() {
let input = "#! \n[attribute(foo)]";
let input = "#! /* blah */ [attribute]";
assert_eq!(strip_shebang(input), None);
}

#[test]
fn test_invalid_shebang_comment() {
let input = "#!//bin/ami/a/comment\n[";
assert_eq!(strip_shebang(input), None)
}
let input = "#! // blah\n/bin/bash";
assert_eq!(strip_shebang(input), Some(10)); // strip up to the newline

#[test]
fn test_invalid_shebang_another_comment() {
let input = "#!/*bin/ami/a/comment*/\n[attribute";
assert_eq!(strip_shebang(input), None)
}
let input = "#! // blah\n[attribute]";
assert_eq!(strip_shebang(input), None);

#[test]
fn test_shebang_valid_rust_after() {
let input = "#!/*bin/ami/a/comment*/\npub fn main() {}";
assert_eq!(strip_shebang(input), Some(23))
}
let input = "#! /* blah\nblah\nblah */ /bin/bash";
assert_eq!(strip_shebang(input), Some(10));

#[test]
fn test_shebang_followed_by_attrib() {
let input = "#!/bin/rust-scripts\n#![allow_unused(true)]";
assert_eq!(strip_shebang(input), Some(19));
let input = "#! /* blah\nblah\nblah */ [attribute]";
assert_eq!(strip_shebang(input), None);

let input = "#!\n/bin/sh";
assert_eq!(strip_shebang(input), Some(2));

let input = "#!\n[attribute]";
assert_eq!(strip_shebang(input), None);

// Because shebangs are interpreted by the kernel, they must be on the first line
let input = "\n#!/bin/bash";
assert_eq!(strip_shebang(input), None);

let input = "\n#![attribute]";
assert_eq!(strip_shebang(input), None);
}

fn check_lexing(src: &str, expect: Expect) {
Expand Down
75 changes: 44 additions & 31 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use rustc_span::symbol::Symbol;
use rustc_span::{BytePos, Pos, Span};
use tracing::debug;

use crate::lexer::diagnostics::TokenTreeDiagInfo;
use crate::lexer::unicode_chars::UNICODE_ARRAY;
use crate::{errors, make_unclosed_delims_error};

Expand Down Expand Up @@ -56,7 +57,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
}

let cursor = Cursor::new(src);
let string_reader = StringReader {
let mut lexer = Lexer {
psess,
start_pos,
pos: start_pos,
Expand All @@ -65,34 +66,31 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
override_span,
nbsp_is_whitespace: false,
last_lifetime: None,
token: Token::dummy(),
diag_info: TokenTreeDiagInfo::default(),
};
let (stream, res, unmatched_delims) =
tokentrees::TokenTreesReader::lex_all_token_trees(string_reader);
match res {
Ok(()) if unmatched_delims.is_empty() => Ok(stream),
_ => {
// Return error if there are unmatched delimiters or unclosed delimiters.
// We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
// because the delimiter mismatch is more likely to be the root cause of error

let mut buffer = Vec::with_capacity(1);
for unmatched in unmatched_delims {
if let Some(err) = make_unclosed_delims_error(unmatched, psess) {
buffer.push(err);
}
}
if let Err(errs) = res {
// Add unclosing delimiter or diff marker errors
for err in errs {
buffer.push(err);
}
}
Err(buffer)
let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false);
let unmatched_delims = lexer.diag_info.unmatched_delims;

if res.is_ok() && unmatched_delims.is_empty() {
Ok(stream)
} else {
// Return error if there are unmatched delimiters or unclosed delimiters.
// We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
// because the delimiter mismatch is more likely to be the root cause of error
let mut buffer: Vec<_> = unmatched_delims
.into_iter()
.filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
.collect();
if let Err(errs) = res {
// Add unclosing delimiter or diff marker errors
buffer.extend(errs);
}
Err(buffer)
}
}

struct StringReader<'psess, 'src> {
struct Lexer<'psess, 'src> {
psess: &'psess ParseSess,
/// Initial position, read-only.
start_pos: BytePos,
Expand All @@ -111,9 +109,14 @@ struct StringReader<'psess, 'src> {
/// Track the `Span` for the leading `'` of the last lifetime. Used for
/// diagnostics to detect possible typo where `"` was meant.
last_lifetime: Option<Span>,

/// The current token.
token: Token,

diag_info: TokenTreeDiagInfo,
}

impl<'psess, 'src> StringReader<'psess, 'src> {
impl<'psess, 'src> Lexer<'psess, 'src> {
fn dcx(&self) -> DiagCtxtHandle<'psess> {
self.psess.dcx()
}
Expand All @@ -124,7 +127,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {

/// Returns the next token, paired with a bool indicating if the token was
/// preceded by whitespace.
fn next_token(&mut self) -> (Token, bool) {
fn next_token_from_cursor(&mut self) -> (Token, bool) {
let mut preceded_by_whitespace = false;
let mut swallow_next_invalid = 0;
// Skip trivial (whitespace & comments) tokens
Expand Down Expand Up @@ -231,7 +234,8 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
.push(span);
token::Ident(sym, IdentIsRaw::No)
}
// split up (raw) c string literals to an ident and a string literal when edition < 2021.
// split up (raw) c string literals to an ident and a string literal when edition <
// 2021.
rustc_lexer::TokenKind::Literal {
kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
suffix_start: _,
Expand All @@ -252,7 +256,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let prefix_span = self.mk_sp(start, lit_start);
return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
}
rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
rustc_lexer::TokenKind::GuardedStrPrefix => {
self.maybe_report_guarded_str(start, str_before)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
Expand Down Expand Up @@ -296,13 +302,20 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
if prefix_span.at_least_rust_2021() {
let span = self.mk_sp(start, self.pos);

let lifetime_name_without_tick = Symbol::intern(&self.str_from(ident_start));
let lifetime_name_without_tick =
Symbol::intern(&self.str_from(ident_start));
if !lifetime_name_without_tick.can_be_raw() {
self.dcx().emit_err(errors::CannotBeRawLifetime { span, ident: lifetime_name_without_tick });
self.dcx().emit_err(
errors::CannotBeRawLifetime {
span,
ident: lifetime_name_without_tick
}
);
}

// Put the `'` back onto the lifetime name.
let mut lifetime_name = String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
let mut lifetime_name =
String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
lifetime_name.push('\'');
lifetime_name += lifetime_name_without_tick.as_str();
let sym = Symbol::intern(&lifetime_name);
Expand Down
Loading

0 comments on commit 9d6a11a

Please sign in to comment.