diff --git a/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py b/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py index 0d0f9ab7cd364..cbcaa26e9194f 100644 --- a/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py +++ b/crates/ruff_python_parser/resources/invalid/re_lex_logical_token.py @@ -38,7 +38,7 @@ def bar(): # The parser tries to recover from an unclosed `]` when the current token is `)`. This -# test is to make sure it emits a `NonLogicalNewline` token after `c`. +# test is to make sure it emits a `NonLogicalNewline` token after `b`. if call(foo, [a, b ) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 9227d54e54c38..0decf4cb804d0 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -1309,11 +1309,58 @@ impl<'src> Lexer<'src> { /// Re-lex the current token in the context of a logical line. /// - /// Returns a boolean indicating that whether the new current token is different than the - /// previous current token. This also means that the current position of the lexer has changed - /// and the caller is responsible for updating it's state accordingly. + /// Returns a boolean indicating whether the lexer's position has changed. This could result + /// into the new current token being different than the previous current token but is not + /// necessarily true. If the return value is `true` then the caller is responsible for updating + /// it's state accordingly. /// /// This method is a no-op if the lexer isn't in a parenthesized context. + /// + /// ## Explanation + /// + /// The lexer emits two different kinds of newline token based on the context. If it's in a + /// parenthesized context, it'll emit a [`NonLogicalNewline`] token otherwise it'll emit a + /// regular [`Newline`] token. Based on the type of newline token, the lexer will consume and + /// emit the indentation tokens appropriately which affects the structure of the code. + /// + /// For example: + /// ```py + /// if call(foo + /// def bar(): + /// pass + /// ``` + /// + /// Here, the lexer emits a [`NonLogicalNewline`] token after `foo` which means that the lexer + /// doesn't emit an `Indent` token before the `def` keyword. This leads to an AST which + /// considers the function `bar` as part of the module block and the `if` block remains empty. + /// + /// This method is to facilitate the parser if it recovers from these kind of scenarios so that + /// the lexer can then re-lex a [`NonLogicalNewline`] token to a [`Newline`] token which in + /// turn helps the parser to build the correct AST. + /// + /// In the above snippet, it would mean that this method would move the lexer back to the + /// newline character after the `foo` token and emit it as a [`Newline`] token instead of + /// [`NonLogicalNewline`]. This means that the next token emitted by the lexer would be an + /// `Indent` token. + /// + /// There are cases where the lexer's position will change but the re-lexed token will remain + /// the same. This is to help the parser to add the error message at an appropriate location. + /// Consider the following example: + /// + /// ```py + /// if call(foo, [a, b + /// def bar(): + /// pass + /// ``` + /// + /// Here, the parser recovers from two unclosed parenthesis. The inner unclosed `[` will call + /// into the re-lexing logic and reduce the nesting level from 2 to 1. And, the re-lexing logic + /// will move the lexer at the newline after `b` but still emit a [`NonLogicalNewline`] token. + /// Only after the parser recovers from the outer unclosed `(` does the re-lexing logic emit + /// the [`Newline`] token. + /// + /// [`Newline`]: TokenKind::Newline + /// [`NonLogicalNewline`]: TokenKind::NonLogicalNewline pub(crate) fn re_lex_logical_token(&mut self) -> bool { if self.nesting == 0 { return false; @@ -1344,7 +1391,7 @@ impl<'src> Lexer<'src> { if new_position != current_position && has_newline { // Earlier we reduced the nesting level unconditionally. Now that we know the lexer's // position is going to be moved back, the lexer needs to be put back into a - // parenthesized context if the current token was a closing parenthesis. + // parenthesized context if the current token is a closing parenthesis. // // ```py // (a, [b, diff --git a/crates/ruff_python_parser/src/parser/mod.rs b/crates/ruff_python_parser/src/parser/mod.rs index 0ef98424bde12..b58284e2a9a5e 100644 --- a/crates/ruff_python_parser/src/parser/mod.rs +++ b/crates/ruff_python_parser/src/parser/mod.rs @@ -537,10 +537,11 @@ impl<'src> Parser<'src> { if recovery_context_kind.is_list_element(self) { parse_element(self); - // Only unset this when we've completely parsed a single element. - if first_element { - first_element = false; - } + // Only unset this when we've completely parsed a single element. This is mainly to + // raise the correct error in case the first element isn't valid and the current + // token isn't a comma. Without this knowledge, the parser would later expect a + // comma instead of raising the context error. + first_element = false; let maybe_comma_range = self.current_token_range(); if self.eat(TokenKind::Comma) { diff --git a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap index 95b26a582a0f1..1c23c1e0cc8a6 100644 --- a/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap +++ b/crates/ruff_python_parser/tests/snapshots/invalid_syntax@re_lex_logical_token.py.snap @@ -582,7 +582,7 @@ Module( | -41 | # test is to make sure it emits a `NonLogicalNewline` token after `c`. +41 | # test is to make sure it emits a `NonLogicalNewline` token after `b`. 42 | if call(foo, [a, 43 | b | ^ Syntax Error: Expected ']', found NonLogicalNewline