From 893cf427c8044f31bc701238cd296451fd811900 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Wed, 25 Oct 2023 15:36:05 -0500 Subject: [PATCH 1/6] perf(parser): use faster string parsing This makes use of memchr for parsing strings. It sadly does introduce one use of `unsafe` to create a string that is valid to pass into `u32::from_str_radix` because I was unable to find another method that does not require far more code than required with `unsafe`. --- Cargo.lock | 1 + crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/string.rs | 226 +++++++++++++----------- 3 files changed, 129 insertions(+), 99 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e4e5a1a51c02ca..b5daeb5aba9b5c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2381,6 +2381,7 @@ dependencies = [ "itertools 0.11.0", "lalrpop", "lalrpop-util", + "memchr", "ruff_python_ast", "ruff_text_size", "rustc-hash", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index a5d4208623392b..46a97eabd43c57 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -22,6 +22,7 @@ bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { version = "0.20.0", default-features = false } +memchr = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } rustc-hash = { workspace = true } diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 81cbe2f9a1ce34..87ebfd179bea5e 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -6,9 +6,6 @@ use ruff_text_size::{Ranged, TextLen, TextRange, TextSize}; use crate::lexer::{LexicalError, LexicalErrorType}; use crate::token::{StringKind, Tok}; -// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798 -const MAX_UNICODE_NAME: usize = 88; - pub(crate) struct StringConstantWithRange { value: StringConstant, range: TextRange, @@ -57,7 +54,7 @@ impl StringType { } struct StringParser<'a> { - chars: std::str::Chars<'a>, + rest: &'a str, kind: StringKind, location: TextSize, } @@ -65,22 +62,18 @@ struct StringParser<'a> { impl<'a> StringParser<'a> { fn new(source: &'a str, kind: StringKind, start: TextSize) -> Self { Self { - chars: source.chars(), + rest: source, kind, location: start, } } #[inline] - fn next_char(&mut self) -> Option { - let c = self.chars.next()?; - self.location += c.text_len(); - Some(c) - } - - #[inline] - fn peek(&mut self) -> Option { - self.chars.clone().next() + fn skip_bytes(&mut self, bytes: usize) -> &'a str { + let skipped_str = &self.rest[..bytes]; + self.rest = &self.rest[bytes..]; + self.location += skipped_str.text_len(); + skipped_str } #[inline] @@ -93,6 +86,29 @@ impl<'a> StringParser<'a> { TextRange::new(start_location, self.location) } + #[inline] + fn next_byte(&mut self) -> Option { + self.rest.as_bytes().first().map(|&byte| { + self.rest = &self.rest[1..]; + self.location += TextSize::new(1); + byte + }) + } + + #[inline] + fn next_char(&mut self) -> Option { + self.rest.chars().next().map(|c| { + self.rest = &self.rest[c.len_utf8()..]; + self.location += c.text_len(); + c + }) + } + + #[inline] + fn peek_byte(&self) -> Option { + self.rest.as_bytes().first().copied() + } + fn parse_unicode_literal(&mut self, literal_number: usize) -> Result { let mut p: u32 = 0u32; let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos()); @@ -110,99 +126,101 @@ impl<'a> StringParser<'a> { _ => std::char::from_u32(p).ok_or(unicode_error), } } + fn parse_octet(&mut self, o: u8) -> char { + let mut radix_bytes = [o, 0, 0]; + let mut len = 1; - fn parse_octet(&mut self, first: char) -> char { - let mut octet_content = String::new(); - octet_content.push(first); - while octet_content.len() < 3 { - if let Some('0'..='7') = self.peek() { - octet_content.push(self.next_char().unwrap()); - } else { + while len < 3 { + let Some(b'0'..=b'8') = self.peek_byte() else { break; - } + }; + + radix_bytes[len] = self.next_byte().unwrap(); + len += 1; } - let value = u32::from_str_radix(&octet_content, 8).unwrap(); + + // SAFETY: radix_bytes is always going to be in the ASCII range. + #[allow(unsafe_code)] + let radix_str = unsafe { std::str::from_utf8_unchecked(&radix_bytes[..len]) }; + + let value = u32::from_str_radix(radix_str, 8).unwrap(); char::from_u32(value).unwrap() } fn parse_unicode_name(&mut self) -> Result { let start_pos = self.get_pos(); - match self.next_char() { - Some('{') => {} - _ => return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)), - } - let start_pos = self.get_pos(); - let mut name = String::new(); - loop { - match self.next_char() { - Some('}') => break, - Some(c) => name.push(c), - None => { - return Err(LexicalError::new( - LexicalErrorType::StringError, - self.get_pos(), - )) - } - } - } - if name.len() > MAX_UNICODE_NAME { + let Some(b'{') = self.next_byte() else { + return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)); + }; + + let start_pos = self.get_pos(); + let Some(close_idx) = memchr::memchr(b'}', self.rest.as_bytes()) else { return Err(LexicalError::new( - LexicalErrorType::UnicodeError, + LexicalErrorType::StringError, self.get_pos(), )); - } + }; + + let name_and_ending = self.skip_bytes(close_idx + 1); + let name = &name_and_ending[..name_and_ending.len() - 1]; - unicode_names2::character(&name) + unicode_names2::character(name) .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos)) } - fn parse_escaped_char(&mut self) -> Result { - match self.next_char() { - Some(c) => { - let char = match c { - '\\' => '\\', - '\'' => '\'', - '\"' => '"', - 'a' => '\x07', - 'b' => '\x08', - 'f' => '\x0c', - 'n' => '\n', - 'r' => '\r', - 't' => '\t', - 'v' => '\x0b', - o @ '0'..='7' => self.parse_octet(o), - 'x' => self.parse_unicode_literal(2)?, - 'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?, - 'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, - 'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, - // Special cases where the escape sequence is not a single character - '\n' => return Ok(String::new()), - '\r' => { - if self.peek() == Some('\n') { - self.next_char(); - } - return Ok(String::new()); - } - c => { - if self.kind.is_any_bytes() && !c.is_ascii() { - return Err(LexicalError { - error: LexicalErrorType::OtherError( - "bytes can only contain ASCII literal characters".to_owned(), - ), - location: self.get_pos(), - }); - } - return Ok(format!("\\{c}")); - } - }; - Ok(char.to_string()) - } - None => Err(LexicalError { + fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> { + let Some(first_byte) = self.next_byte() else { + return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), - }), - } + }); + }; + + let new_char = match first_byte { + b'\\' => '\\', + b'\'' => '\'', + b'\"' => '"', + b'a' => '\x07', + b'b' => '\x08', + b'f' => '\x0c', + b'n' => '\n', + b'r' => '\r', + b't' => '\t', + b'v' => '\x0b', + o @ b'0'..=b'7' => self.parse_octet(o), + b'x' => self.parse_unicode_literal(2)?, + b'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?, + b'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, + b'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, + // Special cases where the escape sequence is not a single character + b'\n' => return Ok(()), + b'\r' => { + if self.peek_byte() == Some(b'\n') { + self.next_byte(); + } + + return Ok(()); + } + _ => { + string.push('\\'); + + if self.kind.is_any_bytes() && !first_byte.is_ascii() { + return Err(LexicalError { + error: LexicalErrorType::OtherError( + "bytes can only contain ASCII literal characters".to_owned(), + ), + location: self.get_pos(), + }); + } + + first_byte as char + } + }; + + string.push(new_char); + + Ok(()) } fn parse_fstring_middle(&mut self) -> Result { @@ -230,8 +248,8 @@ impl<'a> StringParser<'a> { // This is still an invalid escape sequence, but we don't want to // raise a syntax error as is done by the CPython parser. It might // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas - '\\' if !self.kind.is_raw() && self.peek().is_some() => { - value.push_str(&self.parse_escaped_char()?); + '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => { + self.parse_escaped_char(&mut value)?; } // If there are any curly braces inside a `FStringMiddle` token, // then they were escaped (i.e. `{{` or `}}`). This means that @@ -255,7 +273,7 @@ impl<'a> StringParser<'a> { while let Some(ch) = self.next_char() { match ch { '\\' if !self.kind.is_raw() => { - content.push_str(&self.parse_escaped_char()?); + self.parse_escaped_char(&mut content)?; } ch => { if !ch.is_ascii() { @@ -278,16 +296,26 @@ impl<'a> StringParser<'a> { } fn parse_string(&mut self) -> Result { - let mut value = String::new(); let start_location = self.get_pos(); - while let Some(ch) = self.next_char() { - match ch { - '\\' if !self.kind.is_raw() => { - value.push_str(&self.parse_escaped_char()?); - } - ch => value.push(ch), + let mut value = String::new(); + + if self.kind.is_raw() { + value.push_str(self.skip_bytes(self.rest.len())); + } else { + loop { + let Some(escape_idx) = memchr::memchr(b'\\', self.rest.as_bytes()) else { + value.push_str(self.skip_bytes(self.rest.len())); + break; + }; + + let before_with_slash = self.skip_bytes(escape_idx + 1); + let before = &before_with_slash[..before_with_slash.len() - 1]; + + value.push_str(before); + self.parse_escaped_char(&mut value)?; } } + Ok(StringType::Str(StringConstantWithRange { value: StringConstant { value, From 28eba86b41b445461fce36b6fa5d01db2a87329e Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Wed, 25 Oct 2023 15:55:40 -0500 Subject: [PATCH 2/6] fix escape followed by unicode character --- crates/ruff_python_parser/src/string.rs | 42 ++++++++++++------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 87ebfd179bea5e..94de8b62fd7cb7 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -170,32 +170,32 @@ impl<'a> StringParser<'a> { } fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> { - let Some(first_byte) = self.next_byte() else { + let Some(first_char) = self.next_char() else { return Err(LexicalError { error: LexicalErrorType::StringError, location: self.get_pos(), }); }; - let new_char = match first_byte { - b'\\' => '\\', - b'\'' => '\'', - b'\"' => '"', - b'a' => '\x07', - b'b' => '\x08', - b'f' => '\x0c', - b'n' => '\n', - b'r' => '\r', - b't' => '\t', - b'v' => '\x0b', - o @ b'0'..=b'7' => self.parse_octet(o), - b'x' => self.parse_unicode_literal(2)?, - b'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?, - b'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, - b'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, + let new_char = match first_char { + '\\' => '\\', + '\'' => '\'', + '\"' => '"', + 'a' => '\x07', + 'b' => '\x08', + 'f' => '\x0c', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + 'v' => '\x0b', + o @ '0'..='7' => self.parse_octet(o as u8), + 'x' => self.parse_unicode_literal(2)?, + 'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?, + 'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?, + 'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?, // Special cases where the escape sequence is not a single character - b'\n' => return Ok(()), - b'\r' => { + '\n' => return Ok(()), + '\r' => { if self.peek_byte() == Some(b'\n') { self.next_byte(); } @@ -205,7 +205,7 @@ impl<'a> StringParser<'a> { _ => { string.push('\\'); - if self.kind.is_any_bytes() && !first_byte.is_ascii() { + if self.kind.is_any_bytes() && !first_char.is_ascii() { return Err(LexicalError { error: LexicalErrorType::OtherError( "bytes can only contain ASCII literal characters".to_owned(), @@ -214,7 +214,7 @@ impl<'a> StringParser<'a> { }); } - first_byte as char + first_char } }; From 05acb28874b148c6e43fbc8bf465279b9eacf947 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Wed, 25 Oct 2023 19:51:58 -0500 Subject: [PATCH 3/6] fix panic when unicode name starts with... multi-byte UTF-8 characters --- crates/ruff_python_parser/src/string.rs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 94de8b62fd7cb7..2bc1c2e6ddecf7 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -86,6 +86,10 @@ impl<'a> StringParser<'a> { TextRange::new(start_location, self.location) } + /// Returns the next byte in the string, if there is one. + /// + /// # Panics + /// - When the next byte is a part of a multi-byte character. #[inline] fn next_byte(&mut self) -> Option { self.rest.as_bytes().first().map(|&byte| { @@ -150,7 +154,7 @@ impl<'a> StringParser<'a> { fn parse_unicode_name(&mut self) -> Result { let start_pos = self.get_pos(); - let Some(b'{') = self.next_byte() else { + let Some('{') = self.next_char() else { return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)); }; From af7ef4eaf723a5454029bd1ccf8cbde837fab1ba Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Thu, 26 Oct 2023 21:36:13 -0500 Subject: [PATCH 4/6] push to string after error handling --- crates/ruff_python_parser/src/string.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 2bc1c2e6ddecf7..8ecbfdb38e27fc 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -207,8 +207,6 @@ impl<'a> StringParser<'a> { return Ok(()); } _ => { - string.push('\\'); - if self.kind.is_any_bytes() && !first_char.is_ascii() { return Err(LexicalError { error: LexicalErrorType::OtherError( @@ -218,6 +216,8 @@ impl<'a> StringParser<'a> { }); } + string.push('\\'); + first_char } }; From c57787997ce2a27f0494694a6274d6f35e40f948 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Thu, 26 Oct 2023 21:43:00 -0500 Subject: [PATCH 5/6] use find() instead of memchr --- Cargo.lock | 1 - crates/ruff_python_parser/Cargo.toml | 1 - crates/ruff_python_parser/src/string.rs | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b5daeb5aba9b5c..e4e5a1a51c02ca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2381,7 +2381,6 @@ dependencies = [ "itertools 0.11.0", "lalrpop", "lalrpop-util", - "memchr", "ruff_python_ast", "ruff_text_size", "rustc-hash", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 46a97eabd43c57..a5d4208623392b 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -22,7 +22,6 @@ bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { version = "0.20.0", default-features = false } -memchr = { workspace = true } unicode-ident = { workspace = true } unicode_names2 = { workspace = true } rustc-hash = { workspace = true } diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 8ecbfdb38e27fc..6723e1ef94d960 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -159,7 +159,7 @@ impl<'a> StringParser<'a> { }; let start_pos = self.get_pos(); - let Some(close_idx) = memchr::memchr(b'}', self.rest.as_bytes()) else { + let Some(close_idx) = self.rest.find('}') else { return Err(LexicalError::new( LexicalErrorType::StringError, self.get_pos(), @@ -307,7 +307,7 @@ impl<'a> StringParser<'a> { value.push_str(self.skip_bytes(self.rest.len())); } else { loop { - let Some(escape_idx) = memchr::memchr(b'\\', self.rest.as_bytes()) else { + let Some(escape_idx) = self.rest.find('\\') else { value.push_str(self.skip_bytes(self.rest.len())); break; }; From 28b823f7c723bdfc95b3728b3747a25e37028d66 Mon Sep 17 00:00:00 2001 From: Carter Snook Date: Fri, 27 Oct 2023 08:40:08 -0500 Subject: [PATCH 6/6] Update crates/ruff_python_parser/src/string.rs Co-authored-by: Dhruv Manilawala --- crates/ruff_python_parser/src/string.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 6723e1ef94d960..406a04950e245b 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -89,7 +89,8 @@ impl<'a> StringParser<'a> { /// Returns the next byte in the string, if there is one. /// /// # Panics - /// - When the next byte is a part of a multi-byte character. + /// + /// When the next byte is a part of a multi-byte character. #[inline] fn next_byte(&mut self) -> Option { self.rest.as_bytes().first().map(|&byte| {