From 893cf427c8044f31bc701238cd296451fd811900 Mon Sep 17 00:00:00 2001
From: Carter Snook <cartersnook04@gmail.com>
Date: Wed, 25 Oct 2023 15:36:05 -0500
Subject: [PATCH 1/6] perf(parser): use faster string parsing

This makes use of memchr for parsing strings. It sadly does introduce
one use of `unsafe` to create a string that is valid to pass into
`u32::from_str_radix` because I was unable to find another method that
does not require far more code than required with `unsafe`.
---
 Cargo.lock                              |   1 +
 crates/ruff_python_parser/Cargo.toml    |   1 +
 crates/ruff_python_parser/src/string.rs | 226 +++++++++++++-----------
 3 files changed, 129 insertions(+), 99 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index e4e5a1a51c02ca..b5daeb5aba9b5c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2381,6 +2381,7 @@ dependencies = [
  "itertools 0.11.0",
  "lalrpop",
  "lalrpop-util",
+ "memchr",
  "ruff_python_ast",
  "ruff_text_size",
  "rustc-hash",
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
index a5d4208623392b..46a97eabd43c57 100644
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@@ -22,6 +22,7 @@ bitflags = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }
 lalrpop-util = { version = "0.20.0", default-features = false }
+memchr = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
 rustc-hash = { workspace = true }
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 81cbe2f9a1ce34..87ebfd179bea5e 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -6,9 +6,6 @@ use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};
 
-// unicode_name2 does not expose `MAX_NAME_LENGTH`, so we replicate that constant here, fix #3798
-const MAX_UNICODE_NAME: usize = 88;
-
 pub(crate) struct StringConstantWithRange {
     value: StringConstant,
     range: TextRange,
@@ -57,7 +54,7 @@ impl StringType {
 }
 
 struct StringParser<'a> {
-    chars: std::str::Chars<'a>,
+    rest: &'a str,
     kind: StringKind,
     location: TextSize,
 }
@@ -65,22 +62,18 @@ struct StringParser<'a> {
 impl<'a> StringParser<'a> {
     fn new(source: &'a str, kind: StringKind, start: TextSize) -> Self {
         Self {
-            chars: source.chars(),
+            rest: source,
             kind,
             location: start,
         }
     }
 
     #[inline]
-    fn next_char(&mut self) -> Option<char> {
-        let c = self.chars.next()?;
-        self.location += c.text_len();
-        Some(c)
-    }
-
-    #[inline]
-    fn peek(&mut self) -> Option<char> {
-        self.chars.clone().next()
+    fn skip_bytes(&mut self, bytes: usize) -> &'a str {
+        let skipped_str = &self.rest[..bytes];
+        self.rest = &self.rest[bytes..];
+        self.location += skipped_str.text_len();
+        skipped_str
     }
 
     #[inline]
@@ -93,6 +86,29 @@ impl<'a> StringParser<'a> {
         TextRange::new(start_location, self.location)
     }
 
+    #[inline]
+    fn next_byte(&mut self) -> Option<u8> {
+        self.rest.as_bytes().first().map(|&byte| {
+            self.rest = &self.rest[1..];
+            self.location += TextSize::new(1);
+            byte
+        })
+    }
+
+    #[inline]
+    fn next_char(&mut self) -> Option<char> {
+        self.rest.chars().next().map(|c| {
+            self.rest = &self.rest[c.len_utf8()..];
+            self.location += c.text_len();
+            c
+        })
+    }
+
+    #[inline]
+    fn peek_byte(&self) -> Option<u8> {
+        self.rest.as_bytes().first().copied()
+    }
+
     fn parse_unicode_literal(&mut self, literal_number: usize) -> Result<char, LexicalError> {
         let mut p: u32 = 0u32;
         let unicode_error = LexicalError::new(LexicalErrorType::UnicodeError, self.get_pos());
@@ -110,99 +126,101 @@ impl<'a> StringParser<'a> {
             _ => std::char::from_u32(p).ok_or(unicode_error),
         }
     }
+    fn parse_octet(&mut self, o: u8) -> char {
+        let mut radix_bytes = [o, 0, 0];
+        let mut len = 1;
 
-    fn parse_octet(&mut self, first: char) -> char {
-        let mut octet_content = String::new();
-        octet_content.push(first);
-        while octet_content.len() < 3 {
-            if let Some('0'..='7') = self.peek() {
-                octet_content.push(self.next_char().unwrap());
-            } else {
+        while len < 3 {
+            let Some(b'0'..=b'8') = self.peek_byte() else {
                 break;
-            }
+            };
+
+            radix_bytes[len] = self.next_byte().unwrap();
+            len += 1;
         }
-        let value = u32::from_str_radix(&octet_content, 8).unwrap();
+
+        // SAFETY: radix_bytes is always going to be in the ASCII range.
+        #[allow(unsafe_code)]
+        let radix_str = unsafe { std::str::from_utf8_unchecked(&radix_bytes[..len]) };
+
+        let value = u32::from_str_radix(radix_str, 8).unwrap();
         char::from_u32(value).unwrap()
     }
 
     fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
         let start_pos = self.get_pos();
-        match self.next_char() {
-            Some('{') => {}
-            _ => return Err(LexicalError::new(LexicalErrorType::StringError, start_pos)),
-        }
-        let start_pos = self.get_pos();
-        let mut name = String::new();
-        loop {
-            match self.next_char() {
-                Some('}') => break,
-                Some(c) => name.push(c),
-                None => {
-                    return Err(LexicalError::new(
-                        LexicalErrorType::StringError,
-                        self.get_pos(),
-                    ))
-                }
-            }
-        }
 
-        if name.len() > MAX_UNICODE_NAME {
+        let Some(b'{') = self.next_byte() else {
+            return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
+        };
+
+        let start_pos = self.get_pos();
+        let Some(close_idx) = memchr::memchr(b'}', self.rest.as_bytes()) else {
             return Err(LexicalError::new(
-                LexicalErrorType::UnicodeError,
+                LexicalErrorType::StringError,
                 self.get_pos(),
             ));
-        }
+        };
+
+        let name_and_ending = self.skip_bytes(close_idx + 1);
+        let name = &name_and_ending[..name_and_ending.len() - 1];
 
-        unicode_names2::character(&name)
+        unicode_names2::character(name)
             .ok_or_else(|| LexicalError::new(LexicalErrorType::UnicodeError, start_pos))
     }
 
-    fn parse_escaped_char(&mut self) -> Result<String, LexicalError> {
-        match self.next_char() {
-            Some(c) => {
-                let char = match c {
-                    '\\' => '\\',
-                    '\'' => '\'',
-                    '\"' => '"',
-                    'a' => '\x07',
-                    'b' => '\x08',
-                    'f' => '\x0c',
-                    'n' => '\n',
-                    'r' => '\r',
-                    't' => '\t',
-                    'v' => '\x0b',
-                    o @ '0'..='7' => self.parse_octet(o),
-                    'x' => self.parse_unicode_literal(2)?,
-                    'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
-                    'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
-                    'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
-                    // Special cases where the escape sequence is not a single character
-                    '\n' => return Ok(String::new()),
-                    '\r' => {
-                        if self.peek() == Some('\n') {
-                            self.next_char();
-                        }
-                        return Ok(String::new());
-                    }
-                    c => {
-                        if self.kind.is_any_bytes() && !c.is_ascii() {
-                            return Err(LexicalError {
-                                error: LexicalErrorType::OtherError(
-                                    "bytes can only contain ASCII literal characters".to_owned(),
-                                ),
-                                location: self.get_pos(),
-                            });
-                        }
-                        return Ok(format!("\\{c}"));
-                    }
-                };
-                Ok(char.to_string())
-            }
-            None => Err(LexicalError {
+    fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
+        let Some(first_byte) = self.next_byte() else {
+            return Err(LexicalError {
                 error: LexicalErrorType::StringError,
                 location: self.get_pos(),
-            }),
-        }
+            });
+        };
+
+        let new_char = match first_byte {
+            b'\\' => '\\',
+            b'\'' => '\'',
+            b'\"' => '"',
+            b'a' => '\x07',
+            b'b' => '\x08',
+            b'f' => '\x0c',
+            b'n' => '\n',
+            b'r' => '\r',
+            b't' => '\t',
+            b'v' => '\x0b',
+            o @ b'0'..=b'7' => self.parse_octet(o),
+            b'x' => self.parse_unicode_literal(2)?,
+            b'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
+            b'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
+            b'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
+            // Special cases where the escape sequence is not a single character
+            b'\n' => return Ok(()),
+            b'\r' => {
+                if self.peek_byte() == Some(b'\n') {
+                    self.next_byte();
+                }
+
+                return Ok(());
+            }
+            _ => {
+                string.push('\\');
+
+                if self.kind.is_any_bytes() && !first_byte.is_ascii() {
+                    return Err(LexicalError {
+                        error: LexicalErrorType::OtherError(
+                            "bytes can only contain ASCII literal characters".to_owned(),
+                        ),
+                        location: self.get_pos(),
+                    });
+                }
+
+                first_byte as char
+            }
+        };
+
+        string.push(new_char);
+
+        Ok(())
     }
 
     fn parse_fstring_middle(&mut self) -> Result<Expr, LexicalError> {
@@ -230,8 +248,8 @@ impl<'a> StringParser<'a> {
                 // This is still an invalid escape sequence, but we don't want to
                 // raise a syntax error as is done by the CPython parser. It might
                 // be supported in the future, refer to point 3: https://peps.python.org/pep-0701/#rejected-ideas
-                '\\' if !self.kind.is_raw() && self.peek().is_some() => {
-                    value.push_str(&self.parse_escaped_char()?);
+                '\\' if !self.kind.is_raw() && self.peek_byte().is_some() => {
+                    self.parse_escaped_char(&mut value)?;
                 }
                 // If there are any curly braces inside a `FStringMiddle` token,
                 // then they were escaped (i.e. `{{` or `}}`). This means that
@@ -255,7 +273,7 @@ impl<'a> StringParser<'a> {
         while let Some(ch) = self.next_char() {
             match ch {
                 '\\' if !self.kind.is_raw() => {
-                    content.push_str(&self.parse_escaped_char()?);
+                    self.parse_escaped_char(&mut content)?;
                 }
                 ch => {
                     if !ch.is_ascii() {
@@ -278,16 +296,26 @@ impl<'a> StringParser<'a> {
     }
 
     fn parse_string(&mut self) -> Result<StringType, LexicalError> {
-        let mut value = String::new();
         let start_location = self.get_pos();
-        while let Some(ch) = self.next_char() {
-            match ch {
-                '\\' if !self.kind.is_raw() => {
-                    value.push_str(&self.parse_escaped_char()?);
-                }
-                ch => value.push(ch),
+        let mut value = String::new();
+
+        if self.kind.is_raw() {
+            value.push_str(self.skip_bytes(self.rest.len()));
+        } else {
+            loop {
+                let Some(escape_idx) = memchr::memchr(b'\\', self.rest.as_bytes()) else {
+                    value.push_str(self.skip_bytes(self.rest.len()));
+                    break;
+                };
+
+                let before_with_slash = self.skip_bytes(escape_idx + 1);
+                let before = &before_with_slash[..before_with_slash.len() - 1];
+
+                value.push_str(before);
+                self.parse_escaped_char(&mut value)?;
             }
         }
+
         Ok(StringType::Str(StringConstantWithRange {
             value: StringConstant {
                 value,

From 28eba86b41b445461fce36b6fa5d01db2a87329e Mon Sep 17 00:00:00 2001
From: Carter Snook <cartersnook04@gmail.com>
Date: Wed, 25 Oct 2023 15:55:40 -0500
Subject: [PATCH 2/6] fix escape followed by unicode character

---
 crates/ruff_python_parser/src/string.rs | 42 ++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 87ebfd179bea5e..94de8b62fd7cb7 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -170,32 +170,32 @@ impl<'a> StringParser<'a> {
     }
 
     fn parse_escaped_char(&mut self, string: &mut String) -> Result<(), LexicalError> {
-        let Some(first_byte) = self.next_byte() else {
+        let Some(first_char) = self.next_char() else {
             return Err(LexicalError {
                 error: LexicalErrorType::StringError,
                 location: self.get_pos(),
             });
         };
 
-        let new_char = match first_byte {
-            b'\\' => '\\',
-            b'\'' => '\'',
-            b'\"' => '"',
-            b'a' => '\x07',
-            b'b' => '\x08',
-            b'f' => '\x0c',
-            b'n' => '\n',
-            b'r' => '\r',
-            b't' => '\t',
-            b'v' => '\x0b',
-            o @ b'0'..=b'7' => self.parse_octet(o),
-            b'x' => self.parse_unicode_literal(2)?,
-            b'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
-            b'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
-            b'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
+        let new_char = match first_char {
+            '\\' => '\\',
+            '\'' => '\'',
+            '\"' => '"',
+            'a' => '\x07',
+            'b' => '\x08',
+            'f' => '\x0c',
+            'n' => '\n',
+            'r' => '\r',
+            't' => '\t',
+            'v' => '\x0b',
+            o @ '0'..='7' => self.parse_octet(o as u8),
+            'x' => self.parse_unicode_literal(2)?,
+            'u' if !self.kind.is_any_bytes() => self.parse_unicode_literal(4)?,
+            'U' if !self.kind.is_any_bytes() => self.parse_unicode_literal(8)?,
+            'N' if !self.kind.is_any_bytes() => self.parse_unicode_name()?,
             // Special cases where the escape sequence is not a single character
-            b'\n' => return Ok(()),
-            b'\r' => {
+            '\n' => return Ok(()),
+            '\r' => {
                 if self.peek_byte() == Some(b'\n') {
                     self.next_byte();
                 }
@@ -205,7 +205,7 @@ impl<'a> StringParser<'a> {
             _ => {
                 string.push('\\');
 
-                if self.kind.is_any_bytes() && !first_byte.is_ascii() {
+                if self.kind.is_any_bytes() && !first_char.is_ascii() {
                     return Err(LexicalError {
                         error: LexicalErrorType::OtherError(
                             "bytes can only contain ASCII literal characters".to_owned(),
@@ -214,7 +214,7 @@ impl<'a> StringParser<'a> {
                     });
                 }
 
-                first_byte as char
+                first_char
             }
         };
 

From 05acb28874b148c6e43fbc8bf465279b9eacf947 Mon Sep 17 00:00:00 2001
From: Carter Snook <cartersnook04@gmail.com>
Date: Wed, 25 Oct 2023 19:51:58 -0500
Subject: [PATCH 3/6] fix panic when unicode name starts with...

multi-byte UTF-8 characters
---
 crates/ruff_python_parser/src/string.rs | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 94de8b62fd7cb7..2bc1c2e6ddecf7 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -86,6 +86,10 @@ impl<'a> StringParser<'a> {
         TextRange::new(start_location, self.location)
     }
 
+    /// Returns the next byte in the string, if there is one.
+    ///
+    /// # Panics
+    ///  - When the next byte is a part of a multi-byte character.
     #[inline]
     fn next_byte(&mut self) -> Option<u8> {
         self.rest.as_bytes().first().map(|&byte| {
@@ -150,7 +154,7 @@ impl<'a> StringParser<'a> {
     fn parse_unicode_name(&mut self) -> Result<char, LexicalError> {
         let start_pos = self.get_pos();
 
-        let Some(b'{') = self.next_byte() else {
+        let Some('{') = self.next_char() else {
             return Err(LexicalError::new(LexicalErrorType::StringError, start_pos));
         };
 

From af7ef4eaf723a5454029bd1ccf8cbde837fab1ba Mon Sep 17 00:00:00 2001
From: Carter Snook <cartersnook04@gmail.com>
Date: Thu, 26 Oct 2023 21:36:13 -0500
Subject: [PATCH 4/6] push to string after error handling

---
 crates/ruff_python_parser/src/string.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 2bc1c2e6ddecf7..8ecbfdb38e27fc 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -207,8 +207,6 @@ impl<'a> StringParser<'a> {
                 return Ok(());
             }
             _ => {
-                string.push('\\');
-
                 if self.kind.is_any_bytes() && !first_char.is_ascii() {
                     return Err(LexicalError {
                         error: LexicalErrorType::OtherError(
@@ -218,6 +216,8 @@ impl<'a> StringParser<'a> {
                     });
                 }
 
+                string.push('\\');
+
                 first_char
             }
         };

From c57787997ce2a27f0494694a6274d6f35e40f948 Mon Sep 17 00:00:00 2001
From: Carter Snook <cartersnook04@gmail.com>
Date: Thu, 26 Oct 2023 21:43:00 -0500
Subject: [PATCH 5/6] use find() instead of memchr

---
 Cargo.lock                              | 1 -
 crates/ruff_python_parser/Cargo.toml    | 1 -
 crates/ruff_python_parser/src/string.rs | 4 ++--
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b5daeb5aba9b5c..e4e5a1a51c02ca 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2381,7 +2381,6 @@ dependencies = [
  "itertools 0.11.0",
  "lalrpop",
  "lalrpop-util",
- "memchr",
  "ruff_python_ast",
  "ruff_text_size",
  "rustc-hash",
diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml
index 46a97eabd43c57..a5d4208623392b 100644
--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@@ -22,7 +22,6 @@ bitflags = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }
 lalrpop-util = { version = "0.20.0", default-features = false }
-memchr = { workspace = true }
 unicode-ident = { workspace = true }
 unicode_names2 = { workspace = true }
 rustc-hash = { workspace = true }
diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 8ecbfdb38e27fc..6723e1ef94d960 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -159,7 +159,7 @@ impl<'a> StringParser<'a> {
         };
 
         let start_pos = self.get_pos();
-        let Some(close_idx) = memchr::memchr(b'}', self.rest.as_bytes()) else {
+        let Some(close_idx) = self.rest.find('}') else {
             return Err(LexicalError::new(
                 LexicalErrorType::StringError,
                 self.get_pos(),
@@ -307,7 +307,7 @@ impl<'a> StringParser<'a> {
             value.push_str(self.skip_bytes(self.rest.len()));
         } else {
             loop {
-                let Some(escape_idx) = memchr::memchr(b'\\', self.rest.as_bytes()) else {
+                let Some(escape_idx) = self.rest.find('\\') else {
                     value.push_str(self.skip_bytes(self.rest.len()));
                     break;
                 };

From 28b823f7c723bdfc95b3728b3747a25e37028d66 Mon Sep 17 00:00:00 2001
From: Carter Snook <cartersnook04@gmail.com>
Date: Fri, 27 Oct 2023 08:40:08 -0500
Subject: [PATCH 6/6] Update crates/ruff_python_parser/src/string.rs

Co-authored-by: Dhruv Manilawala <dhruvmanila@gmail.com>
---
 crates/ruff_python_parser/src/string.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs
index 6723e1ef94d960..406a04950e245b 100644
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@@ -89,7 +89,8 @@ impl<'a> StringParser<'a> {
     /// Returns the next byte in the string, if there is one.
     ///
     /// # Panics
-    ///  - When the next byte is a part of a multi-byte character.
+    ///
+    /// When the next byte is a part of a multi-byte character.
     #[inline]
     fn next_byte(&mut self) -> Option<u8> {
         self.rest.as_bytes().first().map(|&byte| {