From fffbb33cad812a9677483ee0c7ae2636e80ea862 Mon Sep 17 00:00:00 2001 From: Arthur Carcano Date: Mon, 17 Feb 2025 19:06:14 +0100 Subject: [PATCH] First prototype of make_lowercase --- library/alloc/src/slice.rs | 100 ++++++++++++++++++++++++++++++++++ library/alloc/src/string.rs | 12 ++++ library/alloc/tests/string.rs | 60 ++++++++++++++++++++ 3 files changed, 172 insertions(+) diff --git a/library/alloc/src/slice.rs b/library/alloc/src/slice.rs index 2e1e3f0f989a0..0c23d7cc0b071 100644 --- a/library/alloc/src/slice.rs +++ b/library/alloc/src/slice.rs @@ -703,6 +703,106 @@ impl [u8] { assert_eq!(read_offset, self.len()); if write_offset < read_offset { Ok(write_offset) } else { Err(queue) } } + + #[rustc_allow_incoherent_impl] + #[unstable(issue = "none", feature = "std_internals")] + #[allow(dead_code)] + /// Safety: + /// - Must be UTF-8 + pub unsafe fn make_utf8_lowercase(&mut self) -> Result> { + let mut queue = VecDeque::new(); + + let mut read_offset = 0; + let mut write_offset = 0; + + let mut buffer = [0; 4]; + let mut final_sigma_automata = FinalSigmaAutomata::new(); + while let Some((codepoint, width)) = + unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) } + { + read_offset += width; + let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) }; + if uppercase_char == 'Σ' { + // Σ maps to σ, except at the end of a word where it maps to ς. + // See core::str::to_lowercase + let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) }; + let is_word_final = + final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars()); + let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' }; + let l = sigma_lowercase.len_utf8(); + sigma_lowercase.encode_utf8(&mut buffer); + queue.extend(&buffer[..l]); + } else { + for c in uppercase_char.to_lowercase() { + let l = c.len_utf8(); + c.encode_utf8(&mut buffer); + queue.extend(&buffer[..l]); + } + } + final_sigma_automata.step(uppercase_char); + while write_offset < read_offset { + match queue.pop_front() { + Some(b) => { + self[write_offset] = b; + write_offset += 1; + } + None => break, + } + } + } + assert_eq!(read_offset, self.len()); + return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }; + + // For now this is copy pasted from core::str, FIXME: DRY + fn case_ignorable_then_cased>(iter: I) -> bool { + use core::unicode::{Case_Ignorable, Cased}; + match iter.skip_while(|&c| Case_Ignorable(c)).next() { + Some(c) => Cased(c), + None => false, + } + } + } +} + +#[derive(Clone)] +enum FinalSigmaAutomata { + Init, + Accepted, +} + +impl FinalSigmaAutomata { + fn new() -> Self { + Self::Init + } + + fn is_accepting(&self) -> bool { + match self { + FinalSigmaAutomata::Accepted => true, + FinalSigmaAutomata::Init => false, + } + } + + fn step(&mut self, c: char) { + use core::unicode::{Case_Ignorable, Cased}; + + use FinalSigmaAutomata::*; + *self = match self { + Init => { + if Cased(c) { + Accepted + } else { + Init + } + } + Accepted => { + if Cased(c) || Case_Ignorable(c) { + Accepted + } else { + Init + } + } + } + } } #[cfg(not(test))] diff --git a/library/alloc/src/string.rs b/library/alloc/src/string.rs index 99026353455b1..889bfba6e0474 100644 --- a/library/alloc/src/string.rs +++ b/library/alloc/src/string.rs @@ -1139,6 +1139,18 @@ impl String { *self = unsafe { Self::from_utf8_unchecked(v) } } + #[unstable(feature = "string_make_uplowercase", issue = "135885")] + #[allow(missing_docs)] + pub fn make_lowercase(&mut self) { + let mut v = core::mem::take(self).vec; + let res = unsafe { v.make_utf8_lowercase() }; + match res { + Ok(n) => v.truncate(n), + Err(queue) => v.extend(queue), + } + *self = unsafe { Self::from_utf8_unchecked(v) } + } + /// Copies elements from `src` range to the end of the string. /// /// # Panics diff --git a/library/alloc/tests/string.rs b/library/alloc/tests/string.rs index c5c188fbb9240..b8a89e1a504b7 100644 --- a/library/alloc/tests/string.rs +++ b/library/alloc/tests/string.rs @@ -923,3 +923,63 @@ fn make_uppercase() { test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ"); test("aéDžßfiᾀ"); } + +#[test] +fn make_lowercase() { + fn test(s: &str) { + let ground_truth = s.to_lowercase(); + let mut tested = s.to_owned(); + tested.make_lowercase(); + assert!( + tested == ground_truth, + r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"# + ); + } + test(""); + test("AÉDžaé "); + + // https://github.com/rust-lang/rust/issues/26035 + test("ΑΣ"); + test("Α'Σ"); + test("Α''Σ"); + + test("ΑΣ Α"); + test("Α'Σ Α"); + test("Α''Σ Α"); + + test("ΑΣ' Α"); + test("ΑΣ'' Α"); + + test("Α'Σ' Α"); + test("Α''Σ'' Α"); + + test("Α Σ"); + test("Α 'Σ"); + test("Α ''Σ"); + + test("Σ"); + test("'Σ"); + test("''Σ"); + + test("ΑΣΑ"); + test("ΑΣ'Α"); + test("ΑΣ''Α"); + + // https://github.com/rust-lang/rust/issues/124714 + // input lengths around the boundary of the chunk size used by the ascii prefix optimization + test("abcdefghijklmnoΣ"); + test("abcdefghijklmnopΣ"); + test("abcdefghijklmnopqΣ"); + + // a really long string that has it's lowercase form + // even longer. this tests that implementations don't assume + // an incorrect upper bound on allocations + let upper = str::repeat("İ", 512); + test(&upper); + + // a really long ascii-only string. + // This test that the ascii hot-path + // functions correctly + let upper = str::repeat("A", 511); + test(&upper); +}