Skip to content

Commit

Permalink
First prototype of make_lowercase
Browse files Browse the repository at this point in the history
  • Loading branch information
krtab committed Feb 17, 2025
1 parent c5b0e91 commit fffbb33
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 0 deletions.
100 changes: 100 additions & 0 deletions library/alloc/src/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -703,6 +703,106 @@ impl [u8] {
assert_eq!(read_offset, self.len());
if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }
}

#[rustc_allow_incoherent_impl]
#[unstable(issue = "none", feature = "std_internals")]
#[allow(dead_code)]
/// Safety:
/// - Must be UTF-8
pub unsafe fn make_utf8_lowercase(&mut self) -> Result<usize, VecDeque<u8>> {
let mut queue = VecDeque::new();

let mut read_offset = 0;
let mut write_offset = 0;

let mut buffer = [0; 4];
let mut final_sigma_automata = FinalSigmaAutomata::new();
while let Some((codepoint, width)) =
unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
{
read_offset += width;
let uppercase_char = unsafe { char::from_u32_unchecked(codepoint) };
if uppercase_char == 'Σ' {
// Σ maps to σ, except at the end of a word where it maps to ς.
// See core::str::to_lowercase
let rest = unsafe { core::str::from_utf8_unchecked(&self[read_offset..]) };
let is_word_final =
final_sigma_automata.is_accepting() && !case_ignorable_then_cased(rest.chars());
let sigma_lowercase = if is_word_final { 'ς' } else { 'σ' };
let l = sigma_lowercase.len_utf8();
sigma_lowercase.encode_utf8(&mut buffer);
queue.extend(&buffer[..l]);
} else {
for c in uppercase_char.to_lowercase() {
let l = c.len_utf8();
c.encode_utf8(&mut buffer);
queue.extend(&buffer[..l]);
}
}
final_sigma_automata.step(uppercase_char);
while write_offset < read_offset {
match queue.pop_front() {
Some(b) => {
self[write_offset] = b;
write_offset += 1;
}
None => break,
}
}
}
assert_eq!(read_offset, self.len());
return if write_offset < read_offset { Ok(write_offset) } else { Err(queue) };

// For now this is copy pasted from core::str, FIXME: DRY
fn case_ignorable_then_cased<I: Iterator<Item = char>>(iter: I) -> bool {
use core::unicode::{Case_Ignorable, Cased};
match iter.skip_while(|&c| Case_Ignorable(c)).next() {
Some(c) => Cased(c),
None => false,
}
}
}
}

#[derive(Clone)]
enum FinalSigmaAutomata {
Init,
Accepted,
}

impl FinalSigmaAutomata {
fn new() -> Self {
Self::Init
}

fn is_accepting(&self) -> bool {
match self {
FinalSigmaAutomata::Accepted => true,
FinalSigmaAutomata::Init => false,
}
}

fn step(&mut self, c: char) {
use core::unicode::{Case_Ignorable, Cased};

use FinalSigmaAutomata::*;
*self = match self {
Init => {
if Cased(c) {
Accepted
} else {
Init
}
}
Accepted => {
if Cased(c) || Case_Ignorable(c) {
Accepted
} else {
Init
}
}
}
}
}

#[cfg(not(test))]
Expand Down
12 changes: 12 additions & 0 deletions library/alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,18 @@ impl String {
*self = unsafe { Self::from_utf8_unchecked(v) }
}

#[unstable(feature = "string_make_uplowercase", issue = "135885")]
#[allow(missing_docs)]
pub fn make_lowercase(&mut self) {
let mut v = core::mem::take(self).vec;
let res = unsafe { v.make_utf8_lowercase() };
match res {
Ok(n) => v.truncate(n),
Err(queue) => v.extend(queue),
}
*self = unsafe { Self::from_utf8_unchecked(v) }
}

/// Copies elements from `src` range to the end of the string.
///
/// # Panics
Expand Down
60 changes: 60 additions & 0 deletions library/alloc/tests/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -923,3 +923,63 @@ fn make_uppercase() {
test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
test("aéDžßfiᾀ");
}

#[test]
fn make_lowercase() {
fn test(s: &str) {
let ground_truth = s.to_lowercase();
let mut tested = s.to_owned();
tested.make_lowercase();
assert!(
tested == ground_truth,
r#"When lowercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
);
}
test("");
test("AÉDžaé ");

// https://github.com/rust-lang/rust/issues/26035
test("ΑΣ");
test("Α'Σ");
test("Α''Σ");

test("ΑΣ Α");
test("Α'Σ Α");
test("Α''Σ Α");

test("ΑΣ' Α");
test("ΑΣ'' Α");

test("Α'Σ' Α");
test("Α''Σ'' Α");

test("Α Σ");
test("Α 'Σ");
test("Α ''Σ");

test("Σ");
test("'Σ");
test("''Σ");

test("ΑΣΑ");
test("ΑΣ'Α");
test("ΑΣ''Α");

// https://github.com/rust-lang/rust/issues/124714
// input lengths around the boundary of the chunk size used by the ascii prefix optimization
test("abcdefghijklmnoΣ");
test("abcdefghijklmnopΣ");
test("abcdefghijklmnopqΣ");

// a really long string that has it's lowercase form
// even longer. this tests that implementations don't assume
// an incorrect upper bound on allocations
let upper = str::repeat("İ", 512);
test(&upper);

// a really long ascii-only string.
// This test that the ascii hot-path
// functions correctly
let upper = str::repeat("A", 511);
test(&upper);
}

0 comments on commit fffbb33

Please sign in to comment.