Skip to content

Commit

Permalink
First prototype of make_uppercase
Browse files Browse the repository at this point in the history
  • Loading branch information
krtab committed Feb 17, 2025
1 parent 252b07b commit c5b0e91
Show file tree
Hide file tree
Showing 6 changed files with 97 additions and 5 deletions.
40 changes: 40 additions & 0 deletions library/alloc/src/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ use crate::alloc::Global;
#[cfg(not(no_global_oom_handling))]
use crate::borrow::ToOwned;
use crate::boxed::Box;
use crate::collections::VecDeque;
use crate::vec::Vec;

// HACK(japaric): With cfg(test) `impl [T]` is not available, these three
Expand Down Expand Up @@ -665,6 +666,45 @@ impl<T> [T] {
}
}

#[cfg(not(test))]
impl [u8] {
#[rustc_allow_incoherent_impl]
#[unstable(issue = "none", feature = "std_internals")]
#[allow(dead_code)]
/// Safety:
/// - Must be UTF-8
pub unsafe fn make_utf8_uppercase(&mut self) -> Result<usize, VecDeque<u8>> {
let mut queue = VecDeque::new();

let mut read_offset = 0;
let mut write_offset = 0;

let mut buffer = [0; 4];
while let Some((codepoint, width)) =
unsafe { core::str::next_code_point_with_width(&mut self[read_offset..].iter()) }
{
read_offset += width;
let lowercase_char = unsafe { char::from_u32_unchecked(codepoint) };
for c in lowercase_char.to_uppercase() {
let l = c.len_utf8();
c.encode_utf8(&mut buffer);
queue.extend(&buffer[..l]);
}
while write_offset < read_offset {
match queue.pop_front() {
Some(b) => {
self[write_offset] = b;
write_offset += 1;
}
None => break,
}
}
}
assert_eq!(read_offset, self.len());
if write_offset < read_offset { Ok(write_offset) } else { Err(queue) }
}
}

#[cfg(not(test))]
impl [u8] {
/// Returns a vector containing a copy of this slice where each byte
Expand Down
12 changes: 12 additions & 0 deletions library/alloc/src/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1127,6 +1127,18 @@ impl String {
self.vec.extend_from_slice(string.as_bytes())
}

#[unstable(feature = "string_make_uplowercase", issue = "135885")]
#[allow(missing_docs)]
pub fn make_uppercase(&mut self) {
let mut v = core::mem::take(self).vec;
let res = unsafe { v.make_utf8_uppercase() };
match res {
Ok(n) => v.truncate(n),
Err(queue) => v.extend(queue),
}
*self = unsafe { Self::from_utf8_unchecked(v) }
}

/// Copies elements from `src` range to the end of the string.
///
/// # Panics
Expand Down
1 change: 1 addition & 0 deletions library/alloc/tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#![allow(internal_features)]
#![deny(fuzzy_provenance_casts)]
#![deny(unsafe_op_in_unsafe_fn)]
#![feature(string_make_uplowercase)]

extern crate test;

Expand Down
20 changes: 20 additions & 0 deletions library/alloc/tests/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -903,3 +903,23 @@ fn test_str_concat() {
let s: String = format!("{a}{b}");
assert_eq!(s.as_bytes()[9], 'd' as u8);
}

#[test]
fn make_uppercase() {
fn test(s: &str) {
let ground_truth = s.to_uppercase();
let mut tested = s.to_owned();
tested.make_uppercase();
assert!(
tested == ground_truth,
r#"When uppercased "{s}" gave "{tested}" while "{ground_truth}" was expected"#
);
}
test("");
test("abcde");
// 4 to 9 bytes
test("ǰΐ");
// 10*3 to 10*2 bytes
test("ⱥⱥⱥⱥⱥⱥⱥⱥⱥⱥ");
test("aéDžßfiᾀ");
}
2 changes: 1 addition & 1 deletion library/core/src/str/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ pub use lossy::{Utf8Chunk, Utf8Chunks};
#[stable(feature = "rust1", since = "1.0.0")]
pub use traits::FromStr;
#[unstable(feature = "str_internals", issue = "none")]
pub use validations::{next_code_point, utf8_char_width};
pub use validations::{next_code_point, next_code_point_with_width, utf8_char_width};

#[inline(never)]
#[cold]
Expand Down
27 changes: 23 additions & 4 deletions library/core/src/str/validations.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,21 @@ pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool {
}

/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
/// UTF-8-like encoding) and returns it along with its width.
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
#[allow(dead_code)]
pub unsafe fn next_code_point_with_width<'a, I: Iterator<Item = &'a u8>>(
bytes: &mut I,
) -> Option<(u32, usize)> {
// Decode UTF-8
let x = *bytes.next()?;
if x < 128 {
return Some(x as u32);
return Some((x as u32, 1));
}

// Multibyte case follows
Expand All @@ -47,13 +50,15 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let y = unsafe { *bytes.next().unwrap_unchecked() };
let mut width = 2;
let mut ch = utf8_acc_cont_byte(init, y);
if x >= 0xE0 {
// [[x y z] w] case
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let z = unsafe { *bytes.next().unwrap_unchecked() };
width = 3;
let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z);
ch = init << 12 | y_z;
if x >= 0xF0 {
Expand All @@ -62,11 +67,25 @@ pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) ->
// SAFETY: `bytes` produces an UTF-8-like string,
// so the iterator must produce a value here.
let w = unsafe { *bytes.next().unwrap_unchecked() };
width = 4;
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}

Some(ch)
Some((ch, width))
}

/// Reads the next code point out of a byte iterator (assuming a
/// UTF-8-like encoding).
///
/// # Safety
///
/// `bytes` must produce a valid UTF-8-like (UTF-8 or WTF-8) string
#[unstable(feature = "str_internals", issue = "none")]
#[inline]
pub unsafe fn next_code_point<'a, I: Iterator<Item = &'a u8>>(bytes: &mut I) -> Option<u32> {
// SAFETY: same call condition
Some(unsafe { next_code_point_with_width(bytes) }?.0)
}

/// Reads the last code point out of a byte iterator (assuming a
Expand Down

0 comments on commit c5b0e91

Please sign in to comment.