From cf6f64a963f3cf630ca4bc3688cf461a98d2c9ce Mon Sep 17 00:00:00 2001 From: Maybe Waffle Date: Fri, 5 Nov 2021 15:39:01 +0300 Subject: [PATCH 1/2] Make slice->str conversion and related functions const This commit makes the following functions from `core::str` `const fn`: - `from_utf8[_mut]` (`feature(const_str_from_utf8)`) - `from_utf8_unchecked_mut` (`feature(const_str_from_utf8_unchecked_mut)`) - `Utf8Error::{valid_up_to,error_len}` (`feature(const_str_from_utf8)`) --- library/alloc/tests/lib.rs | 1 + library/alloc/tests/str.rs | 64 +++++++++++++++++++++++++++-- library/core/src/lib.rs | 3 ++ library/core/src/str/converts.rs | 31 ++++++++++---- library/core/src/str/error.rs | 12 ++++-- library/core/src/str/validations.rs | 19 +++++---- 6 files changed, 106 insertions(+), 24 deletions(-) diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs index 8c57c804ad2dc..68e48348b076e 100644 --- a/library/alloc/tests/lib.rs +++ b/library/alloc/tests/lib.rs @@ -25,6 +25,7 @@ #![feature(const_btree_new)] #![feature(const_default_impls)] #![feature(const_trait_impl)] +#![feature(const_str_from_utf8)] use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index dc7d0bff9a404..1b741f174fb12 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -1,3 +1,4 @@ +use std::assert_matches::assert_matches; use std::borrow::Cow; use std::cmp::Ordering::{Equal, Greater, Less}; use std::str::{from_utf8, from_utf8_unchecked}; @@ -883,6 +884,33 @@ fn test_is_utf8() { assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); } +#[test] +fn test_const_is_utf8() { + const _: () = { + // deny overlong encodings + assert!(from_utf8(&[0xc0, 0x80]).is_err()); + assert!(from_utf8(&[0xc0, 0xae]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0x80]).is_err()); + assert!(from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); + assert!(from_utf8(&[0xe0, 0x81, 0x81]).is_err()); + assert!(from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); + assert!(from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); + + // deny surrogates + assert!(from_utf8(&[0xED, 0xA0, 0x80]).is_err()); + assert!(from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); + + assert!(from_utf8(&[0xC2, 0x80]).is_ok()); + assert!(from_utf8(&[0xDF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); + assert!(from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); + assert!(from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); + assert!(from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); + assert!(from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); + }; +} + #[test] fn from_utf8_mostly_ascii() { // deny invalid bytes embedded in long stretches of ascii @@ -895,13 +923,43 @@ fn from_utf8_mostly_ascii() { } } +#[test] +fn const_from_utf8_mostly_ascii() { + const _: () = { + // deny invalid bytes embedded in long stretches of ascii + let mut i = 32; + while i < 64 { + let mut data = [0; 128]; + data[i] = 0xC0; + assert!(from_utf8(&data).is_err()); + data[i] = 0xC2; + assert!(from_utf8(&data).is_err()); + + i = i + 1; + } + }; +} + #[test] fn from_utf8_error() { macro_rules! test { - ($input: expr, $expected_valid_up_to: expr, $expected_error_len: expr) => { + ($input: expr, $expected_valid_up_to:pat, $expected_error_len:pat) => { let error = from_utf8($input).unwrap_err(); - assert_eq!(error.valid_up_to(), $expected_valid_up_to); - assert_eq!(error.error_len(), $expected_error_len); + assert_matches!(error.valid_up_to(), $expected_valid_up_to); + assert_matches!(error.error_len(), $expected_error_len); + + const _: () = { + match from_utf8($input) { + Err(error) => { + let valid_up_to = error.valid_up_to(); + let error_len = error.error_len(); + + assert!(matches!(valid_up_to, $expected_valid_up_to)); + assert!(matches!(error_len, $expected_error_len)); + } + Ok(_) => unreachable!(), + } + }; }; } test!(b"A\xC3\xA9 \xFF ", 4, Some(1)); diff --git a/library/core/src/lib.rs b/library/core/src/lib.rs index e4a566f589582..3b0872378c6e9 100644 --- a/library/core/src/lib.rs +++ b/library/core/src/lib.rs @@ -97,6 +97,7 @@ #![allow(explicit_outlives_requirements)] // // Library features for const fns: +#![feature(const_align_offset)] #![feature(const_align_of_val)] #![feature(const_alloc_layout)] #![feature(const_arguments_as_str)] @@ -130,6 +131,7 @@ #![feature(const_size_of_val)] #![feature(const_slice_from_raw_parts)] #![feature(const_slice_ptr_len)] +#![feature(const_str_from_utf8_unchecked_mut)] #![feature(const_swap)] #![feature(const_trait_impl)] #![feature(const_type_id)] @@ -138,6 +140,7 @@ #![feature(duration_consts_2)] #![feature(ptr_metadata)] #![feature(slice_ptr_get)] +#![feature(str_internals)] #![feature(variant_count)] #![feature(const_array_from_ref)] #![feature(const_slice_from_ref)] diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs index ed9f49f159611..8ac28669b3080 100644 --- a/library/core/src/str/converts.rs +++ b/library/core/src/str/converts.rs @@ -82,10 +82,16 @@ use super::Utf8Error; /// assert_eq!("💖", sparkle_heart); /// ``` #[stable(feature = "rust1", since = "1.0.0")] -pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { - run_utf8_validation(v)?; - // SAFETY: Just ran validation. - Ok(unsafe { from_utf8_unchecked(v) }) +#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] +pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { + // This should use `?` again, once it's `const` + match run_utf8_validation(v) { + Ok(_) => { + // SAFETY: validation succeeded. + Ok(unsafe { from_utf8_unchecked(v) }) + } + Err(err) => Err(err), + } } /// Converts a mutable slice of bytes to a mutable string slice. @@ -119,10 +125,16 @@ pub fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { /// See the docs for [`Utf8Error`] for more details on the kinds of /// errors that can be returned. #[stable(feature = "str_mut_extras", since = "1.20.0")] -pub fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { - run_utf8_validation(v)?; - // SAFETY: Just ran validation. - Ok(unsafe { from_utf8_unchecked_mut(v) }) +#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] +pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { + // This should use `?` again, once it's `const` + match run_utf8_validation(v) { + Ok(_) => { + // SAFETY: validation succeeded. + Ok(unsafe { from_utf8_unchecked_mut(v) }) + } + Err(err) => Err(err), + } } /// Converts a slice of bytes to a string slice without checking @@ -184,7 +196,8 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { #[inline] #[must_use] #[stable(feature = "str_mut_extras", since = "1.20.0")] -pub unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { +#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked_mut", issue = "none")] +pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { // SAFETY: the caller must guarantee that the bytes `v` // are valid UTF-8, thus the cast to `*mut str` is safe. // Also, the pointer dereference is safe because that pointer diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs index b6460d72fef32..3d0aeb52016e9 100644 --- a/library/core/src/str/error.rs +++ b/library/core/src/str/error.rs @@ -72,9 +72,10 @@ impl Utf8Error { /// assert_eq!(1, error.valid_up_to()); /// ``` #[stable(feature = "utf8_error", since = "1.5.0")] + #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] #[must_use] #[inline] - pub fn valid_up_to(&self) -> usize { + pub const fn valid_up_to(&self) -> usize { self.valid_up_to } @@ -94,10 +95,15 @@ impl Utf8Error { /// /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html #[stable(feature = "utf8_error_error_len", since = "1.20.0")] + #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] #[must_use] #[inline] - pub fn error_len(&self) -> Option { - self.error_len.map(|len| len as usize) + pub const fn error_len(&self) -> Option { + // This should become `map` again, once it's `const` + match self.error_len { + Some(len) => Some(len as usize), + None => None, + } } } diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index 9a1cf905e3b02..e362d5c05c1b4 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -8,25 +8,25 @@ use super::Utf8Error; /// The first byte is special, only want bottom 5 bits for width 2, 4 bits /// for width 3, and 3 bits for width 4. #[inline] -fn utf8_first_byte(byte: u8, width: u32) -> u32 { +const fn utf8_first_byte(byte: u8, width: u32) -> u32 { (byte & (0x7F >> width)) as u32 } /// Returns the value of `ch` updated with continuation byte `byte`. #[inline] -fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { +const fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | (byte & CONT_MASK) as u32 } /// Checks whether the byte is a UTF-8 continuation byte (i.e., starts with the /// bits `10`). #[inline] -pub(super) fn utf8_is_cont_byte(byte: u8) -> bool { +pub(super) const fn utf8_is_cont_byte(byte: u8) -> bool { (byte as i8) < -64 } #[inline] -fn unwrap_or_0(opt: Option<&u8>) -> u8 { +const fn unwrap_or_0(opt: Option<&u8>) -> u8 { match opt { Some(&byte) => byte, None => 0, @@ -105,14 +105,15 @@ const NONASCII_MASK: usize = 0x80808080_80808080u64 as usize; /// Returns `true` if any byte in the word `x` is nonascii (>= 128). #[inline] -fn contains_nonascii(x: usize) -> bool { +const fn contains_nonascii(x: usize) -> bool { (x & NONASCII_MASK) != 0 } /// Walks through `v` checking that it's a valid UTF-8 sequence, /// returning `Ok(())` in that case, or, if it is invalid, `Err(err)`. #[inline(always)] -pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { +#[rustc_const_unstable(feature = "str_internals", issue = "none")] +pub(super) const fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { let mut index = 0; let len = v.len(); @@ -142,7 +143,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { let first = v[index]; if first >= 128 { - let w = UTF8_CHAR_WIDTH[first as usize]; + let w = utf8_char_width(first); // 2-byte encoding is for codepoints \u{0080} to \u{07ff} // first C2 80 last DF BF // 3-byte encoding is for codepoints \u{0800} to \u{ffff} @@ -230,7 +231,7 @@ pub(super) fn run_utf8_validation(v: &[u8]) -> Result<(), Utf8Error> { } // https://tools.ietf.org/html/rfc3629 -static UTF8_CHAR_WIDTH: [u8; 256] = [ +const UTF8_CHAR_WIDTH: &[u8; 256] = &[ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -253,7 +254,7 @@ static UTF8_CHAR_WIDTH: [u8; 256] = [ #[unstable(feature = "str_internals", issue = "none")] #[must_use] #[inline] -pub fn utf8_char_width(b: u8) -> usize { +pub const fn utf8_char_width(b: u8) -> usize { UTF8_CHAR_WIDTH[b as usize] as usize } From 573a00e3f9207a3be67921e1046cab95150b4ab4 Mon Sep 17 00:00:00 2001 From: Maybe Waffle Date: Thu, 18 Nov 2021 14:04:01 +0300 Subject: [PATCH 2/2] Fill in tracking issues for `const_str_from_utf8` and `const_str_from_utf8_unchecked_mut` features --- library/core/src/str/converts.rs | 6 +++--- library/core/src/str/error.rs | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/library/core/src/str/converts.rs b/library/core/src/str/converts.rs index 8ac28669b3080..ef26cbfb640bf 100644 --- a/library/core/src/str/converts.rs +++ b/library/core/src/str/converts.rs @@ -82,7 +82,7 @@ use super::Utf8Error; /// assert_eq!("💖", sparkle_heart); /// ``` #[stable(feature = "rust1", since = "1.0.0")] -#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] +#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")] pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { // This should use `?` again, once it's `const` match run_utf8_validation(v) { @@ -125,7 +125,7 @@ pub const fn from_utf8(v: &[u8]) -> Result<&str, Utf8Error> { /// See the docs for [`Utf8Error`] for more details on the kinds of /// errors that can be returned. #[stable(feature = "str_mut_extras", since = "1.20.0")] -#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] +#[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")] pub const fn from_utf8_mut(v: &mut [u8]) -> Result<&mut str, Utf8Error> { // This should use `?` again, once it's `const` match run_utf8_validation(v) { @@ -196,7 +196,7 @@ pub const unsafe fn from_utf8_unchecked(v: &[u8]) -> &str { #[inline] #[must_use] #[stable(feature = "str_mut_extras", since = "1.20.0")] -#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked_mut", issue = "none")] +#[rustc_const_unstable(feature = "const_str_from_utf8_unchecked_mut", issue = "91005")] pub const unsafe fn from_utf8_unchecked_mut(v: &mut [u8]) -> &mut str { // SAFETY: the caller must guarantee that the bytes `v` // are valid UTF-8, thus the cast to `*mut str` is safe. diff --git a/library/core/src/str/error.rs b/library/core/src/str/error.rs index 3d0aeb52016e9..a127dd57eee0e 100644 --- a/library/core/src/str/error.rs +++ b/library/core/src/str/error.rs @@ -72,7 +72,7 @@ impl Utf8Error { /// assert_eq!(1, error.valid_up_to()); /// ``` #[stable(feature = "utf8_error", since = "1.5.0")] - #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] + #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")] #[must_use] #[inline] pub const fn valid_up_to(&self) -> usize { @@ -95,7 +95,7 @@ impl Utf8Error { /// /// [U+FFFD]: ../../std/char/constant.REPLACEMENT_CHARACTER.html #[stable(feature = "utf8_error_error_len", since = "1.20.0")] - #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "none")] + #[rustc_const_unstable(feature = "const_str_from_utf8", issue = "91006")] #[must_use] #[inline] pub const fn error_len(&self) -> Option {