diff --git a/Cargo.lock b/Cargo.lock index 13a536448..42bec08ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -513,11 +513,12 @@ dependencies = [ [[package]] name = "deno_media_type" -version = "0.2.3" +version = "0.2.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a417f8bd3f1074185c4c8ccb6ea6261ae173781596cc358e68ad07aaac11009d" +checksum = "577fe2bbe04f3e9b1b7c6fac6a75101a9fbd611c50a6b68789e69f4d63dcb2b4" dependencies = [ "data-url", + "encoding_rs", "serde", "url", ] diff --git a/Cargo.toml b/Cargo.toml index 5be9c8636..7c2168a22 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -45,7 +45,7 @@ async-trait = "0.1.68" capacity_builder = "0.5.0" data-url = "0.3.0" deno_ast = { version = "0.44.0", features = ["dep_analysis", "emit"] } -deno_media_type = "0.2.3" +deno_media_type = { version = "0.2.4", features = ["decoding", "data_url", "module_specifier"] } deno_unsync.workspace = true deno_path_util = "0.3.0" deno_semver = "0.7.1" diff --git a/src/graph.rs b/src/graph.rs index 386bfbc52..b7e50aad8 100644 --- a/src/graph.rs +++ b/src/graph.rs @@ -33,6 +33,7 @@ use crate::source::*; use deno_ast::dep::DynamicDependencyKind; use deno_ast::dep::ImportAttributes; use deno_ast::dep::StaticDependencyKind; +use deno_ast::encoding::detect_charset; use deno_ast::LineAndColumnIndex; use deno_ast::MediaType; use deno_ast::ParseDiagnostic; @@ -2377,10 +2378,12 @@ pub(crate) async fn parse_module_source_and_info( Some("json") )) { - return match crate::source::decode_source( - &opts.specifier, + let charset = maybe_charset.unwrap_or_else(|| { + detect_charset(&opts.specifier, opts.content.as_ref()) + }); + return match deno_media_type::encoding::decode_arc_source( + charset, opts.content, - maybe_charset, ) { Ok(text) => Ok(ModuleSourceAndInfo::Json { specifier: opts.specifier, @@ -5395,10 +5398,13 @@ impl<'a> NpmSpecifierResolver<'a> { fn new_source_with_text( specifier: &ModuleSpecifier, - text: Arc<[u8]>, + bytes: Arc<[u8]>, maybe_charset: Option<&str>, ) -> Result, Box> { - crate::source::decode_source(specifier, text, maybe_charset).map_err(|err| { + let charset = maybe_charset.unwrap_or_else(|| { + deno_media_type::encoding::detect_charset(specifier, bytes.as_ref()) + }); + deno_media_type::encoding::decode_arc_source(charset, bytes).map_err(|err| { Box::new(ModuleError::LoadingErr( specifier.clone(), None, diff --git a/src/lib.rs b/src/lib.rs index 21647ca36..1efe6e369 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -18,7 +18,6 @@ pub mod symbols; mod fast_check; pub mod packages; pub mod source; -mod text_encoding; use source::FileSystem; use source::JsrUrlProvider; diff --git a/src/source/mod.rs b/src/source/mod.rs index 05c1e8506..30ed7bccb 100644 --- a/src/source/mod.rs +++ b/src/source/mod.rs @@ -1,6 +1,5 @@ // Copyright 2018-2024 the Deno authors. MIT license. -use std::borrow::Cow; use std::collections::HashMap; use std::fmt; use std::path::Path; @@ -8,7 +7,7 @@ use std::path::PathBuf; use std::sync::Arc; use async_trait::async_trait; -use data_url::DataUrl; +use deno_ast::data_url::RawDataUrl; use deno_ast::MediaType; use deno_ast::ModuleSpecifier; use deno_error::JsErrorClass; @@ -30,7 +29,6 @@ use crate::graph::Range; use crate::module_specifier::resolve_import; use crate::packages::JsrPackageInfo; use crate::packages::JsrPackageVersionInfo; -use crate::text_encoding; use crate::ModuleInfo; use crate::NpmLoadError; use crate::SpecifierError; @@ -545,7 +543,8 @@ pub fn load_data_url( specifier: &ModuleSpecifier, ) -> Result, std::io::Error> { let data_url = RawDataUrl::parse(specifier)?; - let (bytes, headers) = data_url.into_bytes_and_headers(); + let (bytes, mime_type) = data_url.into_bytes_and_mime_type(); + let headers = HashMap::from([("content-type".to_string(), mime_type)]); Ok(Some(LoadResponse::Module { specifier: specifier.clone(), maybe_headers: Some(headers), @@ -553,67 +552,6 @@ pub fn load_data_url( })) } -#[derive(Debug, Clone)] -pub struct RawDataUrl { - pub mime_type: String, - pub bytes: Vec, -} - -impl RawDataUrl { - pub fn parse(specifier: &ModuleSpecifier) -> Result { - use std::io::Error; - use std::io::ErrorKind; - - fn unable_to_decode() -> Error { - Error::new(ErrorKind::InvalidData, "Unable to decode data url.") - } - - let url = - DataUrl::process(specifier.as_str()).map_err(|_| unable_to_decode())?; - let (bytes, _) = url.decode_to_vec().map_err(|_| unable_to_decode())?; - Ok(RawDataUrl { - mime_type: url.mime_type().to_string(), - bytes, - }) - } - - pub fn charset(&self) -> Option<&str> { - get_mime_type_charset(&self.mime_type) - } - - pub fn media_type(&self) -> MediaType { - let mut content_types = self.mime_type.split(';'); - let Some(content_type) = content_types.next() else { - return MediaType::Unknown; - }; - MediaType::from_content_type( - // this data url will be ignored when resolving the MediaType - // as in this rare case the MediaType is determined solely based - // on the provided content type - &ModuleSpecifier::parse("data:image/png;base64,").unwrap(), - content_type, - ) - } - - pub fn decode(self) -> Result { - let charset = get_mime_type_charset(&self.mime_type).unwrap_or("utf-8"); - decode_owned_source_with_charset(self.bytes, charset) - } - - pub fn into_bytes_and_headers(self) -> (Vec, HashMap) { - let headers = HashMap::from([("content-type".to_string(), self.mime_type)]); - (self.bytes, headers) - } -} - -fn get_mime_type_charset(mime_type: &str) -> Option<&str> { - mime_type - .split(';') - .skip(1) - .map(str::trim) - .find_map(|s| s.strip_prefix("charset=")) -} - /// An implementation of the loader attribute where the responses are provided /// ahead of time. This is useful for testing or #[derive(Default)] @@ -832,100 +770,6 @@ pub fn resolve_media_type_and_charset_from_content_type<'a>( } } -/// Decodes the source bytes into a string handling any encoding rules -/// where the bytes may be from a remote module, file module, or other. -pub fn decode_owned_source( - specifier: &ModuleSpecifier, - bytes: Vec, - maybe_charset: Option<&str>, -) -> Result { - let charset = maybe_charset.unwrap_or_else(|| { - if specifier.scheme() == "file" { - text_encoding::detect_charset(&bytes) - } else { - "utf-8" - } - }); - decode_owned_source_with_charset(bytes, charset) -} - -/// Decodes the source bytes into a string handling any encoding rules -/// where the source is a `file:` specifier. -pub fn decode_owned_file_source( - bytes: Vec, -) -> Result { - let charset = text_encoding::detect_charset(&bytes); - decode_owned_source_with_charset(bytes, charset) -} - -fn decode_owned_source_with_charset( - bytes: Vec, - charset: &str, -) -> Result { - match text_encoding::convert_to_utf8(&bytes, charset)? { - Cow::Borrowed(text) => { - if text.starts_with(text_encoding::BOM_CHAR) { - Ok(text[text_encoding::BOM_CHAR.len_utf8()..].to_string()) - } else { - Ok( - // SAFETY: we know it's a valid utf-8 string at this point - unsafe { String::from_utf8_unchecked(bytes) }, - ) - } - } - Cow::Owned(mut text) => { - text_encoding::strip_bom_mut(&mut text); - Ok(text) - } - } -} - -/// Decodes the source bytes into a string handling any encoding rules -/// for local vs remote files and dealing with the charset. -pub fn decode_source( - specifier: &ModuleSpecifier, - bytes: Arc<[u8]>, - maybe_charset: Option<&str>, -) -> Result, std::io::Error> { - let charset = maybe_charset.unwrap_or_else(|| { - if specifier.scheme() == "file" { - text_encoding::detect_charset(bytes.as_ref()) - } else { - "utf-8" - } - }); - decode_with_charset(bytes, charset) -} - -fn decode_with_charset( - bytes: Arc<[u8]>, - charset: &str, -) -> Result, std::io::Error> { - let text = match text_encoding::convert_to_utf8(bytes.as_ref(), charset)? { - Cow::Borrowed(text) => { - if text.starts_with(text_encoding::BOM_CHAR) { - text[text_encoding::BOM_CHAR.len_utf8()..].to_string() - } else { - return Ok( - // SAFETY: we know it's a valid utf-8 string at this point - unsafe { - let raw_ptr = Arc::into_raw(bytes); - Arc::from_raw(std::mem::transmute::<*const [u8], *const str>( - raw_ptr, - )) - }, - ); - } - } - Cow::Owned(mut text) => { - text_encoding::strip_bom_mut(&mut text); - text - } - }; - let text: Arc = Arc::from(text); - Ok(text) -} - #[cfg(test)] pub mod tests { use super::*; @@ -1008,98 +852,4 @@ pub mod tests { } ); } - - #[test] - fn test_parse_valid_data_url() { - let valid_data_url = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="; - let specifier = ModuleSpecifier::parse(valid_data_url).unwrap(); - let raw_data_url = RawDataUrl::parse(&specifier).unwrap(); - assert_eq!(raw_data_url.mime_type, "text/plain"); - assert_eq!(raw_data_url.bytes, b"Hello, World!"); - } - - #[test] - fn test_charset_with_valid_mime_type() { - let raw_data_url = RawDataUrl { - mime_type: "text/plain; charset=utf-8".to_string(), - bytes: vec![], - }; - assert_eq!(raw_data_url.charset(), Some("utf-8")); - } - - #[test] - fn test_charset_with_no_charset_in_mime_type() { - let raw_data_url = RawDataUrl { - mime_type: "text/plain".to_string(), - bytes: vec![], - }; - assert_eq!(raw_data_url.charset(), None); - } - - #[test] - fn test_media_type_with_known_type() { - let raw_data_url = RawDataUrl { - mime_type: "application/javascript;charset=utf-8".to_string(), - bytes: vec![], - }; - assert_eq!(raw_data_url.media_type(), MediaType::JavaScript); - } - - #[test] - fn test_media_type_with_unknown_type() { - let raw_data_url = RawDataUrl { - mime_type: "unknown/unknown".to_string(), - bytes: vec![], - }; - assert_eq!(raw_data_url.media_type(), MediaType::Unknown); - } - - #[test] - fn test_decode_with_valid_charset() { - let raw_data_url = RawDataUrl { - mime_type: "text/plain; charset=utf-8".to_string(), - bytes: "Hello, World!".as_bytes().to_vec(), - }; - assert_eq!(raw_data_url.decode().unwrap(), "Hello, World!"); - } - - #[test] - fn test_decode_with_invalid_charset() { - let raw_data_url = RawDataUrl { - mime_type: "text/plain; charset=invalid-charset".to_string(), - bytes: vec![], - }; - assert!(raw_data_url.decode().is_err()); - } - - #[test] - fn test_into_bytes_and_headers() { - let raw_data_url = RawDataUrl { - mime_type: "text/plain; charset=utf-8".to_string(), - bytes: "Hello, World!".as_bytes().to_vec(), - }; - let (bytes, headers) = raw_data_url.into_bytes_and_headers(); - assert_eq!(bytes, "Hello, World!".as_bytes()); - assert_eq!( - headers.get("content-type").unwrap(), - "text/plain; charset=utf-8" - ); - } - - #[test] - fn test_decode_owned_with_bom() { - let text = decode_owned_file_source( - format!("{}{}", text_encoding::BOM_CHAR, "Hello").into_bytes(), - ) - .unwrap(); - assert_eq!(text, "Hello"); - } - - #[test] - fn test_decode_with_charset_with_bom() { - let bytes = format!("{}{}", text_encoding::BOM_CHAR, "Hello").into_bytes(); - let charset = "utf-8"; - let text = decode_with_charset(Arc::from(bytes), charset).unwrap(); - assert_eq!(text.as_ref(), "Hello"); - } } diff --git a/src/text_encoding.rs b/src/text_encoding.rs deleted file mode 100644 index 148ff867b..000000000 --- a/src/text_encoding.rs +++ /dev/null @@ -1,112 +0,0 @@ -// Copyright 2018-2024 the Deno authors. MIT license. - -use std::borrow::Cow; - -pub const BOM_CHAR: char = '\u{FEFF}'; - -/// Attempts to detect the character encoding of the provided bytes. -/// -/// Supports UTF-8, UTF-16 Little Endian and UTF-16 Big Endian. -pub fn detect_charset(bytes: &'_ [u8]) -> &'static str { - const UTF16_LE_BOM: &[u8] = b"\xFF\xFE"; - const UTF16_BE_BOM: &[u8] = b"\xFE\xFF"; - - if bytes.starts_with(UTF16_LE_BOM) { - "utf-16le" - } else if bytes.starts_with(UTF16_BE_BOM) { - "utf-16be" - } else { - // Assume everything else is utf-8 - "utf-8" - } -} - -/// Attempts to convert the provided bytes to a UTF-8 string. -/// -/// Supports all encodings supported by the encoding_rs crate, which includes -/// all encodings specified in the WHATWG Encoding Standard, and only those -/// encodings (see: ). -pub fn convert_to_utf8<'a>( - bytes: &'a [u8], - charset: &'_ str, -) -> Result, std::io::Error> { - match encoding_rs::Encoding::for_label(charset.as_bytes()) { - Some(encoding) => Ok(encoding.decode_without_bom_handling(bytes).0), - None => Err(std::io::Error::new( - std::io::ErrorKind::InvalidInput, - format!("Unsupported charset: {charset}"), - )), - } -} - -/// Strips the byte order mark if it exists from the provided text. -pub fn strip_bom_mut(text: &mut String) { - if text.starts_with(BOM_CHAR) { - text.drain(..BOM_CHAR.len_utf8()); - } -} - -#[cfg(test)] -mod test { - use std::io::ErrorKind; - - use super::*; - - fn test_detection(test_data: &[u8], expected_charset: &str) { - let detected_charset = detect_charset(test_data); - assert_eq!( - expected_charset.to_lowercase(), - detected_charset.to_lowercase() - ); - } - - #[test] - fn test_detection_utf8_no_bom() { - let test_data = "Hello UTF-8 it is \u{23F0} for Deno!" - .to_owned() - .into_bytes(); - test_detection(&test_data, "utf-8"); - } - - #[test] - fn test_detection_utf16_little_endian() { - let test_data = b"\xFF\xFEHello UTF-16LE".to_owned().to_vec(); - test_detection(&test_data, "utf-16le"); - } - - #[test] - fn test_detection_utf16_big_endian() { - let test_data = b"\xFE\xFFHello UTF-16BE".to_owned().to_vec(); - test_detection(&test_data, "utf-16be"); - } - - #[test] - fn strip_bom_mut_with_bom() { - let mut text = format!("{BOM_CHAR}text"); - strip_bom_mut(&mut text); - assert_eq!(text, "text"); - } - - #[test] - fn strip_bom_mut_without_bom() { - let mut text = "text".to_string(); - strip_bom_mut(&mut text); - assert_eq!(text, "text"); - } - - #[test] - fn test_decoding_unsupported_charset() { - let test_data = Vec::new(); - let result = convert_to_utf8(&test_data, "utf-32le"); - assert!(result.is_err()); - let err = result.expect_err("Err expected"); - assert!(err.kind() == ErrorKind::InvalidInput); - } - - #[test] - fn test_decoding_invalid_utf8() { - let test_data = b"\xFE\xFE\xFF\xFF".to_vec(); - let result = convert_to_utf8(&test_data, "utf-8"); - assert!(result.is_ok()); - } -} diff --git a/tests/specs/ecosystem/mrii/rocket_io/0_1_3.test b/tests/specs/ecosystem/mrii/rocket_io/0_1_3.test index 6a0ce1042..c08f308f6 100644 --- a/tests/specs/ecosystem/mrii/rocket_io/0_1_3.test +++ b/tests/specs/ecosystem/mrii/rocket_io/0_1_3.test @@ -72,8 +72,8 @@ mrii/rocket-io/0.1.3 -- stdout -- -- stderr -- -error: Uncaught Error: [ERR_PACKAGE_PATH_NOT_EXPORTED] Package subpath './build/esm/socket' is not defined for types by "exports" in '/socket.io-client/4.7.5/package.json' imported from 'file:///src/types/socket-reserved-events.ts' - at Object.resolveModuleNameLiterals (ext:deno_tsc/99_main_compiler.js:789:28) +error: Error: [ERR_PACKAGE_PATH_NOT_EXPORTED] Package subpath './build/esm/socket' is not defined for types by "exports" in '/socket.io-client/4.7.5/package.json' imported from 'file:///src/types/socket-reserved-events.ts' + at Object.resolveModuleNameLiterals (ext:deno_tsc/99_main_compiler.js:794:28) at resolveModuleNamesWorker (ext:deno_tsc/00_typescript.js:125466:20) at resolveNamesReusingOldState (ext:deno_tsc/00_typescript.js:125608:14) at resolveModuleNamesReusingOldState (ext:deno_tsc/00_typescript.js:125564:12)