diff --git a/src/source/mod.rs b/src/source/mod.rs index 10fdc22a2..41cba601b 100644 --- a/src/source/mod.rs +++ b/src/source/mod.rs @@ -306,13 +306,8 @@ pub trait NpmResolver: fmt::Debug { pub fn load_data_url( specifier: &ModuleSpecifier, ) -> Result, anyhow::Error> { - let url = DataUrl::process(specifier.as_str()) - .map_err(|_| anyhow!("Unable to decode data url."))?; - let (bytes, _) = url - .decode_to_vec() - .map_err(|_| anyhow!("Unable to decode data url."))?; - let mut headers: HashMap = HashMap::with_capacity(1); - headers.insert("content-type".to_string(), url.mime_type().to_string()); + let data_url = RawDataUrl::parse(specifier)?; + let (bytes, headers) = data_url.into_bytes_and_headers(); Ok(Some(LoadResponse::Module { specifier: specifier.clone(), maybe_headers: Some(headers), @@ -320,6 +315,62 @@ pub fn load_data_url( })) } +#[derive(Debug, Clone)] +pub struct RawDataUrl { + pub mime_type: String, + pub bytes: Vec, +} + +impl RawDataUrl { + pub fn parse(specifier: &ModuleSpecifier) -> Result { + let url = DataUrl::process(specifier.as_str()) + .map_err(|_| anyhow!("Unable to decode data url."))?; + let (bytes, _) = url + .decode_to_vec() + .map_err(|_| anyhow!("Unable to decode data url."))?; + Ok(RawDataUrl { + mime_type: url.mime_type().to_string(), + bytes, + }) + } + + pub fn charset(&self) -> Option<&str> { + get_mime_type_charset(&self.mime_type) + } + + pub fn media_type(&self) -> MediaType { + let mut content_types = self.mime_type.split(';'); + let Some(content_type) = content_types.next() else { + return MediaType::Unknown; + }; + MediaType::from_content_type( + // this data url will be ignored when resolving the MediaType + // as in this rare case the MediaType is determined solely based + // on the provided content type + &ModuleSpecifier::parse("data:image/png;base64,").unwrap(), + content_type, + ) + } + + pub fn decode(self) -> Result { + let charset = get_mime_type_charset(&self.mime_type).unwrap_or("utf-8"); + decode_owned_source_with_charset(self.bytes, charset) + } + + pub fn into_bytes_and_headers(self) -> (Vec, HashMap) { + let headers = HashMap::from([("content-type".to_string(), self.mime_type)]); + (self.bytes, headers) + } +} + +fn get_mime_type_charset(mime_type: &str) -> Option<&str> { + mime_type + .split(';') + .skip(1) + .map(str::trim) + .find_map(|s| s.strip_prefix("charset=")) +} + /// An implementation of the loader attribute where the responses are provided /// ahead of time. This is useful for testing or #[derive(Default)] @@ -500,8 +551,10 @@ pub fn resolve_media_type_and_charset_from_content_type<'a>( ) -> (MediaType, Option<&'a str>) { if let Some(content_type) = maybe_content_type { let mut content_types = content_type.split(';'); - let content_type = content_types.next().unwrap(); - let media_type = MediaType::from_content_type(specifier, content_type); + let media_type = content_types + .next() + .map(|content_type| MediaType::from_content_type(specifier, content_type)) + .unwrap_or(MediaType::Unknown); let charset = content_types .map(str::trim) .find_map(|s| s.strip_prefix("charset=")); @@ -512,6 +565,54 @@ pub fn resolve_media_type_and_charset_from_content_type<'a>( } } +/// Decodes the source bytes into a string handling any encoding rules +/// where the bytes may be from a remote module, file module, or other. +pub fn decode_owned_source( + specifier: &ModuleSpecifier, + bytes: Vec, + maybe_charset: Option<&str>, +) -> Result { + let charset = maybe_charset.unwrap_or_else(|| { + if specifier.scheme() == "file" { + text_encoding::detect_charset(&bytes) + } else { + "utf-8" + } + }); + decode_owned_source_with_charset(bytes, charset) +} + +/// Decodes the source bytes into a string handling any encoding rules +/// where the source is a `file:` specifier. +pub fn decode_owned_file_source( + bytes: Vec, +) -> Result { + let charset = text_encoding::detect_charset(&bytes); + decode_owned_source_with_charset(bytes, charset) +} + +fn decode_owned_source_with_charset( + bytes: Vec, + charset: &str, +) -> Result { + match text_encoding::convert_to_utf8(&bytes, charset)? { + Cow::Borrowed(text) => { + if text.starts_with(text_encoding::BOM_CHAR) { + Ok(text[text_encoding::BOM_CHAR.len_utf8()..].to_string()) + } else { + Ok( + // SAFETY: we know it's a valid utf-8 string at this point + unsafe { String::from_utf8_unchecked(bytes) }, + ) + } + } + Cow::Owned(mut text) => { + text_encoding::strip_bom_mut(&mut text); + Ok(text) + } + } +} + /// Decodes the source bytes into a string handling any encoding rules /// for local vs remote files and dealing with the charset. pub fn decode_source( @@ -828,4 +929,81 @@ pub mod tests { ); } } + + #[test] + fn test_parse_valid_data_url() { + let valid_data_url = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ=="; + let specifier = ModuleSpecifier::parse(valid_data_url).unwrap(); + let raw_data_url = RawDataUrl::parse(&specifier).unwrap(); + assert_eq!(raw_data_url.mime_type, "text/plain"); + assert_eq!(raw_data_url.bytes, b"Hello, World!"); + } + + #[test] + fn test_charset_with_valid_mime_type() { + let raw_data_url = RawDataUrl { + mime_type: "text/plain; charset=utf-8".to_string(), + bytes: vec![], + }; + assert_eq!(raw_data_url.charset(), Some("utf-8")); + } + + #[test] + fn test_charset_with_no_charset_in_mime_type() { + let raw_data_url = RawDataUrl { + mime_type: "text/plain".to_string(), + bytes: vec![], + }; + assert_eq!(raw_data_url.charset(), None); + } + + #[test] + fn test_media_type_with_known_type() { + let raw_data_url = RawDataUrl { + mime_type: "application/javascript;charset=utf-8".to_string(), + bytes: vec![], + }; + assert_eq!(raw_data_url.media_type(), MediaType::JavaScript); + } + + #[test] + fn test_media_type_with_unknown_type() { + let raw_data_url = RawDataUrl { + mime_type: "unknown/unknown".to_string(), + bytes: vec![], + }; + assert_eq!(raw_data_url.media_type(), MediaType::Unknown); + } + + #[test] + fn test_decode_with_valid_charset() { + let raw_data_url = RawDataUrl { + mime_type: "text/plain; charset=utf-8".to_string(), + bytes: "Hello, World!".as_bytes().to_vec(), + }; + assert_eq!(raw_data_url.decode().unwrap(), "Hello, World!"); + } + + #[test] + fn test_decode_with_invalid_charset() { + let raw_data_url = RawDataUrl { + mime_type: "text/plain; charset=invalid-charset".to_string(), + bytes: vec![], + }; + assert!(raw_data_url.decode().is_err()); + } + + #[test] + fn test_into_bytes_and_headers() { + let raw_data_url = RawDataUrl { + mime_type: "text/plain; charset=utf-8".to_string(), + bytes: "Hello, World!".as_bytes().to_vec(), + }; + let (bytes, headers) = raw_data_url.into_bytes_and_headers(); + assert_eq!(bytes, "Hello, World!".as_bytes()); + assert_eq!( + headers.get("content-type").unwrap(), + "text/plain; charset=utf-8" + ); + } } diff --git a/src/text_encoding.rs b/src/text_encoding.rs index b1433c6a6..362ff7176 100644 --- a/src/text_encoding.rs +++ b/src/text_encoding.rs @@ -48,6 +48,8 @@ pub fn strip_bom_mut(text: &mut String) { #[cfg(test)] mod test { + use std::io::ErrorKind; + use super::*; fn test_detection(test_data: &[u8], expected_charset: &str) { @@ -91,4 +93,22 @@ mod test { strip_bom_mut(&mut text); assert_eq!(text, "text"); } + + #[test] + fn test_decoding_unsupported_charset() { + let test_data = Vec::new(); + let result = convert_to_utf8(&test_data, "utf-32le"); + assert!(result.is_err()); + let err = result.expect_err("Err expected"); + assert!(err.kind() == ErrorKind::InvalidInput); + } + + #[test] + fn test_decoding_invalid_utf8() { + let test_data = b"\xFE\xFE\xFF\xFF".to_vec(); + let result = convert_to_utf8(&test_data, "utf-8"); + assert!(result.is_err()); + let err = result.expect_err("Err expected"); + assert!(err.kind() == ErrorKind::InvalidData); + } }