Skip to content

Commit

Permalink
feat: move text encoding functionality into deno_graph::source from…
Browse files Browse the repository at this point in the history
… CLI (#370)
  • Loading branch information
dsherret authored Feb 1, 2024
1 parent 19e2b28 commit 3a27ae5
Show file tree
Hide file tree
Showing 2 changed files with 207 additions and 9 deletions.
196 changes: 187 additions & 9 deletions src/source/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -306,20 +306,71 @@ pub trait NpmResolver: fmt::Debug {
pub fn load_data_url(
specifier: &ModuleSpecifier,
) -> Result<Option<LoadResponse>, anyhow::Error> {
let url = DataUrl::process(specifier.as_str())
.map_err(|_| anyhow!("Unable to decode data url."))?;
let (bytes, _) = url
.decode_to_vec()
.map_err(|_| anyhow!("Unable to decode data url."))?;
let mut headers: HashMap<String, String> = HashMap::with_capacity(1);
headers.insert("content-type".to_string(), url.mime_type().to_string());
let data_url = RawDataUrl::parse(specifier)?;
let (bytes, headers) = data_url.into_bytes_and_headers();
Ok(Some(LoadResponse::Module {
specifier: specifier.clone(),
maybe_headers: Some(headers),
content: Arc::from(bytes),
}))
}

#[derive(Debug, Clone)]
pub struct RawDataUrl {
pub mime_type: String,
pub bytes: Vec<u8>,
}

impl RawDataUrl {
pub fn parse(specifier: &ModuleSpecifier) -> Result<Self, Error> {
let url = DataUrl::process(specifier.as_str())
.map_err(|_| anyhow!("Unable to decode data url."))?;
let (bytes, _) = url
.decode_to_vec()
.map_err(|_| anyhow!("Unable to decode data url."))?;
Ok(RawDataUrl {
mime_type: url.mime_type().to_string(),
bytes,
})
}

pub fn charset(&self) -> Option<&str> {
get_mime_type_charset(&self.mime_type)
}

pub fn media_type(&self) -> MediaType {
let mut content_types = self.mime_type.split(';');
let Some(content_type) = content_types.next() else {
return MediaType::Unknown;
};
MediaType::from_content_type(
// this data url will be ignored when resolving the MediaType
// as in this rare case the MediaType is determined solely based
// on the provided content type
&ModuleSpecifier::parse("data:image/png;base64,").unwrap(),
content_type,
)
}

pub fn decode(self) -> Result<String, std::io::Error> {
let charset = get_mime_type_charset(&self.mime_type).unwrap_or("utf-8");
decode_owned_source_with_charset(self.bytes, charset)
}

pub fn into_bytes_and_headers(self) -> (Vec<u8>, HashMap<String, String>) {
let headers = HashMap::from([("content-type".to_string(), self.mime_type)]);
(self.bytes, headers)
}
}

fn get_mime_type_charset(mime_type: &str) -> Option<&str> {
mime_type
.split(';')
.skip(1)
.map(str::trim)
.find_map(|s| s.strip_prefix("charset="))
}

/// An implementation of the loader attribute where the responses are provided
/// ahead of time. This is useful for testing or
#[derive(Default)]
Expand Down Expand Up @@ -500,8 +551,10 @@ pub fn resolve_media_type_and_charset_from_content_type<'a>(
) -> (MediaType, Option<&'a str>) {
if let Some(content_type) = maybe_content_type {
let mut content_types = content_type.split(';');
let content_type = content_types.next().unwrap();
let media_type = MediaType::from_content_type(specifier, content_type);
let media_type = content_types
.next()
.map(|content_type| MediaType::from_content_type(specifier, content_type))
.unwrap_or(MediaType::Unknown);
let charset = content_types
.map(str::trim)
.find_map(|s| s.strip_prefix("charset="));
Expand All @@ -512,6 +565,54 @@ pub fn resolve_media_type_and_charset_from_content_type<'a>(
}
}

/// Decodes the source bytes into a string handling any encoding rules
/// where the bytes may be from a remote module, file module, or other.
pub fn decode_owned_source(
specifier: &ModuleSpecifier,
bytes: Vec<u8>,
maybe_charset: Option<&str>,
) -> Result<String, std::io::Error> {
let charset = maybe_charset.unwrap_or_else(|| {
if specifier.scheme() == "file" {
text_encoding::detect_charset(&bytes)
} else {
"utf-8"
}
});
decode_owned_source_with_charset(bytes, charset)
}

/// Decodes the source bytes into a string handling any encoding rules
/// where the source is a `file:` specifier.
pub fn decode_owned_file_source(
bytes: Vec<u8>,
) -> Result<String, std::io::Error> {
let charset = text_encoding::detect_charset(&bytes);
decode_owned_source_with_charset(bytes, charset)
}

fn decode_owned_source_with_charset(
bytes: Vec<u8>,
charset: &str,
) -> Result<String, std::io::Error> {
match text_encoding::convert_to_utf8(&bytes, charset)? {
Cow::Borrowed(text) => {
if text.starts_with(text_encoding::BOM_CHAR) {
Ok(text[text_encoding::BOM_CHAR.len_utf8()..].to_string())
} else {
Ok(
// SAFETY: we know it's a valid utf-8 string at this point
unsafe { String::from_utf8_unchecked(bytes) },
)
}
}
Cow::Owned(mut text) => {
text_encoding::strip_bom_mut(&mut text);
Ok(text)
}
}
}

/// Decodes the source bytes into a string handling any encoding rules
/// for local vs remote files and dealing with the charset.
pub fn decode_source(
Expand Down Expand Up @@ -828,4 +929,81 @@ pub mod tests {
);
}
}

#[test]
fn test_parse_valid_data_url() {
let valid_data_url = "data:text/plain;base64,SGVsbG8sIFdvcmxkIQ==";
let specifier = ModuleSpecifier::parse(valid_data_url).unwrap();
let raw_data_url = RawDataUrl::parse(&specifier).unwrap();
assert_eq!(raw_data_url.mime_type, "text/plain");
assert_eq!(raw_data_url.bytes, b"Hello, World!");
}

#[test]
fn test_charset_with_valid_mime_type() {
let raw_data_url = RawDataUrl {
mime_type: "text/plain; charset=utf-8".to_string(),
bytes: vec![],
};
assert_eq!(raw_data_url.charset(), Some("utf-8"));
}

#[test]
fn test_charset_with_no_charset_in_mime_type() {
let raw_data_url = RawDataUrl {
mime_type: "text/plain".to_string(),
bytes: vec![],
};
assert_eq!(raw_data_url.charset(), None);
}

#[test]
fn test_media_type_with_known_type() {
let raw_data_url = RawDataUrl {
mime_type: "application/javascript;charset=utf-8".to_string(),
bytes: vec![],
};
assert_eq!(raw_data_url.media_type(), MediaType::JavaScript);
}

#[test]
fn test_media_type_with_unknown_type() {
let raw_data_url = RawDataUrl {
mime_type: "unknown/unknown".to_string(),
bytes: vec![],
};
assert_eq!(raw_data_url.media_type(), MediaType::Unknown);
}

#[test]
fn test_decode_with_valid_charset() {
let raw_data_url = RawDataUrl {
mime_type: "text/plain; charset=utf-8".to_string(),
bytes: "Hello, World!".as_bytes().to_vec(),
};
assert_eq!(raw_data_url.decode().unwrap(), "Hello, World!");
}

#[test]
fn test_decode_with_invalid_charset() {
let raw_data_url = RawDataUrl {
mime_type: "text/plain; charset=invalid-charset".to_string(),
bytes: vec![],
};
assert!(raw_data_url.decode().is_err());
}

#[test]
fn test_into_bytes_and_headers() {
let raw_data_url = RawDataUrl {
mime_type: "text/plain; charset=utf-8".to_string(),
bytes: "Hello, World!".as_bytes().to_vec(),
};
let (bytes, headers) = raw_data_url.into_bytes_and_headers();
assert_eq!(bytes, "Hello, World!".as_bytes());
assert_eq!(
headers.get("content-type").unwrap(),
"text/plain; charset=utf-8"
);
}
}
20 changes: 20 additions & 0 deletions src/text_encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ pub fn strip_bom_mut(text: &mut String) {

#[cfg(test)]
mod test {
use std::io::ErrorKind;

use super::*;

fn test_detection(test_data: &[u8], expected_charset: &str) {
Expand Down Expand Up @@ -91,4 +93,22 @@ mod test {
strip_bom_mut(&mut text);
assert_eq!(text, "text");
}

#[test]
fn test_decoding_unsupported_charset() {
let test_data = Vec::new();
let result = convert_to_utf8(&test_data, "utf-32le");
assert!(result.is_err());
let err = result.expect_err("Err expected");
assert!(err.kind() == ErrorKind::InvalidInput);
}

#[test]
fn test_decoding_invalid_utf8() {
let test_data = b"\xFE\xFE\xFF\xFF".to_vec();
let result = convert_to_utf8(&test_data, "utf-8");
assert!(result.is_err());
let err = result.expect_err("Err expected");
assert!(err.kind() == ErrorKind::InvalidData);
}
}

0 comments on commit 3a27ae5

Please sign in to comment.