From 71b1627aacb86be387a90174f530ea930f5f5e72 Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 16 Sep 2021 03:03:46 +0200 Subject: [PATCH 1/2] CI: Ensure that tests from all workspace projects are run --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index de025eefc78..e8dbc319727 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -218,11 +218,11 @@ jobs: - name: Run tests (with coverage report) if: matrix.rust == 'stable' - run: cargo tarpaulin --avoid-cfg-tarpaulin + run: cargo tarpaulin --avoid-cfg-tarpaulin --workspace - name: Run tests if: matrix.rust != 'stable' - run: cargo test + run: cargo test --workspace - name: Prune unnecessary cache run: script/ci/prune-cache.sh From f9871c2e5be24fa8a280f548fade046011fc688b Mon Sep 17 00:00:00 2001 From: Tobias Bieniek Date: Thu, 16 Sep 2021 03:08:01 +0200 Subject: [PATCH 2/2] Extract `cio_markdown` subpackage This allows us to isolate our markdown rendering code in a dedicated package, which could potentially have a positive effect on the compile times of the full project. It also means we can iterate on this part of the code independent from the other parts, and with a well-specified API interface. --- Cargo.lock | 14 +- Cargo.toml | 4 +- src/admin/render_readmes.rs | 2 +- src/markdown/Cargo.toml | 18 ++ src/markdown/lib.rs | 559 ++++++++++++++++++++++++++++++++++++ src/render.rs | 558 +---------------------------------- 6 files changed, 591 insertions(+), 564 deletions(-) create mode 100644 src/markdown/Cargo.toml create mode 100644 src/markdown/lib.rs diff --git a/Cargo.lock b/Cargo.lock index 3cda25be285..1193e9a619c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -243,14 +243,13 @@ checksum = "c4872d67bab6358e59559027aa3b9157c53d9358c51423c17554809a8858e0f8" name = "cargo-registry" version = "0.2.2" dependencies = [ - "ammonia", "anyhow", "base64", "cargo-registry-s3", "chrono", + "cio_markdown", "claim", "clap", - "comrak", "conduit", "conduit-conditional-get", "conduit-cookie", @@ -274,7 +273,6 @@ dependencies = [ "git2", "handlebars", "hex", - "htmlescape", "http", "hyper", "hyper-tls", @@ -346,6 +344,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "cio_markdown" +version = "0.0.0" +dependencies = [ + "ammonia", + "comrak", + "htmlescape", + "url", +] + [[package]] name = "cipher" version = "0.2.5" diff --git a/Cargo.toml b/Cargo.toml index 04cfa632163..0cc0359ead4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,13 +30,12 @@ rustdoc-args = [ ] [dependencies] -ammonia = "3.0.0" anyhow = "1.0" base64 = "0.13" cargo-registry-s3 = { path = "src/s3", version = "0.2.0" } chrono = { version = "0.4.0", features = ["serde"] } +cio_markdown = { path = "src/markdown" } clap = "=3.0.0-beta.4" -comrak = { version = "0.10.1", default-features = false } conduit = "0.9.0-alpha.5" conduit-conditional-get = "0.9.0-alpha.3" @@ -61,7 +60,6 @@ futures-util = "0.3" git2 = "0.13.0" handlebars = "4.1.3" hex = "0.4" -htmlescape = "0.3.1" http = "0.2" hyper = { version = "0.14", features = ["client", "http1"] } indexmap = { version = "1.0.2", features = ["serde-1"] } diff --git a/src/admin/render_readmes.rs b/src/admin/render_readmes.rs index 957fb87add9..1d89dcbcba8 100644 --- a/src/admin/render_readmes.rs +++ b/src/admin/render_readmes.rs @@ -1,13 +1,13 @@ use crate::{ config, db, models::Version, - render::readme_to_html, schema::{crates, readme_renderings, versions}, uploaders::Uploader, }; use std::{io::Read, path::Path, sync::Arc, thread}; use chrono::{TimeZone, Utc}; +use cio_markdown::readme_to_html; use clap::Clap; use diesel::{dsl::any, prelude::*}; use flate2::read::GzDecoder; diff --git a/src/markdown/Cargo.toml b/src/markdown/Cargo.toml new file mode 100644 index 00000000000..c5831846327 --- /dev/null +++ b/src/markdown/Cargo.toml @@ -0,0 +1,18 @@ +[package] + +name = "cio_markdown" +version = "0.0.0" +license = "MIT OR Apache-2.0" +repository = "https://github.com/rust-lang/crates.io" +description = "crates.io markdown renderer" +edition = "2018" +resolver = "2" + +[lib] +path = "lib.rs" + +[dependencies] +ammonia = "3.1.2" +comrak = { version = "0.10.1", default-features = false } +htmlescape = "0.3.1" +url = "2.2.2" diff --git a/src/markdown/lib.rs b/src/markdown/lib.rs new file mode 100644 index 00000000000..089e96347af --- /dev/null +++ b/src/markdown/lib.rs @@ -0,0 +1,559 @@ +//! Render README files to HTML. + +use ammonia::{Builder, UrlRelative, UrlRelativeEvaluate}; +use comrak::nodes::{AstNode, NodeValue}; +use htmlescape::encode_minimal; +use std::borrow::Cow; +use std::path::Path; +use url::Url; + +/// Context for markdown to HTML rendering. +struct MarkdownRenderer<'a> { + html_sanitizer: Builder<'a>, +} + +impl<'a> MarkdownRenderer<'a> { + /// Creates a new renderer instance. + /// + /// Per `readme_to_html`, `base_url` is the base URL prepended to any + /// relative links in the input document. See that function for more detail. + fn new(base_url: Option<&'a str>, base_dir: &'a str) -> MarkdownRenderer<'a> { + let allowed_classes = hashmap(&[( + "code", + hashset(&[ + "language-bash", + "language-clike", + "language-glsl", + "language-go", + "language-ini", + "language-javascript", + "language-json", + "language-markup", + "language-protobuf", + "language-ruby", + "language-rust", + "language-scss", + "language-sql", + "language-toml", + "language-yaml", + ]), + )]); + let sanitize_url = UrlRelative::Custom(Box::new(SanitizeUrl::new(base_url, base_dir))); + + let mut html_sanitizer = Builder::default(); + html_sanitizer + .add_tags(&["input"]) + .link_rel(Some("nofollow noopener noreferrer")) + .add_generic_attributes(&["align"]) + .add_tag_attributes("a", &["id", "target"]) + .add_tag_attributes("input", &["checked", "disabled", "type"]) + .allowed_classes(allowed_classes) + .url_relative(sanitize_url) + .id_prefix(Some("user-content-")); + MarkdownRenderer { html_sanitizer } + } + + /// Renders the given markdown to HTML using the current settings. + fn to_html(&self, text: &str) -> String { + use comrak::{ + format_html, parse_document, Arena, ComrakExtensionOptions, ComrakOptions, + ComrakRenderOptions, + }; + + let options = ComrakOptions { + render: ComrakRenderOptions { + unsafe_: true, // The output will be sanitized with `ammonia` + ..ComrakRenderOptions::default() + }, + extension: ComrakExtensionOptions { + autolink: true, + strikethrough: true, + table: true, + tagfilter: true, + tasklist: true, + header_ids: Some("user-content-".to_string()), + ..ComrakExtensionOptions::default() + }, + ..ComrakOptions::default() + }; + + let arena = Arena::new(); + let root = parse_document(&arena, text, &options); + + // Tweak annotations of code blocks. + iter_nodes(root, &|node| { + if let NodeValue::CodeBlock(ref mut ncb) = node.data.borrow_mut().value { + // If annot includes invalid UTF-8 char, do nothing. + if let Ok(mut orig_annot) = String::from_utf8(ncb.info.to_vec()) { + // Ignore characters after a comma for syntax highlighting to work correctly. + if let Some(offset) = orig_annot.find(',') { + let _ = orig_annot.drain(offset..orig_annot.len()); + ncb.info = orig_annot.as_bytes().to_vec(); + } + } + } + }); + + let mut html = Vec::new(); + format_html(root, &options, &mut html).unwrap(); + let rendered = String::from_utf8(html).unwrap(); + self.html_sanitizer.clean(&rendered).to_string() + } +} + +/// Iterate the nodes in the CommonMark AST, used in comrak. +fn iter_nodes<'a, F>(node: &'a AstNode<'a>, f: &F) +where + F: Fn(&'a AstNode<'a>), +{ + f(node); + for c in node.children() { + iter_nodes(c, f); + } +} + +/// Add trailing slash and remove `.git` suffix of base URL. +fn canon_base_url(mut base_url: String) -> String { + if !base_url.ends_with('/') { + base_url.push('/'); + } + if base_url.ends_with(".git/") { + let offset = base_url.len() - 5; + base_url.drain(offset..offset + 4); + } + base_url +} + +/// Sanitize relative URLs in README files. +struct SanitizeUrl { + base_url: Option, + base_dir: String, +} + +impl SanitizeUrl { + fn new(base_url: Option<&str>, base_dir: &str) -> Self { + let base_url = base_url + .and_then(|base_url| Url::parse(base_url).ok()) + .and_then(|url| match url.host_str() { + Some("github.com") | Some("gitlab.com") | Some("bitbucket.org") => { + Some(canon_base_url(url.into())) + } + _ => None, + }); + Self { + base_url, + base_dir: base_dir.to_owned(), + } + } +} + +/// Groups media-related URL info +struct MediaUrl { + is_media: bool, + add_sanitize_query: bool, +} + +/// Determine whether the given URL has a media file extension. +/// Also check if `sanitize=true` must be added to the query string, +/// which is required to load SVGs properly from GitHub. +fn is_media_url(url: &str) -> MediaUrl { + Path::new(url) + .extension() + .and_then(std::ffi::OsStr::to_str) + .map_or( + MediaUrl { + is_media: false, + add_sanitize_query: false, + }, + |e| match e { + "svg" => MediaUrl { + is_media: true, + add_sanitize_query: true, + }, + "png" | "jpg" | "jpeg" | "gif" | "mp4" | "webm" | "ogg" | "webp" => MediaUrl { + is_media: true, + add_sanitize_query: false, + }, + _ => MediaUrl { + is_media: false, + add_sanitize_query: false, + }, + }, + ) +} + +impl UrlRelativeEvaluate for SanitizeUrl { + fn evaluate<'a>(&self, url: &'a str) -> Option> { + if url.starts_with('#') { + // Always allow fragment URLs. + return Some(Cow::Borrowed(url)); + } + self.base_url.as_ref().map(|base_url| { + let mut new_url = base_url.clone(); + // Assumes GitHub’s URL scheme. GitHub renders text and markdown + // better in the "blob" view, but images need to be served raw. + let MediaUrl { + is_media, + add_sanitize_query, + } = is_media_url(url); + new_url += if is_media { "raw/HEAD" } else { "blob/HEAD" }; + if !self.base_dir.is_empty() { + new_url += "/"; + new_url += &self.base_dir; + } + if !url.starts_with('/') { + new_url.push('/'); + } + new_url += url; + if add_sanitize_query { + if let Ok(mut parsed_url) = Url::parse(&new_url) { + parsed_url.query_pairs_mut().append_pair("sanitize", "true"); + new_url = parsed_url.into(); + } + } + Cow::Owned(new_url) + }) + } +} + +/// Renders Markdown text to sanitized HTML with a given `base_url`. +/// See `readme_to_html` for the interpretation of `base_url`. +fn markdown_to_html(text: &str, base_url: Option<&str>, base_dir: &str) -> String { + let renderer = MarkdownRenderer::new(base_url, base_dir); + renderer.to_html(text) +} + +/// Any readme with a filename ending in one of these extensions will be rendered as Markdown. +/// Note we also render a readme as Markdown if _no_ extension is on the filename. +static MARKDOWN_EXTENSIONS: [&str; 7] = + ["md", "markdown", "mdown", "mdwn", "mkd", "mkdn", "mkdown"]; + +/// Renders a readme to sanitized HTML. An appropriate rendering method is chosen depending +/// on the extension of the supplied `filename`. +/// +/// The returned text will not contain any harmful HTML tag or attribute (such as iframe, +/// onclick, onmouseover, etc.). +/// +/// The `base_url` parameter will be used as the base for any relative links found in the +/// Markdown, as long as its host part is github.com, gitlab.com, or bitbucket.org. The +/// supplied URL will be used as a directory base whether or not the relative link is +/// prefixed with '/'. If `None` is passed, relative links will be omitted. +/// +/// # Examples +/// +/// ``` +/// use cio_markdown::readme_to_html; +/// +/// let text = "[Rust](https://rust-lang.org/) is an awesome *systems programming* language!"; +/// let rendered = readme_to_html(text, "README.md", None); +/// ``` +pub fn readme_to_html(text: &str, readme_path: &str, base_url: Option<&str>) -> String { + let readme_path = Path::new(readme_path); + let readme_dir = readme_path.parent().and_then(|p| p.to_str()).unwrap_or(""); + + if readme_path.extension().is_none() { + return markdown_to_html(text, base_url, readme_dir); + } + + if let Some(ext) = readme_path.extension().and_then(|ext| ext.to_str()) { + if MARKDOWN_EXTENSIONS.contains(&ext.to_lowercase().as_str()) { + return markdown_to_html(text, base_url, readme_dir); + } + } + + encode_minimal(text).replace("\n", "
\n") +} + +/// Helper function to build a new `HashSet` from the items slice. +fn hashset(items: &[T]) -> std::collections::HashSet +where + T: Clone + Eq + std::hash::Hash, +{ + items.iter().cloned().collect() +} + +/// Helper function to build a new `HashMap` from a slice of key-value pairs. +fn hashmap(items: &[(K, V)]) -> std::collections::HashMap +where + K: Clone + Eq + std::hash::Hash, + V: Clone, +{ + items.iter().cloned().collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn empty_text() { + let text = ""; + let result = markdown_to_html(text, None, ""); + assert_eq!(result, ""); + } + + #[test] + fn text_with_script_tag() { + let text = "foo_readme\n\n"; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "

foo_readme

\n<script>alert(\'Hello World\')</script>\n" + ); + } + + #[test] + fn text_with_iframe_tag() { + let text = "foo_readme\n\n"; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "

foo_readme

\n<iframe>alert(\'Hello World\')</iframe>\n" + ); + } + + #[test] + fn text_with_unknown_tag() { + let text = "foo_readme\n\nalert('Hello World')"; + let result = markdown_to_html(text, None, ""); + assert_eq!(result, "

foo_readme

\n

alert(\'Hello World\')

\n"); + } + + #[test] + fn text_with_inline_javascript() { + let text = r#"foo_readme\n\nCrate page"#; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "

foo_readme\\n\\nCrate page

\n" + ); + } + + // See https://github.com/kivikakk/comrak/issues/37. This panic happened + // in comrak 0.1.8 but was fixed in 0.1.9. + #[test] + fn text_with_fancy_single_quotes() { + let text = "wb’"; + let result = markdown_to_html(text, None, ""); + assert_eq!(result, "

wb’

\n"); + } + + #[test] + fn code_block_with_syntax_highlighting() { + let code_block = r#"```rust \ + println!("Hello World"); \ + ```"#; + let result = markdown_to_html(code_block, None, ""); + assert!(result.contains("")); + } + + #[test] + fn code_block_with_syntax_highlighting_even_if_annot_has_no_run() { + let code_block = r#"```rust , no_run \ + println!("Hello World"); \ + ```"#; + let result = markdown_to_html(code_block, None, ""); + assert!(result.contains("")); + } + + #[test] + fn text_with_forbidden_class_attribute() { + let text = "

Hello World!

"; + let result = markdown_to_html(text, None, ""); + assert_eq!(result, "

Hello World!

\n"); + } + + #[test] + fn relative_links() { + let absolute = "[hi](/hi)"; + let relative = "[there](there)"; + let image = "![alt](img.png)"; + let html_image = "\"alt\""; + let svg = "![alt](sanitize.svg)"; + + for host in &["github.com", "gitlab.com", "bitbucket.org"] { + for (&extra_slash, &dot_git) in [true, false].iter().zip(&[true, false]) { + let url = format!( + "https://{}/rust-lang/test{}{}", + host, + if dot_git { ".git" } else { "" }, + if extra_slash { "/" } else { "" }, + ); + + let result = markdown_to_html(absolute, Some(&url), ""); + assert_eq!( + result, + format!( + "

hi

\n", + host + ) + ); + + let result = markdown_to_html(relative, Some(&url), ""); + assert_eq!( + result, + format!( + "

there

\n", + host + ) + ); + + let result = markdown_to_html(image, Some(&url), ""); + assert_eq!( + result, + format!( + "

\"alt\"

\n", + host + ) + ); + + let result = markdown_to_html(html_image, Some(&url), ""); + assert_eq!( + result, + format!( + "\"alt\"\n", + host + ) + ); + + let result = markdown_to_html(svg, Some(&url), ""); + assert_eq!( + result, + format!( + "

\"alt\"

\n", + host + ) + ); + + let result = markdown_to_html(svg, Some(&url), "subdir"); + assert_eq!( + result, + format!( + "

\"alt\"

\n", + host + ) + ); + + let result = markdown_to_html(svg, Some(&url), "subdir1/subdir2"); + assert_eq!( + result, + format!( + "

\"alt\"

\n", + host + ) + ); + } + } + + let result = markdown_to_html(absolute, Some("https://google.com/"), ""); + assert_eq!( + result, + "

hi

\n" + ); + } + + #[test] + fn absolute_links_dont_get_resolved() { + let readme_text = + "[![Crates.io](https://img.shields.io/crates/v/clap.svg)](https://crates.io/crates/clap)"; + let repository = "https://github.com/kbknapp/clap-rs/"; + let result = markdown_to_html(readme_text, Some(repository), ""); + + assert_eq!( + result, + "

\"Crates.io\"

\n" + ); + } + + #[test] + fn readme_to_html_renders_markdown() { + for f in &[ + "README", + "readme.md", + "README.MARKDOWN", + "whatever.mkd", + "s/readme.md", + "s1/s2/readme.md", + ] { + assert_eq!( + readme_to_html("*lobster*", f, None), + "

lobster

\n" + ); + } + + assert_eq!( + readme_to_html("*[lobster](docs/lobster)*", "readme.md", Some("https://github.com/rust-lang/test")), + "

lobster

\n" + ); + assert_eq!( + readme_to_html("*[lobster](docs/lobster)*", "s/readme.md", Some("https://github.com/rust-lang/test")), + "

lobster

\n" + ); + assert_eq!( + readme_to_html("*[lobster](docs/lobster)*", "s1/s2/readme.md", Some("https://github.com/rust-lang/test")), + "

lobster

\n" + ); + } + + #[test] + fn readme_to_html_renders_other_things() { + for f in &["readme.exe", "readem.org", "blah.adoc"] { + assert_eq!( + readme_to_html("\n\nis my friend\n", f, None), + "<script>lobster</script>
\n
\nis my friend
\n" + ); + } + } + + #[test] + fn header_has_tags() { + let text = "# My crate\n\nHello, world!\n"; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "

My crate

\n

Hello, world!

\n" + ); + } + + #[test] + fn manual_anchor_is_sanitized() { + let text = + "

My crate

\n

Hello, world!

\n"; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "

My crate

\n

Hello, world!

\n" + ); + } + + #[test] + fn tables_with_rowspan_and_colspan() { + let text = "
Target
\n"; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "
Target
\n" + ); + } + + #[test] + fn text_alignment() { + let text = "

foo-bar

\n
Hello World!
\n"; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "

foo-bar

\n
Hello World!
\n" + ); + } + + #[test] + fn image_alignment() { + let text = + "

\"\"

\n"; + let result = markdown_to_html(text, None, ""); + assert_eq!( + result, + "

\"\"

\n" + ); + } +} diff --git a/src/render.rs b/src/render.rs index f5383605cbe..3ba19b52321 100644 --- a/src/render.rs +++ b/src/render.rs @@ -1,273 +1,11 @@ //! Render README files to HTML. -use ammonia::{Builder, UrlRelative, UrlRelativeEvaluate}; -use comrak::nodes::{AstNode, NodeValue}; -use htmlescape::encode_minimal; -use std::borrow::Cow; -use std::path::Path; +use cio_markdown::readme_to_html; use swirl::PerformError; -use url::Url; use crate::background_jobs::Environment; use crate::models::Version; -/// Context for markdown to HTML rendering. -struct MarkdownRenderer<'a> { - html_sanitizer: Builder<'a>, -} - -impl<'a> MarkdownRenderer<'a> { - /// Creates a new renderer instance. - /// - /// Per `readme_to_html`, `base_url` is the base URL prepended to any - /// relative links in the input document. See that function for more detail. - fn new(base_url: Option<&'a str>, base_dir: &'a str) -> MarkdownRenderer<'a> { - let allowed_classes = hashmap(&[( - "code", - hashset(&[ - "language-bash", - "language-clike", - "language-glsl", - "language-go", - "language-ini", - "language-javascript", - "language-json", - "language-markup", - "language-protobuf", - "language-ruby", - "language-rust", - "language-scss", - "language-sql", - "language-toml", - "language-yaml", - ]), - )]); - let sanitize_url = UrlRelative::Custom(Box::new(SanitizeUrl::new(base_url, base_dir))); - - let mut html_sanitizer = Builder::default(); - html_sanitizer - .add_tags(&["input"]) - .link_rel(Some("nofollow noopener noreferrer")) - .add_generic_attributes(&["align"]) - .add_tag_attributes("a", &["id", "target"]) - .add_tag_attributes("input", &["checked", "disabled", "type"]) - .allowed_classes(allowed_classes) - .url_relative(sanitize_url) - .id_prefix(Some("user-content-")); - MarkdownRenderer { html_sanitizer } - } - - /// Renders the given markdown to HTML using the current settings. - fn to_html(&self, text: &str) -> String { - use comrak::{ - format_html, parse_document, Arena, ComrakExtensionOptions, ComrakOptions, - ComrakRenderOptions, - }; - - let options = ComrakOptions { - render: ComrakRenderOptions { - unsafe_: true, // The output will be sanitized with `ammonia` - ..ComrakRenderOptions::default() - }, - extension: ComrakExtensionOptions { - autolink: true, - strikethrough: true, - table: true, - tagfilter: true, - tasklist: true, - header_ids: Some("user-content-".to_string()), - ..ComrakExtensionOptions::default() - }, - ..ComrakOptions::default() - }; - - let arena = Arena::new(); - let root = parse_document(&arena, text, &options); - - // Tweak annotations of code blocks. - iter_nodes(root, &|node| { - if let NodeValue::CodeBlock(ref mut ncb) = node.data.borrow_mut().value { - // If annot includes invalid UTF-8 char, do nothing. - if let Ok(mut orig_annot) = String::from_utf8(ncb.info.to_vec()) { - // Ignore characters after a comma for syntax highlighting to work correctly. - if let Some(offset) = orig_annot.find(',') { - let _ = orig_annot.drain(offset..orig_annot.len()); - ncb.info = orig_annot.as_bytes().to_vec(); - } - } - } - }); - - let mut html = Vec::new(); - format_html(root, &options, &mut html).unwrap(); - let rendered = String::from_utf8(html).unwrap(); - self.html_sanitizer.clean(&rendered).to_string() - } -} - -/// Iterate the nodes in the CommonMark AST, used in comrak. -fn iter_nodes<'a, F>(node: &'a AstNode<'a>, f: &F) -where - F: Fn(&'a AstNode<'a>), -{ - f(node); - for c in node.children() { - iter_nodes(c, f); - } -} - -/// Add trailing slash and remove `.git` suffix of base URL. -fn canon_base_url(mut base_url: String) -> String { - if !base_url.ends_with('/') { - base_url.push('/'); - } - if base_url.ends_with(".git/") { - let offset = base_url.len() - 5; - base_url.drain(offset..offset + 4); - } - base_url -} - -/// Sanitize relative URLs in README files. -struct SanitizeUrl { - base_url: Option, - base_dir: String, -} - -impl SanitizeUrl { - fn new(base_url: Option<&str>, base_dir: &str) -> Self { - let base_url = base_url - .and_then(|base_url| Url::parse(base_url).ok()) - .and_then(|url| match url.host_str() { - Some("github.com") | Some("gitlab.com") | Some("bitbucket.org") => { - Some(canon_base_url(url.into())) - } - _ => None, - }); - Self { - base_url, - base_dir: base_dir.to_owned(), - } - } -} - -/// Groups media-related URL info -struct MediaUrl { - is_media: bool, - add_sanitize_query: bool, -} - -/// Determine whether the given URL has a media file extension. -/// Also check if `sanitize=true` must be added to the query string, -/// which is required to load SVGs properly from GitHub. -fn is_media_url(url: &str) -> MediaUrl { - Path::new(url) - .extension() - .and_then(std::ffi::OsStr::to_str) - .map_or( - MediaUrl { - is_media: false, - add_sanitize_query: false, - }, - |e| match e { - "svg" => MediaUrl { - is_media: true, - add_sanitize_query: true, - }, - "png" | "jpg" | "jpeg" | "gif" | "mp4" | "webm" | "ogg" | "webp" => MediaUrl { - is_media: true, - add_sanitize_query: false, - }, - _ => MediaUrl { - is_media: false, - add_sanitize_query: false, - }, - }, - ) -} - -impl UrlRelativeEvaluate for SanitizeUrl { - fn evaluate<'a>(&self, url: &'a str) -> Option> { - if url.starts_with('#') { - // Always allow fragment URLs. - return Some(Cow::Borrowed(url)); - } - self.base_url.as_ref().map(|base_url| { - let mut new_url = base_url.clone(); - // Assumes GitHub’s URL scheme. GitHub renders text and markdown - // better in the "blob" view, but images need to be served raw. - let MediaUrl { - is_media, - add_sanitize_query, - } = is_media_url(url); - new_url += if is_media { "raw/HEAD" } else { "blob/HEAD" }; - if !self.base_dir.is_empty() { - new_url += "/"; - new_url += &self.base_dir; - } - if !url.starts_with('/') { - new_url.push('/'); - } - new_url += url; - if add_sanitize_query { - if let Ok(mut parsed_url) = Url::parse(&new_url) { - parsed_url.query_pairs_mut().append_pair("sanitize", "true"); - new_url = parsed_url.into(); - } - } - Cow::Owned(new_url) - }) - } -} - -/// Renders Markdown text to sanitized HTML with a given `base_url`. -/// See `readme_to_html` for the interpretation of `base_url`. -fn markdown_to_html(text: &str, base_url: Option<&str>, base_dir: &str) -> String { - let renderer = MarkdownRenderer::new(base_url, base_dir); - renderer.to_html(text) -} - -/// Any readme with a filename ending in one of these extensions will be rendered as Markdown. -/// Note we also render a readme as Markdown if _no_ extension is on the filename. -static MARKDOWN_EXTENSIONS: [&str; 7] = - ["md", "markdown", "mdown", "mdwn", "mkd", "mkdn", "mkdown"]; - -/// Renders a readme to sanitized HTML. An appropriate rendering method is chosen depending -/// on the extension of the supplied `filename`. -/// -/// The returned text will not contain any harmful HTML tag or attribute (such as iframe, -/// onclick, onmouseover, etc.). -/// -/// The `base_url` parameter will be used as the base for any relative links found in the -/// Markdown, as long as its host part is github.com, gitlab.com, or bitbucket.org. The -/// supplied URL will be used as a directory base whether or not the relative link is -/// prefixed with '/'. If `None` is passed, relative links will be omitted. -/// -/// # Examples -/// -/// ``` -/// use render::render_to_html; -/// -/// let text = "[Rust](https://rust-lang.org/) is an awesome *systems programming* language!"; -/// let rendered = readme_to_html(text, "README.md", None)?; -/// ``` -pub fn readme_to_html(text: &str, readme_path: &str, base_url: Option<&str>) -> String { - let readme_path = Path::new(readme_path); - let readme_dir = readme_path.parent().and_then(|p| p.to_str()).unwrap_or(""); - - if readme_path.extension().is_none() { - return markdown_to_html(text, base_url, readme_dir); - } - - if let Some(ext) = readme_path.extension().and_then(|ext| ext.to_str()) { - if MARKDOWN_EXTENSIONS.contains(&ext.to_lowercase().as_str()) { - return markdown_to_html(text, base_url, readme_dir); - } - } - - encode_minimal(text).replace("\n", "
\n") -} - #[swirl::background_job] pub fn render_and_upload_readme( conn: &PgConnection, @@ -294,297 +32,3 @@ pub fn render_and_upload_readme( Ok(()) }) } - -/// Helper function to build a new `HashSet` from the items slice. -fn hashset(items: &[T]) -> std::collections::HashSet -where - T: Clone + Eq + std::hash::Hash, -{ - items.iter().cloned().collect() -} - -/// Helper function to build a new `HashMap` from a slice of key-value pairs. -fn hashmap(items: &[(K, V)]) -> std::collections::HashMap -where - K: Clone + Eq + std::hash::Hash, - V: Clone, -{ - items.iter().cloned().collect() -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn empty_text() { - let text = ""; - let result = markdown_to_html(text, None, ""); - assert_eq!(result, ""); - } - - #[test] - fn text_with_script_tag() { - let text = "foo_readme\n\n"; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "

foo_readme

\n<script>alert(\'Hello World\')</script>\n" - ); - } - - #[test] - fn text_with_iframe_tag() { - let text = "foo_readme\n\n"; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "

foo_readme

\n<iframe>alert(\'Hello World\')</iframe>\n" - ); - } - - #[test] - fn text_with_unknown_tag() { - let text = "foo_readme\n\nalert('Hello World')"; - let result = markdown_to_html(text, None, ""); - assert_eq!(result, "

foo_readme

\n

alert(\'Hello World\')

\n"); - } - - #[test] - fn text_with_inline_javascript() { - let text = r#"foo_readme\n\nCrate page"#; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "

foo_readme\\n\\nCrate page

\n" - ); - } - - // See https://github.com/kivikakk/comrak/issues/37. This panic happened - // in comrak 0.1.8 but was fixed in 0.1.9. - #[test] - fn text_with_fancy_single_quotes() { - let text = "wb’"; - let result = markdown_to_html(text, None, ""); - assert_eq!(result, "

wb’

\n"); - } - - #[test] - fn code_block_with_syntax_highlighting() { - let code_block = r#"```rust \ - println!("Hello World"); \ - ```"#; - let result = markdown_to_html(code_block, None, ""); - assert!(result.contains("")); - } - - #[test] - fn code_block_with_syntax_highlighting_even_if_annot_has_no_run() { - let code_block = r#"```rust , no_run \ - println!("Hello World"); \ - ```"#; - let result = markdown_to_html(code_block, None, ""); - assert!(result.contains("")); - } - - #[test] - fn text_with_forbidden_class_attribute() { - let text = "

Hello World!

"; - let result = markdown_to_html(text, None, ""); - assert_eq!(result, "

Hello World!

\n"); - } - - #[test] - fn relative_links() { - let absolute = "[hi](/hi)"; - let relative = "[there](there)"; - let image = "![alt](img.png)"; - let html_image = "\"alt\""; - let svg = "![alt](sanitize.svg)"; - - for host in &["github.com", "gitlab.com", "bitbucket.org"] { - for (&extra_slash, &dot_git) in [true, false].iter().zip(&[true, false]) { - let url = format!( - "https://{}/rust-lang/test{}{}", - host, - if dot_git { ".git" } else { "" }, - if extra_slash { "/" } else { "" }, - ); - - let result = markdown_to_html(absolute, Some(&url), ""); - assert_eq!( - result, - format!( - "

hi

\n", - host - ) - ); - - let result = markdown_to_html(relative, Some(&url), ""); - assert_eq!( - result, - format!( - "

there

\n", - host - ) - ); - - let result = markdown_to_html(image, Some(&url), ""); - assert_eq!( - result, - format!( - "

\"alt\"

\n", - host - ) - ); - - let result = markdown_to_html(html_image, Some(&url), ""); - assert_eq!( - result, - format!( - "\"alt\"\n", - host - ) - ); - - let result = markdown_to_html(svg, Some(&url), ""); - assert_eq!( - result, - format!( - "

\"alt\"

\n", - host - ) - ); - - let result = markdown_to_html(svg, Some(&url), "subdir"); - assert_eq!( - result, - format!( - "

\"alt\"

\n", - host - ) - ); - - let result = markdown_to_html(svg, Some(&url), "subdir1/subdir2"); - assert_eq!( - result, - format!( - "

\"alt\"

\n", - host - ) - ); - } - } - - let result = markdown_to_html(absolute, Some("https://google.com/"), ""); - assert_eq!( - result, - "

hi

\n" - ); - } - - #[test] - fn absolute_links_dont_get_resolved() { - let readme_text = - "[![Crates.io](https://img.shields.io/crates/v/clap.svg)](https://crates.io/crates/clap)"; - let repository = "https://github.com/kbknapp/clap-rs/"; - let result = markdown_to_html(readme_text, Some(repository), ""); - - assert_eq!( - result, - "

\"Crates.io\"

\n" - ); - } - - #[test] - fn readme_to_html_renders_markdown() { - for f in &[ - "README", - "readme.md", - "README.MARKDOWN", - "whatever.mkd", - "s/readme.md", - "s1/s2/readme.md", - ] { - assert_eq!( - readme_to_html("*lobster*", f, None), - "

lobster

\n" - ); - } - - assert_eq!( - readme_to_html("*[lobster](docs/lobster)*", "readme.md", Some("https://github.com/rust-lang/test")), - "

lobster

\n" - ); - assert_eq!( - readme_to_html("*[lobster](docs/lobster)*", "s/readme.md", Some("https://github.com/rust-lang/test")), - "

lobster

\n" - ); - assert_eq!( - readme_to_html("*[lobster](docs/lobster)*", "s1/s2/readme.md", Some("https://github.com/rust-lang/test")), - "

lobster

\n" - ); - } - - #[test] - fn readme_to_html_renders_other_things() { - for f in &["readme.exe", "readem.org", "blah.adoc"] { - assert_eq!( - readme_to_html("\n\nis my friend\n", f, None), - "<script>lobster</script>
\n
\nis my friend
\n" - ); - } - } - - #[test] - fn header_has_tags() { - let text = "# My crate\n\nHello, world!\n"; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "

My crate

\n

Hello, world!

\n" - ); - } - - #[test] - fn manual_anchor_is_sanitized() { - let text = - "

My crate

\n

Hello, world!

\n"; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "

My crate

\n

Hello, world!

\n" - ); - } - - #[test] - fn tables_with_rowspan_and_colspan() { - let text = "
Target
\n"; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "
Target
\n" - ); - } - - #[test] - fn text_alignment() { - let text = "

foo-bar

\n
Hello World!
\n"; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "

foo-bar

\n
Hello World!
\n" - ); - } - - #[test] - fn image_alignment() { - let text = - "

\"\"

\n"; - let result = markdown_to_html(text, None, ""); - assert_eq!( - result, - "

\"\"

\n" - ); - } -}