From a4733333c4d0699d498b356eb4f0494c1c83d36e Mon Sep 17 00:00:00 2001 From: Phillip Lord Date: Mon, 3 Jan 2022 12:10:42 +0000 Subject: [PATCH 1/2] Add heuristic checking for HTML anchors Previously only anchors specified or generated in markdown could be linked to, without complaint from the link checker. We now use a simple heuristic check for `name` or `id` attributes. Duplicate code has been refactored and all XML anchor checks updated to use regex rather than substring match. --- Cargo.lock | 2 + components/config/src/config/link_checker.rs | 7 +--- components/library/src/content/mod.rs | 7 ++++ components/library/src/content/page.rs | 6 +++ components/link_checker/Cargo.toml | 1 + components/link_checker/src/lib.rs | 22 +++------- components/site/src/link_checking.rs | 3 +- components/utils/Cargo.toml | 1 + components/utils/src/lib.rs | 1 + components/utils/src/links.rs | 42 ++++++++++++++++++++ 10 files changed, 68 insertions(+), 24 deletions(-) create mode 100644 components/utils/src/links.rs diff --git a/Cargo.lock b/Cargo.lock index 8ac6dfc2d4..8410a40aca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1367,6 +1367,7 @@ dependencies = [ "lazy_static", "mockito", "reqwest", + "utils", ] [[package]] @@ -3264,6 +3265,7 @@ dependencies = [ "filetime", "minify-html", "percent-encoding", + "regex", "serde", "slug", "tempfile", diff --git a/components/config/src/config/link_checker.rs b/components/config/src/config/link_checker.rs index 9d597d498b..2e81083bbd 100644 --- a/components/config/src/config/link_checker.rs +++ b/components/config/src/config/link_checker.rs @@ -1,6 +1,6 @@ use serde_derive::{Deserialize, Serialize}; -#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] #[serde(default)] pub struct LinkChecker { /// Skip link checking for these URL prefixes @@ -9,8 +9,3 @@ pub struct LinkChecker { pub skip_anchor_prefixes: Vec, } -impl Default for LinkChecker { - fn default() -> LinkChecker { - LinkChecker { skip_prefixes: Vec::new(), skip_anchor_prefixes: Vec::new() } - } -} diff --git a/components/library/src/content/mod.rs b/components/library/src/content/mod.rs index 8a16597424..6c844f5679 100644 --- a/components/library/src/content/mod.rs +++ b/components/library/src/content/mod.rs @@ -14,6 +14,7 @@ pub use self::ser::{SerializingPage, SerializingSection}; use config::Config; use rendering::Heading; +use utils::links::anchor_id_checks; pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool { for heading in headings { @@ -28,6 +29,12 @@ pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool { false } + +pub fn has_anchor_id(content: &str, anchor: &str) -> bool { + let checks = anchor_id_checks(anchor); + checks.is_match(content) +} + /// Looks into the current folder for the path and see if there's anything that is not a .md /// file. Those will be copied next to the rendered .html file /// If `recursive` is set to `true`, it will add all subdirectories assets as well. This should diff --git a/components/library/src/content/page.rs b/components/library/src/content/page.rs index 64d9a82c52..5a631edd13 100644 --- a/components/library/src/content/page.rs +++ b/components/library/src/content/page.rs @@ -21,6 +21,8 @@ use crate::content::ser::SerializingPage; use crate::content::{find_related_assets, has_anchor}; use utils::fs::read_file; +use super::has_anchor_id; + lazy_static! { // Based on https://regex101.com/r/H2n38Z/1/tests // A regex parsing RFC3339 date followed by {_,-}, some characters and ended by .md @@ -300,6 +302,10 @@ impl Page { has_anchor(&self.toc, anchor) } + pub fn has_anchor_id(&self, id: &str) -> bool { + has_anchor_id(&self.content, id) + } + pub fn to_serialized<'a>(&'a self, library: &'a Library) -> SerializingPage<'a> { SerializingPage::from_page(self, library) } diff --git a/components/link_checker/Cargo.toml b/components/link_checker/Cargo.toml index 15de74d71c..4b4bca42d4 100644 --- a/components/link_checker/Cargo.toml +++ b/components/link_checker/Cargo.toml @@ -9,6 +9,7 @@ lazy_static = "1" config = { path = "../config" } errors = { path = "../errors" } +utils = { path = "../utils" } [dependencies.reqwest] version = "0.11" diff --git a/components/link_checker/src/lib.rs b/components/link_checker/src/lib.rs index 1d56db2c61..de62acc2ec 100644 --- a/components/link_checker/src/lib.rs +++ b/components/link_checker/src/lib.rs @@ -2,6 +2,7 @@ use lazy_static::lazy_static; use reqwest::header::{HeaderMap, ACCEPT}; use reqwest::{blocking::Client, StatusCode}; +use utils::links::anchor_id_checks; use config::LinkChecker; use std::collections::HashMap; @@ -104,22 +105,9 @@ fn has_anchor(url: &str) -> bool { fn check_page_for_anchor(url: &str, body: String) -> errors::Result<()> { let index = url.find('#').unwrap(); let anchor = url.get(index + 1..).unwrap(); - let checks = [ - format!(" id={}", anchor), - format!(" ID={}", anchor), - format!(" id='{}'", anchor), - format!(" ID='{}'", anchor), - format!(r#" id="{}""#, anchor), - format!(r#" ID="{}""#, anchor), - format!(" name={}", anchor), - format!(" NAME={}", anchor), - format!(" name='{}'", anchor), - format!(" NAME='{}'", anchor), - format!(r#" name="{}""#, anchor), - format!(r#" NAME="{}""#, anchor), - ]; - - if checks.iter().any(|check| body[..].contains(&check[..])) { + let checks = anchor_id_checks(anchor); + + if checks.is_match(&body){ Ok(()) } else { Err(errors::Error::from(format!("Anchor `#{}` not found on page", anchor))) @@ -338,7 +326,7 @@ mod tests { #[test] fn skip_anchor_prefixes() { let ignore_url = format!("{}{}", mockito::server_url(), "/ignore/"); - let config = LinkChecker { skip_prefixes: vec![], skip_anchor_prefixes: vec![ignore_url] }; + let config = LinkChecker { skip_anchor_prefixes: vec![ignore_url], ..Default::default() }; let _m1 = mock("GET", "/ignore/i30hobj1cy") .with_header("Content-Type", "text/html") diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs index ae2ecb6d31..b42b2d1a62 100644 --- a/components/site/src/link_checking.rs +++ b/components/site/src/link_checking.rs @@ -63,7 +63,8 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> { let page = library .get_page(&full_path) .expect("Couldn't find section in check_internal_links_with_anchors"); - !page.has_anchor(anchor) + + !(page.has_anchor(anchor)||page.has_anchor_id(anchor)) } }); diff --git a/components/utils/Cargo.toml b/components/utils/Cargo.toml index 17ee05ae31..4773a441be 100644 --- a/components/utils/Cargo.toml +++ b/components/utils/Cargo.toml @@ -9,6 +9,7 @@ include = ["src/**/*"] tera = "1" unicode-segmentation = "1.2" walkdir = "2" +regex="1" toml = "0.5" serde = { version = "1.0", features = ["derive"] } slug = "0.1" diff --git a/components/utils/src/lib.rs b/components/utils/src/lib.rs index eac2325dc3..2a8162d0e1 100644 --- a/components/utils/src/lib.rs +++ b/components/utils/src/lib.rs @@ -1,5 +1,6 @@ pub mod de; pub mod fs; +pub mod links; pub mod minify; pub mod net; pub mod site; diff --git a/components/utils/src/links.rs b/components/utils/src/links.rs new file mode 100644 index 0000000000..72cfcdb65a --- /dev/null +++ b/components/utils/src/links.rs @@ -0,0 +1,42 @@ +use regex::Regex; + + +pub fn anchor_id_checks(anchor:&str) -> Regex { + Regex::new( + &format!(r#" (?i)(id|ID|name|NAME) *= *("|')*{}("|'| |>)+"#, anchor) + ).unwrap() +} + + +#[cfg(test)] +mod tests{ + use super::anchor_id_checks; + + fn check(anchor:&str, content:&str) -> bool { + anchor_id_checks(anchor).is_match(content) + } + + #[test] + fn matchers () { + let m = |content| {check("fred", content)}; + + // Canonical match/non match + assert!(m(r#""#)); + assert!(m(r#""#)); + assert!(!m(r#""#)); + + // Whitespace variants + assert!(m(r#""#)); + assert!(m(r#""#)); + assert!(m(r#""#)); + assert!(m(r#""#)); + + // Quote variants + assert!(m(r#""#)); + assert!(m(r#""#)); + + // Case variants + assert!(m(r#""#)); + assert!(m(r#""#)); + } +} From e538e4a3d06c22dbe761340d24254b3093b4e1a2 Mon Sep 17 00:00:00 2001 From: Phillip Lord Date: Sun, 9 Jan 2022 14:25:40 +0000 Subject: [PATCH 2/2] Fix regexp and refactor --- components/library/src/content/mod.rs | 7 ------- components/library/src/content/page.rs | 3 +-- components/link_checker/src/lib.rs | 6 +++--- components/utils/src/links.rs | 9 +++++++-- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/components/library/src/content/mod.rs b/components/library/src/content/mod.rs index 6c844f5679..8a16597424 100644 --- a/components/library/src/content/mod.rs +++ b/components/library/src/content/mod.rs @@ -14,7 +14,6 @@ pub use self::ser::{SerializingPage, SerializingSection}; use config::Config; use rendering::Heading; -use utils::links::anchor_id_checks; pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool { for heading in headings { @@ -29,12 +28,6 @@ pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool { false } - -pub fn has_anchor_id(content: &str, anchor: &str) -> bool { - let checks = anchor_id_checks(anchor); - checks.is_match(content) -} - /// Looks into the current folder for the path and see if there's anything that is not a .md /// file. Those will be copied next to the rendered .html file /// If `recursive` is set to `true`, it will add all subdirectories assets as well. This should diff --git a/components/library/src/content/page.rs b/components/library/src/content/page.rs index 5a631edd13..26c5340846 100644 --- a/components/library/src/content/page.rs +++ b/components/library/src/content/page.rs @@ -20,8 +20,7 @@ use crate::content::file_info::FileInfo; use crate::content::ser::SerializingPage; use crate::content::{find_related_assets, has_anchor}; use utils::fs::read_file; - -use super::has_anchor_id; +use utils::links::has_anchor_id; lazy_static! { // Based on https://regex101.com/r/H2n38Z/1/tests diff --git a/components/link_checker/src/lib.rs b/components/link_checker/src/lib.rs index de62acc2ec..48502d7b3a 100644 --- a/components/link_checker/src/lib.rs +++ b/components/link_checker/src/lib.rs @@ -2,12 +2,12 @@ use lazy_static::lazy_static; use reqwest::header::{HeaderMap, ACCEPT}; use reqwest::{blocking::Client, StatusCode}; -use utils::links::anchor_id_checks; use config::LinkChecker; use std::collections::HashMap; use std::result; use std::sync::{Arc, RwLock}; +use utils::links::has_anchor_id; pub type Result = result::Result; @@ -105,9 +105,9 @@ fn has_anchor(url: &str) -> bool { fn check_page_for_anchor(url: &str, body: String) -> errors::Result<()> { let index = url.find('#').unwrap(); let anchor = url.get(index + 1..).unwrap(); - let checks = anchor_id_checks(anchor); - if checks.is_match(&body){ + + if has_anchor_id(&body, &anchor){ Ok(()) } else { Err(errors::Error::from(format!("Anchor `#{}` not found on page", anchor))) diff --git a/components/utils/src/links.rs b/components/utils/src/links.rs index 72cfcdb65a..3e18b2ada0 100644 --- a/components/utils/src/links.rs +++ b/components/utils/src/links.rs @@ -1,9 +1,14 @@ use regex::Regex; -pub fn anchor_id_checks(anchor:&str) -> Regex { +pub fn has_anchor_id(content: &str, anchor: &str) -> bool { + let checks = anchor_id_checks(anchor); + checks.is_match(content) +} + +fn anchor_id_checks(anchor:&str) -> Regex { Regex::new( - &format!(r#" (?i)(id|ID|name|NAME) *= *("|')*{}("|'| |>)+"#, anchor) + &format!(r#" (?i)(id|name) *= *("|')*{}("|'| |>)+"#, anchor) ).unwrap() }