Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add heuristic checking for HTML anchors #1716

Merged
merged 2 commits into from
Jan 10, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 1 addition & 6 deletions components/config/src/config/link_checker.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use serde_derive::{Deserialize, Serialize};

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default)]
pub struct LinkChecker {
/// Skip link checking for these URL prefixes
Expand All @@ -9,8 +9,3 @@ pub struct LinkChecker {
pub skip_anchor_prefixes: Vec<String>,
}

impl Default for LinkChecker {
fn default() -> LinkChecker {
LinkChecker { skip_prefixes: Vec::new(), skip_anchor_prefixes: Vec::new() }
}
}
7 changes: 7 additions & 0 deletions components/library/src/content/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ pub use self::ser::{SerializingPage, SerializingSection};

use config::Config;
use rendering::Heading;
use utils::links::anchor_id_checks;

pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool {
for heading in headings {
Expand All @@ -28,6 +29,12 @@ pub fn has_anchor(headings: &[Heading], anchor: &str) -> bool {
false
}


pub fn has_anchor_id(content: &str, anchor: &str) -> bool {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would probably move this function to utils directly since we will only ever do is_match

let checks = anchor_id_checks(anchor);
checks.is_match(content)
}

/// Looks into the current folder for the path and see if there's anything that is not a .md
/// file. Those will be copied next to the rendered .html file
/// If `recursive` is set to `true`, it will add all subdirectories assets as well. This should
Expand Down
6 changes: 6 additions & 0 deletions components/library/src/content/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ use crate::content::ser::SerializingPage;
use crate::content::{find_related_assets, has_anchor};
use utils::fs::read_file;

use super::has_anchor_id;

lazy_static! {
// Based on https://regex101.com/r/H2n38Z/1/tests
// A regex parsing RFC3339 date followed by {_,-}, some characters and ended by .md
Expand Down Expand Up @@ -300,6 +302,10 @@ impl Page {
has_anchor(&self.toc, anchor)
}

pub fn has_anchor_id(&self, id: &str) -> bool {
has_anchor_id(&self.content, id)
}

pub fn to_serialized<'a>(&'a self, library: &'a Library) -> SerializingPage<'a> {
SerializingPage::from_page(self, library)
}
Expand Down
1 change: 1 addition & 0 deletions components/link_checker/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ lazy_static = "1"

config = { path = "../config" }
errors = { path = "../errors" }
utils = { path = "../utils" }

[dependencies.reqwest]
version = "0.11"
Expand Down
22 changes: 5 additions & 17 deletions components/link_checker/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use lazy_static::lazy_static;
use reqwest::header::{HeaderMap, ACCEPT};
use reqwest::{blocking::Client, StatusCode};

use utils::links::anchor_id_checks;
use config::LinkChecker;

use std::collections::HashMap;
Expand Down Expand Up @@ -104,22 +105,9 @@ fn has_anchor(url: &str) -> bool {
fn check_page_for_anchor(url: &str, body: String) -> errors::Result<()> {
let index = url.find('#').unwrap();
let anchor = url.get(index + 1..).unwrap();
let checks = [
format!(" id={}", anchor),
format!(" ID={}", anchor),
format!(" id='{}'", anchor),
format!(" ID='{}'", anchor),
format!(r#" id="{}""#, anchor),
format!(r#" ID="{}""#, anchor),
format!(" name={}", anchor),
format!(" NAME={}", anchor),
format!(" name='{}'", anchor),
format!(" NAME='{}'", anchor),
format!(r#" name="{}""#, anchor),
format!(r#" NAME="{}""#, anchor),
];

if checks.iter().any(|check| body[..].contains(&check[..])) {
let checks = anchor_id_checks(anchor);

if checks.is_match(&body){
Ok(())
} else {
Err(errors::Error::from(format!("Anchor `#{}` not found on page", anchor)))
Expand Down Expand Up @@ -338,7 +326,7 @@ mod tests {
#[test]
fn skip_anchor_prefixes() {
let ignore_url = format!("{}{}", mockito::server_url(), "/ignore/");
let config = LinkChecker { skip_prefixes: vec![], skip_anchor_prefixes: vec![ignore_url] };
let config = LinkChecker { skip_anchor_prefixes: vec![ignore_url], ..Default::default() };

let _m1 = mock("GET", "/ignore/i30hobj1cy")
.with_header("Content-Type", "text/html")
Expand Down
3 changes: 2 additions & 1 deletion components/site/src/link_checking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
let page = library
.get_page(&full_path)
.expect("Couldn't find section in check_internal_links_with_anchors");
!page.has_anchor(anchor)

!(page.has_anchor(anchor)||page.has_anchor_id(anchor))
}
});

Expand Down
1 change: 1 addition & 0 deletions components/utils/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ include = ["src/**/*"]
tera = "1"
unicode-segmentation = "1.2"
walkdir = "2"
regex="1"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: space

toml = "0.5"
serde = { version = "1.0", features = ["derive"] }
slug = "0.1"
Expand Down
1 change: 1 addition & 0 deletions components/utils/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
pub mod de;
pub mod fs;
pub mod links;
pub mod minify;
pub mod net;
pub mod site;
Expand Down
42 changes: 42 additions & 0 deletions components/utils/src/links.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use regex::Regex;


pub fn anchor_id_checks(anchor:&str) -> Regex {
Regex::new(
&format!(r#" (?i)(id|ID|name|NAME) *= *("|')*{}("|'| |>)+"#, anchor)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we're setting the ?i flag, we can get rid of the multiple cases in the first part of the regex

).unwrap()
}


#[cfg(test)]
mod tests{
use super::anchor_id_checks;

fn check(anchor:&str, content:&str) -> bool {
anchor_id_checks(anchor).is_match(content)
}

#[test]
fn matchers () {
let m = |content| {check("fred", content)};

// Canonical match/non match
assert!(m(r#"<a name="fred">"#));
assert!(m(r#"<a id="fred">"#));
assert!(!m(r#"<a name="george">"#));

// Whitespace variants
assert!(m(r#"<a id ="fred">"#));
assert!(m(r#"<a id = "fred">"#));
assert!(m(r#"<a id="fred" >"#));
assert!(m(r#"<a id="fred" >"#));

// Quote variants
assert!(m(r#"<a id='fred'>"#));
assert!(m(r#"<a id=fred>"#));

// Case variants
assert!(m(r#"<a ID="fred">"#));
assert!(m(r#"<a iD="fred">"#));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe add some other tags, so readers can easily understand, that the code also works with any element with id?

}
}