Skip to content

Commit

Permalink
Merge pull request #2308 from ehuss/pulldown_cmark-0.10
Browse files Browse the repository at this point in the history
Update pulldown_cmark to 0.10
  • Loading branch information
ehuss authored Feb 5, 2024
2 parents d48810f + 42e635b commit 600824b
Show file tree
Hide file tree
Showing 8 changed files with 366 additions and 54 deletions.
13 changes: 10 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ handlebars = "5.0"
log = "0.4.17"
memchr = "2.5.0"
opener = "0.6.1"
pulldown-cmark = { version = "0.9.3", default-features = false }
pulldown-cmark = { version = "0.10.0", default-features = false, features = ["html"] }
regex = "1.8.1"
serde = { version = "1.0.163", features = ["derive"] }
serde_json = "1.0.96"
Expand Down
55 changes: 36 additions & 19 deletions src/book/summary.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use crate::errors::*;
use log::{debug, trace, warn};
use memchr::{self, Memchr};
use pulldown_cmark::{self, Event, HeadingLevel, Tag};
use memchr::Memchr;
use pulldown_cmark::{DefaultBrokenLinkCallback, Event, HeadingLevel, Tag, TagEnd};
use serde::{Deserialize, Serialize};
use std::fmt::{self, Display, Formatter};
use std::iter::FromIterator;
Expand Down Expand Up @@ -163,7 +163,7 @@ impl From<Link> for SummaryItem {
/// > match the following regex: "[^<>\n[]]+".
struct SummaryParser<'a> {
src: &'a str,
stream: pulldown_cmark::OffsetIter<'a, 'a>,
stream: pulldown_cmark::OffsetIter<'a, DefaultBrokenLinkCallback>,
offset: usize,

/// We can't actually put an event back into the `OffsetIter` stream, so instead we store it
Expand Down Expand Up @@ -210,7 +210,7 @@ macro_rules! collect_events {
}

impl<'a> SummaryParser<'a> {
fn new(text: &str) -> SummaryParser<'_> {
fn new(text: &'a str) -> SummaryParser<'a> {
let pulldown_parser = pulldown_cmark::Parser::new(text).into_offset_iter();

SummaryParser {
Expand Down Expand Up @@ -265,7 +265,12 @@ impl<'a> SummaryParser<'a> {
loop {
match self.next_event() {
Some(ev @ Event::Start(Tag::List(..)))
| Some(ev @ Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
| Some(
ev @ Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
}),
) => {
if is_prefix {
// we've finished prefix chapters and are at the start
// of the numbered section.
Expand All @@ -275,8 +280,8 @@ impl<'a> SummaryParser<'a> {
bail!(self.parse_error("Suffix chapters cannot be followed by a list"));
}
}
Some(Event::Start(Tag::Link(_type, href, _title))) => {
let link = self.parse_link(href.to_string());
Some(Event::Start(Tag::Link { dest_url, .. })) => {
let link = self.parse_link(dest_url.to_string());
items.push(SummaryItem::Link(link));
}
Some(Event::Rule) => items.push(SummaryItem::Separator),
Expand Down Expand Up @@ -304,10 +309,13 @@ impl<'a> SummaryParser<'a> {
break;
}

Some(Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
Some(Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
})) => {
debug!("Found a h1 in the SUMMARY");

let tags = collect_events!(self.stream, end Tag::Heading(HeadingLevel::H1, ..));
let tags = collect_events!(self.stream, end TagEnd::Heading(HeadingLevel::H1));
Some(stringify_events(tags))
}

Expand Down Expand Up @@ -336,7 +344,7 @@ impl<'a> SummaryParser<'a> {
/// Finishes parsing a link once the `Event::Start(Tag::Link(..))` has been opened.
fn parse_link(&mut self, href: String) -> Link {
let href = href.replace("%20", " ");
let link_content = collect_events!(self.stream, end Tag::Link(..));
let link_content = collect_events!(self.stream, end TagEnd::Link);
let name = stringify_events(link_content);

let path = if href.is_empty() {
Expand Down Expand Up @@ -377,7 +385,12 @@ impl<'a> SummaryParser<'a> {
}
// The expectation is that pulldown cmark will terminate a paragraph before a new
// heading, so we can always count on this to return without skipping headings.
Some(ev @ Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
Some(
ev @ Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
}),
) => {
// we're starting a new part
self.back(ev);
break;
Expand All @@ -398,7 +411,7 @@ impl<'a> SummaryParser<'a> {

// Skip over the contents of this tag
while let Some(event) = self.next_event() {
if event == Event::End(other_tag.clone()) {
if event == Event::End(other_tag.clone().into()) {
break;
}
}
Expand Down Expand Up @@ -469,7 +482,7 @@ impl<'a> SummaryParser<'a> {

last_item.nested_items = sub_items;
}
Some(Event::End(Tag::List(..))) => break,
Some(Event::End(TagEnd::List(..))) => break,
Some(_) => {}
None => break,
}
Expand All @@ -486,8 +499,8 @@ impl<'a> SummaryParser<'a> {
loop {
match self.next_event() {
Some(Event::Start(Tag::Paragraph)) => continue,
Some(Event::Start(Tag::Link(_type, href, _title))) => {
let mut link = self.parse_link(href.to_string());
Some(Event::Start(Tag::Link { dest_url, .. })) => {
let mut link = self.parse_link(dest_url.to_string());

let mut number = parent.clone();
number.0.push(num_existing_items as u32 + 1);
Expand Down Expand Up @@ -529,14 +542,18 @@ impl<'a> SummaryParser<'a> {
fn parse_title(&mut self) -> Option<String> {
loop {
match self.next_event() {
Some(Event::Start(Tag::Heading(HeadingLevel::H1, ..))) => {
Some(Event::Start(Tag::Heading {
level: HeadingLevel::H1,
..
})) => {
debug!("Found a h1 in the SUMMARY");

let tags = collect_events!(self.stream, end Tag::Heading(HeadingLevel::H1, ..));
let tags = collect_events!(self.stream, end TagEnd::Heading(HeadingLevel::H1));
return Some(stringify_events(tags));
}
// Skip a HTML element such as a comment line.
Some(Event::Html(_)) => {}
Some(Event::Html(_) | Event::InlineHtml(_))
| Some(Event::Start(Tag::HtmlBlock) | Event::End(TagEnd::HtmlBlock)) => {}
// Otherwise, no title.
Some(ev) => {
self.back(ev);
Expand Down Expand Up @@ -744,7 +761,7 @@ mod tests {
let _ = parser.stream.next(); // Discard opening paragraph

let href = match parser.stream.next() {
Some((Event::Start(Tag::Link(_type, href, _title)), _range)) => href.to_string(),
Some((Event::Start(Tag::Link { dest_url, .. }), _range)) => dest_url.to_string(),
other => panic!("Unreachable, {:?}", other),
};

Expand Down
54 changes: 41 additions & 13 deletions src/renderer/html_handlebars/search.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,10 +66,23 @@ fn add_doc(
index: &mut Index,
doc_urls: &mut Vec<String>,
anchor_base: &str,
section_id: &Option<String>,
heading: &str,
id_counter: &mut HashMap<String, usize>,
section_id: &Option<CowStr<'_>>,
items: &[&str],
) {
let url = if let Some(ref id) = *section_id {
// Either use the explicit section id the user specified, or generate one
// from the heading content.
let section_id = section_id.as_ref().map(|id| id.to_string()).or_else(|| {
if heading.is_empty() {
// In the case where a chapter has no heading, don't set a section id.
None
} else {
Some(utils::unique_id_from_content(heading, id_counter))
}
});

let url = if let Some(id) = section_id {
Cow::Owned(format!("{}#{}", anchor_base, id))
} else {
Cow::Borrowed(anchor_base)
Expand Down Expand Up @@ -119,30 +132,29 @@ fn render_item(
let mut id_counter = HashMap::new();
while let Some(event) = p.next() {
match event {
Event::Start(Tag::Heading(i, ..)) if i as u32 <= max_section_depth => {
Event::Start(Tag::Heading { level, id, .. }) if level as u32 <= max_section_depth => {
if !heading.is_empty() {
// Section finished, the next heading is following now
// Write the data to the index, and clear it for the next section
add_doc(
index,
doc_urls,
&anchor_base,
&heading,
&mut id_counter,
&section_id,
&[&heading, &body, &breadcrumbs.join(" » ")],
);
section_id = None;
heading.clear();
body.clear();
breadcrumbs.pop();
}

section_id = id;
in_heading = true;
}
Event::End(Tag::Heading(i, id, _classes)) if i as u32 <= max_section_depth => {
Event::End(TagEnd::Heading(level)) if level as u32 <= max_section_depth => {
in_heading = false;
section_id = id
.map(|id| id.to_string())
.or_else(|| Some(utils::unique_id_from_content(&heading, &mut id_counter)));
breadcrumbs.push(heading.clone());
}
Event::Start(Tag::FootnoteDefinition(name)) => {
Expand All @@ -159,9 +171,19 @@ fn render_item(
html_block.push_str(html);
p.next();
}

body.push_str(&clean_html(&html_block));
}
Event::InlineHtml(html) => {
// This is not capable of cleaning inline tags like
// `foo <script>…</script>`. The `<script>` tags show up as
// individual InlineHtml events, and the content inside is
// just a regular Text event. There isn't a very good way to
// know how to collect all the content in-between. I'm not
// sure if this is easily fixable. It should be extremely
// rare, since script and style tags should almost always be
// blocks, and worse case you have some noise in the index.
body.push_str(&clean_html(&html));
}
Event::Start(_) | Event::End(_) | Event::Rule | Event::SoftBreak | Event::HardBreak => {
// Insert spaces where HTML output would usually separate text
// to ensure words don't get merged together
Expand All @@ -188,18 +210,24 @@ fn render_item(
}

if !body.is_empty() || !heading.is_empty() {
if heading.is_empty() {
let title = if heading.is_empty() {
if let Some(chapter) = breadcrumbs.first() {
heading = chapter.clone();
chapter
} else {
""
}
}
} else {
&heading
};
// Make sure the last section is added to the index
add_doc(
index,
doc_urls,
&anchor_base,
&heading,
&mut id_counter,
&section_id,
&[&heading, &body, &breadcrumbs.join(" » ")],
&[title, &body, &breadcrumbs.join(" » ")],
);
}

Expand Down
35 changes: 26 additions & 9 deletions src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ pub(crate) mod toml_ext;
use crate::errors::Error;
use log::error;
use once_cell::sync::Lazy;
use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag};
use pulldown_cmark::{html, CodeBlockKind, CowStr, Event, Options, Parser, Tag, TagEnd};
use regex::Regex;

use std::borrow::Cow;
Expand Down Expand Up @@ -161,13 +161,30 @@ fn adjust_links<'a>(event: Event<'a>, path: Option<&Path>) -> Event<'a> {
}

match event {
Event::Start(Tag::Link(link_type, dest, title)) => {
Event::Start(Tag::Link(link_type, fix(dest, path), title))
}
Event::Start(Tag::Image(link_type, dest, title)) => {
Event::Start(Tag::Image(link_type, fix(dest, path), title))
}
Event::Start(Tag::Link {
link_type,
dest_url,
title,
id,
}) => Event::Start(Tag::Link {
link_type,
dest_url: fix(dest_url, path),
title,
id,
}),
Event::Start(Tag::Image {
link_type,
dest_url,
title,
id,
}) => Event::Start(Tag::Image {
link_type,
dest_url: fix(dest_url, path),
title,
id,
}),
Event::Html(html) => Event::Html(fix_html(html, path)),
Event::InlineHtml(html) => Event::InlineHtml(fix_html(html, path)),
_ => event,
}
}
Expand All @@ -177,7 +194,7 @@ pub fn render_markdown(text: &str, curly_quotes: bool) -> String {
render_markdown_with_path(text, curly_quotes, None)
}

pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_, '_> {
pub fn new_cmark_parser(text: &str, curly_quotes: bool) -> Parser<'_> {
let mut opts = Options::empty();
opts.insert(Options::ENABLE_TABLES);
opts.insert(Options::ENABLE_FOOTNOTES);
Expand Down Expand Up @@ -212,7 +229,7 @@ fn wrap_tables(event: Event<'_>) -> (Option<Event<'_>>, Option<Event<'_>>) {
Some(Event::Html(r#"<div class="table-wrapper">"#.into())),
Some(event),
),
Event::End(Tag::Table(_)) => (Some(event), Some(Event::Html(r#"</div>"#.into()))),
Event::End(TagEnd::Table) => (Some(event), Some(Event::Html(r#"</div>"#.into()))),
_ => (Some(event), None),
}
}
Expand Down
4 changes: 4 additions & 0 deletions tests/dummy_book/src/conclusion.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,7 @@ css looks, like this {
}
*/
</style>

Sneaky inline event <script>alert("inline");</script>.

But regular <b>inline</b> is indexed.
Loading

0 comments on commit 600824b

Please sign in to comment.