From 861d899d601362f6be3b168e4a2df921837685cd Mon Sep 17 00:00:00 2001 From: j-mendez Date: Mon, 30 Dec 2024 04:18:30 -0500 Subject: [PATCH] chore(crate): remove htm5ever crate --- Cargo.lock | 37 +- spider/Cargo.toml | 7 +- spider/src/lib.rs | 2 - spider/src/packages/mod.rs | 1 - .../packages/scraper/element_ref/element.rs | 201 ---------- .../src/packages/scraper/element_ref/mod.rs | 201 ---------- .../scraper/element_ref/serializable.rs | 15 - spider/src/packages/scraper/error.rs | 70 ---- spider/src/packages/scraper/html/mod.rs | 233 ----------- .../src/packages/scraper/html/serializable.rs | 27 -- spider/src/packages/scraper/html/tree_sink.rs | 229 ----------- spider/src/packages/scraper/mod.rs | 139 ------- spider/src/packages/scraper/node.rs | 361 ------------------ .../src/packages/scraper/node/serializable.rs | 53 --- spider/src/packages/scraper/selector.rs | 214 ----------- spider/src/packages/scraper/test.rs | 22 -- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 3 +- .../src/transformation/content.rs | 2 +- spider_utils/Cargo.toml | 3 +- spider_utils/src/lib.rs | 3 +- spider_worker/Cargo.toml | 2 +- 23 files changed, 36 insertions(+), 1793 deletions(-) delete mode 100644 spider/src/packages/scraper/element_ref/element.rs delete mode 100644 spider/src/packages/scraper/element_ref/mod.rs delete mode 100644 spider/src/packages/scraper/element_ref/serializable.rs delete mode 100644 spider/src/packages/scraper/error.rs delete mode 100644 spider/src/packages/scraper/html/mod.rs delete mode 100644 spider/src/packages/scraper/html/serializable.rs delete mode 100644 spider/src/packages/scraper/html/tree_sink.rs delete mode 100644 spider/src/packages/scraper/mod.rs delete mode 100644 spider/src/packages/scraper/node.rs delete mode 100644 spider/src/packages/scraper/node/serializable.rs delete mode 100644 spider/src/packages/scraper/selector.rs delete mode 100644 spider/src/packages/scraper/test.rs diff --git a/Cargo.lock b/Cargo.lock index 4e686ef59..5ffeb8b25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5364,7 +5364,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.22.9" +version = "2.22.10" dependencies = [ "ahash", "aho-corasick", @@ -5378,9 +5378,6 @@ dependencies = [ "chrono", "const_format", "cron", - "cssparser 0.31.2", - "ego-tree", - "fast_html5ever", "fastrand 2.3.0", "flexbuffers", "hashbrown 0.15.2", @@ -5403,7 +5400,6 @@ dependencies = [ "regex", "reqwest", "reqwest-middleware", - "selectors 0.25.0", "serde", "serde_json", "serde_regex", @@ -5416,7 +5412,6 @@ dependencies = [ "string_concat", "strum", "sysinfo", - "tendril", "tiktoken-rs", "tikv-jemallocator", "tokio", @@ -5428,7 +5423,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.22.9" +version = "2.22.10" dependencies = [ "adblock", "aho-corasick", @@ -5518,7 +5513,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.22.9" +version = "2.22.10" dependencies = [ "clap", "env_logger", @@ -5541,9 +5536,27 @@ dependencies = [ "spider_utils", ] +[[package]] +name = "spider_scraper" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb1b2e09bb57a18a9a1e5762dff4b0248c0fff76a24c9002d03c08de4e3aa257" +dependencies = [ + "ahash", + "auto_encoder", + "cssparser 0.31.2", + "ego-tree", + "fast_html5ever", + "hashbrown 0.15.2", + "lazy_static", + "selectors 0.25.0", + "smallvec", + "tendril", +] + [[package]] name = "spider_transformations" -version = "2.22.9" +version = "2.22.10" dependencies = [ "aho-corasick", "fast_html2md", @@ -5558,6 +5571,7 @@ dependencies = [ "regex", "serde", "spider", + "spider_scraper", "tendril", "thiserror 1.0.69", "unicode-width 0.2.0", @@ -5565,11 +5579,12 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.22.9" +version = "2.22.10" dependencies = [ "indexmap 1.9.3", "serde", "spider", + "spider_scraper", "spider_transformations", "sxd-document", "sxd-xpath", @@ -5577,7 +5592,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.22.9" +version = "2.22.10" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index e2c3d555d..920277c0c 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.22.9" +version = "2.22.10" authors = [ "j-mendez " ] @@ -26,13 +26,8 @@ regex = { version = "1" } ua_generator = { version = "^0.5", optional = true } string_concat = "0.0.1" lazy_static = "1" -fast_html5ever = "0.26.6" -selectors = "0.25.0" -tendril = "0.4.3" ahash = { version = "0.8", default-features = false, features = ["std"] } -cssparser = "0.31.2" smallvec = "1" -ego-tree = "0.10" num_cpus = "1" bytes = { version = "1", features = ["serde"] } serde = { version = "1", optional = true, features = ["derive"] } diff --git a/spider/src/lib.rs b/spider/src/lib.rs index 4608edae9..e737a8be6 100644 --- a/spider/src/lib.rs +++ b/spider/src/lib.rs @@ -158,8 +158,6 @@ pub extern crate string_concat; pub extern crate strum; #[macro_use] pub extern crate lazy_static; -#[macro_use] -pub extern crate fast_html5ever; /// Configuration structure for `Website`. pub mod configuration; diff --git a/spider/src/packages/mod.rs b/spider/src/packages/mod.rs index caa6bb3a6..597fbdf72 100644 --- a/spider/src/packages/mod.rs +++ b/spider/src/packages/mod.rs @@ -1,3 +1,2 @@ /// robot parser pub mod robotparser; -pub mod scraper; diff --git a/spider/src/packages/scraper/element_ref/element.rs b/spider/src/packages/scraper/element_ref/element.rs deleted file mode 100644 index 8897feaef..000000000 --- a/spider/src/packages/scraper/element_ref/element.rs +++ /dev/null @@ -1,201 +0,0 @@ -use fast_html5ever::Namespace; -use selectors::attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint}; -use selectors::matching; -use selectors::{Element, OpaqueElement}; - -use super::super::selector::{CssLocalName, CssString, NonTSPseudoClass, PseudoElement, Simple}; -use super::ElementRef; - -/// Note: will never match against non-tree-structure pseudo-classes. -impl<'a> Element for ElementRef<'a> { - type Impl = Simple; - - fn opaque(&self) -> OpaqueElement { - OpaqueElement::new(self.node.value()) - } - - fn parent_element(&self) -> Option { - self.parent().and_then(ElementRef::wrap) - } - - fn parent_node_is_shadow_root(&self) -> bool { - false - } - - fn containing_shadow_host(&self) -> Option { - None - } - - fn first_element_child(&self) -> Option { - self.prev_siblings().nth(0).and_then(ElementRef::wrap) - } - - fn apply_selector_flags(&self, _: selectors::matching::ElementSelectorFlags) { - // Apply selector flags when enabled - } - - fn is_pseudo_element(&self) -> bool { - false - } - - fn is_part(&self, _name: &CssLocalName) -> bool { - false - } - - fn is_same_type(&self, other: &Self) -> bool { - self.value().name == other.value().name - } - - fn imported_part(&self, _: &CssLocalName) -> Option { - None - } - - fn prev_sibling_element(&self) -> Option { - self.prev_siblings() - .find(|sibling| sibling.value().is_element()) - .map(ElementRef::new) - } - - fn next_sibling_element(&self) -> Option { - self.next_siblings() - .find(|sibling| sibling.value().is_element()) - .map(ElementRef::new) - } - - fn is_html_element_in_html_document(&self) -> bool { - // FIXME: Is there more to this? - self.value().name.ns == ns!(html) - } - - fn has_local_name(&self, name: &CssLocalName) -> bool { - self.value().name.local == name.0 - } - - fn has_namespace(&self, namespace: &Namespace) -> bool { - &self.value().name.ns == namespace - } - - fn attr_matches( - &self, - ns: &NamespaceConstraint<&Namespace>, - local_name: &CssLocalName, - operation: &AttrSelectorOperation<&CssString>, - ) -> bool { - self.value().attrs.iter().any(|(key, value)| { - !matches!(*ns, NamespaceConstraint::Specific(url) if *url != key.ns) - && local_name.0 == key.local - && operation.eval_str(value) - }) - } - - fn match_non_ts_pseudo_class( - &self, - _pc: &NonTSPseudoClass, - _context: &mut matching::MatchingContext, - ) -> bool { - false - } - - fn match_pseudo_element( - &self, - _pe: &PseudoElement, - _context: &mut matching::MatchingContext, - ) -> bool { - false - } - - fn is_link(&self) -> bool { - self.value().name() == "link" - } - - fn is_html_slot_element(&self) -> bool { - true - } - - fn has_id(&self, id: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { - match self.value().id { - Some(ref val) => case_sensitivity.eq(id.0.as_bytes(), val.as_bytes()), - None => false, - } - } - - fn has_class(&self, name: &CssLocalName, case_sensitivity: CaseSensitivity) -> bool { - self.value().has_class(&name.0, case_sensitivity) - } - - fn is_empty(&self) -> bool { - !self - .children() - .any(|child| child.value().is_element() || child.value().is_text()) - } - - fn is_root(&self) -> bool { - self.parent() - .map_or(false, |parent| parent.value().is_document()) - } -} - -#[cfg(test)] -mod tests { - use crate::packages::scraper::html::Html; - use crate::packages::scraper::selector::{CssLocalName, Selector}; - use selectors::attr::CaseSensitivity; - use selectors::Element; - - #[test] - fn test_has_id() { - let html = ""; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - - let element = fragment.select(&sel).next().unwrap(); - assert!(element.has_id( - &CssLocalName::from("link_id_456"), - CaseSensitivity::CaseSensitive - )); - - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let element = fragment.select(&sel).next().unwrap(); - assert!(!element.has_id( - &CssLocalName::from("any_link_id"), - CaseSensitivity::CaseSensitive - )); - } - - #[test] - fn test_is_link() { - let html = ""; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("link").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert!(element.is_link()); - - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert!(!element.is_link()); - } - - #[test] - fn test_has_class() { - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert!(element.has_class( - &CssLocalName::from("my_class"), - CaseSensitivity::CaseSensitive - )); - - let html = "

hey there

"; - let fragment = Html::parse_fragment(html); - let sel = Selector::parse("p").unwrap(); - let element = fragment.select(&sel).next().unwrap(); - assert!(!element.has_class( - &CssLocalName::from("my_class"), - CaseSensitivity::CaseSensitive - )); - } -} diff --git a/spider/src/packages/scraper/element_ref/mod.rs b/spider/src/packages/scraper/element_ref/mod.rs deleted file mode 100644 index f5073ccd8..000000000 --- a/spider/src/packages/scraper/element_ref/mod.rs +++ /dev/null @@ -1,201 +0,0 @@ -//! Element references. - -use std::ops::Deref; - -use ego_tree::iter::{Edge, Traverse}; -use ego_tree::NodeRef; -use fast_html5ever::serialize::{serialize, SerializeOpts, TraversalScope}; - -use crate::packages::scraper::node::Element; -use crate::packages::scraper::node::Node; -use crate::packages::scraper::selector::Selector; - -/// Wrapper around a reference to an element node. -/// -/// This wrapper implements the `Element` trait from the `selectors` crate, which allows it to be -/// matched against CSS selectors. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub struct ElementRef<'a> { - node: NodeRef<'a, Node>, - /// The language of the element. Not used atm. - pub lang: &'a str, -} - -impl<'a> ElementRef<'a> { - fn new(node: NodeRef<'a, Node>) -> Self { - ElementRef { node, lang: "" } - } - - /// Wraps a `NodeRef` only if it references a `Node::Element`. - pub fn wrap(node: NodeRef<'a, Node>) -> Option { - if node.value().is_element() { - Some(ElementRef::new(node)) - } else { - None - } - } - - /// Returns the `Element` referenced by `self`. - pub fn value(&self) -> &'a Element { - self.node.value().as_element().unwrap() - } - - /// Returns an iterator over descendent elements matching a selector. - pub fn select<'b>(&self, selector: &'b Selector) -> Select<'a, 'b> { - let mut inner = self.traverse(); - inner.next(); // Skip Edge::Open(self). - - Select { - scope: *self, - inner, - selector, - } - } - - fn serialize(&self, traversal_scope: TraversalScope) -> String { - let opts = SerializeOpts { - scripting_enabled: false, // It's not clear what this does. - traversal_scope, - create_missing_parent: false, - }; - let mut buf = Vec::new(); - let _ = serialize(&mut buf, self, opts); - // we need to get the initial encoding of the html lang if used. - auto_encoder::auto_encode_bytes(&buf) - } - - /// Returns the HTML of this element. - pub fn html(&self) -> String { - self.serialize(TraversalScope::IncludeNode) - } - - /// Returns the inner HTML of this element. - pub fn inner_html(&self) -> String { - self.serialize(TraversalScope::ChildrenOnly(None)) - } - - /// Returns the value of an attribute. - pub fn attr(&self, attr: &str) -> Option<&str> { - self.value().attr(attr) - } - - /// Returns an iterator over descendent text nodes. - pub fn text(&self) -> Text<'a> { - Text { - inner: self.traverse(), - } - } -} - -impl<'a> Deref for ElementRef<'a> { - type Target = NodeRef<'a, Node>; - fn deref(&self) -> &NodeRef<'a, Node> { - &self.node - } -} - -/// Iterator over descendent elements matching a selector. -#[derive(Debug, Clone)] -pub struct Select<'a, 'b> { - scope: ElementRef<'a>, - inner: Traverse<'a, Node>, - selector: &'b Selector, -} - -impl<'a, 'b> Iterator for Select<'a, 'b> { - type Item = ElementRef<'a>; - - fn next(&mut self) -> Option> { - for edge in &mut self.inner { - if let Edge::Open(node) = edge { - if let Some(element) = ElementRef::wrap(node) { - if self.selector.matches_with_scope(&element, Some(self.scope)) { - return Some(element); - } - } - } - } - None - } -} - -/// Iterator over descendent text nodes. -#[derive(Debug, Clone)] -pub struct Text<'a> { - inner: Traverse<'a, Node>, -} - -impl<'a> Iterator for Text<'a> { - type Item = &'a str; - - fn next(&mut self) -> Option<&'a str> { - for edge in &mut self.inner { - if let Edge::Open(ref node) = edge { - // check if the element is not a script or link. - let processable = match node.parent() { - Some(e) => { - match e.value().as_element() { - Some(n) => { - let name = n.name(); - // prevent all script and style elements - !(name == "script" || name == "style") - } - _ => true, - } - } - _ => true, - }; - - if !processable { - continue; - } - - if let Node::Text(text) = node.value() { - return Some(&**text); - } - } - } - - None - } -} - -mod element; -mod serializable; - -#[cfg(test)] -mod tests { - use crate::packages::scraper::html::Html; - use crate::packages::scraper::selector::Selector; - - #[test] - fn test_scope() { - let html = r" -
- 1 - - 2 - 3 - -
- "; - let fragment = Html::parse_fragment(html); - let sel1 = Selector::parse("div > span").unwrap(); - let sel2 = Selector::parse(":scope > b").unwrap(); - - let element1 = fragment.select(&sel1).next().unwrap(); - let element2 = element1.select(&sel2).next().unwrap(); - assert_eq!(element2.inner_html(), "3"); - } - - #[test] - fn test_text() { - let fragment = Html::parse_fragment("

Hello, world!

"); - let selector = Selector::parse("h1").unwrap(); - - let h1 = fragment.select(&selector).next().unwrap(); - let text = h1.text().collect::>(); - - assert_eq!(vec!["Hello, ", "world!"], text); - } -} diff --git a/spider/src/packages/scraper/element_ref/serializable.rs b/spider/src/packages/scraper/element_ref/serializable.rs deleted file mode 100644 index fc5d37df2..000000000 --- a/spider/src/packages/scraper/element_ref/serializable.rs +++ /dev/null @@ -1,15 +0,0 @@ -use std::io::Error; - -use fast_html5ever::serialize::{Serialize, Serializer, TraversalScope}; - -use super::ElementRef; - -impl<'a> Serialize for ElementRef<'a> { - fn serialize( - &self, - serializer: &mut S, - traversal_scope: TraversalScope, - ) -> Result<(), Error> { - super::super::node::serializable::serialize(**self, serializer, traversal_scope) - } -} diff --git a/spider/src/packages/scraper/error.rs b/spider/src/packages/scraper/error.rs deleted file mode 100644 index 2bdbd9f01..000000000 --- a/spider/src/packages/scraper/error.rs +++ /dev/null @@ -1,70 +0,0 @@ -//! Custom error types for diagnostics -//! Includes re-exported error types from dependencies - -use cssparser::{BasicParseErrorKind, ParseErrorKind, Token}; -use selectors::parser::SelectorParseErrorKind; - -/// Error type that is returned when calling `Selector::parse` -#[derive(Debug, Clone)] -pub enum SelectorErrorKind<'a> { - /// A `Token` was not expected - UnexpectedToken(Token<'a>), - - /// End-Of-Line was unexpected - EndOfLine, - - /// `@` rule is invalid - InvalidAtRule(String), - - /// The body of an `@` rule is invalid - InvalidAtRuleBody, - - /// The qualified rule is invalid - QualRuleInvalid, - - /// Expected a `::` for a pseudoelement - ExpectedColonOnPseudoElement(Token<'a>), - - /// Expected an identity for a pseudoelement - ExpectedIdentityOnPseudoElement(Token<'a>), - - /// A `SelectorParseErrorKind` error that isn't really supposed to happen did - UnexpectedSelectorParseError(SelectorParseErrorKind<'a>), -} - -impl<'a> From>> for SelectorErrorKind<'a> { - fn from(original: cssparser::ParseError<'a, SelectorParseErrorKind<'a>>) -> Self { - // NOTE: This could be improved, but I dont - // exactly know how - match original.kind { - ParseErrorKind::Basic(err) => SelectorErrorKind::from(err), - ParseErrorKind::Custom(err) => SelectorErrorKind::from(err), - } - } -} - -impl<'a> From> for SelectorErrorKind<'a> { - fn from(err: BasicParseErrorKind<'a>) -> Self { - match err { - BasicParseErrorKind::UnexpectedToken(token) => Self::UnexpectedToken(token), - BasicParseErrorKind::EndOfInput => Self::EndOfLine, - BasicParseErrorKind::AtRuleInvalid(rule) => Self::InvalidAtRule(rule.to_string()), - BasicParseErrorKind::AtRuleBodyInvalid => Self::InvalidAtRuleBody, - BasicParseErrorKind::QualifiedRuleInvalid => Self::QualRuleInvalid, - } - } -} - -impl<'a> From> for SelectorErrorKind<'a> { - fn from(err: SelectorParseErrorKind<'a>) -> Self { - match err { - SelectorParseErrorKind::PseudoElementExpectedColon(token) => { - Self::ExpectedColonOnPseudoElement(token) - } - SelectorParseErrorKind::PseudoElementExpectedIdent(token) => { - Self::ExpectedIdentityOnPseudoElement(token) - } - other => Self::UnexpectedSelectorParseError(other), - } - } -} diff --git a/spider/src/packages/scraper/html/mod.rs b/spider/src/packages/scraper/html/mod.rs deleted file mode 100644 index 67d60e072..000000000 --- a/spider/src/packages/scraper/html/mod.rs +++ /dev/null @@ -1,233 +0,0 @@ -//! HTML documents and fragments. - -use ego_tree::iter::Nodes; -use ego_tree::{NodeId, Tree}; -use fast_html5ever::serialize::SerializeOpts; -use fast_html5ever::tree_builder::QuirksMode; -use fast_html5ever::QualName; -use fast_html5ever::{driver, serialize}; -use tendril::TendrilSink; - -use crate::packages::scraper::element_ref::ElementRef; -use crate::packages::scraper::node::Node; -use crate::packages::scraper::selector::Selector; - -lazy_static! { - static ref HTML_SELECTOR: Selector = Selector::parse("html").unwrap(); -} - -/// An HTML tree. -/// -/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the -/// `errors` field. The `tree` will still be populated as best as possible. -/// -/// Implements the `TreeSink` trait from the `fast_html5ever` crate, which allows HTML to be parsed. -#[derive(Debug, Clone)] -pub struct Html { - /// The quirks mode. - pub quirks_mode: QuirksMode, - /// The node tree. - pub tree: Tree, - /// The html language of the document. - pub lang: String, -} - -impl Html { - /// Creates an empty HTML document. - pub fn new_document() -> Self { - Html { - quirks_mode: QuirksMode::NoQuirks, - tree: Tree::new(Node::Document), - lang: Default::default(), - } - } - - /// Creates an empty HTML fragment. - pub fn new_fragment() -> Self { - Html { - quirks_mode: QuirksMode::NoQuirks, - tree: Tree::new(Node::Fragment), - lang: Default::default(), - } - } - - /// Parses a string of HTML as a document. - /// - /// This is a convenience method for the following: - /// - /// ``` - /// # extern crate fast_html5ever; - /// # extern crate tendril; - /// # fn main() { - /// # let document = ""; - /// use fast_html5ever::driver::{self, ParseOpts}; - /// use spider::packages::scraper::Html; - /// use tendril::TendrilSink; - /// - /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default()); - /// let html = parser.one(document); - /// # } - /// ``` - pub fn parse_document(document: &str) -> Self { - let parser = driver::parse_document(Self::new_document(), Default::default()); - parser.one(document) - } - - /// Parses a string of HTML as a fragment. - pub fn parse_fragment(fragment: &str) -> Self { - let parser = driver::parse_fragment( - Self::new_fragment(), - Default::default(), - QualName::new(None, ns!(html), local_name!("body")), - Vec::new(), - ); - parser.one(fragment) - } - - /// Returns an iterator over elements matching a selector. - pub fn select<'a, 'b>(&'a self, selector: &'b Selector) -> Select<'a, 'b> { - Select { - inner: self.tree.nodes(), - selector, - } - } - - /// Returns the root `` element. - pub fn root_element(&self) -> ElementRef { - let root_node = self - .tree - .root() - .children() - .find(|child| child.value().is_element()) - .expect("html node missing"); - ElementRef::wrap(root_node).unwrap() - } - - /// Set the html language of the document by getting the lang attr - pub fn set_language(&mut self, lang: String) { - self.lang = lang; - } - - /// Get the language for the page. - pub fn get_lang(&self) -> &str { - if self.lang.is_empty() { - if let Some(element) = self.select(&HTML_SELECTOR).next() { - if let Some(lang) = element.value().attr("lang") { - return lang; - } - } - &self.lang - } else { - &self.lang - } - } - - /// Serialize entire document into HTML. - pub fn html(&self) -> String { - let opts = SerializeOpts { - scripting_enabled: false, // It's not clear what this does. - traversal_scope: fast_html5ever::serialize::TraversalScope::IncludeNode, - create_missing_parent: false, - }; - let mut buf = Vec::new(); - let _ = serialize(&mut buf, self, opts); - auto_encoder::auto_encode_bytes(&buf) - } - - /// Find and remove a node - pub fn remove_node(&mut self, node_id: NodeId) { - if let Some(mut node) = self.tree.get_mut(node_id) { - node.detach(); - } - } -} - -/// Iterator over elements matching a selector. -#[derive(Debug)] -pub struct Select<'a, 'b> { - inner: Nodes<'a, Node>, - selector: &'b Selector, -} - -impl<'a, 'b> Iterator for Select<'a, 'b> { - type Item = ElementRef<'a>; - - fn next(&mut self) -> Option> { - for node in self.inner.by_ref() { - if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { - return Some(element); - } - } - } - None - } -} - -impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { - fn next_back(&mut self) -> Option { - for node in self.inner.by_ref().rev() { - if let Some(element) = ElementRef::wrap(node) { - if element.parent().is_some() && self.selector.matches(&element) { - return Some(element); - } - } - } - None - } -} - -mod serializable; -mod tree_sink; - -#[cfg(test)] -mod tests { - use super::Html; - use super::Selector; - - #[test] - fn root_element_fragment() { - let html = Html::parse_fragment(r#"1"#); - let root_ref = html.root_element(); - let href = root_ref - .select(&Selector::parse("a").unwrap()) - .next() - .unwrap(); - assert_eq!(href.inner_html(), "1"); - assert_eq!(href.value().attr("href").unwrap(), "http://github.com"); - } - - #[test] - fn root_element_document_doctype() { - let html = Html::parse_document("\nabc"); - let root_ref = html.root_element(); - let title = root_ref - .select(&Selector::parse("title").unwrap()) - .next() - .unwrap(); - assert_eq!(title.inner_html(), "abc"); - } - - #[test] - fn root_element_document_comment() { - let html = Html::parse_document("abc"); - let root_ref = html.root_element(); - let title = root_ref - .select(&Selector::parse("title").unwrap()) - .next() - .unwrap(); - assert_eq!(title.inner_html(), "abc"); - } - - #[test] - fn select_is_reversible() { - let html = Html::parse_document("

element1

element2

element3

"); - let selector = Selector::parse("p").unwrap(); - let result: Vec<_> = html - .select(&selector) - .rev() - .map(|e| e.inner_html()) - .collect(); - assert_eq!(result, vec!["element3", "element2", "element1"]); - } -} diff --git a/spider/src/packages/scraper/html/serializable.rs b/spider/src/packages/scraper/html/serializable.rs deleted file mode 100644 index 2cd60f7b8..000000000 --- a/spider/src/packages/scraper/html/serializable.rs +++ /dev/null @@ -1,27 +0,0 @@ -use std::io::Error; - -use fast_html5ever::serialize::{Serialize, Serializer, TraversalScope}; - -use super::Html; - -impl Serialize for Html { - fn serialize( - &self, - serializer: &mut S, - traversal_scope: TraversalScope, - ) -> Result<(), Error> { - super::super::node::serializable::serialize(self.tree.root(), serializer, traversal_scope) - } -} - -#[cfg(test)] -mod tests { - use super::Html; - - #[test] - fn test_serialize() { - let src = r#"

Hello world!

"#; - let html = Html::parse_document(src); - assert_eq!(html.html(), src); - } -} diff --git a/spider/src/packages/scraper/html/tree_sink.rs b/spider/src/packages/scraper/html/tree_sink.rs deleted file mode 100644 index b3d7abfdc..000000000 --- a/spider/src/packages/scraper/html/tree_sink.rs +++ /dev/null @@ -1,229 +0,0 @@ -use super::Html; -use crate::packages::scraper::node::{Doctype, Element, Node, ProcessingInstruction, Text}; -use ego_tree::NodeId; -use fast_html5ever::tendril::StrTendril; -use fast_html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; -use fast_html5ever::Attribute; -use fast_html5ever::{ExpandedName, QualName}; -use std::borrow::Cow; - -/// Note: does not support the `