From 47e090f03d74c2c6c6cae6c32695456b8c1e5cb4 Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 4 Dec 2024 15:12:54 +0000 Subject: [PATCH 1/2] Use Box in AST --- src/selectors_vm/ast.rs | 16 ++++++++-------- src/selectors_vm/parser.rs | 11 ++++------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/selectors_vm/ast.rs b/src/selectors_vm/ast.rs index 45ba94da..a18f17c4 100644 --- a/src/selectors_vm/ast.rs +++ b/src/selectors_vm/ast.rs @@ -49,15 +49,15 @@ impl NthChild { pub(crate) enum OnTagNameExpr { ExplicitAny, Unmatchable, - LocalName(String), + LocalName(Box), NthChild(NthChild), NthOfType(NthChild), } #[derive(Eq, PartialEq)] pub(crate) struct AttributeComparisonExpr { - pub name: String, - pub value: String, + pub name: Box, + pub value: Box, pub case_sensitivity: ParsedCaseSensitivity, pub operator: AttrSelectorOperator, } @@ -66,8 +66,8 @@ impl AttributeComparisonExpr { #[inline] #[must_use] pub const fn new( - name: String, - value: String, + name: Box, + value: Box, case_sensitivity: ParsedCaseSensitivity, operator: AttrSelectorOperator, ) -> Self { @@ -105,9 +105,9 @@ impl Debug for AttributeComparisonExpr { /// An attribute check when attributes are received and parsed. #[derive(PartialEq, Eq, Debug)] pub(crate) enum OnAttributesExpr { - Id(String), - Class(String), - AttributeExists(String), + Id(Box), + Class(Box), + AttributeExists(Box), AttributeComparisonExpr(AttributeComparisonExpr), } diff --git a/src/selectors_vm/parser.rs b/src/selectors_vm/parser.rs index f0723f19..3b2421df 100644 --- a/src/selectors_vm/parser.rs +++ b/src/selectors_vm/parser.rs @@ -12,20 +12,17 @@ use std::str::FromStr; pub(crate) struct SelectorImplDescriptor; #[derive(Clone, Default, Eq, PartialEq)] -pub struct CssString(pub String); +pub struct CssString(pub Box); impl<'a> From<&'a str> for CssString { fn from(value: &'a str) -> Self { - Self(value.to_string()) + Self(value.into()) } } impl ToCss for CssString { - fn to_css(&self, dest: &mut W) -> fmt::Result - where - W: fmt::Write, - { - write!(dest, "{}", self.0) + fn to_css(&self, dest: &mut W) -> fmt::Result { + dest.write_str(&self.0) } } From 5e1d842efafc05527354986351dfed462e6db43c Mon Sep 17 00:00:00 2001 From: Kornel Date: Wed, 4 Dec 2024 15:33:18 +0000 Subject: [PATCH 2/2] Upgrade/remove dependencies --- Cargo.toml | 14 +++--- benches/bench.rs | 48 +++++++++---------- c-api/Cargo.toml | 2 +- src/base/bytes.rs | 6 +++ src/rewritable_units/tokens/attributes.rs | 22 +++------ src/selectors_vm/attribute_matcher.rs | 20 ++++---- tests/harness/mod.rs | 7 --- .../feedback_tests/expected_tokens.rs | 30 +++++++----- 8 files changed, 67 insertions(+), 82 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index e1fb64d5..78d23242 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,23 +34,21 @@ name = "bench" [dependencies] bitflags = "2.0.0" cfg-if = "1.0.0" -cssparser = "0.28.1" +cssparser = "0.29" encoding_rs = "0.8.13" -lazycell = "1.3.0" -lazy_static = "1.3.0" memchr = "2.1.2" -selectors = "0.23.0" -thiserror = "1.0.2" hashbrown = "0.15.0" mime = "0.3.16" +selectors = "0.24" +thiserror = "2.0" [dev-dependencies] criterion = "0.5.1" # Needed for criterion <= v0.5.1. See https://github.com/bheisler/criterion.rs/pull/703. clap = { version = "4.5.21", features = ["help"] } glob = "0.3.0" -html5ever = "0.26.0" -markup5ever_rcdom = "0.2.0" +html5ever = "0.29" +markup5ever_rcdom = "0.5.0-unofficial" hashbrown = { version = "0.15.0", features = ["serde"] } serde = "1.0.126" serde_derive = "1.0.19" @@ -58,7 +56,7 @@ serde_json = "1.0.65" static_assertions = "1.1.0" rand = "0.8.5" rustc-test = "0.3.1" -itertools = "0.10.1" +itertools = "0.13" [lints.rust] keyword_idents = { level = "deny", priority = 1 } diff --git a/benches/bench.rs b/benches/bench.rs index 6e2479b4..de288049 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -1,9 +1,9 @@ use criterion::{criterion_group, criterion_main}; use glob::glob; -use lazy_static::lazy_static; use std::fmt::{self, Debug}; use std::fs::File; use std::io::Read; +use std::sync::LazyLock; const CHUNK_SIZE: usize = 1024; @@ -19,30 +19,28 @@ impl Debug for Input { } } -lazy_static! { - static ref INPUTS: Vec = { - glob("benches/data/*.html") - .unwrap() - .map(|path| { - let mut data = String::new(); - let path = path.unwrap(); - - File::open(&path) - .unwrap() - .read_to_string(&mut data) - .unwrap(); - - let data = data.into_bytes(); - - Input { - name: path.file_name().unwrap().to_string_lossy().to_string(), - length: data.len(), - chunks: data.chunks(CHUNK_SIZE).map(|c| c.to_owned()).collect(), - } - }) - .collect() - }; -} +static INPUTS: LazyLock> = LazyLock::new(|| { + glob("benches/data/*.html") + .unwrap() + .map(|path| { + let mut data = String::new(); + let path = path.unwrap(); + + File::open(&path) + .unwrap() + .read_to_string(&mut data) + .unwrap(); + + let data = data.into_bytes(); + + Input { + name: path.file_name().unwrap().to_string_lossy().to_string(), + length: data.len(), + chunks: data.chunks(CHUNK_SIZE).map(|c| c.to_owned()).collect(), + } + }) + .collect() +}); macro_rules! create_runner { ($settings:expr) => { diff --git a/c-api/Cargo.toml b/c-api/Cargo.toml index 304bba3c..83ad6f75 100644 --- a/c-api/Cargo.toml +++ b/c-api/Cargo.toml @@ -10,7 +10,7 @@ publish = false encoding_rs = "0.8.13" lol_html = { path = "../" } libc = "0" -thiserror = "1" +thiserror = "2" [profile.release] panic = "abort" diff --git a/src/base/bytes.rs b/src/base/bytes.rs index b2591145..b46549a2 100644 --- a/src/base/bytes.rs +++ b/src/base/bytes.rs @@ -15,6 +15,12 @@ pub struct HasReplacementsError; #[allow(unnameable_types)] // accidentally exposed via `tag.set_name()` pub struct Bytes<'b>(Cow<'b, [u8]>); +impl Bytes<'static> { + pub const fn from_static(string: &'static str) -> Self { + Self(Cow::Borrowed(string.as_bytes())) + } +} + impl<'b> Bytes<'b> { #[inline] pub fn from_str(string: &'b str, encoding: &'static Encoding) -> Self { diff --git a/src/rewritable_units/tokens/attributes.rs b/src/rewritable_units/tokens/attributes.rs index afa09add..34b6dc96 100644 --- a/src/rewritable_units/tokens/attributes.rs +++ b/src/rewritable_units/tokens/attributes.rs @@ -4,7 +4,7 @@ use crate::html::escape_double_quotes_only; use crate::parser::AttributeBuffer; use crate::rewritable_units::Serialize; use encoding_rs::Encoding; -use lazycell::LazyCell; +use std::cell::OnceCell; use std::fmt::{self, Debug}; use std::ops::Deref; use thiserror::Error; @@ -154,7 +154,7 @@ impl Debug for Attribute<'_> { pub(crate) struct Attributes<'i> { input: &'i Bytes<'i>, attribute_buffer: &'i AttributeBuffer, - items: LazyCell>>, + items: OnceCell>>, encoding: &'static Encoding, } @@ -169,7 +169,7 @@ impl<'i> Attributes<'i> { Attributes { input, attribute_buffer, - items: LazyCell::default(), + items: OnceCell::default(), encoding, } } @@ -226,19 +226,9 @@ impl<'i> Attributes<'i> { #[inline] fn as_mut_vec(&mut self) -> &mut Vec> { - // NOTE: we can't use borrow_mut_with here as we'll need - // because `self` is a mutable reference and we'll have - // two mutable references by passing it to the initializer - // closure. - if !self.items.filled() { - self.items - .fill(self.init_items()) - .expect("Cell should be empty at this point"); - } + let _ = self.items.get_or_init(|| self.init_items()); - self.items - .borrow_mut() - .expect("Items should be initialized") + self.items.get_mut().expect("Items should be initialized") } #[cfg(test)] @@ -252,7 +242,7 @@ impl<'i> Deref for Attributes<'i> { #[inline] fn deref(&self) -> &[Attribute<'i>] { - self.items.borrow_with(|| self.init_items()) + self.items.get_or_init(|| self.init_items()) } } diff --git a/src/selectors_vm/attribute_matcher.rs b/src/selectors_vm/attribute_matcher.rs index 21b11944..b759c3e1 100644 --- a/src/selectors_vm/attribute_matcher.rs +++ b/src/selectors_vm/attribute_matcher.rs @@ -2,23 +2,19 @@ use super::compiler::AttrExprOperands; use crate::base::Bytes; use crate::html::Namespace; use crate::parser::{AttributeBuffer, AttributeOutline}; -use encoding_rs::UTF_8; -use lazy_static::lazy_static; -use lazycell::LazyCell; use memchr::{memchr, memchr2}; use selectors::attr::CaseSensitivity; +use std::cell::OnceCell; -lazy_static! { - static ref ID_ATTR: Bytes<'static> = Bytes::from_str("id", UTF_8); - static ref CLASS_ATTR: Bytes<'static> = Bytes::from_str("class", UTF_8); -} +static ID_ATTR: Bytes<'static> = Bytes::from_static("id"); +static CLASS_ATTR: Bytes<'static> = Bytes::from_static("class"); #[inline] const fn is_attr_whitespace(b: u8) -> bool { b == b' ' || b == b'\n' || b == b'\r' || b == b'\t' || b == b'\x0c' } -type MemoizedAttrValue<'i> = LazyCell>>; +type MemoizedAttrValue<'i> = OnceCell>>; pub(crate) struct AttributeMatcher<'i> { input: &'i Bytes<'i>, @@ -35,8 +31,8 @@ impl<'i> AttributeMatcher<'i> { AttributeMatcher { input, attributes, - id: LazyCell::default(), - class: LazyCell::default(), + id: OnceCell::new(), + class: OnceCell::new(), is_html_element: ns == Namespace::Html, } } @@ -78,7 +74,7 @@ impl<'i> AttributeMatcher<'i> { #[inline] #[must_use] pub fn has_id(&self, id: &Bytes<'_>) -> bool { - match self.id.borrow_with(|| self.get_value(&ID_ATTR)) { + match self.id.get_or_init(|| self.get_value(&ID_ATTR)) { Some(actual_id) => actual_id == id, None => false, } @@ -87,7 +83,7 @@ impl<'i> AttributeMatcher<'i> { #[inline] #[must_use] pub fn has_class(&self, class_name: &Bytes<'_>) -> bool { - match self.class.borrow_with(|| self.get_value(&CLASS_ATTR)) { + match self.class.get_or_init(|| self.get_value(&CLASS_ATTR)) { Some(class) => class .split(|&b| is_attr_whitespace(b)) .any(|actual_class_name| actual_class_name == &**class_name), diff --git a/tests/harness/mod.rs b/tests/harness/mod.rs index 0599d47d..01d9bb91 100644 --- a/tests/harness/mod.rs +++ b/tests/harness/mod.rs @@ -1,10 +1,3 @@ -use lazy_static::lazy_static; -use std::sync::Mutex; - -lazy_static! { - pub static ref TEST_CRITICAL_SECTION_MUTEX: Mutex<()> = Mutex::new(()); -} - macro_rules! ignore { (@info $($args:expr),+) => { if std::env::var("IGNORES_VERBOSE").is_ok() { diff --git a/tests/harness/suites/html5lib_tests/feedback_tests/expected_tokens.rs b/tests/harness/suites/html5lib_tests/feedback_tests/expected_tokens.rs index 616da6be..e71bef46 100644 --- a/tests/harness/suites/html5lib_tests/feedback_tests/expected_tokens.rs +++ b/tests/harness/suites/html5lib_tests/feedback_tests/expected_tokens.rs @@ -7,6 +7,7 @@ use html5ever::tokenizer::{ }; use html5ever::tree_builder::{TreeBuilder, TreeBuilderOpts}; use markup5ever_rcdom::RcDom; +use std::cell::RefCell; use std::iter::FromIterator; use std::string::ToString; @@ -14,15 +15,16 @@ use std::string::ToString; // recording them into the provided array pub struct TokenSinkProxy<'a, Sink> { pub inner: Sink, - pub tokens: &'a mut Vec, + pub tokens: RefCell<&'a mut Vec>, } impl TokenSinkProxy<'_, Sink> { - fn push_text_token(&mut self, s: &str) { - if let Some(&mut TestToken::Text(ref mut last)) = self.tokens.last_mut() { + fn push_text_token(&self, s: &str) { + let tokens = &mut *self.tokens.borrow_mut(); + if let Some(&mut TestToken::Text(ref mut last)) = tokens.last_mut() { *last += s; } else { - self.tokens.push(TestToken::Text(s.to_string())); + tokens.push(TestToken::Text(s.to_string())); } } } @@ -30,10 +32,10 @@ impl TokenSinkProxy<'_, Sink> { impl TokenSink for TokenSinkProxy<'_, Sink> { type Handle = Sink::Handle; - fn process_token(&mut self, token: Token, line_number: u64) -> TokenSinkResult { + fn process_token(&self, token: Token, line_number: u64) -> TokenSinkResult { match token { Token::DoctypeToken(ref doctype) => { - self.tokens.push(TestToken::Doctype { + self.tokens.borrow_mut().push(TestToken::Doctype { name: doctype.name.as_ref().map(ToString::to_string), public_id: doctype.public_id.as_ref().map(ToString::to_string), system_id: doctype.system_id.as_ref().map(ToString::to_string), @@ -43,7 +45,7 @@ impl TokenSink for TokenSinkProxy<'_, Sink> { Token::TagToken(ref tag) => { let name = tag.name.to_string(); - self.tokens.push(match tag.kind { + self.tokens.borrow_mut().push(match tag.kind { TagKind::StartTag => TestToken::StartTag { name, attributes: HashMap::from_iter( @@ -58,7 +60,9 @@ impl TokenSink for TokenSinkProxy<'_, Sink> { }); } Token::CommentToken(ref s) => { - self.tokens.push(TestToken::Comment(s.to_string())); + self.tokens + .borrow_mut() + .push(TestToken::Comment(s.to_string())); } Token::CharacterTokens(ref s) => { if !s.is_empty() { @@ -73,7 +77,7 @@ impl TokenSink for TokenSinkProxy<'_, Sink> { self.inner.process_token(token, line_number) } - fn end(&mut self) { + fn end(&self) { self.inner.end(); } @@ -85,20 +89,20 @@ impl TokenSink for TokenSinkProxy<'_, Sink> { pub fn get(input: &str) -> Vec { let mut tokens = Vec::default(); - let mut b = BufferQueue::new(); + let b = BufferQueue::default(); b.push_back(StrTendril::from(input)); { - let mut t = Tokenizer::new( + let t = Tokenizer::new( TokenSinkProxy { inner: TreeBuilder::new(RcDom::default(), TreeBuilderOpts::default()), - tokens: &mut tokens, + tokens: RefCell::new(&mut tokens), }, TokenizerOpts::default(), ); - while let TokenizerResult::Script(_) = t.feed(&mut b) { + while let TokenizerResult::Script(_) = t.feed(&b) { // ignore script markers }