From d5b69ceae5b76d76e2e6ae8161f54e5a72e6169e Mon Sep 17 00:00:00 2001 From: j-mendez Date: Mon, 30 Dec 2024 05:13:33 -0500 Subject: [PATCH] chore(page): add goaway retry --- Cargo.lock | 13 ++++----- spider/Cargo.toml | 3 ++- spider/src/page.rs | 45 ++++++++++++++++++++++++++----- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 8 files changed, 53 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d71d5f799..a73e68376 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5367,7 +5367,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.22.11" +version = "2.22.13" dependencies = [ "ahash", "aho-corasick", @@ -5383,6 +5383,7 @@ dependencies = [ "cron", "fastrand 2.3.0", "flexbuffers", + "h2 0.4.7", "hashbrown 0.15.2", "http 1.2.0", "http-cache", @@ -5426,7 +5427,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.22.11" +version = "2.22.13" dependencies = [ "adblock", "aho-corasick", @@ -5516,7 +5517,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.22.11" +version = "2.22.13" dependencies = [ "clap", "env_logger", @@ -5559,7 +5560,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.22.11" +version = "2.22.13" dependencies = [ "aho-corasick", "fast_html2md", @@ -5582,7 +5583,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.22.11" +version = "2.22.13" dependencies = [ "indexmap 1.9.3", "serde", @@ -5595,7 +5596,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.22.11" +version = "2.22.13" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 020bffb3c..1a90e3ed3 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.22.11" +version = "2.22.13" authors = [ "j-mendez " ] @@ -66,6 +66,7 @@ aho-corasick = { version = "1", optional = true } tracing = { version = "0.1", default-features = false, features = ["std"], optional = true } sysinfo = { version = "0.33", default-features = false, features = ["system"], optional = true } sqlx = { version = "0.8", features = [ "runtime-tokio", "sqlite" ], optional = true } +h2 = "0.4" [dependencies.spider_chrome] version = "2" diff --git a/spider/src/page.rs b/spider/src/page.rs index 89ada34c6..159e18716 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -24,6 +24,9 @@ use crate::utils::FetchPageResult; use tokio_stream::StreamExt; use url::Url; +/// Allocate up to 16kb upfront for small pages. +const MAX_PRE_ALLOCATED_HTML_PAGE_SIZE: u64 = 16 * 1024; + lazy_static! { /// Wildcard match all domains. static ref CASELESS_WILD_CARD: CaseInsensitiveString = CaseInsensitiveString::new("*"); @@ -401,6 +404,33 @@ pub fn validate_empty(content: &Option>, is_success: bool) -> bool { } } +/// Extract a specific type of error from a chain of errors. +fn extract_specific_error<'a, T: std::error::Error + 'static>( + error: &'a (dyn std::error::Error + 'static), +) -> Option<&'a T> { + let mut current_error = Some(error); + while let Some(err) = current_error { + if let Some(desired_error) = err.downcast_ref::() { + return Some(desired_error); + } + current_error = err.source(); + } + None +} + +/// Determine if the response is goaway and should retry. +fn should_attempt_retry(error: &(dyn std::error::Error + 'static)) -> bool { + if let Some(e) = extract_specific_error::(error) { + if e.is_go_away() && e.is_remote() && e.reason() == Some(h2::Reason::NO_ERROR) { + return true; + } + if e.is_remote() && e.reason() == Some(h2::Reason::REFUSED_STREAM) { + return true; + } + } + false +} + /// Instantiate a new page without scraping it (used for testing purposes). #[cfg(not(feature = "decentralized"))] pub fn build(url: &str, res: PageResponse) -> Page { @@ -441,6 +471,9 @@ pub fn build(url: &str, res: PageResponse) -> Page { if er.is_status() || er.is_connect() || er.is_timeout() { should_retry = !er.to_string().contains("ENOTFOUND"); } + if !should_retry && should_attempt_retry(&er) { + should_retry = true; + } Some(er.to_string()) } }, @@ -502,9 +535,6 @@ pub struct PageLinkBuildSettings { pub subdomains: bool, } -/// Default byte capacity for response stream collecting. -const DEFAULT_BYTE_CAPACITY: u64 = 8 * 1024; - impl PageLinkBuildSettings { /// New build link settings. pub fn new(ssg_build: bool, full_resources: bool) -> Self { @@ -618,9 +648,12 @@ impl Page { _ => (AsciiCompatibleEncoding::utf_8(), true), }; - let mut collected_bytes = bytes::BytesMut::with_capacity( - res.content_length().unwrap_or(DEFAULT_BYTE_CAPACITY) as usize, - ); + let mut collected_bytes = match res.content_length() { + Some(cap) if cap <= MAX_PRE_ALLOCATED_HTML_PAGE_SIZE => { + bytes::BytesMut::with_capacity(cap as usize) + } + _ => bytes::BytesMut::new(), + }; let target_url = res.url().as_str(); diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 1e56d22c4..5984243f4 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.22.11" +version = "2.22.13" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index be404ace2..e182b9b26 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.22.11" +version = "2.22.13" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 54519917f..1b9acda18 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.22.11" +version = "2.22.13" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index cfe2a26e8..cbf697405 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.22.11" +version = "2.22.13" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 481315e8b..f9c850266 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.22.11" +version = "2.22.13" authors = [ "j-mendez " ]