Skip to content

Commit

Permalink
chore(page): add goaway retry
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 30, 2024
1 parent 1b9c3c1 commit d5b69ce
Show file tree
Hide file tree
Showing 8 changed files with 53 additions and 18 deletions.
13 changes: 7 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.22.11"
version = "2.22.13"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down Expand Up @@ -66,6 +66,7 @@ aho-corasick = { version = "1", optional = true }
tracing = { version = "0.1", default-features = false, features = ["std"], optional = true }
sysinfo = { version = "0.33", default-features = false, features = ["system"], optional = true }
sqlx = { version = "0.8", features = [ "runtime-tokio", "sqlite" ], optional = true }
h2 = "0.4"

[dependencies.spider_chrome]
version = "2"
Expand Down
45 changes: 39 additions & 6 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ use crate::utils::FetchPageResult;
use tokio_stream::StreamExt;
use url::Url;

/// Allocate up to 16kb upfront for small pages.
const MAX_PRE_ALLOCATED_HTML_PAGE_SIZE: u64 = 16 * 1024;

lazy_static! {
/// Wildcard match all domains.
static ref CASELESS_WILD_CARD: CaseInsensitiveString = CaseInsensitiveString::new("*");
Expand Down Expand Up @@ -401,6 +404,33 @@ pub fn validate_empty(content: &Option<Box<Bytes>>, is_success: bool) -> bool {
}
}

/// Extract a specific type of error from a chain of errors.
fn extract_specific_error<'a, T: std::error::Error + 'static>(
error: &'a (dyn std::error::Error + 'static),
) -> Option<&'a T> {
let mut current_error = Some(error);
while let Some(err) = current_error {
if let Some(desired_error) = err.downcast_ref::<T>() {
return Some(desired_error);
}
current_error = err.source();
}
None
}

/// Determine if the response is goaway and should retry.
fn should_attempt_retry(error: &(dyn std::error::Error + 'static)) -> bool {
if let Some(e) = extract_specific_error::<h2::Error>(error) {
if e.is_go_away() && e.is_remote() && e.reason() == Some(h2::Reason::NO_ERROR) {
return true;
}
if e.is_remote() && e.reason() == Some(h2::Reason::REFUSED_STREAM) {
return true;
}
}
false
}

/// Instantiate a new page without scraping it (used for testing purposes).
#[cfg(not(feature = "decentralized"))]
pub fn build(url: &str, res: PageResponse) -> Page {
Expand Down Expand Up @@ -441,6 +471,9 @@ pub fn build(url: &str, res: PageResponse) -> Page {
if er.is_status() || er.is_connect() || er.is_timeout() {
should_retry = !er.to_string().contains("ENOTFOUND");
}
if !should_retry && should_attempt_retry(&er) {
should_retry = true;
}
Some(er.to_string())
}
},
Expand Down Expand Up @@ -502,9 +535,6 @@ pub struct PageLinkBuildSettings {
pub subdomains: bool,
}

/// Default byte capacity for response stream collecting.
const DEFAULT_BYTE_CAPACITY: u64 = 8 * 1024;

impl PageLinkBuildSettings {
/// New build link settings.
pub fn new(ssg_build: bool, full_resources: bool) -> Self {
Expand Down Expand Up @@ -618,9 +648,12 @@ impl Page {
_ => (AsciiCompatibleEncoding::utf_8(), true),
};

let mut collected_bytes = bytes::BytesMut::with_capacity(
res.content_length().unwrap_or(DEFAULT_BYTE_CAPACITY) as usize,
);
let mut collected_bytes = match res.content_length() {
Some(cap) if cap <= MAX_PRE_ALLOCATED_HTML_PAGE_SIZE => {
bytes::BytesMut::with_capacity(cap as usize)
}
_ => bytes::BytesMut::new(),
};

let target_url = res.url().as_str();

Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.22.11"
version = "2.22.13"
rust-version = "1.70"
authors = [
"j-mendez <jeff@spider.cloud>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.22.11"
version = "2.22.13"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.22.11"
version = "2.22.13"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.22.11"
version = "2.22.13"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.22.11"
version = "2.22.13"
authors = [
"j-mendez <jeff@spider.cloud>"
]
Expand Down

0 comments on commit d5b69ce

Please sign in to comment.