diff --git a/Cargo.lock b/Cargo.lock index d1e833285..a5fa19784 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -585,9 +585,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.2" +version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f34d93e62b03caf570cccc334cbc6c2fceca82f39211051345108adcba3eebdc" +checksum = "27f657647bcff5394bf56c7317665bbf790a137a50eaaa5c6bfbb9e27a518f2d" dependencies = [ "jobserver", "libc", @@ -832,18 +832,18 @@ dependencies = [ [[package]] name = "const_format" -version = "0.2.33" +version = "0.2.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50c655d81ff1114fb0dcdea9225ea9f0cc712a6f8d189378e82bdf62a473a64b" +checksum = "126f97965c8ad46d6d9163268ff28432e8f6a1196a55578867832e3049df63dd" dependencies = [ "const_format_proc_macros", ] [[package]] name = "const_format_proc_macros" -version = "0.2.33" +version = "0.2.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eff1a44b93f47b1bac19a27932f5c591e43d1ba357ee4f61526c8a25603f0eb1" +checksum = "1d57c2eccfb16dbac1f4e61e206105db5820c9d26c3c472bc17c774259ef7744" dependencies = [ "proc-macro2", "quote", @@ -2406,9 +2406,9 @@ dependencies = [ [[package]] name = "js-sys" -version = "0.3.74" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a865e038f7f6ed956f788f0d7d60c541fff74c7bd74272c5d4cf15c63743e705" +checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7" dependencies = [ "once_cell", "wasm-bindgen", @@ -3336,9 +3336,9 @@ dependencies = [ [[package]] name = "prost" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7b0487d90e047de87f984913713b85c601c05609aad5b0df4b4573fbf69aa13f" +checksum = "2c0fef6c4230e4ccf618a35c59d7ede15dea37de8427500f50aff708806e42ec" dependencies = [ "bytes", "prost-derive", @@ -3346,9 +3346,9 @@ dependencies = [ [[package]] name = "prost-derive" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5" +checksum = "157c5a9d7ea5c2ed2d9fb8f495b64759f7816c7eaea54ba3978f0d63000162e3" dependencies = [ "anyhow", "itertools 0.13.0", @@ -3359,18 +3359,18 @@ dependencies = [ [[package]] name = "prost-types" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4759aa0d3a6232fb8dbdb97b61de2c20047c68aca932c7ed76da9d788508d670" +checksum = "cc2f1e56baa61e93533aebc21af4d2134b70f66275e0fcdf3cbe43d77ff7e8fc" dependencies = [ "prost", ] [[package]] name = "psl" -version = "2.1.65" +version = "2.1.66" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "116ebba3917694f4abc8de221600dd24e2686afaf04bb1ab5b4c966f35cecf88" +checksum = "59bbcf24d16c12896265cdc98f2b98afb1e62223bc0c396565eaf1c2b6278170" dependencies = [ "psl-types", ] @@ -3436,7 +3436,7 @@ dependencies = [ "rustc-hash 2.1.0", "rustls 0.23.19", "socket2", - "thiserror 2.0.4", + "thiserror 2.0.5", "tokio", "tracing", ] @@ -3455,7 +3455,7 @@ dependencies = [ "rustls 0.23.19", "rustls-pki-types", "slab", - "thiserror 2.0.4", + "thiserror 2.0.5", "tinyvec", "tracing", "web-time", @@ -4278,7 +4278,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.20.4" +version = "2.20.5" dependencies = [ "ahash", "aho-corasick", @@ -4340,7 +4340,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.20.4" +version = "2.20.5" dependencies = [ "adblock", "async-tungstenite", @@ -4377,7 +4377,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.20.4" +version = "2.20.5" dependencies = [ "clap", "env_logger", @@ -4402,7 +4402,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.20.4" +version = "2.20.5" dependencies = [ "aho-corasick", "fast_html2md", @@ -4424,7 +4424,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.20.4" +version = "2.20.5" dependencies = [ "indexmap 1.9.3", "serde", @@ -4436,7 +4436,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.20.4" +version = "2.20.5" dependencies = [ "env_logger", "lazy_static", @@ -4716,11 +4716,11 @@ dependencies = [ [[package]] name = "thiserror" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f49a1853cf82743e3b7950f77e0f4d622ca36cf4317cba00c767838bac8d490" +checksum = "643caef17e3128658ff44d85923ef2d28af81bb71e0d67bbfe1d76f19a73e053" dependencies = [ - "thiserror-impl 2.0.4", + "thiserror-impl 2.0.5", ] [[package]] @@ -4736,9 +4736,9 @@ dependencies = [ [[package]] name = "thiserror-impl" -version = "2.0.4" +version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8381894bb3efe0c4acac3ded651301ceee58a15d47c2e34885ed1908ad667061" +checksum = "995d0bbc9995d1f19d28b7215a9352b0fc3cd3a2d2ec95c2cadc485cdedbcdde" dependencies = [ "proc-macro2", "quote", @@ -4931,9 +4931,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.16" +version = "0.1.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" +checksum = "eca58d7bba4a75707817a2c44174253f9236b2d5fbd055602e9d5c07c139a047" dependencies = [ "futures-core", "pin-project-lite", @@ -5427,9 +5427,9 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d15e63b4482863c109d70a7b8706c1e364eb6ea449b201a76c5b89cedcec2d5c" +checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396" dependencies = [ "cfg-if", "once_cell", @@ -5438,13 +5438,12 @@ dependencies = [ [[package]] name = "wasm-bindgen-backend" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8d36ef12e3aaca16ddd3f67922bc63e48e953f126de60bd33ccc0101ef9998cd" +checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79" dependencies = [ "bumpalo", "log", - "once_cell", "proc-macro2", "quote", "syn 2.0.90", @@ -5453,9 +5452,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-futures" -version = "0.4.47" +version = "0.4.49" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9dfaf8f50e5f293737ee323940c7d8b08a66a95a419223d9f41610ca08b0833d" +checksum = "38176d9b44ea84e9184eff0bc34cc167ed044f816accfe5922e54d84cf48eca2" dependencies = [ "cfg-if", "js-sys", @@ -5466,9 +5465,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "705440e08b42d3e4b36de7d66c944be628d579796b8090bfa3471478a2260051" +checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -5476,9 +5475,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "98c9ae5a76e46f4deecd0f0255cc223cfa18dc9b261213b8aa0c7b36f61b3f1d" +checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2" dependencies = [ "proc-macro2", "quote", @@ -5489,9 +5488,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.97" +version = "0.2.99" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ee99da9c5ba11bd675621338ef6fa52296b76b83305e9b6e5c77d4c286d6d49" +checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6" [[package]] name = "wasm-streams" @@ -5508,9 +5507,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.74" +version = "0.3.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a98bc3c33f0fe7e59ad7cd041b89034fa82a7c2d4365ca538dda6cdaf513863c" +checksum = "04dd7223427d52553d3702c004d3b2fe07c148165faa56313cb00211e31c12bc" dependencies = [ "js-sys", "wasm-bindgen", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 825010a60..9951f8875 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.20.4" +version = "2.20.5" authors = [ "j-mendez " ] diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 91dfd6d1d..eac8ad944 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.20.4" +version = "2.20.5" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_chrome/src/handler/blockers/glassdoor_blockers.rs b/spider_chrome/src/handler/blockers/glassdoor_blockers.rs new file mode 100644 index 000000000..eca04ee2a --- /dev/null +++ b/spider_chrome/src/handler/blockers/glassdoor_blockers.rs @@ -0,0 +1,54 @@ +use crate::handler::blockers::Trie; + +lazy_static::lazy_static! { + /// Ignore list of urls. + static ref URL_IGNORE_TRIE: Trie = { + let mut trie = Trie::new(); + let patterns = [ + "https://www.glassdoor.com/garnish/static/js/gd-sw-register.", + "https://cdnjs.cloudflare.com/ajax/libs/prop-types/15.7.2/prop-types.min.js", + "https://www.glassdoor.com/autocomplete/location?", + ]; + for pattern in &patterns { + trie.insert(pattern); + } + trie + }; + + /// Ignore list of urls styles. + static ref URL_IGNORE_TRIE_STYLES: Trie = { + let mut trie = Trie::new(); + let patterns = [ + "https://www.glassdoor.com/sam-global-nav/static/", + "https://www.glassdoor.com/garnish/static/js/gd-", + "https://unpkg.com/@dotlottie/player-component@", + "https://www.glassdoor.com/job-search-next/assets/_next/static/", + "https://www.glassdoor.com/ei-overview-next/assets/_next/static/", + "https://www.glassdoor.com/occ-salaries-web/assets/_next/static/" + ]; + for pattern in &patterns { + trie.insert(pattern); + } + trie + }; +} + +// Block glassdoor events that are not required +pub fn block_glassdoor_styles( + event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, +) -> bool { + URL_IGNORE_TRIE_STYLES.contains_prefix(&event.request.url) +} + +// Block glassdoor events that are not required +pub fn block_glassdoor( + event: &chromiumoxide_cdp::cdp::browser_protocol::fetch::EventRequestPaused, + ignore_visuals: bool, +) -> bool { + let blocked = URL_IGNORE_TRIE.contains_prefix(&event.request.url); + if !blocked && ignore_visuals { + block_glassdoor_styles(event) + } else { + blocked + } +} diff --git a/spider_chrome/src/handler/blockers/mod.rs b/spider_chrome/src/handler/blockers/mod.rs index 76730121f..28f04e15c 100644 --- a/spider_chrome/src/handler/blockers/mod.rs +++ b/spider_chrome/src/handler/blockers/mod.rs @@ -2,6 +2,8 @@ pub mod adblock_patterns; /// amazon blockers pub mod amazon_blockers; +/// glassdoor blockers +pub mod glassdoor_blockers; /// linkedin blockers pub mod linkedin_blockers; /// netflix blockers diff --git a/spider_chrome/src/handler/blockers/upwork_blockers.rs b/spider_chrome/src/handler/blockers/upwork_blockers.rs index 01f4c2f80..dfef50ff7 100644 --- a/spider_chrome/src/handler/blockers/upwork_blockers.rs +++ b/spider_chrome/src/handler/blockers/upwork_blockers.rs @@ -26,7 +26,7 @@ lazy_static::lazy_static! { trie }; - /// Ignore list of urls. + /// Ignore list of urls styles. static ref URL_IGNORE_TRIE_STYLES: Trie = { let mut trie = Trie::new(); let patterns = [ diff --git a/spider_chrome/src/handler/network.rs b/spider_chrome/src/handler/network.rs index 5d84cbbb6..ee294a69a 100644 --- a/spider_chrome/src/handler/network.rs +++ b/spider_chrome/src/handler/network.rs @@ -130,6 +130,8 @@ lazy_static! { "https://js.hsforms.net/forms/embed/v2.js", "https://static.parastorage.com/services/wix-thunderbolt/dist/", "https://static.parastorage.com/services/tag-manager-client/", + "https://www.datadoghq-browser-agent.com/datadog-rum-slim-v4.js", + "https://cdn.rudderlabs.com", ".sharethis.com", ".newrelic.com", ".googlesyndication.com", @@ -143,6 +145,7 @@ lazy_static! { "tinypass.min.js", ".airship.com", ".adlightning.com", + ".lab.amplitude.", // explicit ignore tracking.js and ad files "privacy-notice.js", "tracking.js", @@ -207,6 +210,8 @@ lazy_static! { "https://idx.liadm.com", "https://geo.privacymanager.io/", "https://nimbleplot.com", + "https://api.lab.amplitude.com/", + "https://flag.lab.amplitude.com/sdk/v2/flags", ".wixapps.net/api/v1/bulklog", // video embeddings "https://video.squarespace-cdn.com/content/", @@ -250,7 +255,6 @@ lazy_static! { // extra CDN scripts "https://cdn.readme.io/public/", - // insight tracker "https://insight.adsrvr.org/track/", "cxense.com/", @@ -261,6 +265,8 @@ lazy_static! { // ignore font extras "https://kit.fontawesome.com/", "https://use.typekit.net", + ".amplitude.com", + ".rudderstack.com", // ignore tailwind cdn "https://cdn.tailwindcss.com", // ignore extra ads @@ -368,6 +374,8 @@ pub enum NetworkInterceptManager { Netflix, /// upwork.com, Upwork, + /// glassdoor.com + Glassdoor, #[default] /// Unknown Unknown, @@ -375,7 +383,7 @@ pub enum NetworkInterceptManager { lazy_static! { /// Top tier list of the most common websites visited. - pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 12] = [ + pub static ref TOP_TIER_LIST: [(&'static str, NetworkInterceptManager); 14] = [ ("https://www.tiktok.com", NetworkInterceptManager::TikTok), ("https://tiktok.com", NetworkInterceptManager::TikTok), ("https://www.amazon.com", NetworkInterceptManager::Amazon), @@ -391,6 +399,8 @@ lazy_static! { ("https://linkedin.com", NetworkInterceptManager::LinkedIn), ("https://www.upwork.com", NetworkInterceptManager::Upwork), ("https://upwork.com", NetworkInterceptManager::Upwork), + ("https://www.glassdoor.com", NetworkInterceptManager::Glassdoor), + ("https://glassdoor.com", NetworkInterceptManager::Glassdoor), ]; } @@ -736,6 +746,12 @@ impl NetworkManager { NetworkInterceptManager::LinkedIn => { super::blockers::linkedin_blockers::block_linkedin(event) } + NetworkInterceptManager::Glassdoor => { + super::blockers::glassdoor_blockers::block_glassdoor( + event, + self.ignore_visuals, + ) + } NetworkInterceptManager::Upwork => { super::blockers::upwork_blockers::block_upwork( event, @@ -850,6 +866,12 @@ impl NetworkManager { NetworkInterceptManager::LinkedIn => { super::blockers::linkedin_blockers::block_linkedin(event) } + NetworkInterceptManager::Glassdoor => { + super::blockers::glassdoor_blockers::block_glassdoor( + event, + self.ignore_visuals, + ) + } NetworkInterceptManager::Upwork => { super::blockers::upwork_blockers::block_upwork( event, diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index 133f555aa..11651eaa9 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.20.4" +version = "2.20.5" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index 37ff1dcfb..6f7c8f495 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.20.4" +version = "2.20.5" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index 1513b8469..86fab0b52 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.20.4" +version = "2.20.5" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 5d0c29147..7d1194ab3 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.20.4" +version = "2.20.5" authors = [ "j-mendez " ]