Skip to content

Commit

Permalink
Don't log chrome ext. pages on site visit timeouts
Browse files Browse the repository at this point in the history
It doesn't seem interesting to know that the browser failed
to even navigate away from the extension page (that we
already had open, I assume).
  • Loading branch information
ghostwords committed Jan 7, 2025
1 parent 529f4a1 commit 798e5e6
Showing 1 changed file with 16 additions and 15 deletions.
31 changes: 16 additions & 15 deletions crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1001,18 +1001,12 @@ def get_current_url(self):
except Exception:
return None

def crawl(self):
def crawl(self, domains):
"""
Visit the top `num_sites` websites in the Tranco list, in order, in
a virtual browser with Privacy Badger installed. Afterwards, save the
action_map and snitch_map that the Badger learned.
"""

if self.num_sites == 0:
domains = []
else:
domains = self.get_sitelist()

num_visited = 0
old_snitches = self.dump_data()['snitch_map']

Expand All @@ -1037,23 +1031,25 @@ def crawl(self):
self.logger.info("Visiting %d: %s", i + 1, domain)
self.visit_domain(domain)

curl_or_domain = self.get_current_url() or domain
if curl_or_domain.startswith(CHROME_URL_PREFIX):
curl = self.get_current_url()
if curl and curl.startswith(CHROME_URL_PREFIX):
self.logger.error("Error loading %s: "
"driver.current_url is still %s",
domain, curl_or_domain)
"driver.current_url is still a %s page",
domain, CHROME_URL_PREFIX)
continue

self.logger.info("Visited %s", curl_or_domain)
self.logger.info("Visited %s", curl or domain)
num_visited += 1

except (MaxRetryError, ProtocolError, ReadTimeoutError) as ex:
self.logger.error("Error loading %s:\n%s", domain, str(ex))
self.restart_browser()

except TimeoutException:
self.logger.warning("Timed out loading %s",
self.get_current_url() or domain)
curl = self.get_current_url()
if curl and curl.startswith(CHROME_URL_PREFIX):
curl = None
self.logger.warning("Timed out loading %s", curl or domain)

except WebDriverException as ex:
if should_restart(ex):
Expand Down Expand Up @@ -1199,4 +1195,9 @@ def save(self, data, name='results.json'):
data = json.load(f)
crawler.load_user_data(data)

crawler.crawl()
if crawler.num_sites == 0:
domains = []
else:
domains = crawler.get_sitelist()

crawler.crawl(domains)

0 comments on commit 798e5e6

Please sign in to comment.