From 3c29af5d7807e2e1da50b66571705bbdcc4e99a2 Mon Sep 17 00:00:00 2001 From: Alexei Date: Fri, 27 Dec 2024 14:09:44 -0500 Subject: [PATCH] Use latest Ghostery blocklist See ghostery/ghostery-extension/pull/1051 --- lib/lists/blocklist.py | 23 ++++++++++++------ lib/lists/ghostery.py | 55 +++++++++++++++--------------------------- 2 files changed, 35 insertions(+), 43 deletions(-) diff --git a/lib/lists/blocklist.py b/lib/lists/blocklist.py index bee9c73..0360050 100644 --- a/lib/lists/blocklist.py +++ b/lib/lists/blocklist.py @@ -24,14 +24,23 @@ def _download(self, url, filename): with open(filename, 'w', encoding='utf-8') as file: file.write(data.decode('utf-8')) + def exists_and_unexpired(self, filename, expire_cache_hrs): + if not os.path.isfile(filename): + return False + + time_diff = time.time() - os.path.getmtime(filename) + if time_diff / 3600 > expire_cache_hrs: + return False + + return True + def fetch(self, url, filename, expire_cache_hrs=24): os.makedirs(self.cache_dir, exist_ok=True) - if not os.path.isfile(filename): - self._download(url, filename) - # redownload if cached file is older than specified span of hours - elif (time.time() - os.path.getmtime(filename)) / 3600 > expire_cache_hrs: - # first remove (back up) the file so that if downloading fails, - # we know something went wrong - os.replace(filename, filename + ".bak") + if not self.exists_and_unexpired(filename, expire_cache_hrs): + if os.path.isfile(filename): + # first remove (back up) the file so that if downloading fails, + # we know something went wrong + os.replace(filename, filename + ".bak") + self._download(url, filename) diff --git a/lib/lists/ghostery.py b/lib/lists/ghostery.py index 72b7644..c824c16 100644 --- a/lib/lists/ghostery.py +++ b/lib/lists/ghostery.py @@ -2,25 +2,12 @@ import json import os - -from collections.abc import MutableMapping +import urllib from lib.basedomain import extract from lib.lists.blocklist import Blocklist -# https://stackoverflow.com/a/6027615 -def flatten(dictionary, parent_key='', separator='_'): - items = [] - for key, value in dictionary.items(): - new_key = parent_key + separator + key if parent_key else key - if isinstance(value, MutableMapping): - items.extend(flatten(value, new_key, separator).items()) - else: - items.append((new_key, value)) - return dict(items) - - class Ghostery(Blocklist): bases = set() @@ -30,10 +17,18 @@ class Ghostery(Blocklist): blocked_categories = ("advertising", "site_analytics", "pornvertising") def __init__(self): - url = "https://cdn.ghostery.com/update/v4.1/bugs.json" - filename = os.path.join(self.cache_dir, "ghostery-bugs.json") + filename = os.path.join(self.cache_dir, "ghostery-trackerdb.json") + expire_hrs = 168 # weekly expiration + + if not self.exists_and_unexpired(filename, expire_hrs): + url = "https://github.com/ghostery/trackerdb/releases/latest" + with urllib.request.urlopen( + urllib.request.Request(url, method='HEAD')) as conn: + version = conn.geturl().rpartition('/')[-1] - self.fetch(url, filename, expire_cache_hrs=168) # weekly expiration + url = ("https://github.com/ghostery/trackerdb/releases" + f"/download/{version}/trackerdb.json") + self.fetch(url, filename, expire_cache_hrs=expire_hrs) try: with open(filename, encoding='utf-8') as file: @@ -42,26 +37,14 @@ def __init__(self): print(f"WARNING Failed to open {filename}") return - # TODO review if we can ingest some domains from other pattern types, not just "host" - - # since '_' is a valid domain names character, '_' is a bad separator - # for working with domains names; let's use ':' instead - host_patterns_flat = flatten(data["patterns"]["host"], separator=':') - - for domain_key, bug_id in host_patterns_flat.items(): - # trim the last segment ("_$") and then reverse - domain = ".".join(domain_key.split(":")[:-1][::-1]) - - base = extract(domain).registered_domain - if not base: - base = domain - self.bases.add(base) + for name in data['patterns']: + for domain in data['patterns'][name]['domains']: + base = extract(domain).registered_domain or domain + self.bases.add(base) - aid = data["bugs"][str(bug_id)]["aid"] - category = data["apps"][str(aid)]["cat"] - if category not in self.blocked_categories: - self.bases_unblocked.add(base) + if data['patterns'][name]['category'] not in self.blocked_categories: + self.bases_unblocked.add(base) - self.domains.add(domain) + self.domains.add(domain) self.ready = True