From a04945e0ad80ae31042b06ed5fffa44a6420d2e9 Mon Sep 17 00:00:00 2001 From: davidemarcoli Date: Tue, 9 Jul 2024 20:35:23 +0200 Subject: [PATCH 1/3] Add first version of comet scraper Fix annatar not using custom url for validation --- backend/program/scrapers/__init__.py | 4 +- backend/program/scrapers/annatar.py | 2 +- backend/program/scrapers/comet.py | 134 +++++++++++++++++++++++++++ backend/program/settings/models.py | 16 ++++ 4 files changed, 154 insertions(+), 2 deletions(-) create mode 100644 backend/program/scrapers/comet.py diff --git a/backend/program/scrapers/__init__.py b/backend/program/scrapers/__init__.py index 81263fd9..1091514a 100644 --- a/backend/program/scrapers/__init__.py +++ b/backend/program/scrapers/__init__.py @@ -16,6 +16,7 @@ from program.scrapers.torbox import TorBoxScraper from program.scrapers.torrentio import Torrentio from program.scrapers.zilean import Zilean +from program.scrapers.comet import Comet from program.settings.manager import settings_manager from RTN import Torrent from utils.logger import logger @@ -35,7 +36,8 @@ def __init__(self): TorBoxScraper: TorBoxScraper(), Mediafusion: Mediafusion(), Prowlarr: Prowlarr(), - Zilean: Zilean() + Zilean: Zilean(), + Comet: Comet() } self.initialized = self.validate() if not self.initialized: diff --git a/backend/program/scrapers/annatar.py b/backend/program/scrapers/annatar.py index 3ba1ab99..7939bfbe 100644 --- a/backend/program/scrapers/annatar.py +++ b/backend/program/scrapers/annatar.py @@ -42,7 +42,7 @@ def validate(self) -> bool: logger.error("Annatar ratelimit must be a valid boolean.") return False try: - response = get("https://annatar.elfhosted.com/manifest.json", timeout=15) + response = get(f"{self.settings.url}/manifest.json", timeout=15) if not response.is_ok: return False return True diff --git a/backend/program/scrapers/comet.py b/backend/program/scrapers/comet.py new file mode 100644 index 00000000..148451d0 --- /dev/null +++ b/backend/program/scrapers/comet.py @@ -0,0 +1,134 @@ +""" Comet scraper module """ +from typing import Dict, Union +import base64 +import json + +from program.media.item import Episode, MediaItem, Movie, Season, Show +from program.settings.manager import settings_manager +from program.settings.models import CometConfig +from requests import ConnectTimeout, ReadTimeout +from requests.exceptions import RequestException +from utils.logger import logger +from utils.request import RateLimiter, RateLimitExceeded, get, ping + + +class Comet: + """Scraper for `Comet`""" + + def __init__(self): + self.key = "comet" + self.settings = settings_manager.settings.scraping.comet + self.timeout = self.settings.timeout + self.encoded_string = base64.b64encode(json.dumps({ + "indexers": self.settings.indexers, + "maxResults":0, + "filterTitles":False, + "resolutions":["All"], + "languages":["All"], + "debridService":"realdebrid", + "debridApiKey": settings_manager.settings.downloaders.real_debrid.api_key, + "debridStreamProxyPassword":"" + }).encode('utf-8')).decode('utf-8') + self.initialized = self.validate() + if not self.initialized: + return + self.second_limiter = RateLimiter(max_calls=1, period=5) if self.settings.ratelimit else None + logger.success("Comet initialized!") + + def validate(self) -> bool: + """Validate the Comet settings.""" + if not self.settings.enabled: + logger.warning("Comet is set to disabled.") + return False + if not self.settings.url: + logger.error("Comet URL is not configured and will not be used.") + return False + if not isinstance(self.timeout, int) or self.timeout <= 0: + logger.error("Comet timeout is not set or invalid.") + return False + if not isinstance(self.settings.ratelimit, bool): + logger.error("Comet ratelimit must be a valid boolean.") + return False + try: + url = f"{self.settings.url}/manifest.json" + response = ping(url=url, timeout=self.timeout) + if response.ok: + return True + except Exception as e: + logger.error(f"Comet failed to initialize: {e}", ) + return False + return True + + def run(self, item: MediaItem) -> Dict[str, str]: + """Scrape the comet site for the given media items + and update the object with scraped streams""" + if not item: + return {} + + try: + # Returns a dict of {infoHash: raw_title} + return self.scrape(item) + except RateLimitExceeded: + if self.hour_limiter: + self.hour_limiter.limit_hit() + else: + logger.warning(f"Comet ratelimit exceeded for item: {item.log_string}") + except ConnectTimeout: + logger.warning(f"Comet connection timeout for item: {item.log_string}") + except ReadTimeout: + logger.warning(f"Comet read timeout for item: {item.log_string}") + except RequestException as e: + logger.error(f"Comet request exception: {str(e)}") + except Exception as e: + logger.error(f"Comet exception thrown: {str(e)}") + return {} + + def scrape(self, item: MediaItem) -> Dict[str, str]: + """Scrape the given media item""" + data, stream_count = self.api_scrape(item) + if data: + logger.log("SCRAPER", f"Found {len(data)} streams out of {stream_count} for {item.log_string}") + else: + logger.log("NOT_FOUND", f"No streams found for {item.log_string}") + return data + + def api_scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]: + """Wrapper for `Comet` scrape method""" + if isinstance(item, Show): + scrape_type = "series" + imdb_id = item.imdb_id + elif isinstance(item, Season): + scrape_type = "series" + imdb_id = item.parent.imdb_id + elif isinstance(item, Episode): + scrape_type = "series" + imdb_id = item.parent.parent.imdb_id + elif isinstance(item, Movie): + scrape_type = "movie" + imdb_id = item.imdb_id + + url = f"{self.settings.url}/{self.encoded_string}/stream/{scrape_type}/{imdb_id}.json" + + if self.second_limiter: + with self.second_limiter: + response = get(url, timeout=self.timeout) + else: + response = get(url, timeout=self.timeout) + + if not response.is_ok or not response.data.streams: + return {}, 0 + + torrents: Dict[str, str] = {} + for stream in response.data.streams: + + # Split the URL by '/playback/' and then split the remaining part by '/' + logger.info(url) + logger.info(url.split('/playback/')) + hash = url.split('/playback/')[1].split('/')[0] + + if not hash: + continue + + torrents[hash] = stream.title + + return torrents, len(response.data.media) diff --git a/backend/program/settings/models.py b/backend/program/settings/models.py index 9ac80406..5d9c0c6d 100644 --- a/backend/program/settings/models.py +++ b/backend/program/settings/models.py @@ -168,6 +168,21 @@ class KnightcrawlerConfig(Observable): timeout: int = 30 ratelimit: bool = True + +class CometConfig(Observable): + enabled: bool = False + url: str = "http://localhost:8000" + indexers: List[str] = [ + "bitsearch", + "eztv", + "thepiratebay", + "therarbg", + "yts" + ] + timeout: int = 30 + ratelimit: bool = True + + class ZileanConfig(Observable): enabled: bool = False url: str = "http://localhost:8181" @@ -238,6 +253,7 @@ class ScraperModel(Observable): torbox_scraper: TorBoxScraperConfig = TorBoxScraperConfig() mediafusion: MediafusionConfig = MediafusionConfig() zilean: ZileanConfig = ZileanConfig() + comet: CometConfig = CometConfig() # Version Ranking Model (set application defaults here!) From 3b43113dd8828a8a0691c26d407e67dc638eab80 Mon Sep 17 00:00:00 2001 From: davidemarcoli Date: Thu, 11 Jul 2024 20:53:12 +0200 Subject: [PATCH 2/3] Fix imdb id in comet scraper --- backend/program/scrapers/comet.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/backend/program/scrapers/comet.py b/backend/program/scrapers/comet.py index 148451d0..f7640b91 100644 --- a/backend/program/scrapers/comet.py +++ b/backend/program/scrapers/comet.py @@ -2,6 +2,7 @@ from typing import Dict, Union import base64 import json +from urllib.parse import quote from program.media.item import Episode, MediaItem, Movie, Season, Show from program.settings.manager import settings_manager @@ -96,18 +97,18 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]: """Wrapper for `Comet` scrape method""" if isinstance(item, Show): scrape_type = "series" - imdb_id = item.imdb_id + imdb_id = item.imdb_id + ":1" elif isinstance(item, Season): scrape_type = "series" - imdb_id = item.parent.imdb_id + imdb_id = f"{item.parent.imdb_id}:{item.number}" elif isinstance(item, Episode): scrape_type = "series" - imdb_id = item.parent.parent.imdb_id + imdb_id = f"${item.parent.parent.imdb_id}:{item.parent.number}:{item.number}" elif isinstance(item, Movie): scrape_type = "movie" imdb_id = item.imdb_id - url = f"{self.settings.url}/{self.encoded_string}/stream/{scrape_type}/{imdb_id}.json" + url = f"{self.settings.url}/{self.encoded_string}/stream/{scrape_type}/{quote(imdb_id)}.json" if self.second_limiter: with self.second_limiter: From e76692cea1ee68bf54416bbe84d941cf71f8d3f7 Mon Sep 17 00:00:00 2001 From: davidemarcoli Date: Fri, 12 Jul 2024 13:16:09 +0200 Subject: [PATCH 3/3] Finish comet scraper --- backend/program/scrapers/comet.py | 57 ++++++++++++++++++++----------- 1 file changed, 38 insertions(+), 19 deletions(-) diff --git a/backend/program/scrapers/comet.py b/backend/program/scrapers/comet.py index f7640b91..26b1a2f0 100644 --- a/backend/program/scrapers/comet.py +++ b/backend/program/scrapers/comet.py @@ -93,22 +93,33 @@ def scrape(self, item: MediaItem) -> Dict[str, str]: logger.log("NOT_FOUND", f"No streams found for {item.log_string}") return data + + def _determine_scrape(self, item: Union[Show, Season, Episode, Movie]) -> tuple[str, str, str]: + """Determine the scrape type and identifier for the given media item""" + try: + if isinstance(item, Show): + identifier, scrape_type, imdb_id = f":{item.seasons[0].number}:1", "series", item.imdb_id + elif isinstance(item, Season): + identifier, scrape_type, imdb_id = f":{item.number}:1", "series", item.parent.imdb_id + elif isinstance(item, Episode): + identifier, scrape_type, imdb_id = f":{item.parent.number}:{item.number}", "series", item.parent.parent.imdb_id + elif isinstance(item, Movie): + identifier, scrape_type, imdb_id = None, "movie", item.imdb_id + else: + logger.error(f"Invalid media item type") + return None, None, None + return identifier, scrape_type, imdb_id + except Exception as e: + logger.warning(f"Failed to determine scrape type or identifier for {item.log_string}: {e}") + return None, None, None + def api_scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]: """Wrapper for `Comet` scrape method""" - if isinstance(item, Show): - scrape_type = "series" - imdb_id = item.imdb_id + ":1" - elif isinstance(item, Season): - scrape_type = "series" - imdb_id = f"{item.parent.imdb_id}:{item.number}" - elif isinstance(item, Episode): - scrape_type = "series" - imdb_id = f"${item.parent.parent.imdb_id}:{item.parent.number}:{item.number}" - elif isinstance(item, Movie): - scrape_type = "movie" - imdb_id = item.imdb_id - - url = f"{self.settings.url}/{self.encoded_string}/stream/{scrape_type}/{quote(imdb_id)}.json" + identifier, scrape_type, imdb_id = self._determine_scrape(item) + if not imdb_id: + return {}, 0 + + url = f"{self.settings.url}/{self.encoded_string}/stream/{scrape_type}/{imdb_id}{identifier or ''}.json" if self.second_limiter: with self.second_limiter: @@ -121,15 +132,23 @@ def api_scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]: torrents: Dict[str, str] = {} for stream in response.data.streams: - # Split the URL by '/playback/' and then split the remaining part by '/' - logger.info(url) - logger.info(url.split('/playback/')) - hash = url.split('/playback/')[1].split('/')[0] + + url_parts = stream.url.split('/playback/') + + if len(url_parts) != 2: + logger.warning(f'Comet Playback url can\'t be parsed: {stream.url}') + + end_parts = url_parts[1].split('/') + + if len(end_parts) != 2: + logger.warning(f'End part of Comet Playback url can\'t be parsed ({end_parts}): {stream.url}') + + hash = end_parts[0] if not hash: continue torrents[hash] = stream.title - return torrents, len(response.data.media) + return torrents, len(response.data.streams)