Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Comet Scraper #511

Merged
merged 3 commits into from
Jul 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion backend/program/scrapers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from program.scrapers.torbox import TorBoxScraper
from program.scrapers.torrentio import Torrentio
from program.scrapers.zilean import Zilean
from program.scrapers.comet import Comet
from program.settings.manager import settings_manager
from RTN import Torrent
from utils.logger import logger
Expand All @@ -35,7 +36,8 @@ def __init__(self):
TorBoxScraper: TorBoxScraper(),
Mediafusion: Mediafusion(),
Prowlarr: Prowlarr(),
Zilean: Zilean()
Zilean: Zilean(),
Comet: Comet()
}
self.initialized = self.validate()
if not self.initialized:
Expand Down
2 changes: 1 addition & 1 deletion backend/program/scrapers/annatar.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def validate(self) -> bool:
logger.error("Annatar ratelimit must be a valid boolean.")
return False
try:
response = get("https://annatar.elfhosted.com/manifest.json", timeout=15)
response = get(f"{self.settings.url}/manifest.json", timeout=15)
if not response.is_ok:
return False
return True
Expand Down
154 changes: 154 additions & 0 deletions backend/program/scrapers/comet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
""" Comet scraper module """
from typing import Dict, Union
import base64
import json
from urllib.parse import quote

from program.media.item import Episode, MediaItem, Movie, Season, Show
from program.settings.manager import settings_manager
from program.settings.models import CometConfig
from requests import ConnectTimeout, ReadTimeout
from requests.exceptions import RequestException
from utils.logger import logger
from utils.request import RateLimiter, RateLimitExceeded, get, ping


class Comet:
"""Scraper for `Comet`"""

def __init__(self):
self.key = "comet"
self.settings = settings_manager.settings.scraping.comet
self.timeout = self.settings.timeout
self.encoded_string = base64.b64encode(json.dumps({
"indexers": self.settings.indexers,
"maxResults":0,
"filterTitles":False,
"resolutions":["All"],
"languages":["All"],
"debridService":"realdebrid",
"debridApiKey": settings_manager.settings.downloaders.real_debrid.api_key,
"debridStreamProxyPassword":""
}).encode('utf-8')).decode('utf-8')
self.initialized = self.validate()
if not self.initialized:
return
self.second_limiter = RateLimiter(max_calls=1, period=5) if self.settings.ratelimit else None
logger.success("Comet initialized!")

def validate(self) -> bool:
"""Validate the Comet settings."""
if not self.settings.enabled:
logger.warning("Comet is set to disabled.")
return False
if not self.settings.url:
logger.error("Comet URL is not configured and will not be used.")
return False
if not isinstance(self.timeout, int) or self.timeout <= 0:
logger.error("Comet timeout is not set or invalid.")
return False
if not isinstance(self.settings.ratelimit, bool):
logger.error("Comet ratelimit must be a valid boolean.")
return False
try:
url = f"{self.settings.url}/manifest.json"
response = ping(url=url, timeout=self.timeout)
if response.ok:
return True
except Exception as e:
logger.error(f"Comet failed to initialize: {e}", )
return False
return True

def run(self, item: MediaItem) -> Dict[str, str]:
"""Scrape the comet site for the given media items
and update the object with scraped streams"""
if not item:
return {}

try:
# Returns a dict of {infoHash: raw_title}
return self.scrape(item)
except RateLimitExceeded:
if self.hour_limiter:
self.hour_limiter.limit_hit()
else:
logger.warning(f"Comet ratelimit exceeded for item: {item.log_string}")
except ConnectTimeout:
logger.warning(f"Comet connection timeout for item: {item.log_string}")
except ReadTimeout:
logger.warning(f"Comet read timeout for item: {item.log_string}")
except RequestException as e:
logger.error(f"Comet request exception: {str(e)}")
except Exception as e:
logger.error(f"Comet exception thrown: {str(e)}")
return {}

def scrape(self, item: MediaItem) -> Dict[str, str]:
"""Scrape the given media item"""
data, stream_count = self.api_scrape(item)
if data:
logger.log("SCRAPER", f"Found {len(data)} streams out of {stream_count} for {item.log_string}")
else:
logger.log("NOT_FOUND", f"No streams found for {item.log_string}")
return data


def _determine_scrape(self, item: Union[Show, Season, Episode, Movie]) -> tuple[str, str, str]:
"""Determine the scrape type and identifier for the given media item"""
try:
if isinstance(item, Show):
identifier, scrape_type, imdb_id = f":{item.seasons[0].number}:1", "series", item.imdb_id
elif isinstance(item, Season):
identifier, scrape_type, imdb_id = f":{item.number}:1", "series", item.parent.imdb_id
elif isinstance(item, Episode):
identifier, scrape_type, imdb_id = f":{item.parent.number}:{item.number}", "series", item.parent.parent.imdb_id
elif isinstance(item, Movie):
identifier, scrape_type, imdb_id = None, "movie", item.imdb_id
else:
logger.error(f"Invalid media item type")
return None, None, None
return identifier, scrape_type, imdb_id
except Exception as e:
logger.warning(f"Failed to determine scrape type or identifier for {item.log_string}: {e}")
return None, None, None

def api_scrape(self, item: MediaItem) -> tuple[Dict[str, str], int]:
"""Wrapper for `Comet` scrape method"""
identifier, scrape_type, imdb_id = self._determine_scrape(item)
if not imdb_id:
return {}, 0

url = f"{self.settings.url}/{self.encoded_string}/stream/{scrape_type}/{imdb_id}{identifier or ''}.json"

if self.second_limiter:
with self.second_limiter:
response = get(url, timeout=self.timeout)
else:
response = get(url, timeout=self.timeout)

if not response.is_ok or not response.data.streams:
return {}, 0

torrents: Dict[str, str] = {}
for stream in response.data.streams:
# Split the URL by '/playback/' and then split the remaining part by '/'

url_parts = stream.url.split('/playback/')

if len(url_parts) != 2:
logger.warning(f'Comet Playback url can\'t be parsed: {stream.url}')

end_parts = url_parts[1].split('/')

if len(end_parts) != 2:
logger.warning(f'End part of Comet Playback url can\'t be parsed ({end_parts}): {stream.url}')

hash = end_parts[0]

if not hash:
continue

torrents[hash] = stream.title

return torrents, len(response.data.streams)
16 changes: 16 additions & 0 deletions backend/program/settings/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,21 @@ class KnightcrawlerConfig(Observable):
timeout: int = 30
ratelimit: bool = True


class CometConfig(Observable):
enabled: bool = False
url: str = "http://localhost:8000"
indexers: List[str] = [
"bitsearch",
"eztv",
"thepiratebay",
"therarbg",
"yts"
]
timeout: int = 30
ratelimit: bool = True


class ZileanConfig(Observable):
enabled: bool = False
url: str = "http://localhost:8181"
Expand Down Expand Up @@ -238,6 +253,7 @@ class ScraperModel(Observable):
torbox_scraper: TorBoxScraperConfig = TorBoxScraperConfig()
mediafusion: MediafusionConfig = MediafusionConfig()
zilean: ZileanConfig = ZileanConfig()
comet: CometConfig = CometConfig()


# Version Ranking Model (set application defaults here!)
Expand Down
Loading