diff --git a/docs/configuration.rst b/docs/configuration.rst index 680919bdb1..7985f54de4 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -922,11 +922,13 @@ Description extractor.*.archive ------------------- Type - |Path|_ + * ``string`` + * |Path|_ Default ``null`` Example - ``"$HOME/.archives/{category}.sqlite3"`` + * ``"$HOME/.archives/{category}.sqlite3"`` + * ``"postgresql://user:pass@host/database"`` Description File to store IDs of downloaded files in. Downloads of files already recorded in this archive file will be @@ -937,6 +939,11 @@ Description memory requirements are significantly lower when the amount of stored IDs gets reasonably large. + If this value is a + `PostgreSQL Connection URI `__, + the archive will use this PostgreSQL database as backend (requires + `Psycopg `__). + Note: Archive files that do not already exist get generated automatically. Note: Archive paths support regular `format string`_ replacements, @@ -5564,6 +5571,21 @@ Description regardless of this option. +downloader.http.sleep-429 +------------------------- +Type + |Duration|_ +Default + `extractor.*.sleep-429`_ +Description + Number of seconds to sleep when receiving a `429 Too Many Requests` + response before `retrying `__ the request. + + Note: Requires + `retry-codes `__ + to include ``429``. + + downloader.http.validate ------------------------ Type diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 0e12760c7f..bead106016 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -918,7 +918,8 @@ "consume-content" : false, "enabled" : true, "headers" : null, - "retry-codes" : [404, 429, 430], + "retry-codes" : [], + "sleep-429" : 60.0, "validate" : true }, diff --git a/gallery_dl/archive.py b/gallery_dl/archive.py index 5f05bbfd8c..bd35895d6a 100644 --- a/gallery_dl/archive.py +++ b/gallery_dl/archive.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2024 Mike Fährmann +# Copyright 2024-2025 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -9,26 +9,55 @@ """Download Archives""" import os -import sqlite3 -from . import formatter +import logging +from . import util, formatter + +log = logging.getLogger("archive") + + +def connect(path, prefix, format, mode=None, pragma=None, kwdict=None): + keygen = formatter.parse(prefix + format).format_map + + if path.startswith(("postgres://", "postgresql://")): + if mode == "memory": + cls = DownloadArchivePostgresqlMemory + else: + cls = DownloadArchivePostgresql + else: + path = util.expand_path(path) + if kwdict is not None and "{" in path: + path = formatter.parse(path).format_map(kwdict) + if mode == "memory": + cls = DownloadArchiveMemory + else: + cls = DownloadArchive + + return cls(path, keygen, pragma) class DownloadArchive(): + _sqlite3 = None + + def __init__(self, path, keygen, pragma=None, cache_key=None): + + if self._sqlite3 is None: + import sqlite3 + DownloadArchive._sqlite3 = sqlite3 - def __init__(self, path, format_string, pragma=None, - cache_key="_archive_key"): try: - con = sqlite3.connect(path, timeout=60, check_same_thread=False) + con = self._sqlite3.connect( + path, timeout=60, check_same_thread=False) except sqlite3.OperationalError: os.makedirs(os.path.dirname(path)) - con = sqlite3.connect(path, timeout=60, check_same_thread=False) + con = self._sqlite3.connect( + path, timeout=60, check_same_thread=False) con.isolation_level = None - self.keygen = formatter.parse(format_string).format_map + self.keygen = keygen self.connection = con self.close = con.close self.cursor = cursor = con.cursor() - self._cache_key = cache_key + self._cache_key = cache_key or "_archive_key" if pragma: for stmt in pragma: @@ -37,7 +66,7 @@ def __init__(self, path, format_string, pragma=None, try: cursor.execute("CREATE TABLE IF NOT EXISTS archive " "(entry TEXT PRIMARY KEY) WITHOUT ROWID") - except sqlite3.OperationalError: + except self._sqlite3.OperationalError: # fallback for missing WITHOUT ROWID support (#553) cursor.execute("CREATE TABLE IF NOT EXISTS archive " "(entry TEXT PRIMARY KEY)") @@ -61,9 +90,9 @@ def finalize(self): class DownloadArchiveMemory(DownloadArchive): - def __init__(self, path, format_string, pragma=None, - cache_key="_archive_key"): - DownloadArchive.__init__(self, path, format_string, pragma, cache_key) + def __init__(self, path, keygen, pragma=None, cache_key=None): + DownloadArchive.__init__( + self, path, keygen, pragma, cache_key) self.keys = set() def add(self, kwdict): @@ -87,7 +116,7 @@ def finalize(self): with self.connection: try: cursor.execute("BEGIN") - except sqlite3.OperationalError: + except self._sqlite3.OperationalError: pass stmt = "INSERT OR IGNORE INTO archive (entry) VALUES (?)" @@ -96,3 +125,107 @@ def finalize(self): cursor.execute(stmt, (key,)) else: cursor.executemany(stmt, ((key,) for key in self.keys)) + + +class DownloadArchivePostgresql(): + _psycopg = None + + def __init__(self, uri, keygen, pragma=None, cache_key=None): + if self._psycopg is None: + import psycopg + DownloadArchivePostgresql._psycopg = psycopg + + self.connection = con = self._psycopg.connect(uri) + self.cursor = cursor = con.cursor() + self.close = con.close + self.keygen = keygen + self._cache_key = cache_key or "_archive_key" + + try: + cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry TEXT PRIMARY KEY)") + con.commit() + except Exception as exc: + log.error("%s: %s when creating 'archive' table: %s", + con, exc.__class__.__name__, exc) + con.rollback() + raise + + def add(self, kwdict): + key = kwdict.get(self._cache_key) or self.keygen(kwdict) + try: + self.cursor.execute( + "INSERT INTO archive (entry) " + "VALUES (%s) " + "ON CONFLICT DO NOTHING", + (key,)) + self.connection.commit() + except Exception as exc: + log.error("%s: %s when writing entry: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() + + def check(self, kwdict): + key = kwdict[self._cache_key] = self.keygen(kwdict) + try: + self.cursor.execute( + "SELECT true " + "FROM archive " + "WHERE entry=%s " + "LIMIT 1", + (key,)) + return self.cursor.fetchone() + except Exception as exc: + log.error("%s: %s when checking entry: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() + return False + + def finalize(self): + pass + + +class DownloadArchivePostgresqlMemory(DownloadArchivePostgresql): + + def __init__(self, path, keygen, pragma=None, cache_key=None): + DownloadArchivePostgresql.__init__( + self, path, keygen, pragma, cache_key) + self.keys = set() + + def add(self, kwdict): + self.keys.add( + kwdict.get(self._cache_key) or + self.keygen(kwdict)) + + def check(self, kwdict): + key = kwdict[self._cache_key] = self.keygen(kwdict) + if key in self.keys: + return True + try: + self.cursor.execute( + "SELECT true " + "FROM archive " + "WHERE entry=%s " + "LIMIT 1", + (key,)) + return self.cursor.fetchone() + except Exception as exc: + log.error("%s: %s when checking entry: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() + return False + + def finalize(self): + if not self.keys: + return + try: + self.cursor.executemany( + "INSERT INTO archive (entry) " + "VALUES (%s) " + "ON CONFLICT DO NOTHING", + ((key,) for key in self.keys)) + self.connection.commit() + except Exception as exc: + log.error("%s: %s when writing entries: %s", + self.connection, exc.__class__.__name__, exc) + self.connection.rollback() diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index c8aeef88b1..449ffe8ab7 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -12,7 +12,7 @@ import mimetypes from requests.exceptions import RequestException, ConnectionError, Timeout from .common import DownloaderBase -from .. import text, util +from .. import text, util, output from ssl import SSLError @@ -38,6 +38,7 @@ def __init__(self, job): self.verify = self.config("verify", extractor._verify) self.mtime = self.config("mtime", True) self.rate = self.config("rate") + interval_429 = self.config("sleep-429") if not self.config("consume-content", False): # this resets the underlying TCP connection, and therefore @@ -79,12 +80,16 @@ def __init__(self, job): self.receive = self._receive_rate if self.progress < 0.0: self.progress = 0.0 + if interval_429 is None: + self.interval_429 = extractor._interval_429 + else: + self.interval_429 = util.build_duration_func(interval_429) def download(self, url, pathfmt): try: return self._download_impl(url, pathfmt) except Exception: - print() + output.stderr_write("\n") raise finally: # remove file from incomplete downloads @@ -93,7 +98,7 @@ def download(self, url, pathfmt): def _download_impl(self, url, pathfmt): response = None - tries = 0 + tries = code = 0 msg = "" metadata = self.metadata @@ -111,10 +116,17 @@ def _download_impl(self, url, pathfmt): if response: self.release_conn(response) response = None + self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) if tries > self.retries: return False - time.sleep(tries) + + if code == 429 and self.interval_429: + s = self.interval_429() + time.sleep(s if s > tries else tries) + else: + time.sleep(tries) + code = 0 tries += 1 file_header = None @@ -257,7 +269,7 @@ def _download_impl(self, url, pathfmt): else response.iter_content(16), b"") except (RequestException, SSLError) as exc: msg = str(exc) - print() + output.stderr_write("\n") continue if self._adjust_extension(pathfmt, file_header) and \ pathfmt.exists(): @@ -291,14 +303,14 @@ def _download_impl(self, url, pathfmt): self.receive(fp, content, size, offset) except (RequestException, SSLError) as exc: msg = str(exc) - print() + output.stderr_write("\n") continue # check file size if size and fp.tell() < size: msg = "file size mismatch ({} < {})".format( fp.tell(), size) - print() + output.stderr_write("\n") continue break @@ -317,7 +329,7 @@ def release_conn(self, response): for _ in response.iter_content(self.chunk_size): pass except (RequestException, SSLError) as exc: - print() + output.stderr_write("\n") self.log.debug( "Unable to consume response body (%s: %s); " "closing the connection anyway", exc.__class__.__name__, exc) diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index ca8426b0c6..20f8ea4be3 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -183,7 +183,7 @@ class ImgurMeExtractor(ImgurExtractor): def items(self): if not self.cookies_check(("accesstoken",)): - raise exception.AuthorizationError("'accesstoken' cookie required") + self.log.error("'accesstoken' cookie required") if self.groups[0]: posts = self.api.accounts_me_hiddenalbums() diff --git a/gallery_dl/extractor/imhentai.py b/gallery_dl/extractor/imhentai.py index bba43da0d3..47027a8fa4 100644 --- a/gallery_dl/extractor/imhentai.py +++ b/gallery_dl/extractor/imhentai.py @@ -79,9 +79,7 @@ def _split(self, html): results.append(tag) return results - def images(self, _): - url = "{}/view/{}/1/".format(self.root, self.gallery_id) - page = self.request(url).text + def images(self, page): data = util.json_loads(text.extr(page, "$.parseJSON('", "'")) base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/" exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"} diff --git a/gallery_dl/job.py b/gallery_dl/job.py index 29149270e8..57b7c3ca50 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -551,8 +551,6 @@ def initialize(self, kwdict=None): archive_path = cfg("archive") if archive_path: - archive_path = util.expand_path(archive_path) - archive_prefix = cfg("archive-prefix") if archive_prefix is None: archive_prefix = extr.category @@ -562,16 +560,11 @@ def initialize(self, kwdict=None): archive_format = extr.archive_fmt try: - if "{" in archive_path: - archive_path = formatter.parse( - archive_path).format_map(kwdict) - if cfg("archive-mode") == "memory": - archive_cls = archive.DownloadArchiveMemory - else: - archive_cls = archive.DownloadArchive - self.archive = archive_cls( + self.archive = archive.connect( archive_path, - archive_prefix + archive_format, + archive_prefix, + archive_format, + cfg("archive-mode"), cfg("archive-pragma"), ) except Exception as exc: diff --git a/gallery_dl/postprocessor/compare.py b/gallery_dl/postprocessor/compare.py index 3bb63c80e1..c6bc54d14f 100644 --- a/gallery_dl/postprocessor/compare.py +++ b/gallery_dl/postprocessor/compare.py @@ -9,7 +9,7 @@ """Compare versions of the same file and replace/enumerate them on mismatch""" from .common import PostProcessor -from .. import text, util, exception +from .. import text, util, output, exception import os @@ -83,7 +83,7 @@ def _equal(self, pathfmt): self._equal_cnt += 1 if self._equal_cnt >= self._equal_max: util.remove_file(pathfmt.temppath) - print() + output.stderr_write("\n") raise self._equal_exc() pathfmt.delete = True diff --git a/gallery_dl/postprocessor/ugoira.py b/gallery_dl/postprocessor/ugoira.py index fec4ab0254..3a32b39875 100644 --- a/gallery_dl/postprocessor/ugoira.py +++ b/gallery_dl/postprocessor/ugoira.py @@ -9,7 +9,7 @@ """Convert Pixiv Ugoira to WebM""" from .common import PostProcessor -from .. import util +from .. import util, output import subprocess import tempfile import zipfile @@ -226,13 +226,13 @@ def convert_to_animation(self, pathfmt, tempdir): if self._finalize: self._finalize(pathfmt, tempdir) except OSError as exc: - print() + output.stderr_write("\n") self.log.error("Unable to invoke FFmpeg (%s: %s)", exc.__class__.__name__, exc) self.log.debug("", exc_info=exc) pathfmt.realpath = pathfmt.temppath except Exception as exc: - print() + output.stderr_write("\n") self.log.error("%s: %s", exc.__class__.__name__, exc) self.log.debug("", exc_info=exc) pathfmt.realpath = pathfmt.temppath @@ -296,7 +296,7 @@ def _exec(self, args): out = None if self.output else subprocess.DEVNULL retcode = util.Popen(args, stdout=out, stderr=out).wait() if retcode: - print() + output.stderr_write("\n") self.log.error("Non-zero exit status when running %s (%s)", args, retcode) raise ValueError() diff --git a/gallery_dl/update.py b/gallery_dl/update.py index bce960769d..6650ec4e29 100644 --- a/gallery_dl/update.py +++ b/gallery_dl/update.py @@ -12,7 +12,7 @@ from .extractor.common import Extractor, Message from .job import DownloadJob -from . import util, version, exception +from . import util, version, output, exception REPOS = { "stable" : "mikf/gallery-dl", @@ -143,13 +143,13 @@ def _check_update(self, kwdict): def _warning(self, msg, *args): if self._newline: self._newline = False - print() + output.stderr_write("\n") self.extractor.log.warning(msg, *args) def _error(self, msg, *args): if self._newline: self._newline = False - print() + output.stderr_write("\n") self.status |= 1 self.extractor.log.error(msg, *args)