Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from mikf:master #128

Merged
merged 5 commits into from
Feb 16, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions docs/configuration.rst
Original file line number Diff line number Diff line change
Expand Up @@ -922,11 +922,13 @@ Description
extractor.*.archive
-------------------
Type
|Path|_
* ``string``
* |Path|_
Default
``null``
Example
``"$HOME/.archives/{category}.sqlite3"``
* ``"$HOME/.archives/{category}.sqlite3"``
* ``"postgresql://user:pass@host/database"``
Description
File to store IDs of downloaded files in. Downloads of files
already recorded in this archive file will be
Expand All @@ -937,6 +939,11 @@ Description
memory requirements are significantly lower when the
amount of stored IDs gets reasonably large.

If this value is a
`PostgreSQL Connection URI <https://www.postgresql.org/docs/current/libpq-connect.html#LIBPQ-CONNSTRING-URIS>`__,
the archive will use this PostgreSQL database as backend (requires
`Psycopg <https://www.psycopg.org/>`__).

Note: Archive files that do not already exist get generated automatically.

Note: Archive paths support regular `format string`_ replacements,
Expand Down Expand Up @@ -5564,6 +5571,21 @@ Description
regardless of this option.


downloader.http.sleep-429
-------------------------
Type
|Duration|_
Default
`extractor.*.sleep-429`_
Description
Number of seconds to sleep when receiving a `429 Too Many Requests`
response before `retrying <downloader.*.retries_>`__ the request.

Note: Requires
`retry-codes <downloader.http.retry-codes_>`__
to include ``429``.


downloader.http.validate
------------------------
Type
Expand Down
3 changes: 2 additions & 1 deletion docs/gallery-dl.conf
Original file line number Diff line number Diff line change
Expand Up @@ -918,7 +918,8 @@
"consume-content" : false,
"enabled" : true,
"headers" : null,
"retry-codes" : [404, 429, 430],
"retry-codes" : [],
"sleep-429" : 60.0,
"validate" : true
},

Expand Down
161 changes: 147 additions & 14 deletions gallery_dl/archive.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-

# Copyright 2024 Mike Fährmann
# Copyright 2024-2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
Expand All @@ -9,26 +9,55 @@
"""Download Archives"""

import os
import sqlite3
from . import formatter
import logging
from . import util, formatter

log = logging.getLogger("archive")


def connect(path, prefix, format, mode=None, pragma=None, kwdict=None):
keygen = formatter.parse(prefix + format).format_map

if path.startswith(("postgres://", "postgresql://")):
if mode == "memory":
cls = DownloadArchivePostgresqlMemory
else:
cls = DownloadArchivePostgresql
else:
path = util.expand_path(path)
if kwdict is not None and "{" in path:
path = formatter.parse(path).format_map(kwdict)
if mode == "memory":
cls = DownloadArchiveMemory
else:
cls = DownloadArchive

return cls(path, keygen, pragma)


class DownloadArchive():
_sqlite3 = None

def __init__(self, path, keygen, pragma=None, cache_key=None):

if self._sqlite3 is None:
import sqlite3
DownloadArchive._sqlite3 = sqlite3

def __init__(self, path, format_string, pragma=None,
cache_key="_archive_key"):
try:
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
con = self._sqlite3.connect(
path, timeout=60, check_same_thread=False)
except sqlite3.OperationalError:
os.makedirs(os.path.dirname(path))
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
con = self._sqlite3.connect(
path, timeout=60, check_same_thread=False)
con.isolation_level = None

self.keygen = formatter.parse(format_string).format_map
self.keygen = keygen
self.connection = con
self.close = con.close
self.cursor = cursor = con.cursor()
self._cache_key = cache_key
self._cache_key = cache_key or "_archive_key"

if pragma:
for stmt in pragma:
Expand All @@ -37,7 +66,7 @@ def __init__(self, path, format_string, pragma=None,
try:
cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry TEXT PRIMARY KEY) WITHOUT ROWID")
except sqlite3.OperationalError:
except self._sqlite3.OperationalError:
# fallback for missing WITHOUT ROWID support (#553)
cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry TEXT PRIMARY KEY)")
Expand All @@ -61,9 +90,9 @@ def finalize(self):

class DownloadArchiveMemory(DownloadArchive):

def __init__(self, path, format_string, pragma=None,
cache_key="_archive_key"):
DownloadArchive.__init__(self, path, format_string, pragma, cache_key)
def __init__(self, path, keygen, pragma=None, cache_key=None):
DownloadArchive.__init__(
self, path, keygen, pragma, cache_key)
self.keys = set()

def add(self, kwdict):
Expand All @@ -87,7 +116,7 @@ def finalize(self):
with self.connection:
try:
cursor.execute("BEGIN")
except sqlite3.OperationalError:
except self._sqlite3.OperationalError:
pass

stmt = "INSERT OR IGNORE INTO archive (entry) VALUES (?)"
Expand All @@ -96,3 +125,107 @@ def finalize(self):
cursor.execute(stmt, (key,))
else:
cursor.executemany(stmt, ((key,) for key in self.keys))


class DownloadArchivePostgresql():
_psycopg = None

def __init__(self, uri, keygen, pragma=None, cache_key=None):
if self._psycopg is None:
import psycopg
DownloadArchivePostgresql._psycopg = psycopg

self.connection = con = self._psycopg.connect(uri)
self.cursor = cursor = con.cursor()
self.close = con.close
self.keygen = keygen
self._cache_key = cache_key or "_archive_key"

try:
cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry TEXT PRIMARY KEY)")
con.commit()
except Exception as exc:
log.error("%s: %s when creating 'archive' table: %s",
con, exc.__class__.__name__, exc)
con.rollback()
raise

def add(self, kwdict):
key = kwdict.get(self._cache_key) or self.keygen(kwdict)
try:
self.cursor.execute(
"INSERT INTO archive (entry) "
"VALUES (%s) "
"ON CONFLICT DO NOTHING",
(key,))
self.connection.commit()
except Exception as exc:
log.error("%s: %s when writing entry: %s",
self.connection, exc.__class__.__name__, exc)
self.connection.rollback()

def check(self, kwdict):
key = kwdict[self._cache_key] = self.keygen(kwdict)
try:
self.cursor.execute(
"SELECT true "
"FROM archive "
"WHERE entry=%s "
"LIMIT 1",
(key,))
return self.cursor.fetchone()
except Exception as exc:
log.error("%s: %s when checking entry: %s",
self.connection, exc.__class__.__name__, exc)
self.connection.rollback()
return False

def finalize(self):
pass


class DownloadArchivePostgresqlMemory(DownloadArchivePostgresql):

def __init__(self, path, keygen, pragma=None, cache_key=None):
DownloadArchivePostgresql.__init__(
self, path, keygen, pragma, cache_key)
self.keys = set()

def add(self, kwdict):
self.keys.add(
kwdict.get(self._cache_key) or
self.keygen(kwdict))

def check(self, kwdict):
key = kwdict[self._cache_key] = self.keygen(kwdict)
if key in self.keys:
return True
try:
self.cursor.execute(
"SELECT true "
"FROM archive "
"WHERE entry=%s "
"LIMIT 1",
(key,))
return self.cursor.fetchone()
except Exception as exc:
log.error("%s: %s when checking entry: %s",
self.connection, exc.__class__.__name__, exc)
self.connection.rollback()
return False

def finalize(self):
if not self.keys:
return
try:
self.cursor.executemany(
"INSERT INTO archive (entry) "
"VALUES (%s) "
"ON CONFLICT DO NOTHING",
((key,) for key in self.keys))
self.connection.commit()
except Exception as exc:
log.error("%s: %s when writing entries: %s",
self.connection, exc.__class__.__name__, exc)
self.connection.rollback()
28 changes: 20 additions & 8 deletions gallery_dl/downloader/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import mimetypes
from requests.exceptions import RequestException, ConnectionError, Timeout
from .common import DownloaderBase
from .. import text, util
from .. import text, util, output
from ssl import SSLError


Expand All @@ -38,6 +38,7 @@ def __init__(self, job):
self.verify = self.config("verify", extractor._verify)
self.mtime = self.config("mtime", True)
self.rate = self.config("rate")
interval_429 = self.config("sleep-429")

if not self.config("consume-content", False):
# this resets the underlying TCP connection, and therefore
Expand Down Expand Up @@ -79,12 +80,16 @@ def __init__(self, job):
self.receive = self._receive_rate
if self.progress < 0.0:
self.progress = 0.0
if interval_429 is None:
self.interval_429 = extractor._interval_429
else:
self.interval_429 = util.build_duration_func(interval_429)

def download(self, url, pathfmt):
try:
return self._download_impl(url, pathfmt)
except Exception:
print()
output.stderr_write("\n")
raise
finally:
# remove file from incomplete downloads
Expand All @@ -93,7 +98,7 @@ def download(self, url, pathfmt):

def _download_impl(self, url, pathfmt):
response = None
tries = 0
tries = code = 0
msg = ""

metadata = self.metadata
Expand All @@ -111,10 +116,17 @@ def _download_impl(self, url, pathfmt):
if response:
self.release_conn(response)
response = None

self.log.warning("%s (%s/%s)", msg, tries, self.retries+1)
if tries > self.retries:
return False
time.sleep(tries)

if code == 429 and self.interval_429:
s = self.interval_429()
time.sleep(s if s > tries else tries)
else:
time.sleep(tries)
code = 0

tries += 1
file_header = None
Expand Down Expand Up @@ -257,7 +269,7 @@ def _download_impl(self, url, pathfmt):
else response.iter_content(16), b"")
except (RequestException, SSLError) as exc:
msg = str(exc)
print()
output.stderr_write("\n")
continue
if self._adjust_extension(pathfmt, file_header) and \
pathfmt.exists():
Expand Down Expand Up @@ -291,14 +303,14 @@ def _download_impl(self, url, pathfmt):
self.receive(fp, content, size, offset)
except (RequestException, SSLError) as exc:
msg = str(exc)
print()
output.stderr_write("\n")
continue

# check file size
if size and fp.tell() < size:
msg = "file size mismatch ({} < {})".format(
fp.tell(), size)
print()
output.stderr_write("\n")
continue

break
Expand All @@ -317,7 +329,7 @@ def release_conn(self, response):
for _ in response.iter_content(self.chunk_size):
pass
except (RequestException, SSLError) as exc:
print()
output.stderr_write("\n")
self.log.debug(
"Unable to consume response body (%s: %s); "
"closing the connection anyway", exc.__class__.__name__, exc)
Expand Down
2 changes: 1 addition & 1 deletion gallery_dl/extractor/imgur.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ class ImgurMeExtractor(ImgurExtractor):

def items(self):
if not self.cookies_check(("accesstoken",)):
raise exception.AuthorizationError("'accesstoken' cookie required")
self.log.error("'accesstoken' cookie required")

if self.groups[0]:
posts = self.api.accounts_me_hiddenalbums()
Expand Down
4 changes: 1 addition & 3 deletions gallery_dl/extractor/imhentai.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,9 +79,7 @@ def _split(self, html):
results.append(tag)
return results

def images(self, _):
url = "{}/view/{}/1/".format(self.root, self.gallery_id)
page = self.request(url).text
def images(self, page):
data = util.json_loads(text.extr(page, "$.parseJSON('", "'"))
base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/"
exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"}
Expand Down
Loading
Loading