Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pull] master from mikf:master #125

Merged
merged 3 commits into from
Feb 12, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion docs/supportedsites.md
Original file line number Diff line number Diff line change
Expand Up @@ -439,6 +439,12 @@ Consider all listed sites to potentially be NSFW.
<td>Albums, Favorites, Favorites Folders, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles</td>
<td></td>
</tr>
<tr>
<td>IMHentai</td>
<td>https://imhentai.xxx/</td>
<td>Galleries, Search Results, Tag Searches</td>
<td></td>
</tr>
<tr>
<td>Imxto</td>
<td>https://imx.to/</td>
Expand Down Expand Up @@ -1012,7 +1018,7 @@ Consider all listed sites to potentially be NSFW.
<tr>
<td>VSCO</td>
<td>https://vsco.co/</td>
<td>Avatars, Collections, Galleries, individual Images, Spaces, User Profiles</td>
<td>Avatars, Collections, Galleries, individual Images, Spaces, User Profiles, Videos</td>
<td></td>
</tr>
<tr>
Expand Down
1 change: 1 addition & 0 deletions gallery_dl/extractor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@
"imgbox",
"imgth",
"imgur",
"imhentai",
"inkbunny",
"instagram",
"issuu",
Expand Down
2 changes: 2 additions & 0 deletions gallery_dl/extractor/bunkr.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def _extract_file(self, webpage_url):
text.extr(page, '<img src="', '"'))
file_name = (text.extr(page, 'property="og:title" content="', '"') or
text.extr(page, "<title>", " | Bunkr<"))
fallback = text.extr(page, 'property="og:url" content="', '"')

if not file_url:
webpage_url = text.unescape(text.rextract(
Expand All @@ -172,6 +173,7 @@ def _extract_file(self, webpage_url):
"file" : text.unescape(file_url),
"name" : text.unescape(file_name),
"id_url" : webpage_url.rpartition("/")[2],
"_fallback" : (fallback,) if fallback else (),
"_http_headers" : {"Referer": response.url},
"_http_validate": self._validate,
}
Expand Down
121 changes: 121 additions & 0 deletions gallery_dl/extractor/imhentai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# -*- coding: utf-8 -*-

# Copyright 2025 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Extractors for https://imhentai.xxx/"""

from .common import GalleryExtractor, Extractor, Message
from .. import text, util

BASE_PATTERN = r"(?:https?://)?(?:www\.)?imhentai\.xxx"


class ImhentaiExtractor(Extractor):
category = "imhentai"
root = "https://imhentai.xxx"

def _pagination(self, url):
base = self.root + "/gallery/"
data = {"_extractor": ImhentaiGalleryExtractor}

while True:
page = self.request(url).text
extr = text.extract_from(page)

while True:
gallery_id = extr('<a href="/gallery/', '"')
if not gallery_id:
break
yield Message.Queue, base + gallery_id, data
extr('<a href="/gallery/', '"') # skip duplicate GIDs

href = text.rextract(page, "class='page-link' href='", "'")[0]
if not href or href == "#":
return
url = text.ensure_http_scheme(href)


class ImhentaiGalleryExtractor(ImhentaiExtractor, GalleryExtractor):
"""Extractor for imhentai galleries"""
pattern = BASE_PATTERN + r"/(?:gallery|view)/(\d+)"
example = "https://imhentai.xxx/gallery/12345/"

def __init__(self, match):
self.gallery_id = match.group(1)
url = "{}/gallery/{}/".format(self.root, self.gallery_id)
GalleryExtractor.__init__(self, match, url)

def metadata(self, page):
extr = text.extract_from(page)

data = {
"gallery_id": text.parse_int(self.gallery_id),
"title" : text.unescape(extr("<h1>", "<")),
"title_alt" : text.unescape(extr('class="subtitle">', "<")),
"parody" : self._split(extr(">Parodies:</span>", "</li>")),
"character" : self._split(extr(">Characters:</span>", "</li>")),
"tags" : self._split(extr(">Tags:</span>", "</li>")),
"artist" : self._split(extr(">Artists:</span>", "</li>")),
"group" : self._split(extr(">Groups:</span>", "</li>")),
"language" : self._split(extr(">Languages:</span>", "</li>")),
"type" : text.remove_html(extr(">Category:</span>", "<span")),
}

if data["language"]:
data["lang"] = util.language_to_code(data["language"][0])

return data

def _split(self, html):
results = []
for tag in text.extract_iter(html, ">", "</a>"):
tag = tag.partition(" <span class='badge'>")[0]
if "<" in tag:
tag = text.remove_html(tag)
results.append(tag)
return results

def images(self, _):
url = "{}/view/{}/1/".format(self.root, self.gallery_id)
page = self.request(url).text
data = util.json_loads(text.extr(page, "$.parseJSON('", "'"))
base = text.extr(page, 'data-src="', '"').rpartition("/")[0] + "/"
exts = {"j": "jpg", "p": "png", "g": "gif", "w": "webp", "a": "avif"}

results = []
for i in map(str, range(1, len(data)+1)):
ext, width, height = data[i].split(",")
url = base + i + "." + exts[ext]
results.append((url, {
"width" : text.parse_int(width),
"height": text.parse_int(height),
}))
return results


class ImhentaiTagExtractor(ImhentaiExtractor):
"""Extractor for imhentai tag searches"""
subcategory = "tag"
pattern = (BASE_PATTERN + r"(/(?:"
r"artist|category|character|group|language|parody|tag"
r")/([^/?#]+))")
example = "https://imhentai.xxx/tag/TAG/"

def items(self):
url = self.root + self.groups[0] + "/"
return self._pagination(url)


class ImhentaiSearchExtractor(ImhentaiExtractor):
"""Extractor for imhentai search results"""
subcategory = "search"
pattern = BASE_PATTERN + r"/search/?\?([^#]+)"
example = "https://imhentai.xxx/search/?key=QUERY"

def items(self):
url = self.root + "/search/?" + self.groups[0]
return self._pagination(url)
38 changes: 32 additions & 6 deletions gallery_dl/extractor/vsco.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ def items(self):
if img["is_video"]:
if not videos:
continue
url = "https://" + img["video_url"]
url = img["video_url"]
if not url.startswith("ytdl:"):
url = "https://" + url
else:
base = img["responsive_url"].partition("/")[2]
cdn, _, path = base.partition("/")
Expand All @@ -63,6 +65,9 @@ def items(self):
"height": img["height"],
"description": img.get("description") or "",
})
if data["extension"] == "m3u8":
data["_ytdl_manifest"] = "hls"
data["extension"] = "mp4"
yield Message.Url, url, data

def images(self):
Expand Down Expand Up @@ -294,12 +299,33 @@ class VscoImageExtractor(VscoExtractor):
pattern = USER_PATTERN + r"/media/([0-9a-fA-F]+)"
example = "https://vsco.co/USER/media/0123456789abcdef"

def __init__(self, match):
VscoExtractor.__init__(self, match)
self.media_id = match.group(2)

def images(self):
url = "{}/{}/media/{}".format(self.root, self.user, self.media_id)
url = "{}/{}/media/{}".format(self.root, self.user, self.groups[1])
data = self._extract_preload_state(url)
media = data["medias"]["byId"].popitem()[1]["media"]
return (self._transform_media(media),)


class VscoVideoExtractor(VscoExtractor):
"""Extractor for vsco.co videos links"""
subcategory = "video"
pattern = USER_PATTERN + r"/video/([^/?#]+)"
example = "https://vsco.co/USER/video/012345678-9abc-def0"

def images(self):
url = "{}/{}/video/{}".format(self.root, self.user, self.groups[1])
data = self._extract_preload_state(url)
media = data["medias"]["byId"].popitem()[1]["media"]

return ({
"_id" : media["id"],
"is_video" : True,
"grid_name" : "",
"upload_date" : media["createdDate"],
"responsive_url": media["posterUrl"],
"video_url" : "ytdl:" + media.get("playbackUrl"),
"image_meta" : None,
"width" : media["width"],
"height" : media["height"],
"description" : media["description"],
},)
1 change: 1 addition & 0 deletions scripts/supportedsites.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
"imgkiwi" : "IMG.Kiwi",
"imgth" : "imgth",
"imgur" : "imgur",
"imhentai" : "IMHentai",
"joyreactor" : "JoyReactor",
"itchio" : "itch.io",
"jpgfish" : "JPG Fish",
Expand Down
29 changes: 25 additions & 4 deletions test/results/bunkr.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,15 @@
"#url" : "https://bunkr.sk/a/Lktg9Keq",
"#category": ("lolisafe", "bunkr", "album"),
"#class" : bunkr.BunkrAlbumExtractor,
"#urls" : "https://brg-bk.cdn.gigachad-cdn.ru/test-%E3%83%86%E3%82%B9%E3%83%88-%22%26%3E-QjgneIQv.png",
"#urls" : "https://brg-bk.cdn.gigachad-cdn.ru/test-%E3%83%86%E3%82%B9%E3%83%88-%22%26%3E-QjgneIQv.png?n=test-%E3%83%86%E3%82%B9%E3%83%88-%22%26%3E.png",
"#sha1_content": "0c8768055e4e20e7c7259608b67799171b691140",

"album_id" : "Lktg9Keq",
"album_name" : "test テスト \"&>",
"album_size" : "182 bytes",
"count" : 1,
"extension" : "png",
"file" : "https://brg-bk.cdn.gigachad-cdn.ru/test-%E3%83%86%E3%82%B9%E3%83%88-%22%26%3E-QjgneIQv.png",
"file" : str,
"filename" : "test-テスト-\"&>-QjgneIQv",
"id" : "QjgneIQv",
"id_url" : "1044478",
Expand Down Expand Up @@ -217,12 +217,12 @@
"#url" : "https://bunkrrr.org/d/dJuETSzKLrUps",
"#category": ("lolisafe", "bunkr", "media"),
"#class" : bunkr.BunkrMediaExtractor,
"#urls" : "https://brg-bk.cdn.gigachad-cdn.ru/file-r5fmwjdd.zip",
"#urls" : "https://brg-bk.cdn.gigachad-cdn.ru/file-r5fmwjdd.zip?n=file.zip",
"#sha1_content": "102ddd7894fe39b3843098fc51f972a0af938f45",

"count" : 1,
"extension": "zip",
"file" : "https://brg-bk.cdn.gigachad-cdn.ru/file-r5fmwjdd.zip",
"file" : "https://brg-bk.cdn.gigachad-cdn.ru/file-r5fmwjdd.zip?n=file.zip",
"filename" : "file-r5fmwjdd",
"id" : "r5fmwjdd",
"id_url" : "38792076",
Expand Down Expand Up @@ -251,4 +251,25 @@
"extension": "mp4",
},

{
"#url" : "https://bunkr.site/f/JEn5iQgYVYJfi",
"#comment" : "403 error for main 'brg-bk.cdn.gigachad-cdn.ru' URL (#6732 #6972)",
"#category": ("lolisafe", "bunkr", "media"),
"#class" : bunkr.BunkrMediaExtractor,
"#urls" : "https://brg-bk.cdn.gigachad-cdn.ru/IMG_47272f2c698d257fd22f4300ae98ec35929b-iEYVkLPQ.jpg?n=IMG_47272f2c698d257fd22f4300ae98ec35929b.jpg",
"#sha1_content": "f1c839743563828b250e48d485933a735a508527",

"_fallback": (
"https://i-burger.bunkr.ru/IMG_47272f2c698d257fd22f4300ae98ec35929b-iEYVkLPQ.jpg",
),
"_http_headers": {
"Referer": "https://get.bunkrr.su/file/29682239",
},
"extension": "jpg",
"filename" : "IMG_47272f2c698d257fd22f4300ae98ec35929b-iEYVkLPQ",
"id" : "iEYVkLPQ",
"id_url" : "29682239",
"name" : "IMG_47272f2c698d257fd22f4300ae98ec35929b",
},

)
Loading
Loading