diff --git a/catalog/common/sites.py b/catalog/common/sites.py index d4fdc964..c3acee24 100644 --- a/catalog/common/sites.py +++ b/catalog/common/sites.py @@ -104,7 +104,7 @@ def query_str(content, query: str) -> str: return content.xpath(query)[0].strip() @staticmethod - def query_list(content, query: str) -> list[str]: + def query_list(content, query: str) -> list: return list(content.xpath(query)) @classmethod diff --git a/catalog/search/models.py b/catalog/search/models.py index 90e9e251..f6279539 100644 --- a/catalog/search/models.py +++ b/catalog/search/models.py @@ -79,6 +79,9 @@ def __init__( self.display_description = brief self.cover_image_url = cover_url + def __repr__(self): + return f"[{self.category}] {self.display_title} {self.url}" + @property def verbose_category_name(self): return self.category.label if self.category else "" diff --git a/catalog/sites/douban.py b/catalog/sites/douban.py index a9c6e2c1..f228aaa2 100644 --- a/catalog/sites/douban.py +++ b/catalog/sites/douban.py @@ -1,6 +1,8 @@ +import json import re from catalog.common import * +from catalog.search.models import ExternalSearchResultItem RE_NUMBERS = re.compile(r"\d+\d*") RE_WHITESPACES = re.compile(r"\s+") @@ -30,3 +32,35 @@ def validate_response(self, response) -> int: return RESPONSE_OK else: return RESPONSE_INVALID_CONTENT + + +class DoubanSearcher: + @classmethod + def search(cls, cat: ItemCategory, c: str, q: str, p: int = 1): + url = f"https://search.douban.com/{c}/subject_search?search_text={q}&start={15*(p-1)}" + content = DoubanDownloader(url).download().html() + j = json.loads( + content.xpath( + "//script[text()[contains(.,'window.__DATA__')]]/text()" + )[ # type:ignore + 0 + ] + .split("window.__DATA__ = ")[1] # type:ignore + .split("};")[0] # type:ignore + + "}" + ) + results = [ + ExternalSearchResultItem( + cat, + SiteName.Douban, + item["url"], + item["title"], + item["abstract"], + item["abstract_2"], + item["cover_url"], + ) + for item in j["items"] + for item in j["items"] + if item.get("tpl_name") == "search_subject" + ] + return results diff --git a/catalog/sites/douban_book.py b/catalog/sites/douban_book.py index 857fb043..2a6413a7 100644 --- a/catalog/sites/douban_book.py +++ b/catalog/sites/douban_book.py @@ -3,7 +3,7 @@ from catalog.common import * from common.models.lang import detect_language -from .douban import * +from .douban import RE_NUMBERS, RE_WHITESPACES, DoubanDownloader, DoubanSearcher @SiteManager.register @@ -23,46 +23,51 @@ class DoubanBook(AbstractSite): def id_to_url(cls, id_value): return "https://book.douban.com/subject/" + id_value + "/" + @classmethod + def search(cls, q: str, p: int = 1): + return DoubanSearcher.search(ItemCategory.Book, "book", q, p) + def scrape(self): content = DoubanDownloader(self.url).download().html() - isbn_elem = content.xpath( - "//div[@id='info']//span[text()='ISBN:']/following::text()" + isbn_elem = self.query_list( + content, "//div[@id='info']//span[text()='ISBN:']/following::text()" ) isbn = isbn_elem[0].strip() if isbn_elem else None - title_elem = content.xpath("/html/body//h1/span/text()") + title_elem = self.query_list(content, "/html/body//h1/span/text()") title = ( title_elem[0].strip() if title_elem else f"Unknown Title {self.id_value}" ) - subtitle_elem = content.xpath( - "//div[@id='info']//span[text()='副标题:']/following::text()" + subtitle_elem = self.query_list( + content, "//div[@id='info']//span[text()='副标题:']/following::text()" ) subtitle = subtitle_elem[0].strip()[:500] if subtitle_elem else None - orig_title_elem = content.xpath( - "//div[@id='info']//span[text()='原作名:']/following::text()" + orig_title_elem = self.query_list( + content, "//div[@id='info']//span[text()='原作名:']/following::text()" ) orig_title = orig_title_elem[0].strip()[:500] if orig_title_elem else None - language_elem = content.xpath( - "//div[@id='info']//span[text()='语言:']/following::text()" + language_elem = self.query_list( + content, "//div[@id='info']//span[text()='语言:']/following::text()" ) language = [language_elem[0].strip()] if language_elem else [] - pub_house_elem = content.xpath( - "//div[@id='info']//span[text()='出版社:']/following::text()" + pub_house_elem = self.query_list( + content, "//div[@id='info']//span[text()='出版社:']/following::text()" ) pub_house = pub_house_elem[0].strip() if pub_house_elem else None if not pub_house: - pub_house_elem = content.xpath( - "//div[@id='info']//span[text()='出版社:']/following-sibling::a/text()" + pub_house_elem = self.query_list( + content, + "//div[@id='info']//span[text()='出版社:']/following-sibling::a/text()", ) pub_house = pub_house_elem[0].strip() if pub_house_elem else None - pub_date_elem = content.xpath( - "//div[@id='info']//span[text()='出版年:']/following::text()" + pub_date_elem = self.query_list( + content, "//div[@id='info']//span[text()='出版年:']/following::text()" ) pub_date = pub_date_elem[0].strip() if pub_date_elem else "" year_month_day = RE_NUMBERS.findall(pub_date) @@ -88,18 +93,18 @@ def scrape(self): else pub_month ) - binding_elem = content.xpath( - "//div[@id='info']//span[text()='装帧:']/following::text()" + binding_elem = self.query_list( + content, "//div[@id='info']//span[text()='装帧:']/following::text()" ) binding = binding_elem[0].strip() if binding_elem else None - price_elem = content.xpath( - "//div[@id='info']//span[text()='定价:']/following::text()" + price_elem = self.query_list( + content, "//div[@id='info']//span[text()='定价:']/following::text()" ) price = price_elem[0].strip() if price_elem else None - pages_elem = content.xpath( - "//div[@id='info']//span[text()='页数:']/following::text()" + pages_elem = self.query_list( + content, "//div[@id='info']//span[text()='页数:']/following::text()" ) pages = pages_elem[0].strip() if pages_elem else None if pages is not None: @@ -109,15 +114,16 @@ def scrape(self): if pages and (pages > 999999 or pages < 1): pages = None - brief_elem = content.xpath( - "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()" + brief_elem = self.query_list( + content, + "//h2/span[text()='内容简介']/../following-sibling::div[1]//div[@class='intro'][not(ancestor::span[@class='short'])]/p/text()", ) brief = "\n".join(p.strip() for p in brief_elem) if brief_elem else None contents = None try: - contents_elem = content.xpath( - "//h2/span[text()='目录']/../following-sibling::div[1]" + contents_elem = self.query_list( + content, "//h2/span[text()='目录']/../following-sibling::div[1]" )[0] # if next the id of next sibling contains `dir`, that would be the full contents if "dir" in contents_elem.getnext().xpath("@id")[0]: @@ -129,24 +135,28 @@ def scrape(self): ) else: contents = ( - "\n".join(p.strip() for p in contents_elem.xpath("text()")) + "\n".join( + p.strip() for p in self.query_list(contents_elem, "text()") + ) if contents_elem is not None else None ) except Exception: pass - img_url_elem = content.xpath("//*[@id='mainpic']/a/img/@src") + img_url_elem = self.query_list(content, "//*[@id='mainpic']/a/img/@src") img_url = img_url_elem[0].strip() if img_url_elem else None # there are two html formats for authors and translators - authors_elem = content.xpath( + authors_elem = self.query_list( + content, """//div[@id='info']//span[text()='作者:']/following-sibling::br[1]/ - preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""" + preceding-sibling::a[preceding-sibling::span[text()='作者:']]/text()""", ) if not authors_elem: - authors_elem = content.xpath( - """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""" + authors_elem = self.query_list( + content, + """//div[@id='info']//span[text()=' 作者']/following-sibling::a/text()""", ) if authors_elem: authors = [] @@ -155,13 +165,15 @@ def scrape(self): else: authors = None - translators_elem = content.xpath( + translators_elem = self.query_list( + content, """//div[@id='info']//span[text()='译者:']/following-sibling::br[1]/ - preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""" + preceding-sibling::a[preceding-sibling::span[text()='译者:']]/text()""", ) if not translators_elem: - translators_elem = content.xpath( - """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""" + translators_elem = self.query_list( + content, + """//div[@id='info']//span[text()=' 译者']/following-sibling::a/text()""", ) if translators_elem: translators = [] @@ -170,18 +182,20 @@ def scrape(self): else: translators = None - cncode_elem = content.xpath( - "//div[@id='info']//span[text()='统一书号:']/following::text()" + cncode_elem = self.query_list( + content, "//div[@id='info']//span[text()='统一书号:']/following::text()" ) cubn = cncode_elem[0].strip() if cncode_elem else None - series_elem = content.xpath( - "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()" + series_elem = self.query_list( + content, + "//div[@id='info']//span[text()='丛书:']/following-sibling::a[1]/text()", ) series = series_elem[0].strip() if series_elem else None - imprint_elem = content.xpath( - "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()" + imprint_elem = self.query_list( + content, + "//div[@id='info']//span[text()='出品方:']/following-sibling::a[1]/text()", ) imprint = imprint_elem[0].strip() if imprint_elem else None @@ -212,8 +226,9 @@ def scrape(self): "cover_image_url": img_url, } - works_element = content.xpath( - '//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href' + works_element = self.query_list( + content, + '//h2/span[text()="这本书的其他版本"]/following-sibling::span[@class="pl"]/a/@href', ) if works_element: r = re.match(r"\w+://book.douban.com/works/(\d+)", works_element[0]) @@ -234,7 +249,7 @@ def scrape(self): ] pd = ResourceContent(metadata=data) - t, n = detect_isbn_asin(isbn) + t, n = detect_isbn_asin(isbn or "") if t: pd.lookup_ids[t] = n pd.lookup_ids[IdType.CUBN] = cubn @@ -255,11 +270,11 @@ def id_to_url(cls, id_value): def scrape(self): content = DoubanDownloader(self.url).download().html() - title_elem = content.xpath("//h1/text()") + title_elem = self.query_list(content, "//h1/text()") title = title_elem[0].split("全部版本(")[0].strip() if title_elem else None if not title: raise ParseError(self, "title") - book_urls = content.xpath('//a[@class="pl2"]/@href') + book_urls = self.query_list(content, '//a[@class="pl2"]/@href') related_resources = [] for url in book_urls: site = SiteManager.get_site_by_url(url) diff --git a/catalog/sites/douban_drama.py b/catalog/sites/douban_drama.py index c277d0e0..63d10141 100644 --- a/catalog/sites/douban_drama.py +++ b/catalog/sites/douban_drama.py @@ -7,7 +7,7 @@ from catalog.models import * from common.models.lang import detect_language -from .douban import DoubanDownloader +from .douban import DoubanDownloader, DoubanSearcher def _cache_key(url): @@ -45,6 +45,8 @@ def id_to_url(cls, id_value): return f"https://www.douban.com/location/drama/{ids[0]}/#{ids[1]}" def scrape(self): + if not self.id_value or not self.url: + raise ParseError(self, "id_value or url") show_url = self.url.split("#")[0] show_id = self.id_value.split("-")[0] version_id = self.id_value.split("-")[1] @@ -59,20 +61,20 @@ def scrape(self): p = "//div[@id='" + version_id + "']" q = p + "//dt[text()='{}:']/following-sibling::dd[1]/a/span/text()" q2 = p + "//dt[text()='{}:']/following-sibling::dd[1]/text()" - title = " ".join(h.xpath(p + "//h3/text()")).strip() + title = " ".join(self.query_list(h, p + "//h3/text()")).strip() if not title: raise ParseError(self, "title") data = { "title": title, "localized_title": [{"lang": "zh-cn", "text": title}], - "director": [x.strip() for x in h.xpath(q.format("导演"))], - "playwright": [x.strip() for x in h.xpath(q.format("编剧"))], - # "actor": [x.strip() for x in h.xpath(q.format("主演"))], - "composer": [x.strip() for x in h.xpath(q.format("作曲"))], - "language": [x.strip() for x in h.xpath(q2.format("语言"))], - "opening_date": " ".join(h.xpath(q2.format("演出日期"))).strip(), - "troupe": [x.strip() for x in h.xpath(q.format("演出团体"))], - "location": [x.strip() for x in h.xpath(q.format("演出剧院"))], + "director": [x.strip() for x in self.query_list(h, q.format("导演"))], + "playwright": [x.strip() for x in self.query_list(h, q.format("编剧"))], + # "actor": [x.strip() for x in self.query_list(h, q.format("主演"))], + "composer": [x.strip() for x in self.query_list(h, q.format("作曲"))], + "language": [x.strip() for x in self.query_list(h, q2.format("语言"))], + "opening_date": " ".join(self.query_list(h, q2.format("演出日期"))).strip(), + "troupe": [x.strip() for x in self.query_list(h, q.format("演出团体"))], + "location": [x.strip() for x in self.query_list(h, q.format("演出剧院"))], } if data["opening_date"]: d = data["opening_date"].split("-") @@ -80,7 +82,9 @@ def scrape(self): if dl > 3: data["opening_date"] = "-".join(d[:3]) data["closing_date"] = "-".join(d[0 : 6 - dl] + d[3:dl]) - actor_elem = h.xpath(p + "//dt[text()='主演:']/following-sibling::dd[1]/a") + actor_elem = self.query_list( + h, p + "//dt[text()='主演:']/following-sibling::dd[1]/a" + ) data["actor"] = [] for e in actor_elem: n = "".join(e.xpath("span/text()")).strip() @@ -88,7 +92,7 @@ def scrape(self): t = re.sub(r"^[\s\(饰]*(.+)\)[\s\/]*$", r"\1", t).strip() t = t if t != "/" else "" data["actor"].append({"name": n, "role": t}) - img_url_elem = h.xpath("//img[@itemprop='image']/@src") + img_url_elem = self.query_list(h, "//img[@itemprop='image']/@src") data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None pd = ResourceContent(metadata=data) pd.metadata["required_resources"] = [ @@ -128,78 +132,87 @@ def scrape(self): h = html.fromstring(r) data = {} - title_elem = h.xpath("/html/body//h1/span/text()") + title_elem = self.query_list(h, "/html/body//h1/span/text()") if title_elem: data["title"] = title_elem[0].strip() data["orig_title"] = title_elem[1] if len(title_elem) > 1 else None else: raise ParseError(self, "title") - other_title_elem = h.xpath( - "//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()" + other_title_elem = self.query_list( + h, "//dl//dt[text()='又名:']/following::dd[@itemprop='name']/text()" ) data["other_title"] = other_title_elem - plot_elem = h.xpath("//div[@class='pure-text']/div[@class='full']/text()") + plot_elem = self.query_list( + h, "//div[@class='pure-text']/div[@class='full']/text()" + ) if len(plot_elem) == 0: - plot_elem = h.xpath( - "//div[@class='pure-text']/div[@class='abstract']/text()" + plot_elem = self.query_list( + h, "//div[@class='pure-text']/div[@class='abstract']/text()" ) if len(plot_elem) == 0: - plot_elem = h.xpath("//div[@class='pure-text']/text()") + plot_elem = self.query_list(h, "//div[@class='pure-text']/text()") data["brief"] = "\n".join(plot_elem) data["genre"] = [ s.strip() - for s in h.xpath( - "//div[@class='meta']//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()" + for s in self.query_list( + h, + "//div[@class='meta']//dl//dt[text()='类型:']/following-sibling::dd[@itemprop='genre']/text()", ) ] # data["version"] = [ # s.strip() - # for s in h.xpath( + # for s in self.query_list(h, # "//dl//dt[text()='版本:']/following-sibling::dd[@class='titles']/a//text()" # ) # ] data["director"] = [ s.strip() - for s in h.xpath( - "//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()" + for s in self.query_list( + h, + "//div[@class='meta']/dl//dt[text()='导演:']/following-sibling::dd/a[@itemprop='director']//text()", ) ] data["composer"] = [ s.strip() - for s in h.xpath( - "//div[@class='meta']/dl//dt[text()='作曲:']/following-sibling::dd/a[@itemprop='musicBy']//text()" + for s in self.query_list( + h, + "//div[@class='meta']/dl//dt[text()='作曲:']/following-sibling::dd/a[@itemprop='musicBy']//text()", ) ] data["choreographer"] = [ s.strip() - for s in h.xpath( - "//div[@class='meta']/dl//dt[text()='编舞:']/following-sibling::dd/a[@itemprop='choreographer']//text()" + for s in self.query_list( + h, + "//div[@class='meta']/dl//dt[text()='编舞:']/following-sibling::dd/a[@itemprop='choreographer']//text()", ) ] data["troupe"] = [ s.strip() - for s in h.xpath( - "//div[@class='meta']/dl//dt[text()='演出团体:']/following-sibling::dd/a[@itemprop='performer']//text()" + for s in self.query_list( + h, + "//div[@class='meta']/dl//dt[text()='演出团体:']/following-sibling::dd/a[@itemprop='performer']//text()", ) ] data["playwright"] = [ s.strip() - for s in h.xpath( - "//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()" + for s in self.query_list( + h, + "//div[@class='meta']/dl//dt[text()='编剧:']/following-sibling::dd/a[@itemprop='author']//text()", ) ] data["actor"] = [ {"name": s.strip(), "role": ""} - for s in h.xpath( - "//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()" + for s in self.query_list( + h, + "//div[@class='meta']/dl//dt[text()='主演:']/following-sibling::dd/a[@itemprop='actor']//text()", ) ] - date_elem = h.xpath( - "//div[@class='meta']//dl//dt[text()='演出日期:']/following::dd/text()" + date_elem = self.query_list( + h, "//div[@class='meta']//dl//dt[text()='演出日期:']/following::dd/text()" ) data["opening_date"] = date_elem[0] if date_elem else None if data["opening_date"]: @@ -211,12 +224,15 @@ def scrape(self): data["location"] = [ s.strip() - for s in h.xpath( - "//div[@class='meta']/dl//dt[text()='演出剧院:']/following-sibling::dd/a[@itemprop='location']//text()" + for s in self.query_list( + h, + "//div[@class='meta']/dl//dt[text()='演出剧院:']/following-sibling::dd/a[@itemprop='location']//text()", ) ] - versions = h.xpath("//div[@id='versions']/div[@class='fluid-mods']/div/@id") + versions = self.query_list( + h, "//div[@id='versions']/div[@class='fluid-mods']/div/@id" + ) data["related_resources"] = list( map( lambda v: { @@ -229,7 +245,7 @@ def scrape(self): versions, ) ) - img_url_elem = h.xpath("//img[@itemprop='image']/@src") + img_url_elem = self.query_list(h, "//img[@itemprop='image']/@src") data["cover_image_url"] = img_url_elem[0].strip() if img_url_elem else None data["localized_title"] = ( [{"lang": "zh-cn", "text": data["title"]}] diff --git a/catalog/sites/douban_game.py b/catalog/sites/douban_game.py index a867e31d..0fce5479 100644 --- a/catalog/sites/douban_game.py +++ b/catalog/sites/douban_game.py @@ -7,9 +7,7 @@ from common.models.lang import detect_language from common.models.misc import uniq -from .douban import DoubanDownloader - -_logger = logging.getLogger(__name__) +from .douban import DoubanDownloader, DoubanSearcher @SiteManager.register @@ -26,18 +24,18 @@ class DoubanGame(AbstractSite): DEFAULT_MODEL = Game @classmethod - def id_to_url(self, id_value): + def id_to_url(cls, id_value): return "https://www.douban.com/game/" + id_value + "/" def scrape(self): content = DoubanDownloader(self.url).download().html() - elem = content.xpath("//div[@id='content']/h1/text()") + elem = self.query_list(content, "//div[@id='content']/h1/text()") title = elem[0].strip() if len(elem) else None if not title: raise ParseError(self, "title") - elem = content.xpath("//div[@id='comments']//h2/text()") + elem = self.query_list(content, "//div[@id='comments']//h2/text()") title2 = elem[0].strip() if len(elem) else "" if title2: sp = title2.strip().rsplit("的短评", 1) @@ -48,46 +46,52 @@ def scrape(self): else: orig_title = "" - other_title_elem = content.xpath( - "//dl[@class='thing-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()" + other_title_elem = self.query_list( + content, + "//dl[@class='thing-attr']//dt[text()='别名:']/following-sibling::dd[1]/text()", ) other_title = ( other_title_elem[0].strip().split(" / ") if other_title_elem else [] ) - developer_elem = content.xpath( - "//dl[@class='thing-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()" + developer_elem = self.query_list( + content, + "//dl[@class='thing-attr']//dt[text()='开发商:']/following-sibling::dd[1]/text()", ) developer = developer_elem[0].strip().split(" / ") if developer_elem else None - publisher_elem = content.xpath( - "//dl[@class='thing-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()" + publisher_elem = self.query_list( + content, + "//dl[@class='thing-attr']//dt[text()='发行商:']/following-sibling::dd[1]/text()", ) publisher = publisher_elem[0].strip().split(" / ") if publisher_elem else None - platform_elem = content.xpath( - "//dl[@class='thing-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()" + platform_elem = self.query_list( + content, + "//dl[@class='thing-attr']//dt[text()='平台:']/following-sibling::dd[1]/a/text()", ) platform = platform_elem if platform_elem else None - genre_elem = content.xpath( - "//dl[@class='thing-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()" + genre_elem = self.query_list( + content, + "//dl[@class='thing-attr']//dt[text()='类型:']/following-sibling::dd[1]/a/text()", ) genre = None if genre_elem: genre = [g for g in genre_elem if g != "游戏"] - date_elem = content.xpath( - "//dl[@class='thing-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()" + date_elem = self.query_list( + content, + "//dl[@class='thing-attr']//dt[text()='发行日期:']/following-sibling::dd[1]/text()", ) release_date = dateparser.parse(date_elem[0].strip()) if date_elem else None release_date = release_date.strftime("%Y-%m-%d") if release_date else None - brief_elem = content.xpath("//div[@class='mod item-desc']/p/text()") + brief_elem = self.query_list(content, "//div[@class='mod item-desc']/p/text()") brief = "\n".join(brief_elem) if brief_elem else "" - img_url_elem = content.xpath( - "//div[@class='item-subject-info']/div[@class='pic']//img/@src" + img_url_elem = self.query_list( + content, "//div[@class='item-subject-info']/div[@class='pic']//img/@src" ) img_url = img_url_elem[0].strip() if img_url_elem else None diff --git a/catalog/sites/douban_movie.py b/catalog/sites/douban_movie.py index 8a1b25f1..36ccb87a 100644 --- a/catalog/sites/douban_movie.py +++ b/catalog/sites/douban_movie.py @@ -1,16 +1,17 @@ import json import logging +from loguru import logger + from catalog.common import * from catalog.movie.models import * from catalog.tv.models import * from common.models.lang import detect_language +from common.models.misc import int_ -from .douban import * +from .douban import DoubanDownloader, DoubanSearcher from .tmdb import TMDB_TV, TMDB_TVSeason, query_tmdb_tv_episode, search_tmdb_by_imdb_id -_logger = logging.getLogger(__name__) - @SiteManager.register class DoubanMovie(AbstractSite): @@ -29,11 +30,15 @@ class DoubanMovie(AbstractSite): def id_to_url(cls, id_value): return "https://movie.douban.com/subject/" + id_value + "/" + @classmethod + def search(cls, q: str, p: int = 1): + return DoubanSearcher.search(ItemCategory.Movie, "movie", q, p) + def scrape(self): content = DoubanDownloader(self.url).download().html() try: schema_data = "".join( - content.xpath('//script[@type="application/ld+json"]/text()') + self.query_list(content, '//script[@type="application/ld+json"]/text()') ).replace( "\n", "" ) # strip \n bc multi-line string is not properly coded in json by douban @@ -42,13 +47,13 @@ def scrape(self): d = {} try: - raw_title = content.xpath("//span[@property='v:itemreviewed']/text()")[ - 0 - ].strip() + raw_title = self.query_list( + content, "//span[@property='v:itemreviewed']/text()" + )[0].strip() except IndexError: raise ParseError(self, "title") - orig_title = content.xpath("//img[@rel='v:image']/@alt")[0].strip() + orig_title = self.query_list(content, "//img[@rel='v:image']/@alt")[0].strip() title = raw_title.split(orig_title)[0].strip() # if has no chinese title if title == "": @@ -58,40 +63,46 @@ def scrape(self): orig_title = None # there are two html formats for authors and translators - other_title_elem = content.xpath( - "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]" + other_title_elem = self.query_list( + content, + "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]", ) other_title = ( other_title_elem[0].strip().split(" / ") if other_title_elem else None ) - imdb_elem = content.xpath( - "//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()" + imdb_elem = self.query_list( + content, + "//div[@id='info']//span[text()='IMDb链接:']/following-sibling::a[1]/text()", ) if not imdb_elem: - imdb_elem = content.xpath( - "//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]" + imdb_elem = self.query_list( + content, + "//div[@id='info']//span[text()='IMDb:']/following-sibling::text()[1]", ) imdb_code = imdb_elem[0].strip() if imdb_elem else None - director_elem = content.xpath( - "//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()" + director_elem = self.query_list( + content, + "//div[@id='info']//span[text()='导演']/following-sibling::span[1]/a/text()", ) director = director_elem if director_elem else None - playwright_elem = content.xpath( - "//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()" + playwright_elem = self.query_list( + content, + "//div[@id='info']//span[text()='编剧']/following-sibling::span[1]/a/text()", ) playwright = ( list(map(lambda a: a[:200], playwright_elem)) if playwright_elem else None ) - actor_elem = content.xpath( - "//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()" + actor_elem = self.query_list( + content, + "//div[@id='info']//span[text()='主演']/following-sibling::span[1]/a/text()", ) actor = list(map(lambda a: a[:200], actor_elem)) if actor_elem else None - genre_elem = content.xpath("//span[@property='v:genre']/text()") + genre_elem = self.query_list(content, "//span[@property='v:genre']/text()") genre = [] if genre_elem: for g in genre_elem: @@ -102,7 +113,9 @@ def scrape(self): g = "惊悚" genre.append(g) - showtime_elem = content.xpath("//span[@property='v:initialReleaseDate']/text()") + showtime_elem = self.query_list( + content, "//span[@property='v:initialReleaseDate']/text()" + ) if showtime_elem: showtime = [] for st in showtime_elem: @@ -122,39 +135,39 @@ def scrape(self): else: showtime = None - site_elem = content.xpath( - "//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href" + site_elem = self.query_list( + content, + "//div[@id='info']//span[text()='官方网站:']/following-sibling::a[1]/@href", ) site = site_elem[0].strip()[:200] if site_elem else None if site and not re.match(r"http.+", site): site = None - area_elem = content.xpath( - "//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]" + area_elem = self.query_list( + content, + "//div[@id='info']//span[text()='制片国家/地区:']/following-sibling::text()[1]", ) if area_elem: area = [a.strip()[:100] for a in area_elem[0].split("/")] else: area = None - language_elem = content.xpath( - "//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]" + language_elem = self.query_list( + content, + "//div[@id='info']//span[text()='语言:']/following-sibling::text()[1]", ) if language_elem: language = [a.strip() for a in language_elem[0].split(" / ")] else: language = None - year_elem = content.xpath("//span[@class='year']/text()") - year = ( - int(re.search(r"\d+", year_elem[0])[0]) - if year_elem and re.search(r"\d+", year_elem[0]) - else None - ) + year_s = self.query_str(content, "//span[@class='year']/text()") + year_r = re.search(r"\d+", year_s) if year_s else None + year = int_(year_r[0]) if year_r else None - duration_elem = content.xpath("//span[@property='v:runtime']/text()") - other_duration_elem = content.xpath( - "//span[@property='v:runtime']/following-sibling::text()[1]" + duration_elem = self.query_list(content, "//span[@property='v:runtime']/text()") + other_duration_elem = self.query_list( + content, "//span[@property='v:runtime']/following-sibling::text()[1]" ) if duration_elem: duration = duration_elem[0].strip() @@ -164,19 +177,21 @@ def scrape(self): else: duration = None - season_elem = content.xpath( - "//*[@id='season']/option[@selected='selected']/text()" + season_elem = self.query_list( + content, "//*[@id='season']/option[@selected='selected']/text()" ) if not season_elem: - season_elem = content.xpath( - "//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]" + season_elem = self.query_list( + content, + "//div[@id='info']//span[text()='季数:']/following-sibling::text()[1]", ) season = int(season_elem[0].strip()) if season_elem else None else: season = int(season_elem[0].strip()) - episodes_elem = content.xpath( - "//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]" + episodes_elem = self.query_list( + content, + "//div[@id='info']//span[text()='集数:']/following-sibling::text()[1]", ) episodes = ( int(episodes_elem[0].strip()) @@ -184,8 +199,9 @@ def scrape(self): else None ) - single_episode_length_elem = content.xpath( - "//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]" + single_episode_length_elem = self.query_list( + content, + "//div[@id='info']//span[text()='单集片长:']/following-sibling::text()[1]", ) single_episode_length = ( single_episode_length_elem[0].strip()[:100] @@ -195,16 +211,16 @@ def scrape(self): is_series = d.get("@type") == "TVSeries" or episodes is not None - brief_elem = content.xpath("//span[@class='all hidden']") + brief_elem = self.query_list(content, "//span[@class='all hidden']") if not brief_elem: - brief_elem = content.xpath("//span[@property='v:summary']") + brief_elem = self.query_list(content, "//span[@property='v:summary']") brief = ( "\n".join([e.strip() for e in brief_elem[0].xpath("./text()")]) if brief_elem else None ) - img_url_elem = content.xpath("//img[@rel='v:image']/@src") + img_url_elem = self.query_list(content, "//img[@rel='v:image']/@src") img_url = img_url_elem[0].strip() if img_url_elem else None titles = set( @@ -261,26 +277,26 @@ def scrape(self): pd.metadata.get("season_number") and pd.metadata.get("season_number") != 1 ): - _logger.warn(f"{imdb_code} matched imdb tv show, force season 1") + logger.warning(f"{imdb_code} matched imdb tv show, force season 1") pd.metadata["season_number"] = 1 elif pd.metadata["preferred_model"] == "TVSeason" and has_episode: if res_data["tv_episode_results"][0]["episode_number"] != 1: - _logger.warning( + logger.warning( f"Douban Movie {self.url} IMDB {imdb_code} mapping to non-first episode in a season" ) elif res_data["tv_episode_results"][0]["season_number"] == 1: - _logger.warning( + logger.warning( f"Douban Movie {self.url} IMDB {imdb_code} mapping to first season episode in a season" ) elif has_movie: if pd.metadata["preferred_model"] != "Movie": - _logger.warn(f"{imdb_code} matched imdb movie, force Movie") + logger.warning(f"{imdb_code} matched imdb movie, force Movie") pd.metadata["preferred_model"] = "Movie" elif has_tv or has_episode: - _logger.warn(f"{imdb_code} matched imdb tv/episode, force TVSeason") + logger.warning(f"{imdb_code} matched imdb tv/episode, force TVSeason") pd.metadata["preferred_model"] = "TVSeason" else: - _logger.warn(f"{imdb_code} unknown to TMDB") + logger.warning(f"{imdb_code} unknown to TMDB") pd.lookup_ids[IdType.IMDB] = imdb_code diff --git a/catalog/sites/douban_music.py b/catalog/sites/douban_music.py index d0871fe6..af57ddf7 100644 --- a/catalog/sites/douban_music.py +++ b/catalog/sites/douban_music.py @@ -7,9 +7,7 @@ from catalog.music.utils import upc_to_gtin_13 from common.models.lang import detect_language -from .douban import DoubanDownloader - -_logger = logging.getLogger(__name__) +from .douban import DoubanDownloader, DoubanSearcher @SiteManager.register @@ -29,58 +27,63 @@ class DoubanMusic(AbstractSite): def id_to_url(cls, id_value): return "https://music.douban.com/subject/" + id_value + "/" + @classmethod + def search(cls, q: str, p: int = 1): + return DoubanSearcher.search(ItemCategory.Music, "music", q, p) + def scrape(self): content = DoubanDownloader(self.url).download().html() - elem = content.xpath("//h1/span/text()") + elem = self.query_list(content, "//h1/span/text()") title = elem[0].strip() if len(elem) else None if not title: raise ParseError(self, "title") - artists_elem = content.xpath( - "//div[@id='info']/span/span[@class='pl']/a/text()" + artists_elem = self.query_list( + content, "//div[@id='info']/span/span[@class='pl']/a/text()" ) artist = ( None if not artists_elem else list(map(lambda a: a[:200], artists_elem)) ) - genre_elem = content.xpath( - "//div[@id='info']//span[text()='流派:']/following::text()[1]" + genre_elem = self.query_list( + content, "//div[@id='info']//span[text()='流派:']/following::text()[1]" ) genre = genre_elem[0].strip().split(" / ") if genre_elem else [] - date_elem = content.xpath( - "//div[@id='info']//span[text()='发行时间:']/following::text()[1]" + date_elem = self.query_list( + content, "//div[@id='info']//span[text()='发行时间:']/following::text()[1]" ) release_date = dateparser.parse(date_elem[0].strip()) if date_elem else None release_date = release_date.strftime("%Y-%m-%d") if release_date else None - company_elem = content.xpath( - "//div[@id='info']//span[text()='出版者:']/following::text()[1]" + company_elem = self.query_list( + content, "//div[@id='info']//span[text()='出版者:']/following::text()[1]" ) company = company_elem[0].strip() if company_elem else None - track_list_elem = content.xpath( - "//div[@class='track-list']/div[@class='indent']/div/text()" + track_list_elem = self.query_list( + content, "//div[@class='track-list']/div[@class='indent']/div/text()" ) if track_list_elem: track_list = "\n".join([track.strip() for track in track_list_elem]) else: track_list = None - brief_elem = content.xpath("//span[@class='all hidden']") + brief_elem = self.query_list(content, "//span[@class='all hidden']") if not brief_elem: - brief_elem = content.xpath("//span[@property='v:summary']") + brief_elem = self.query_list(content, "//span[@property='v:summary']") brief = ( "\n".join([e.strip() for e in brief_elem[0].xpath("./text()")]) if brief_elem else None ) - img_url_elem = content.xpath("//div[@id='mainpic']//img/@src") + img_url_elem = self.query_list(content, "//div[@id='mainpic']//img/@src") img_url = img_url_elem[0].strip() if img_url_elem else None - other_elem = content.xpath( - "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]" + other_elem = self.query_list( + content, + "//div[@id='info']//span[text()='又名:']/following-sibling::text()[1]", ) other_title = other_elem[0].strip().split(" / ") if other_elem else [] lang = detect_language(f"{title} {brief}") @@ -103,28 +106,33 @@ def scrape(self): } gtin = None isrc = None - other_elem = content.xpath( - "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]" + other_elem = self.query_list( + content, + "//div[@id='info']//span[text()='专辑类型:']/following-sibling::text()[1]", ) if other_elem: data["album_type"] = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]" + other_elem = self.query_list( + content, + "//div[@id='info']//span[text()='介质:']/following-sibling::text()[1]", ) if other_elem: data["media"] = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]" + other_elem = self.query_list( + content, + "//div[@id='info']//span[text()='ISRC:']/following-sibling::text()[1]", ) if other_elem: isrc = other_elem[0].strip() - other_elem = content.xpath( - "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]" + other_elem = self.query_list( + content, + "//div[@id='info']//span[text()='条形码:']/following-sibling::text()[1]", ) if other_elem: gtin = upc_to_gtin_13(other_elem[0].strip()) - other_elem = content.xpath( - "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]" + other_elem = self.query_list( + content, + "//div[@id='info']//span[text()='碟片数:']/following-sibling::text()[1]", ) if other_elem: data["disc_count"] = other_elem[0].strip() diff --git a/pyproject.toml b/pyproject.toml index 80fc2288..5115b80d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,7 +80,6 @@ exclude = [ "journal/tests.py", "neodb", "**/migrations", - "**/sites/douban_*", "neodb-takahe", ] reportIncompatibleVariableOverride = false