From c6136eceb80f9abbabe773b07bd1389ba85f7ef4 Mon Sep 17 00:00:00 2001 From: Mpaa Date: Sun, 22 Oct 2023 23:05:09 +0200 Subject: [PATCH] proofreadpage.py: fetch URL of page scan via API Fetch URL of Page image using new API for MW >= 1.40: - query+prop=imageforpage No more HTML page scraping is needed, except for MW version < 1.40. This should also fix bug T181913, tests are re-added. Change-Id: I374e878d0b321024903be8d5194b2878355667b6 Bug: T352524 Bug: T181913 Bug: T114318 --- pywikibot/data/api/_generators.py | 5 +++++ pywikibot/page/_basepage.py | 2 +- pywikibot/proofreadpage.py | 36 +++++++++++++++++++++++++++---- pywikibot/site/_extensions.py | 23 ++++++++++++++++++++ tests/proofreadpage_tests.py | 6 ++---- 5 files changed, 63 insertions(+), 9 deletions(-) diff --git a/pywikibot/data/api/_generators.py b/pywikibot/data/api/_generators.py index 902c56190a..07f0338a66 100644 --- a/pywikibot/data/api/_generators.py +++ b/pywikibot/data/api/_generators.py @@ -1038,3 +1038,8 @@ def update_page(page, pagedict: dict, props=None): page._lintinfo.pop('pageid') page._lintinfo.pop('title') page._lintinfo.pop('ns') + + if 'imageforpage' in props and 'imagesforpage' in pagedict: + # proofreadpage will work always on dicts + # it serves also as workaround for T352482 + page._imageforpage = pagedict['imagesforpage'] or {} diff --git a/pywikibot/page/_basepage.py b/pywikibot/page/_basepage.py index 796eac5b4f..caef2f31d8 100644 --- a/pywikibot/page/_basepage.py +++ b/pywikibot/page/_basepage.py @@ -73,7 +73,7 @@ class BasePage(ComparableMixin): '_contentmodel', '_langlinks', '_isredir', '_coords', '_preloadedtext', '_timestamp', '_applicable_protections', '_flowinfo', '_quality', '_pageprops', '_revid', '_quality_text', - '_pageimage', '_item', '_lintinfo', + '_pageimage', '_item', '_lintinfo', '_imageforpage', ) def __init__(self, source, title: str = '', ns=0) -> None: diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py index a4ae60fb82..7157316d7e 100644 --- a/pywikibot/proofreadpage.py +++ b/pywikibot/proofreadpage.py @@ -54,7 +54,7 @@ from pywikibot.data.api import ListGenerator, Request from pywikibot.exceptions import Error, InvalidTitleError, OtherPageSaveError from pywikibot.page import PageSourceType -from pywikibot.tools import cached +from pywikibot.tools import MediaWikiVersion, cached try: @@ -825,9 +825,7 @@ def pre_summary(self) -> str: """ return f'/* {self.status} */ ' - @property - @cached - def url_image(self) -> str: + def __url_image_lt_140(self) -> str: """Get the file url of the scan of ProofreadPage. :return: file url of the scan ProofreadPage or None. @@ -864,6 +862,36 @@ def url_image(self) -> str: return url_image + def __url_image(self) -> str: + """Get the file url of the scan of ProofreadPage. + + :return: file url of the scan of ProofreadPage or None. + :raises ValueError: in case of no image found for scan + """ + self.site.loadpageurls(self) + url = self._imageforpage.get('fullsize') + if url is not None: + return f'{self.site.family.protocol(self.site.code)}:{url}' + else: + raise ValueError(f'imagesforpage is empty for {self}.') + + @property + @cached + def url_image(self) -> str: + """Get the file url of the scan of ProofreadPage. + + :return: file url of the scan of ProofreadPage or None. + + For MW version < 1.40: + :raises Exception: in case of http errors + :raises ImportError: if bs4 is not installed, _bs4_soup() will raise + :raises ValueError: in case of no prp_page_image src found for scan + """ + if self.site.version() < MediaWikiVersion('1.40'): + return self.__url_image_lt_140() + else: + return self.__url_image() + def _ocr_callback(self, cmd_uri: str, parser_func: Optional[Callable[[str], str]] = None, ocr_tool: Optional[str] = None diff --git a/pywikibot/site/_extensions.py b/pywikibot/site/_extensions.py index f6c5859ad8..4e6977e468 100644 --- a/pywikibot/site/_extensions.py +++ b/pywikibot/site/_extensions.py @@ -141,6 +141,29 @@ def proofread_levels(self): self._cache_proofreadinfo() return self._proofread_levels + @need_extension('ProofreadPage') + def loadpageurls( + self, + page: 'pywikibot.page.BasePage' + ) -> None: + """Load URLs from api and store in page attributes. + + Load URLs to images for a given page in the "Page:" namespace. + No effect for pages in other namespaces. + + .. seealso:: :api:`imageforpage` + """ + title = page.title(with_section=False) + # responsiveimages: server would try to render the other images as well + # let's not load the server unless needed. + prppifpprop = 'filename|size|fullsize' + + query = self._generator(api.PropertyGenerator, + type_arg='imageforpage', + titles=title.encode(self.encoding()), + prppifpprop=prppifpprop) + self._update_page(page, query) + class GeoDataMixin: diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py index 40a1728fe1..13534e1574 100755 --- a/tests/proofreadpage_tests.py +++ b/tests/proofreadpage_tests.py @@ -26,7 +26,6 @@ BasePageLoadRevisionsCachingTestBase, BasePageMethodsTestBase, ) -from tests.utils import skipping class TestPagesTagParser(TestCase): @@ -250,7 +249,7 @@ class TestProofreadPageValidSite(TestCase): 'footer': '\n{{smallrefs}}', 'url_image': ('https://upload.wikimedia.org/wikipedia/commons/' 'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/' - 'page12-1024px-Popular_Science_Monthly_Volume_1.djvu' + 'page12-2267px-Popular_Science_Monthly_Volume_1.djvu' '.jpg'), } @@ -412,8 +411,7 @@ def test_url_image(self): page.url_image page = ProofreadPage(self.site, self.valid_redlink['title']) - with skipping(ValueError, msg='T181913, T114318'): - self.assertEqual(page.url_image, self.valid_redlink['url_image']) + self.assertEqual(page.url_image, self.valid_redlink['url_image']) class TestPageQuality(TestCase):