proofreadpage.py: fetch URL of page scan via API

Fetch URL of Page image using new API for MW >= 1.40: - query+prop=imageforpage No more HTML page scraping is needed, except for MW version < 1.40. This should also fix bug T181913, tests are re-added. Change-Id: I374e878d0b321024903be8d5194b2878355667b6 Bug: T352524 Bug: T181913 Bug: T114318
wikimedia · Dec 3, 2023 · c6136ec · c6136ec
1 parent f2aa850
commit c6136ec
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 9 deletions.
diff --git a/pywikibot/data/api/_generators.py b/pywikibot/data/api/_generators.py
@@ -1038,3 +1038,8 @@ def update_page(page, pagedict: dict, props=None):
         page._lintinfo.pop('pageid')
         page._lintinfo.pop('title')
         page._lintinfo.pop('ns')
+
+    if 'imageforpage' in props and 'imagesforpage' in pagedict:
+        # proofreadpage will work always on dicts
+        # it serves also as workaround for T352482
+        page._imageforpage = pagedict['imagesforpage'] or {}
diff --git a/pywikibot/page/_basepage.py b/pywikibot/page/_basepage.py
@@ -73,7 +73,7 @@ class BasePage(ComparableMixin):
         '_contentmodel', '_langlinks', '_isredir', '_coords',
         '_preloadedtext', '_timestamp', '_applicable_protections',
         '_flowinfo', '_quality', '_pageprops', '_revid', '_quality_text',
-        '_pageimage', '_item', '_lintinfo',
+        '_pageimage', '_item', '_lintinfo', '_imageforpage',
     )
 
     def __init__(self, source, title: str = '', ns=0) -> None:

diff --git a/pywikibot/proofreadpage.py b/pywikibot/proofreadpage.py
@@ -54,7 +54,7 @@
 from pywikibot.data.api import ListGenerator, Request
 from pywikibot.exceptions import Error, InvalidTitleError, OtherPageSaveError
 from pywikibot.page import PageSourceType
-from pywikibot.tools import cached
+from pywikibot.tools import MediaWikiVersion, cached
 
 
 try:
@@ -825,9 +825,7 @@ def pre_summary(self) -> str:
         """
         return f'/* {self.status} */ '
 
-    @property
-    @cached
-    def url_image(self) -> str:
+    def __url_image_lt_140(self) -> str:
         """Get the file url of the scan of ProofreadPage.
 
         :return: file url of the scan ProofreadPage or None.
@@ -864,6 +862,36 @@ def url_image(self) -> str:
 
         return url_image
 
+    def __url_image(self) -> str:
+        """Get the file url of the scan of ProofreadPage.
+
+        :return: file url of the scan of ProofreadPage or None.
+        :raises ValueError: in case of no image found for scan
+        """
+        self.site.loadpageurls(self)
+        url = self._imageforpage.get('fullsize')
+        if url is not None:
+            return f'{self.site.family.protocol(self.site.code)}:{url}'
+        else:
+            raise ValueError(f'imagesforpage is empty for {self}.')
+
+    @property
+    @cached
+    def url_image(self) -> str:
+        """Get the file url of the scan of ProofreadPage.
+
+        :return: file url of the scan of ProofreadPage or None.
+
+        For MW version < 1.40:
+        :raises Exception: in case of http errors
+        :raises ImportError: if bs4 is not installed, _bs4_soup() will raise
+        :raises ValueError: in case of no prp_page_image src found for scan
+        """
+        if self.site.version() < MediaWikiVersion('1.40'):
+            return self.__url_image_lt_140()
+        else:
+            return self.__url_image()
+
     def _ocr_callback(self, cmd_uri: str,
                       parser_func: Optional[Callable[[str], str]] = None,
                       ocr_tool: Optional[str] = None

diff --git a/pywikibot/site/_extensions.py b/pywikibot/site/_extensions.py
@@ -141,6 +141,29 @@ def proofread_levels(self):
             self._cache_proofreadinfo()
         return self._proofread_levels
 
+    @need_extension('ProofreadPage')
+    def loadpageurls(
+        self,
+        page: 'pywikibot.page.BasePage'
+    ) -> None:
+        """Load URLs from api and store in page attributes.
+
+        Load URLs to images for a given page in the "Page:" namespace.
+        No effect for pages in other namespaces.
+
+        .. seealso:: :api:`imageforpage`
+        """
+        title = page.title(with_section=False)
+        # responsiveimages: server would try to render the other images as well
+        # let's not load the server unless needed.
+        prppifpprop = 'filename|size|fullsize'
+
+        query = self._generator(api.PropertyGenerator,
+                                type_arg='imageforpage',
+                                titles=title.encode(self.encoding()),
+                                prppifpprop=prppifpprop)
+        self._update_page(page, query)
+
 
 class GeoDataMixin:
 

diff --git a/tests/proofreadpage_tests.py b/tests/proofreadpage_tests.py
@@ -26,7 +26,6 @@
     BasePageLoadRevisionsCachingTestBase,
     BasePageMethodsTestBase,
 )
-from tests.utils import skipping
 
 
 class TestPagesTagParser(TestCase):
@@ -250,7 +249,7 @@ class TestProofreadPageValidSite(TestCase):
         'footer': '\n{{smallrefs}}',
         'url_image': ('https://upload.wikimedia.org/wikipedia/commons/'
                       'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/'
-                      'page12-1024px-Popular_Science_Monthly_Volume_1.djvu'
+                      'page12-2267px-Popular_Science_Monthly_Volume_1.djvu'
                       '.jpg'),
     }
 
@@ -412,8 +411,7 @@ def test_url_image(self):
             page.url_image
 
         page = ProofreadPage(self.site, self.valid_redlink['title'])
-        with skipping(ValueError, msg='T181913, T114318'):
-            self.assertEqual(page.url_image, self.valid_redlink['url_image'])
+        self.assertEqual(page.url_image, self.valid_redlink['url_image'])
 
 
 class TestPageQuality(TestCase):