Skip to content

Commit

Permalink
proofreadpage.py: fetch URL of page scan via API
Browse files Browse the repository at this point in the history
Fetch URL of Page image using new API for MW >= 1.40:
    - query+prop=imageforpage

No more HTML page scraping is needed, except for MW version < 1.40.

This should also fix bug T181913, tests are re-added.

Change-Id: I374e878d0b321024903be8d5194b2878355667b6
Bug: T352524
Bug: T181913
Bug: T114318
  • Loading branch information
Mpaa committed Dec 3, 2023
1 parent f2aa850 commit c6136ec
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 9 deletions.
5 changes: 5 additions & 0 deletions pywikibot/data/api/_generators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1038,3 +1038,8 @@ def update_page(page, pagedict: dict, props=None):
page._lintinfo.pop('pageid')
page._lintinfo.pop('title')
page._lintinfo.pop('ns')

if 'imageforpage' in props and 'imagesforpage' in pagedict:
# proofreadpage will work always on dicts
# it serves also as workaround for T352482
page._imageforpage = pagedict['imagesforpage'] or {}
2 changes: 1 addition & 1 deletion pywikibot/page/_basepage.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ class BasePage(ComparableMixin):
'_contentmodel', '_langlinks', '_isredir', '_coords',
'_preloadedtext', '_timestamp', '_applicable_protections',
'_flowinfo', '_quality', '_pageprops', '_revid', '_quality_text',
'_pageimage', '_item', '_lintinfo',
'_pageimage', '_item', '_lintinfo', '_imageforpage',
)

def __init__(self, source, title: str = '', ns=0) -> None:
Expand Down
36 changes: 32 additions & 4 deletions pywikibot/proofreadpage.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
from pywikibot.data.api import ListGenerator, Request
from pywikibot.exceptions import Error, InvalidTitleError, OtherPageSaveError
from pywikibot.page import PageSourceType
from pywikibot.tools import cached
from pywikibot.tools import MediaWikiVersion, cached


try:
Expand Down Expand Up @@ -825,9 +825,7 @@ def pre_summary(self) -> str:
"""
return f'/* {self.status} */ '

@property
@cached
def url_image(self) -> str:
def __url_image_lt_140(self) -> str:
"""Get the file url of the scan of ProofreadPage.
:return: file url of the scan ProofreadPage or None.
Expand Down Expand Up @@ -864,6 +862,36 @@ def url_image(self) -> str:

return url_image

def __url_image(self) -> str:
"""Get the file url of the scan of ProofreadPage.
:return: file url of the scan of ProofreadPage or None.
:raises ValueError: in case of no image found for scan
"""
self.site.loadpageurls(self)
url = self._imageforpage.get('fullsize')
if url is not None:
return f'{self.site.family.protocol(self.site.code)}:{url}'
else:
raise ValueError(f'imagesforpage is empty for {self}.')

@property
@cached
def url_image(self) -> str:
"""Get the file url of the scan of ProofreadPage.
:return: file url of the scan of ProofreadPage or None.
For MW version < 1.40:
:raises Exception: in case of http errors
:raises ImportError: if bs4 is not installed, _bs4_soup() will raise
:raises ValueError: in case of no prp_page_image src found for scan
"""
if self.site.version() < MediaWikiVersion('1.40'):
return self.__url_image_lt_140()
else:
return self.__url_image()

def _ocr_callback(self, cmd_uri: str,
parser_func: Optional[Callable[[str], str]] = None,
ocr_tool: Optional[str] = None
Expand Down
23 changes: 23 additions & 0 deletions pywikibot/site/_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,29 @@ def proofread_levels(self):
self._cache_proofreadinfo()
return self._proofread_levels

@need_extension('ProofreadPage')
def loadpageurls(
self,
page: 'pywikibot.page.BasePage'
) -> None:
"""Load URLs from api and store in page attributes.
Load URLs to images for a given page in the "Page:" namespace.
No effect for pages in other namespaces.
.. seealso:: :api:`imageforpage`
"""
title = page.title(with_section=False)
# responsiveimages: server would try to render the other images as well
# let's not load the server unless needed.
prppifpprop = 'filename|size|fullsize'

query = self._generator(api.PropertyGenerator,
type_arg='imageforpage',
titles=title.encode(self.encoding()),
prppifpprop=prppifpprop)
self._update_page(page, query)


class GeoDataMixin:

Expand Down
6 changes: 2 additions & 4 deletions tests/proofreadpage_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
BasePageLoadRevisionsCachingTestBase,
BasePageMethodsTestBase,
)
from tests.utils import skipping


class TestPagesTagParser(TestCase):
Expand Down Expand Up @@ -250,7 +249,7 @@ class TestProofreadPageValidSite(TestCase):
'footer': '\n{{smallrefs}}',
'url_image': ('https://upload.wikimedia.org/wikipedia/commons/'
'thumb/a/ac/Popular_Science_Monthly_Volume_1.djvu/'
'page12-1024px-Popular_Science_Monthly_Volume_1.djvu'
'page12-2267px-Popular_Science_Monthly_Volume_1.djvu'
'.jpg'),
}

Expand Down Expand Up @@ -412,8 +411,7 @@ def test_url_image(self):
page.url_image

page = ProofreadPage(self.site, self.valid_redlink['title'])
with skipping(ValueError, msg='T181913, T114318'):
self.assertEqual(page.url_image, self.valid_redlink['url_image'])
self.assertEqual(page.url_image, self.valid_redlink['url_image'])


class TestPageQuality(TestCase):
Expand Down

0 comments on commit c6136ec

Please sign in to comment.