-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrawpage.py
57 lines (48 loc) · 1.7 KB
/
rawpage.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import requests
from typing import Optional
from urlerror import UrlError
class RawPage:
"""
Class which extracts needed data from the webpage.
Attributes:
url (str): The URL of the webpage.
headline (str): The headline of the Wikipedia article.
data (str): The extracted data.
Properties:
url (str): Returns the URL associated with this instance.
sentences (tuple): Returns a tuple of sentences extracted from the URL content.
words (tuple): Returns a tuple of words extracted from the URL content.
"""
def __init__(self, url: str):
"""
Initializes a new instance of the RawPage class.
Args:
url (str): The URL of the webpage.
"""
self._url = url
self._headline: Optional[str] = None
self._data: Optional[str] = None
def _collect_data(self):
r = requests.get(self._url)
if r.status_code != requests.status_codes.codes.ok:
raise UrlError(self._url)
self._data = r.content.decode()
self._headline = self._url.split("/")[-1]
@property
def data(self) -> str:
"""
If there is no data already extracted, extracts the data from the webpage and in both cases returns the data
"""
if self._data is None:
self._collect_data()
assert self._data
return self._data
@property
def headline(self) -> str:
"""
If there is no headline already extracted, extracts the data from the webpage and in both cases returns the headline
"""
if self._headline is None:
self._collect_data()
assert self._headline
return self._headline