Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple Scraper Improvements #13

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 74 additions & 45 deletions risscraper/scraperallris.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"""

import collections
import datetime
import HTMLParser
import logging
Expand Down Expand Up @@ -144,12 +145,8 @@ def guess_system(self):
logging.info("Nothing to guess until now.")

def find_person(self):
find_person_url = (self.config['scraper']['base_url'] +
'kp041.asp?template=xyz&selfaction=ws&showAll=true&'
'PALFDNRM=1&kpdatfil=&filtdatum=filter&kpname=&'
'kpsonst=&kpampa=99999999&kpfr=99999999&'
'kpamfr=99999999&kpau=99999999&kpamau=99999999&'
'searchForm=true&search=Suchen')
# example: https://ksd.rostock.de/bi/kp041.asp?selfaction=ws
find_person_url = self.config['scraper']['base_url'] + 'kp041.asp?selfaction=ws'
logging.info("Getting person overview from %s", find_person_url)

"""parse an XML file and return the tree"""
Expand Down Expand Up @@ -288,6 +285,27 @@ def get_person_organization(self, person_id=None, organization_url=None):
% (self.config['scraper']['base_url'], person_id))

logging.info("Getting person organization from %s", url)
# maps name of type to form name and membership type
membership = collections.namedtuple('Membership', ('mtype', 'field'))
membership_map = {
u'Rat der Stadt' : membership('parliament', 'PALFDNR'),
u'Parlament' : membership('parliament', 'PALFDNR'),
u'Bürgerschaft' : membership('parliament', 'PALFDNR'),
u'Fraktion' : membership('organisation', 'FRLFDNR'),
u'Fraktionen': membership('parliament', 'FRLFDNR'),
u'Ausschüsse' : membership('organization', 'AULFDNR'),
u'Stadtbezirk': membership('parliament', 'PALFDNR'),
u'BVV': membership('parliament', 'PALFDNR'),
u'Bezirksparlament': membership('parliament', 'PALFDNR'),
u'Bezirksverordnetenversammlung': membership('parliament',
'PALFDNR'),
u'Ortsbeiräte': membership('organization', 'AULFDNR'),
u'Aufsichtsräte': membership('organization', 'AULFDNR'),
u'sonstige Gremien': membership('organization', 'AULFDNR'),
# At least in Rostock there can be an empty organization type.
# see: https://ksd.rostock.de/bi/kp020.asp?KPLFDNR=300&history=true
u'': membership('organization', 'AULFDNR'),
}
# Stupid re-try concept because AllRis sometimes misses start < at
# tags at first request.
try_counter = 0
Expand All @@ -296,64 +314,72 @@ def get_person_organization(self, person_id=None, organization_url=None):
response = self.get_url(url)
if not url:
return
tree = html.fromstring(response.text)
text = response.text.encode('ascii', 'xmlcharrefreplace')
tree = html.fromstring(text)

memberships = []
person = Person(originalId=person_id)
# maps name of type to form name and membership type
type_map = {
u'Rat der Stadt' : {'mtype' : 'parliament',
'field' : 'PALFDNR'},
u'Parlament' : {'mtype' : 'parliament',
'field' : 'PALFDNR'},
u'Fraktion' : {'mtype' : 'organisation',
'field' : 'FRLFDNR'},
'Fraktionen': {'mtype' : 'parliament', 'field' : 'FRLFDNR'},
u'Ausschüsse' : {'mtype' : 'organization',
'field' : 'AULFDNR'},
'Stadtbezirk': {'mtype' : 'parliament',
'field' : 'PALFDNR'},
'BVV': {'mtype' : 'parliament', 'field' : 'PALFDNR'},
'Bezirksparlament': {'mtype' : 'parliament',
'field' : 'PALFDNR'},
'Bezirksverordnetenversammlung': {'mtype' : 'parliament',
'field' : 'PALFDNR'}
}

# Different versions contain different "main" divs:
# Rostock (ALLRIS net Version 3.8.8): "rismain"
# others: "rismain_raw"
for key in ("rismain_raw", "rismain"):
# There are three tables on this page:
# Anschrift, Sonstiges, Mitarbeit
# We are interested in "Mitarbeit".
table = tree.xpath('//*[@id="%s"]/table[2]' % key)
if table:
break
# obtain the table with the membership list via a simple state machine
mtype = "parliament"
field = 'PALFDNR'
# for checking if it changes
old_group_id = None
# for checking if it changes
old_group_name = None
# might break otherwise
group_id = None
table = tree.xpath('//*[@id="rismain_raw"]/table[2]')
if len(table):
if table:
table = table[0]
mtype = None
field = None
# for checking if it changes
old_group_id = None
# for checking if it changes
old_group_name = None
# might break otherwise
group_id = None
for line in table.findall("tr"):
if line[0].tag == "th":
what = line[0].text.strip()
# This is a subtitle for the following memberships.
# Carefully look inside - maybe it is empty.
what = (line[0].text or "").strip()
field = None
field_list = None
if what in type_map:
mtype = type_map[what]['mtype']
field = type_map[what]['field']
if what in membership_map:
mtype = membership_map[what].mtype
field = membership_map[what].field
elif 'Wahlperiode' in what:
mtype = 'parliament'
# 'FRLFDNR'
field_list = ['KPLFDNR', 'AULFDNR']
elif "Auskünfte gemäß BVV" in what:
elif u"Auskünfte gemäß BVV" in what:
break
else:
logging.error("Unknown organization type %s "
"at person detail page %s",
what, person_id)
continue
else:
"""
This is a membership description consisting of
organization icon, organization name, role and
timespan.
Example:
<tr><td><form action="au020.asp" method="post" style="margin:0">
<input name="AULFDNR" value="157" type="hidden">
<input name="altoption" value="Ausschuss" type="hidden">
<input class="il1_au" value="AU" title="Ausschuss" type="submit"></form></td>
<td class="text1"><a href="au020.asp?AULFDNR=157&amp;altoption=Ausschuss">
Ausschuss für Stadt- und Regionalentwicklung, Umwelt und Ordnung</a></td>
<td class="text4">Mitglied&nbsp;</td>
<td class="text4">14.07.1999 - 13.12.1999</td><td>&nbsp;</td></tr>
"""
if "Keine Information" in line.text_content():
# skip because no content is available
# Typically "Fraktion" is undefined.
continue

# Empty line = strange stuff comes after this
Expand Down Expand Up @@ -407,25 +433,28 @@ def get_person_organization(self, person_id=None, organization_url=None):
membership.originalId = (unicode(person_id) + '-'
+ unicode(group_id))

# TODO: create a list of functions so we can
# TODO: create a list of roles so we can
# index them somehow
function = line[2].text_content()
role = line[2].text_content()
raw_date = line[3].text_content()
# parse the date information
if "seit" in raw_date:
# Example: "seit 02.07.2014"
dparts = raw_date.split()
membership.endDate = dparts[-1]
elif "Keine" in raw_date or not raw_date.strip():
# no date information available
start_date = end_date = None
else:
# Example: "14.07.1999 - 13.12.1999"
dparts = raw_date.split()
membership.startDate = dparts[0]
membership.endDate = dparts[-1]
if organization.originalId is not None:
memberships.append(membership)
else:
logging.warn("Bad organization at %s", url)
logging.warn("Bad organization (%s): %s",
url, line.text_content())

person.membership = memberships
oid = self.db.save_person(person)
Expand Down Expand Up @@ -603,7 +632,7 @@ def get_paper(self, paper_url=None, paper_id=None):
logging.warn("Paper %s in %s seems to private",
paper_id, paper_url)
return
text = response.text
text = response.text.encode('ascii', 'xmlcharrefreplace')
doc = html.fromstring(text)
data = {}

Expand Down