-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #258 from OpenUpSA/members-interest-2024
updated member's interests import procedure
- Loading branch information
Showing
2 changed files
with
312 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
272 changes: 272 additions & 0 deletions
272
pombola/south_africa/data/members-interests/scraper/docx_to_html_to_json.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,272 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "LllNYwnLqjn6" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"import mammoth\n", | ||
"\n", | ||
"import re\n", | ||
"from pprint import pprint\n", | ||
"import json\n", | ||
"from bs4 import BeautifulSoup" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "_ESbMiFoQJMo" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def docx_to_html(file_path):\n", | ||
" with open(file_path, \"rb\") as docx_file:\n", | ||
" result = mammoth.convert_to_html(docx_file)\n", | ||
" html = result.value \n", | ||
" messages = result.messages # Any warnings or errors during conversion\n", | ||
" return html\n", | ||
"\n", | ||
"docx_file_path = \"register.docx\"\n", | ||
"html_output = docx_to_html(docx_file_path)\n", | ||
"\n", | ||
"# find and delete this pattern in html_output </table><table><tr>(.*?)</tr> - this is for tables that span pages\n", | ||
"matches = re.findall(r'</table><table><tr>(.*?)</tr>', html_output)\n", | ||
"cleaned_html = re.sub(r'</table><table><tr>(.*?)</tr>', '', html_output)\n", | ||
"\n", | ||
"# replace </p><p> with \"\" - This is for paragraphs in <td>\n", | ||
"cleaned_html = re.sub(r'</p><p>', ' ', cleaned_html)\n", | ||
"\n", | ||
"# Save the HTML to a file\n", | ||
"with open(\"output.html\", \"w\") as html_file:\n", | ||
" html_file.write(cleaned_html)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "tdsl7kk2mjYV" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"\n", | ||
"def split_document_by_pattern(html_content):\n", | ||
"\n", | ||
" # Split at table after TRUSTS\n", | ||
"\n", | ||
" pattern = r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>TRUSTS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\"\n", | ||
"\n", | ||
" # Split the document using the pattern\n", | ||
" sections = re.split(pattern, html_content, flags=re.DOTALL)\n", | ||
"\n", | ||
" # Combine the sections after splitting (capturing groups leave pattern matches in the split result)\n", | ||
" combined_sections = []\n", | ||
" for i in range(0, len(sections) - 1, 2):\n", | ||
" combined_sections.append(sections[i] + sections[i + 1]) # Add the content before and including the match\n", | ||
"\n", | ||
" # Add the final leftover content if any\n", | ||
" if len(sections) % 2 != 0:\n", | ||
" combined_sections.append(sections[-1])\n", | ||
"\n", | ||
" return combined_sections\n", | ||
"\n", | ||
"with open(\"output.html\", \"r\", encoding=\"utf-8\") as file:\n", | ||
" html_data = file.read()\n", | ||
"\n", | ||
"sections = split_document_by_pattern(html_data)\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "lbZtE00n2P4g" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Section Patterns\n", | ||
"\n", | ||
"sections_split = {\n", | ||
" \"RAW-SHARES AND OTHER FINANCIAL INTERESTS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>SHARES AND OTHER FINANCIAL INTERESTS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-DIRECTORSHIPS AND PARTNERSHIPS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>DIRECTORSHIPS AND PARTNERSHIPS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-CONSULTANCIES AND RETAINERSHIPS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>CONSULTANCIES AND RETAINERSHIPS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-SPONSORSHIPS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>SPONSORSHIPS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-GIFTS AND HOSPITALITY\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>GIFTS AND HOSPITALITY</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-BENEFITS AND INTERESTS FREE LOANS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>BENEFITS AND INTERESTS FREE LOANS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-TRAVEL\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>TRAVEL</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-OWNERSHIP IN LAND AND PROPERTY\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>OWNERSHIP IN LAND AND PROPERTY</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-PENSIONS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>PENSIONS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-RENTED PROPERTY\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>RENTED PROPERTY</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-INCOME GENERATING ASSETS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>INCOME GENERATING ASSETS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\",\n", | ||
" \"RAW-TRUSTS\": r\"(<ul>\\s*<li>\\s*<ul>\\s*<li>\\s*<ol>\\s*<li>\\s*<strong>TRUSTS</strong>\\s*</li>\\s*</ol>\\s*</li>\\s*</ul>\\s*</li>\\s*</ul>.*?</table>)\"\n", | ||
"}\n", | ||
"\n", | ||
"def parse_table_to_json(html_table, key_name):\n", | ||
" if not isinstance(html_table, str):\n", | ||
" return\n", | ||
"\n", | ||
" soup = BeautifulSoup(html_table, \"html.parser\")\n", | ||
" rows = soup.find_all(\"tr\")\n", | ||
"\n", | ||
" # Extract headers from the first row\n", | ||
" headers = [header.get_text(strip=True) for header in rows[0].find_all(\"p\")]\n", | ||
"\n", | ||
" # Extract data from the remaining rows\n", | ||
" data = []\n", | ||
" for row in rows[1:]:\n", | ||
" values = [value.get_text(strip=True) for value in row.find_all(\"p\")]\n", | ||
" entry = {headers[i]: values[i] if i < len(values) else \"\" for i in range(len(headers))}\n", | ||
" data.append(entry)\n", | ||
"\n", | ||
" # Construct the final JSON object\n", | ||
" result = data\n", | ||
" return result\n", | ||
"\n", | ||
"def process_person(person_html, section_name):\n", | ||
"\n", | ||
"\n", | ||
" content = {}\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
" # Extract each section\n", | ||
" for key, pattern in sections_split.items():\n", | ||
" matches = re.findall(pattern, person_html)\n", | ||
" content[key] = matches[0] if matches else None\n", | ||
"\n", | ||
"\n", | ||
" for html in content:\n", | ||
" table_pattern = r\"<table.*?>(.*?)</table>\"\n", | ||
"\n", | ||
" if isinstance(content[html], str):\n", | ||
" table_contents = re.findall(table_pattern, content[html])[0]\n", | ||
" content[html] = \"<table>\" + table_contents + \"</table>\"\n", | ||
"\n", | ||
"\n", | ||
" key_name = section_name.replace(\"RAW-\", \"\")\n", | ||
" result = parse_table_to_json(content['RAW-' + key_name], key_name)\n", | ||
"\n", | ||
" return(result)\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"people = []\n", | ||
"\n", | ||
"for person in sections:\n", | ||
" if isinstance(person, str):\n", | ||
"\n", | ||
" person_name = \"\"\n", | ||
" person_title = \"\"\n", | ||
" person_party = \"\"\n", | ||
"\n", | ||
" if re.findall(r\"<ul><li><ol><li>(.*?)</li></ol></li></ul>\", person):\n", | ||
" person_name = re.findall(r\"<ul><li><ol><li>(.*?)</li></ol></li></ul>\", person)[0]\n", | ||
"\n", | ||
" if person_name == \"<strong>SHARES AND OTHER FINANCIAL INTERESTS</strong>\":\n", | ||
" person_name = re.findall(r\"<ol><li>(.*?)<ol><li>(.*?)</li></ol></li></ol>\",person)[0][1]\n", | ||
"\n", | ||
" person_party = re.findall(r\"<p>(.*?)</p>\",person)[0] if re.findall(r\"<p>(.*?)</p>\",person) else None\n", | ||
"\n", | ||
" if person_name:\n", | ||
" parts = person_name.split(\", \")\n", | ||
" surname = parts[0].strip() # Always the first part\n", | ||
" if len(parts) > 1:\n", | ||
" person_title = parts[1].split()[0].strip() # Only the first word is the title\n", | ||
" given_names = \" \".join(parts[1].split()[1:]).strip() # Remaining words are given names\n", | ||
" person_name = f\"{given_names} {surname}\".strip()\n", | ||
"\n", | ||
" people.append({\n", | ||
" \"mp\": person_name,\n", | ||
" \"title\": person_title,\n", | ||
" \"party\": person_party,\n", | ||
" \"SHARES AND OTHER FINANCIAL INTERESTS\": process_person(person, \"RAW-SHARES AND OTHER FINANCIAL INTERESTS\"),\n", | ||
" \"REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT\": process_person(person, \"RAW-REMUNERATED EMPLOYMENT OR WORK OUTSIDE OF PARLIAMENT\"),\n", | ||
" \"DIRECTORSHIPS AND PARTNERSHIPS\": process_person(person, \"RAW-DIRECTORSHIPS AND PARTNERSHIPS\"),\n", | ||
" \"CONSULTANCIES AND RETAINERSHIPS\": process_person(person, \"RAW-CONSULTANCIES AND RETAINERSHIPS\"),\n", | ||
" \"SPONSORSHIPS\": process_person(person, \"RAW-SPONSORSHIPS\"),\n", | ||
" \"GIFTS AND HOSPITALITY\": process_person(person, \"RAW-GIFTS AND HOSPITALITY\"),\n", | ||
" \"BENEFITS AND INTERESTS FREE LOANS\": process_person(person, \"RAW-BENEFITS AND INTERESTS FREE LOANS\"),\n", | ||
" \"TRAVEL\": process_person(person, \"RAW-TRAVEL\"),\n", | ||
" \"OWNERSHIP IN LAND AND PROPERTY\": process_person(person, \"RAW-OWNERSHIP IN LAND AND PROPERTY\"),\n", | ||
" \"PENSIONS\": process_person(person, \"RAW-PENSIONS\"),\n", | ||
" \"RENTED PROPERTY\": process_person(person, \"RAW-RENTED PROPERTY\"),\n", | ||
" \"INCOME GENERATING ASSETS\": process_person(person, \"RAW-INCOME GENERATING ASSETS\"),\n", | ||
" \"TRUSTS\": process_person(person, \"RAW-TRUSTS\")\n", | ||
" })\n", | ||
"\n", | ||
" else:\n", | ||
" print(f\"Skipping non-string person entry: {type(person)}\")\n", | ||
"\n", | ||
"\n", | ||
"# clean people by dumping any entry where mp = None\n", | ||
"people = [person for person in people if person['mp'] is not None]\n", | ||
"\n", | ||
"\n", | ||
"with open(\"/content/drive/MyDrive/PROJECTS/PMG/MI/output.json\", \"w\") as outfile:\n", | ||
" json.dump(people, outfile)\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"\n", | ||
"\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"metadata": { | ||
"id": "el1f_xdV1R4q" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Just to debug final JSON\n", | ||
"\n", | ||
"import json\n", | ||
"\n", | ||
"def load_json_as_array(file_path):\n", | ||
" try:\n", | ||
" with open(file_path, 'r') as file:\n", | ||
" data = json.load(file)\n", | ||
" if isinstance(data, list):\n", | ||
" return data\n", | ||
" else:\n", | ||
" print(f\"Warning: JSON file does not contain an array of objects. Returning the loaded data as is.\")\n", | ||
" return data\n", | ||
" except FileNotFoundError:\n", | ||
" print(f\"Error: File not found at {file_path}\")\n", | ||
" return None\n", | ||
" except json.JSONDecodeError:\n", | ||
" print(f\"Error: Invalid JSON format in {file_path}\")\n", | ||
" return None\n", | ||
"\n", | ||
"# Example usage\n", | ||
"file_path = \"output.json\" \n", | ||
"data = load_json_as_array(file_path)\n" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"name": "python" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |