diff --git a/tests/parsers/xml/test_akomantoso.py b/tests/parsers/xml/test_akomantoso.py index d3871bc..4a3796b 100644 --- a/tests/parsers/xml/test_akomantoso.py +++ b/tests/parsers/xml/test_akomantoso.py @@ -94,13 +94,13 @@ def test_get_chapters(self): self.parser.get_chapters() expected_chapters = [ - {'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'}, - {'eId': 'chp_II', 'chapter_num': 'CHAPTER II', 'chapter_heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'}, - {'eId': 'chp_III', 'chapter_num': 'CHAPTER III', 'chapter_heading': 'SWITCHING'}, - {'eId': 'chp_IV', 'chapter_num': 'CHAPTER IV', 'chapter_heading': 'ACCESS TO PAYMENT ACCOUNTS'}, - {'eId': 'chp_V', 'chapter_num': 'CHAPTER V', 'chapter_heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'}, - {'eId': 'chp_VI', 'chapter_num': 'CHAPTER VI', 'chapter_heading': 'SANCTIONS'}, - {'eId': 'chp_VII', 'chapter_num': 'CHAPTER VII', 'chapter_heading': 'FINAL PROVISIONS'} + {'eId': 'chp_I', 'num': 'CHAPTER I', 'heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'}, + {'eId': 'chp_II', 'num': 'CHAPTER II', 'heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'}, + {'eId': 'chp_III', 'num': 'CHAPTER III', 'heading': 'SWITCHING'}, + {'eId': 'chp_IV', 'num': 'CHAPTER IV', 'heading': 'ACCESS TO PAYMENT ACCOUNTS'}, + {'eId': 'chp_V', 'num': 'CHAPTER V', 'heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'}, + {'eId': 'chp_VI', 'num': 'CHAPTER VI', 'heading': 'SANCTIONS'}, + {'eId': 'chp_VII', 'num': 'CHAPTER VII', 'heading': 'FINAL PROVISIONS'} ] self.assertEqual(self.parser.chapters, expected_chapters, "Chapters data does not match expected content") diff --git a/tests/parsers/xml/test_formex.py b/tests/parsers/xml/test_formex.py index 598607a..6ef5c45 100644 --- a/tests/parsers/xml/test_formex.py +++ b/tests/parsers/xml/test_formex.py @@ -87,12 +87,12 @@ def test_get_chapters(self): self.parser.get_chapters() expected_chapters = [ - {'eId': 0, 'chapter_num': 'Chapter 1', 'chapter_heading': 'General provisions'}, - {'eId': 1, 'chapter_num': 'Chapter 2', 'chapter_heading': 'European Interoperability enablers' }, - {'eId': 2, 'chapter_heading': 'Interoperable Europe support measures', 'chapter_num': 'Chapter 3'}, - {'eId': 3, 'chapter_heading': 'Governance of cross-border interoperability', 'chapter_num': 'Chapter 4'}, - {'eId': 4, 'chapter_heading': 'Interoperable Europe planning and monitoring', 'chapter_num': 'Chapter 5'}, - {'eId': 5, 'chapter_heading': 'Final provisions', 'chapter_num': 'Chapter 6'}, + {'eId': 0, 'num': 'Chapter 1', 'heading': 'General provisions'}, + {'eId': 1, 'num': 'Chapter 2', 'heading': 'European Interoperability enablers'}, + {'eId': 2, 'num': 'Chapter 3', 'heading': 'Interoperable Europe support measures'}, + {'eId': 3, 'num': 'Chapter 4', 'heading': 'Governance of cross-border interoperability'}, + {'eId': 4, 'num': 'Chapter 5', 'heading': 'Interoperable Europe planning and monitoring'}, + {'eId': 5, 'num': 'Chapter 6', 'heading': 'Final provisions'}, ] self.assertEqual(self.parser.chapters[0], expected_chapters[0], "Chapters data does not match expected content") @@ -105,16 +105,16 @@ def test_get_articles(self): expected = [ { "eId": "001", - "article_num": "Article 1", - "article_title": None, + "num": "Article 1", + "heading": None, "children": [ {"eId": 0, "text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."} ] }, { "eId": "002", - "article_num": "Article 2", - "article_title": None, + "num": "Article 2", + "heading": None, "children": [ {"eId": 0, "text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."} ] diff --git a/tulit/parsers/html/cellar.py b/tulit/parsers/html/cellar.py index dac9388..cfa1edf 100644 --- a/tulit/parsers/html/cellar.py +++ b/tulit/parsers/html/cellar.py @@ -1,6 +1,7 @@ from tulit.parsers.html.xhtml import HTMLParser import json import re +import argparse class CellarHTMLParser(HTMLParser): def __init__(self): @@ -165,8 +166,8 @@ def get_chapters(self): chapter_title = chapter.find('div', class_="eli-title").get_text(strip=True) self.chapters.append({ 'eId': eId, - 'chapter_num': chapter_num, - 'chapter_heading': chapter_title + 'num': chapter_num, + 'heading': chapter_title }) def get_articles(self): @@ -236,8 +237,8 @@ def get_articles(self): # Store the article with its eId and subdivisions self.articles.append({ 'eId': eId, - 'article_num': article_num, - 'article_title': article_title, + 'num': article_num, + 'heading': article_title, 'children': children }) @@ -254,17 +255,17 @@ def parse(self, file): def main(): - parser = CellarHTMLParser() - file_to_parse = 'tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html' + parser = argparse.ArgumentParser(description='Parse an Cellar XHTML document and output the results to a JSON file.') + parser.add_argument('--input', type=str, default='tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html', help='Path to the Cellar XHTML file to parse.') + parser.add_argument('--output', type=str, default='tests/data/json/iopa_html.json', help='Path to the output JSON file.') + args = parser.parse_args() - output_file = 'tests/data/json/iopa_html.json' + html_parser = CellarHTMLParser() + html_parser.parse(args.input) - - parser.parse(file_to_parse) - - with open(output_file, 'w', encoding='utf-8') as f: + with open(args.output, 'w', encoding='utf-8') as f: # Get the parser's attributes as a dictionary - parser_dict = parser.__dict__ + parser_dict = html_parser.__dict__ # Filter out non-serializable attributes serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))} diff --git a/tulit/parsers/xml/akomantoso.py b/tulit/parsers/xml/akomantoso.py index 459f069..80e6f47 100644 --- a/tulit/parsers/xml/akomantoso.py +++ b/tulit/parsers/xml/akomantoso.py @@ -1,5 +1,6 @@ from tulit.parsers.xml.xml import XMLParser import json +import argparse class AkomaNtosoParser(XMLParser): """ @@ -195,8 +196,8 @@ def get_articles(self) -> None: # Append the article data to the articles list self.articles.append({ 'eId': eId, - 'article_num': article_num_text, - 'article_title': article_title_text, + 'num': article_num_text, + 'heading': article_title_text, 'children': children }) @@ -292,21 +293,17 @@ def parse(self, file: str) -> None: return super().parse(file, schema = 'akomantoso30.xsd', format = 'Akoma Ntoso') def main(): - parser = AkomaNtosoParser() - - file_to_parse = 'tests/data/akn/eu/32014L0092.akn' - output_file = 'tests/data/json/akn.json' - - parser.parse(file_to_parse) - - - with open(output_file, 'w', encoding='utf-8') as f: + parser = argparse.ArgumentParser(description='Parse an Akoma Ntoso XML document and output the results to a JSON file.') + parser.add_argument('--input', type=str, default='tests/data/akn/eu/32014L0092.akn', help='Path to the Akoma Ntoso XML file to parse.') + parser.add_argument('--output', type=str, default='tests/data/json/akn.json', help='Path to the output JSON file.') + args = parser.parse_args() + akoma_parser = AkomaNtosoParser() + akoma_parser.parse(args.input) + with open(args.output, 'w', encoding='utf-8') as f: # Get the parser's attributes as a dictionary - parser_dict = parser.__dict__ - + parser_dict = akoma_parser.__dict__ # Filter out non-serializable attributes serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))} - # Write to a JSON file json.dump(serializable_dict, f, ensure_ascii=False, indent=4) diff --git a/tulit/parsers/xml/formex.py b/tulit/parsers/xml/formex.py index ae77b2e..2ca2b68 100644 --- a/tulit/parsers/xml/formex.py +++ b/tulit/parsers/xml/formex.py @@ -2,6 +2,7 @@ import json from tulit.parsers.xml.xml import XMLParser +import argparse class Formex4Parser(XMLParser): """ @@ -196,8 +197,8 @@ def get_articles(self): self.articles.append({ "eId": article.get("IDENTIFIER"), - "article_num": article.findtext('.//TI.ART'), - "article_title": article.findtext('.//STI.ART'), + "num": article.findtext('.//TI.ART'), + "heading": article.findtext('.//STI.ART'), "children": children }) @@ -284,24 +285,24 @@ def parse(self, file): super().parse(file, schema='./formex4.xsd', format='Formex 4') def main(): - parser = Formex4Parser() - file_to_parse = 'tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml' - - output_file = 'tests/data/json/iopa.json' - + parser = argparse.ArgumentParser(description='Parse a FORMEX XML document and output the results to a JSON file.') + parser.add_argument('--input', type=str, default='tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml', help='Path to the FORMEX XML file to parse.') + parser.add_argument('--output', type=str, default='tests/data/json/iopa.json', help='Path to the output JSON file.') - parser.parse(file_to_parse) - - with open(output_file, 'w', encoding='utf-8') as f: + args = parser.parse_args() + + formex_parser = Formex4Parser() + formex_parser.parse(args.input) + + with open(args.output, 'w', encoding='utf-8') as f: # Get the parser's attributes as a dictionary - parser_dict = parser.__dict__ - + parser_dict = formex_parser.__dict__ + # Filter out non-serializable attributes serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))} - + # Write to a JSON file json.dump(serializable_dict, f, ensure_ascii=False, indent=4) if __name__ == "__main__": main() - diff --git a/tulit/parsers/xml/xml.py b/tulit/parsers/xml/xml.py index 25b2e15..88d2129 100644 --- a/tulit/parsers/xml/xml.py +++ b/tulit/parsers/xml/xml.py @@ -397,8 +397,8 @@ def get_chapters(self, chapter_xpath: str, num_xpath: str, heading_xpath: str, e self.chapters.append({ 'eId': eId, - 'chapter_num': chapter_num, - 'chapter_heading': chapter_heading + 'num': chapter_num, + 'heading': chapter_heading }) def get_articles(self) -> None: