Skip to content

Commit

Permalink
Adopted uniform naming num and heading
Browse files Browse the repository at this point in the history
  • Loading branch information
AlessioNar committed Dec 29, 2024
1 parent aaa9c0f commit 8498ae7
Show file tree
Hide file tree
Showing 6 changed files with 58 additions and 59 deletions.
14 changes: 7 additions & 7 deletions tests/parsers/xml/test_akomantoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,13 @@ def test_get_chapters(self):
self.parser.get_chapters()

expected_chapters = [
{'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},
{'eId': 'chp_II', 'chapter_num': 'CHAPTER II', 'chapter_heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'},
{'eId': 'chp_III', 'chapter_num': 'CHAPTER III', 'chapter_heading': 'SWITCHING'},
{'eId': 'chp_IV', 'chapter_num': 'CHAPTER IV', 'chapter_heading': 'ACCESS TO PAYMENT ACCOUNTS'},
{'eId': 'chp_V', 'chapter_num': 'CHAPTER V', 'chapter_heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'},
{'eId': 'chp_VI', 'chapter_num': 'CHAPTER VI', 'chapter_heading': 'SANCTIONS'},
{'eId': 'chp_VII', 'chapter_num': 'CHAPTER VII', 'chapter_heading': 'FINAL PROVISIONS'}
{'eId': 'chp_I', 'num': 'CHAPTER I', 'heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},
{'eId': 'chp_II', 'num': 'CHAPTER II', 'heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'},
{'eId': 'chp_III', 'num': 'CHAPTER III', 'heading': 'SWITCHING'},
{'eId': 'chp_IV', 'num': 'CHAPTER IV', 'heading': 'ACCESS TO PAYMENT ACCOUNTS'},
{'eId': 'chp_V', 'num': 'CHAPTER V', 'heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'},
{'eId': 'chp_VI', 'num': 'CHAPTER VI', 'heading': 'SANCTIONS'},
{'eId': 'chp_VII', 'num': 'CHAPTER VII', 'heading': 'FINAL PROVISIONS'}
]
self.assertEqual(self.parser.chapters, expected_chapters, "Chapters data does not match expected content")

Expand Down
20 changes: 10 additions & 10 deletions tests/parsers/xml/test_formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,12 @@ def test_get_chapters(self):
self.parser.get_chapters()

expected_chapters = [
{'eId': 0, 'chapter_num': 'Chapter 1', 'chapter_heading': 'General provisions'},
{'eId': 1, 'chapter_num': 'Chapter 2', 'chapter_heading': 'European Interoperability enablers' },
{'eId': 2, 'chapter_heading': 'Interoperable Europe support measures', 'chapter_num': 'Chapter 3'},
{'eId': 3, 'chapter_heading': 'Governance of cross-border interoperability', 'chapter_num': 'Chapter 4'},
{'eId': 4, 'chapter_heading': 'Interoperable Europe planning and monitoring', 'chapter_num': 'Chapter 5'},
{'eId': 5, 'chapter_heading': 'Final provisions', 'chapter_num': 'Chapter 6'},
{'eId': 0, 'num': 'Chapter 1', 'heading': 'General provisions'},
{'eId': 1, 'num': 'Chapter 2', 'heading': 'European Interoperability enablers'},
{'eId': 2, 'num': 'Chapter 3', 'heading': 'Interoperable Europe support measures'},
{'eId': 3, 'num': 'Chapter 4', 'heading': 'Governance of cross-border interoperability'},
{'eId': 4, 'num': 'Chapter 5', 'heading': 'Interoperable Europe planning and monitoring'},
{'eId': 5, 'num': 'Chapter 6', 'heading': 'Final provisions'},
]

self.assertEqual(self.parser.chapters[0], expected_chapters[0], "Chapters data does not match expected content")
Expand All @@ -105,16 +105,16 @@ def test_get_articles(self):
expected = [
{
"eId": "001",
"article_num": "Article 1",
"article_title": None,
"num": "Article 1",
"heading": None,
"children": [
{"eId": 0, "text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."}
]
},
{
"eId": "002",
"article_num": "Article 2",
"article_title": None,
"num": "Article 2",
"heading": None,
"children": [
{"eId": 0, "text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."}
]
Expand Down
25 changes: 13 additions & 12 deletions tulit/parsers/html/cellar.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from tulit.parsers.html.xhtml import HTMLParser
import json
import re
import argparse

class CellarHTMLParser(HTMLParser):
def __init__(self):
Expand Down Expand Up @@ -165,8 +166,8 @@ def get_chapters(self):
chapter_title = chapter.find('div', class_="eli-title").get_text(strip=True)
self.chapters.append({
'eId': eId,
'chapter_num': chapter_num,
'chapter_heading': chapter_title
'num': chapter_num,
'heading': chapter_title
})

def get_articles(self):
Expand Down Expand Up @@ -236,8 +237,8 @@ def get_articles(self):
# Store the article with its eId and subdivisions
self.articles.append({
'eId': eId,
'article_num': article_num,
'article_title': article_title,
'num': article_num,
'heading': article_title,
'children': children
})

Expand All @@ -254,17 +255,17 @@ def parse(self, file):


def main():
parser = CellarHTMLParser()
file_to_parse = 'tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html'
parser = argparse.ArgumentParser(description='Parse an Cellar XHTML document and output the results to a JSON file.')
parser.add_argument('--input', type=str, default='tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html', help='Path to the Cellar XHTML file to parse.')
parser.add_argument('--output', type=str, default='tests/data/json/iopa_html.json', help='Path to the output JSON file.')
args = parser.parse_args()

output_file = 'tests/data/json/iopa_html.json'
html_parser = CellarHTMLParser()
html_parser.parse(args.input)


parser.parse(file_to_parse)

with open(output_file, 'w', encoding='utf-8') as f:
with open(args.output, 'w', encoding='utf-8') as f:
# Get the parser's attributes as a dictionary
parser_dict = parser.__dict__
parser_dict = html_parser.__dict__

# Filter out non-serializable attributes
serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}
Expand Down
25 changes: 11 additions & 14 deletions tulit/parsers/xml/akomantoso.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from tulit.parsers.xml.xml import XMLParser
import json
import argparse

class AkomaNtosoParser(XMLParser):
"""
Expand Down Expand Up @@ -195,8 +196,8 @@ def get_articles(self) -> None:
# Append the article data to the articles list
self.articles.append({
'eId': eId,
'article_num': article_num_text,
'article_title': article_title_text,
'num': article_num_text,
'heading': article_title_text,
'children': children
})

Expand Down Expand Up @@ -292,21 +293,17 @@ def parse(self, file: str) -> None:
return super().parse(file, schema = 'akomantoso30.xsd', format = 'Akoma Ntoso')

def main():
parser = AkomaNtosoParser()

file_to_parse = 'tests/data/akn/eu/32014L0092.akn'
output_file = 'tests/data/json/akn.json'

parser.parse(file_to_parse)


with open(output_file, 'w', encoding='utf-8') as f:
parser = argparse.ArgumentParser(description='Parse an Akoma Ntoso XML document and output the results to a JSON file.')
parser.add_argument('--input', type=str, default='tests/data/akn/eu/32014L0092.akn', help='Path to the Akoma Ntoso XML file to parse.')
parser.add_argument('--output', type=str, default='tests/data/json/akn.json', help='Path to the output JSON file.')
args = parser.parse_args()
akoma_parser = AkomaNtosoParser()
akoma_parser.parse(args.input)
with open(args.output, 'w', encoding='utf-8') as f:
# Get the parser's attributes as a dictionary
parser_dict = parser.__dict__

parser_dict = akoma_parser.__dict__
# Filter out non-serializable attributes
serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}

# Write to a JSON file
json.dump(serializable_dict, f, ensure_ascii=False, indent=4)

Expand Down
29 changes: 15 additions & 14 deletions tulit/parsers/xml/formex.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json

from tulit.parsers.xml.xml import XMLParser
import argparse

class Formex4Parser(XMLParser):
"""
Expand Down Expand Up @@ -196,8 +197,8 @@ def get_articles(self):

self.articles.append({
"eId": article.get("IDENTIFIER"),
"article_num": article.findtext('.//TI.ART'),
"article_title": article.findtext('.//STI.ART'),
"num": article.findtext('.//TI.ART'),
"heading": article.findtext('.//STI.ART'),
"children": children
})

Expand Down Expand Up @@ -284,24 +285,24 @@ def parse(self, file):
super().parse(file, schema='./formex4.xsd', format='Formex 4')

def main():
parser = Formex4Parser()
file_to_parse = 'tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml'

output_file = 'tests/data/json/iopa.json'

parser = argparse.ArgumentParser(description='Parse a FORMEX XML document and output the results to a JSON file.')
parser.add_argument('--input', type=str, default='tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml', help='Path to the FORMEX XML file to parse.')
parser.add_argument('--output', type=str, default='tests/data/json/iopa.json', help='Path to the output JSON file.')

parser.parse(file_to_parse)

with open(output_file, 'w', encoding='utf-8') as f:
args = parser.parse_args()

formex_parser = Formex4Parser()
formex_parser.parse(args.input)

with open(args.output, 'w', encoding='utf-8') as f:
# Get the parser's attributes as a dictionary
parser_dict = parser.__dict__
parser_dict = formex_parser.__dict__

# Filter out non-serializable attributes
serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}

# Write to a JSON file
json.dump(serializable_dict, f, ensure_ascii=False, indent=4)

if __name__ == "__main__":
main()

4 changes: 2 additions & 2 deletions tulit/parsers/xml/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,8 +397,8 @@ def get_chapters(self, chapter_xpath: str, num_xpath: str, heading_xpath: str, e

self.chapters.append({
'eId': eId,
'chapter_num': chapter_num,
'chapter_heading': chapter_heading
'num': chapter_num,
'heading': chapter_heading
})

def get_articles(self) -> None:
Expand Down

0 comments on commit 8498ae7

Please sign in to comment.