Adopted uniform naming num and heading

AlessioNar · Dec 29, 2024 · 8498ae7 · 8498ae7
1 parent aaa9c0f
commit 8498ae7
Show file tree

Hide file tree

Showing 6 changed files with 58 additions and 59 deletions.
diff --git a/tests/parsers/xml/test_akomantoso.py b/tests/parsers/xml/test_akomantoso.py
@@ -94,13 +94,13 @@ def test_get_chapters(self):
         self.parser.get_chapters()
 
         expected_chapters = [
-            {'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},
-            {'eId': 'chp_II', 'chapter_num': 'CHAPTER II', 'chapter_heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'},
-            {'eId': 'chp_III', 'chapter_num': 'CHAPTER III', 'chapter_heading': 'SWITCHING'},
-            {'eId': 'chp_IV', 'chapter_num': 'CHAPTER IV', 'chapter_heading': 'ACCESS TO PAYMENT ACCOUNTS'},
-            {'eId': 'chp_V', 'chapter_num': 'CHAPTER V', 'chapter_heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'},
-            {'eId': 'chp_VI', 'chapter_num': 'CHAPTER VI', 'chapter_heading': 'SANCTIONS'},
-            {'eId': 'chp_VII', 'chapter_num': 'CHAPTER VII', 'chapter_heading': 'FINAL PROVISIONS'}
+            {'eId': 'chp_I',   'num': 'CHAPTER I',   'heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'},
+            {'eId': 'chp_II',  'num': 'CHAPTER II',  'heading': 'COMPARABILITY OF FEES CONNECTED WITH PAYMENT ACCOUNTS'},
+            {'eId': 'chp_III', 'num': 'CHAPTER III', 'heading': 'SWITCHING'},
+            {'eId': 'chp_IV',  'num': 'CHAPTER IV',  'heading': 'ACCESS TO PAYMENT ACCOUNTS'},
+            {'eId': 'chp_V',   'num': 'CHAPTER V',   'heading': 'COMPETENT AUTHORITIES AND ALTERNATIVE DISPUTE RESOLUTION'},
+            {'eId': 'chp_VI',  'num': 'CHAPTER VI',  'heading': 'SANCTIONS'},
+            {'eId': 'chp_VII', 'num': 'CHAPTER VII', 'heading': 'FINAL PROVISIONS'}
         ]
         self.assertEqual(self.parser.chapters, expected_chapters, "Chapters data does not match expected content")
 

diff --git a/tests/parsers/xml/test_formex.py b/tests/parsers/xml/test_formex.py
@@ -87,12 +87,12 @@ def test_get_chapters(self):
         self.parser.get_chapters()
 
         expected_chapters = [
-            {'eId': 0,  'chapter_num': 'Chapter 1', 'chapter_heading': 'General provisions'},
-            {'eId': 1,  'chapter_num': 'Chapter 2', 'chapter_heading': 'European Interoperability enablers' }, 
-            {'eId': 2,  'chapter_heading': 'Interoperable Europe support measures', 'chapter_num': 'Chapter 3'},
-            {'eId': 3,  'chapter_heading': 'Governance of cross-border interoperability', 'chapter_num': 'Chapter 4'},
-            {'eId': 4, 'chapter_heading': 'Interoperable Europe planning and monitoring', 'chapter_num': 'Chapter 5'},
-            {'eId': 5, 'chapter_heading': 'Final provisions', 'chapter_num': 'Chapter 6'},
+            {'eId': 0,  'num': 'Chapter 1', 'heading': 'General provisions'},
+            {'eId': 1,  'num': 'Chapter 2', 'heading': 'European Interoperability enablers'}, 
+            {'eId': 2,  'num': 'Chapter 3', 'heading': 'Interoperable Europe support measures'},
+            {'eId': 3,  'num': 'Chapter 4', 'heading': 'Governance of cross-border interoperability'},
+            {'eId': 4,  'num': 'Chapter 5', 'heading': 'Interoperable Europe planning and monitoring'},
+            {'eId': 5,  'num': 'Chapter 6', 'heading': 'Final provisions'},
         ]
 
         self.assertEqual(self.parser.chapters[0], expected_chapters[0], "Chapters data does not match expected content")
@@ -105,16 +105,16 @@ def test_get_articles(self):
         expected = [
             {
                 "eId": "001",
-                "article_num": "Article 1",
-                "article_title": None,
+                "num": "Article 1",
+                "heading": None,
                 "children": [
                     {"eId": 0, "text": "Annex I to Regulation (EC) No 1484/95 is replaced by the Annex to this Regulation."}
                 ]
             },
             {
                 "eId": "002",
-                "article_num": "Article 2",
-                "article_title": None,
+                "num": "Article 2",
+                "heading": None,
                 "children": [
                     {"eId": 0, "text": "This Regulation shall enter into force on the day of its publication in the Official Journal of the European Union."}
                 ]

diff --git a/tulit/parsers/html/cellar.py b/tulit/parsers/html/cellar.py
@@ -1,6 +1,7 @@
 from tulit.parsers.html.xhtml import HTMLParser
 import json
 import re
+import argparse
 
 class CellarHTMLParser(HTMLParser):
     def __init__(self):
@@ -165,8 +166,8 @@ def get_chapters(self):
             chapter_title = chapter.find('div', class_="eli-title").get_text(strip=True)
             self.chapters.append({
                 'eId': eId,
-                'chapter_num': chapter_num,
-                'chapter_heading': chapter_title
+                'num': chapter_num,
+                'heading': chapter_title
             })
 
     def get_articles(self):
@@ -236,8 +237,8 @@ def get_articles(self):
             # Store the article with its eId and subdivisions
             self.articles.append({
                 'eId': eId,
-                'article_num': article_num,
-                'article_title': article_title,
+                'num': article_num,
+                'heading': article_title,
                 'children': children
             })
 
@@ -254,17 +255,17 @@ def parse(self, file):
 
 
 def main():
-    parser = CellarHTMLParser()
-    file_to_parse = 'tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html'
+    parser = argparse.ArgumentParser(description='Parse an Cellar XHTML document and output the results to a JSON file.')
+    parser.add_argument('--input', type=str, default='tests/data/html/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.03/DOC_1.html', help='Path to the Cellar XHTML file to parse.')
+    parser.add_argument('--output', type=str, default='tests/data/json/iopa_html.json', help='Path to the output JSON file.')
+    args = parser.parse_args()
 
-    output_file = 'tests/data/json/iopa_html.json'
+    html_parser = CellarHTMLParser()
+    html_parser.parse(args.input)
 
-
-    parser.parse(file_to_parse)
-
-    with open(output_file, 'w', encoding='utf-8') as f:
+    with open(args.output, 'w', encoding='utf-8') as f:
         # Get the parser's attributes as a dictionary
-        parser_dict = parser.__dict__
+        parser_dict = html_parser.__dict__
 
         # Filter out non-serializable attributes
         serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}

diff --git a/tulit/parsers/xml/akomantoso.py b/tulit/parsers/xml/akomantoso.py
@@ -1,5 +1,6 @@
 from tulit.parsers.xml.xml import XMLParser
 import json
+import argparse
 
 class AkomaNtosoParser(XMLParser):
     """
@@ -195,8 +196,8 @@ def get_articles(self) -> None:
             # Append the article data to the articles list
             self.articles.append({
                 'eId': eId,
-                'article_num': article_num_text,
-                'article_title': article_title_text,
+                'num': article_num_text,
+                'heading': article_title_text,
                 'children': children
             })
 
@@ -292,21 +293,17 @@ def parse(self, file: str) -> None:
         return super().parse(file, schema = 'akomantoso30.xsd', format = 'Akoma Ntoso')
 
 def main():
-    parser = AkomaNtosoParser()
-
-    file_to_parse = 'tests/data/akn/eu/32014L0092.akn'
-    output_file = 'tests/data/json/akn.json'
-
-    parser.parse(file_to_parse)
-
-
-    with open(output_file, 'w', encoding='utf-8') as f:
+    parser = argparse.ArgumentParser(description='Parse an Akoma Ntoso XML document and output the results to a JSON file.')
+    parser.add_argument('--input', type=str, default='tests/data/akn/eu/32014L0092.akn', help='Path to the Akoma Ntoso XML file to parse.')
+    parser.add_argument('--output', type=str, default='tests/data/json/akn.json', help='Path to the output JSON file.')
+    args = parser.parse_args()
+    akoma_parser = AkomaNtosoParser()
+    akoma_parser.parse(args.input)
+    with open(args.output, 'w', encoding='utf-8') as f:
         # Get the parser's attributes as a dictionary
-        parser_dict = parser.__dict__
-
+        parser_dict = akoma_parser.__dict__
         # Filter out non-serializable attributes
         serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}
-
         # Write to a JSON file
         json.dump(serializable_dict, f, ensure_ascii=False, indent=4)
 

diff --git a/tulit/parsers/xml/formex.py b/tulit/parsers/xml/formex.py
@@ -2,6 +2,7 @@
 import json
 
 from tulit.parsers.xml.xml import XMLParser
+import argparse
 
 class Formex4Parser(XMLParser):
     """
@@ -196,8 +197,8 @@ def get_articles(self):
 
                 self.articles.append({
                     "eId": article.get("IDENTIFIER"),
-                    "article_num": article.findtext('.//TI.ART'),
-                    "article_title": article.findtext('.//STI.ART'),
+                    "num": article.findtext('.//TI.ART'),
+                    "heading": article.findtext('.//STI.ART'),
                     "children": children
                 })
 
@@ -284,24 +285,24 @@ def parse(self, file):
         super().parse(file, schema='./formex4.xsd', format='Formex 4')
 
 def main():
-    parser = Formex4Parser()
-    file_to_parse = 'tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml'
-
-    output_file = 'tests/data/json/iopa.json'
-
+    parser = argparse.ArgumentParser(description='Parse a FORMEX XML document and output the results to a JSON file.')
+    parser.add_argument('--input', type=str, default='tests/data/formex/c008bcb6-e7ec-11ee-9ea8-01aa75ed71a1.0006.02/DOC_1/L_202400903EN.000101.fmx.xml', help='Path to the FORMEX XML file to parse.')
+    parser.add_argument('--output', type=str, default='tests/data/json/iopa.json', help='Path to the output JSON file.')
 
-    parser.parse(file_to_parse)
-
-    with open(output_file, 'w', encoding='utf-8') as f:
+    args = parser.parse_args()
+
+    formex_parser = Formex4Parser()
+    formex_parser.parse(args.input)
+
+    with open(args.output, 'w', encoding='utf-8') as f:
         # Get the parser's attributes as a dictionary
-        parser_dict = parser.__dict__
-    
+        parser_dict = formex_parser.__dict__
+
         # Filter out non-serializable attributes
         serializable_dict = {k: v for k, v in parser_dict.items() if isinstance(v, (str, int, float, bool, list, dict, type(None)))}
-    
+
         # Write to a JSON file
         json.dump(serializable_dict, f, ensure_ascii=False, indent=4)
 
 if __name__ == "__main__":
     main()
-
diff --git a/tulit/parsers/xml/xml.py b/tulit/parsers/xml/xml.py
@@ -397,8 +397,8 @@ def get_chapters(self, chapter_xpath: str, num_xpath: str, heading_xpath: str, e
 
             self.chapters.append({
                 'eId': eId,
-                'chapter_num': chapter_num,
-                'chapter_heading': chapter_heading 
+                'num': chapter_num,
+                'heading': chapter_heading 
             })
 
     def get_articles(self) -> None: