diff --git a/tests/parsers/test_akomantoso.py b/tests/parsers/test_akomantoso.py index b213353..7565e27 100644 --- a/tests/parsers/test_akomantoso.py +++ b/tests/parsers/test_akomantoso.py @@ -60,6 +60,8 @@ def test_get_preamble(self): def test_get_formula(self): """Test extraction of formula text within the preamble.""" + self.parser.get_preamble(preamble_xpath='.//akn:preamble', notes_xpath='.//akn:authorialNote') + formula_data = self.parser.get_formula() self.assertIn("THE EUROPEAN PARLIAMENT AND THE COUNCIL OF THE EUROPEAN UNION", formula_data) @@ -107,7 +109,7 @@ def test_get_body(self): def test_get_chapters(self): """Test retrieval and content of chapter headings.""" self.parser.get_body(body_xpath='.//akn:body') - self.parser.get_chapters(chapter_xpath='.//akn:chapter', num_xpath='.//akn:num', heading_xpath='.//akn:heading') + self.parser.get_chapters() expected_chapters = [ {'eId': 'chp_I', 'chapter_num': 'CHAPTER I', 'chapter_heading': 'SUBJECT MATTER, SCOPE AND DEFINITIONS'}, diff --git a/tulit/parsers/akomantoso.py b/tulit/parsers/akomantoso.py index 2772f16..de3c0ac 100644 --- a/tulit/parsers/akomantoso.py +++ b/tulit/parsers/akomantoso.py @@ -231,7 +231,7 @@ def get_formula(self): Concatenated text from all paragraphs within the formula element. Returns None if no formula is found. """ - formula = self.root.find('.//akn:preamble/akn:formula', namespaces=self.namespaces) + formula = self.preamble.find('.//akn:formula', namespaces=self.namespaces) if formula is None: return None @@ -305,18 +305,9 @@ def get_act(self) -> None: # Fallback: try without namespace self.act = self.root.find('.//act') - def get_chapters(self, chapter_xpath, num_xpath, heading_xpath) -> None: + def get_chapters(self) -> None: """ Extracts chapter information from the document. - - Parameters - ---------- - chapter_xpath : str - XPath expression to locate the chapter elements. - num_xpath : str - XPath expression to locate the chapter number within each chapter element. - heading_xpath : str - XPath expression to locate the chapter heading within each chapter element. Returns ------- @@ -325,19 +316,16 @@ def get_chapters(self, chapter_xpath, num_xpath, heading_xpath) -> None: - 'eId': Chapter identifier - 'chapter_num': Chapter number - 'chapter_heading': Chapter heading text - """ - # Find all elements in the body - for chapter in self.body.findall(chapter_xpath, namespaces=self.namespaces): - eId = chapter.get('eId') - chapter_num = chapter.find(num_xpath, namespaces=self.namespaces) - chapter_heading = chapter.find(heading_xpath, namespaces=self.namespaces) - - # Add chapter data to chapters list - self.chapters.append({ - 'eId': eId, - 'chapter_num': chapter_num.text if chapter_num is not None else None, - 'chapter_heading': ''.join(chapter_heading.itertext()).strip() if chapter_heading is not None else None - }) + """ + def extract_eId(chapter, index): + return chapter.get('eId') + + return super().get_chapters( + chapter_xpath='.//akn:chapter', + num_xpath='.//akn:num', + heading_xpath='.//akn:heading', + extract_eId=extract_eId + ) def get_articles(self) -> None: diff --git a/tulit/parsers/formex.py b/tulit/parsers/formex.py index 2319265..16d65b1 100644 --- a/tulit/parsers/formex.py +++ b/tulit/parsers/formex.py @@ -153,7 +153,6 @@ def get_chapters(self) -> None: if len(chapter.findall('.//HT')) > 1: chapter_heading = chapter.findall('.//HT')[1] self.chapters.append({ - "eId": index, "chapter_num" : "".join(chapter_num.itertext()).strip(), "chapter_heading": "".join(chapter_heading.itertext()).strip() diff --git a/tulit/parsers/parser.py b/tulit/parsers/parser.py index caa1c77..5a32bbf 100644 --- a/tulit/parsers/parser.py +++ b/tulit/parsers/parser.py @@ -351,6 +351,42 @@ def get_body(self, body_xpath) -> None: # Fallback: try without namespace self.body = self.root.find(body_xpath) + def get_chapters(self, chapter_xpath: str, num_xpath: str, heading_xpath: str, extract_eId=None) -> None: + """ + Extracts chapter information from the document. + + Parameters + ---------- + chapter_xpath : str + XPath expression to locate the chapter elements. + num_xpath : str + XPath expression to locate the chapter number within each chapter element. + heading_xpath : str + XPath expression to locate the chapter heading within each chapter element. + extract_eId : function, optional + Function to handle the extraction or generation of eId. + + Returns + ------- + list + List of dictionaries containing chapter data with keys: + - 'eId': Chapter identifier + - 'chapter_num': Chapter number + - 'chapter_heading': Chapter heading text + """ + self.chapters = [] + chapters = self.body.findall(chapter_xpath, namespaces=self.namespaces) + for index, chapter in enumerate(chapters): + eId = extract_eId(chapter, index) if extract_eId else index + chapter_num = chapter.find(num_xpath, namespaces=self.namespaces) + chapter_heading = chapter.find(heading_xpath, namespaces=self.namespaces) + + self.chapters.append({ + 'eId': eId, + 'chapter_num': chapter_num.text if chapter_num is not None else None, + 'chapter_heading': ''.join(chapter_heading.itertext()).strip() if chapter_heading is not None else None + }) + @abstractmethod def parse(self): """