diff --git a/Readme.md b/Readme.md index 464cc97..14583e9 100644 --- a/Readme.md +++ b/Readme.md @@ -1,4 +1,4 @@ -# cic-beautify-state-codes +# cic-beautify-state-codes-framework Welcome to the Code Improvement Commission @@ -8,15 +8,14 @@ This repository contains software that transforms official codes from ugly .rtf Currently this code supports following states: -1. ###Georgia (GA): +1. ###Alaska (AK): - **Code repo:** https://github.com/UniCourt/cic-code-ga + **Code repo:** https://github.com/UniCourt/cic-code-ak - **Code pages:** https://unicourt.github.io/cic-code-ga + **Code pages:** https://unicourt.github.io/cic-code-ak - **Original RTF:** https://archive.org/download/gov.ga.ocga.2018 + **Original RTF:** https://archive.org/download/gov.ak.code - 2. ###Arkansas (AR): @@ -27,24 +26,34 @@ Currently this code supports following states: **Original RTF:** https://archive.org/download/gov.ar.code -3. ###Mississippi (MS): +3. ###Colorado (CO): - **Code repo:** https://github.com/UniCourt/cic-code-ms + **Code repo:** https://github.com/UniCourt/cic-code-co - **Code pages:** https://unicourt.github.io/cic-code-ms + **Code pages:** https://unicourt.github.io/cic-code-co - **Original RTF:** https://archive.org/download/gov.ms.code.ann.2018 + **Original RTF:** https://archive.org/download/gov.co.crs.bulk + + +4. ###Georgia (GA): + + **Code repo:** https://github.com/UniCourt/cic-code-ga + + **Code pages:** https://unicourt.github.io/cic-code-ga + + **Original RTF:** https://archive.org/download/gov.ga.ocga.2018 -4. ###Tennessee (TN): +5. ###Idaho (ID): - **Code repo:** https://github.com/UniCourt/cic-code-tn + **Code repo:** https://github.com/UniCourt/cic-code-id - **Code pages:** https://unicourt.github.io/cic-code-tn + **Code pages:** https://unicourt.github.io/cic-code-id - **Original RTF:** https://archive.org/details/gov.tn.tca + **Original files can be found here:** https://archive.org/details/govlaw?and%5B%5D=subject%3A%22idaho.gov%22+AND+subject%3A%222020+Code%22&sin=&sort=titleSorter -5. ###Kentucky (KY): + +6. ###Kentucky (KY): **Code repo:** https://github.com/UniCourt/cic-code-ky @@ -52,42 +61,62 @@ Currently this code supports following states: **Original RTF:** https://archive.org/details/gov.ky.code -6. ###Colorado (CO): + +7. ###Mississippi (MS): - **Code repo:** https://github.com/UniCourt/cic-code-co + **Code repo:** https://github.com/UniCourt/cic-code-ms - **Code pages:** https://unicourt.github.io/cic-code-co + **Code pages:** https://unicourt.github.io/cic-code-ms - **Original RTF:** https://archive.org/download/gov.co.crs.bulk + **Original RTF:** https://archive.org/download/gov.ms.code.ann.2018 -7. ###Idaho (ID): +8. ###North Carolina (NC): - **Code repo:** https://github.com/UniCourt/cic-code-id + **Code repo:** https://github.com/UniCourt/cic-code-nc - **Code pages:** https://unicourt.github.io/cic-code-id + **Code pages:** https://unicourt.github.io/cic-code-nc - **Original files can be found here:** https://archive.org/details/govlaw?and%5B%5D=subject%3A%22idaho.gov%22+AND+subject%3A%222020+Code%22&sin=&sort=titleSorter + **Original RTF:** https://archive.org/download/gov.nc.code -8. ###Virginia (VA): +9. ###North Dakota (ND): - **Code repo:** https://github.com/UniCourt/cic-code-va - - **Code pages:** https://unicourt.github.io/cic-code-va - - **Original RTF:** https://archive.org/download/gov.va.code/ + **Code repo:** https://github.com/UniCourt/cic-code-nd + + **Code pages:** https://unicourt.github.io/cic-code-nd + + **Original RTF:** https://archive.org/details/gov.nd.code -9. ###Vermont (VT): +10. ###Tennessee (TN): - **Code repo:** https://github.com/UniCourt/cic-code-vt + **Code repo:** https://github.com/UniCourt/cic-code-tn + + **Code pages:** https://unicourt.github.io/cic-code-tn + + **Original RTF:** https://archive.org/details/gov.tn.tca + + +11. ###Vermont (VT): - **Code pages:** https://unicourt.github.io/cic-code-vt + **Code repo:** https://github.com/UniCourt/cic-code-vt + + **Code pages:** https://unicourt.github.io/cic-code-vt + + **Original RTF:** https://archive.org/download/gov.vt.code + + +12. ###Virginia (VA): - **Original RTF:** https://archive.org/download/gov.vt.code + **Code repo:** https://github.com/UniCourt/cic-code-va + + **Code pages:** https://unicourt.github.io/cic-code-va + + **Original RTF:** https://archive.org/download/gov.va.code/ -10. ###Wyoming (WY): + +13. ###Wyoming (WY): **Code repo:** https://github.com/UniCourt/cic-code-wy @@ -98,8 +127,10 @@ Currently this code supports following states: In subsequent months, we intend to add two more features: -1. Extend the code to handle the official codes Colorado and Idaho. -2. Add a "redline" capability to show diffs. +1. Extend the code to handle the official codes Rhode Island and other states. +2. Add a "redline" capability to show diffs. +3. Adding citation to external links. + **REQUIREMENTS AND INSTALLATION** @@ -127,10 +158,9 @@ In subsequent months, we intend to add two more features: │ │ file012.py | └───transforms - │ └───ga - │ └───ocga - │ └───raw - │ title_01.html + │ └───co + │ └───occo + │ └───title_01.html 5. Python3.8 should be installed in development environment to run this project @@ -139,12 +169,25 @@ In subsequent months, we intend to add two more features: **Usage:** python html_parser/html_parse_runner.py - [--state_key (GA)] - - [--release_label (Release-75)] - - [--release_date (DD-MM-YYYY)] - - [--input_file_name (gov.ga.ocga.title.01.html) This is an optional argument, - - if this argument is not passed all the files for provided release label will be parsed] + [--state_key (CO)] + + [--path This argument can be in three different types, + To run single file : (/co/occo/r80/gov.co.code.title.01.html) + To run all files from particular release : (/co/occo/r80/) + To run all the release of particular state : (/co/occo/) ] + + [--run_after_release (83) This is an optional argument,this helps to run all releases after the mentioned release] + + +**Additional required files:** + + Release_dates.txt : + This is a file where all states release dates are stored in the format _r< > + eg: [CO_r71 2020.08.01] + +**Implementation of Child class:** + + Child class name format : _html_parser eg:co_html_parser. + Mandatory functions in child : + pre_process : + convert_paragraph_to_alphabetical_ol_tags diff --git a/html_parser/ak_html_parser.py b/html_parser/ak_html_parser.py index 95e25c8..b8a02d6 100644 --- a/html_parser/ak_html_parser.py +++ b/html_parser/ak_html_parser.py @@ -819,9 +819,9 @@ def clean_html_and_add_cite(self): title_id = id_reg.group("title").strip().zfill(2) if os.path.isfile( - f"../../code-ak/transforms/ak/ocak/r{self.release_number}/gov.ak.code.title.{title_id}.html"): + f"/home/mis/PycharmProjects/cic-code-ak-1/transforms/ak/ocak/r{self.release_number}/gov.ak.code.title.{title_id}.html"): with open( - f"../../code-ak/transforms/ak/ocak/r{self.release_number}/gov.ak.code.title.{title_id}.html", + f"/home/mis/PycharmProjects/cic-code-ak-1/transforms/ak/ocak/r{self.release_number}/gov.ak.code.title.{title_id}.html", 'r') as firstfile: for line in firstfile: @@ -843,7 +843,7 @@ def clean_html_and_add_cite(self): tag.clear() text = re.sub(fr'\s{re.escape(match)}', - f' {match}', + f' {match}', inside_text, re.I) tag.append(text) @@ -863,7 +863,7 @@ def clean_html_and_add_cite(self): tag.clear() text = re.sub(fr'\s{re.escape(match)}', - f' {match}', + f' {match}', inside_text, re.I) tag.append(text) @@ -937,8 +937,13 @@ def write_soup_to_file(self): soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) print("validating") - with open(f"../../code-ak/transforms/ak/ocak/r{self.release_number}/{self.html_file_name}", "w") as file: + with open(f"../../cic-code-ak-1/transforms/ak/ocak/r{self.release_number}/{self.html_file_name}", "w") as file: + # soup_str = re.sub(r'&(?!amp;)', '&', soup_str) + # file.write(soup_str) + soup_str = re.sub(r'&(?!amp;)', '&', soup_str) + soup_str = re.sub('
', '
', soup_str) + soup_str = re.sub(r'\s*', '', soup_str) file.write(soup_str) def create_Notes_to_Decisions_ul_con(self): diff --git a/html_parser/ar_html_parser.py b/html_parser/ar_html_parser.py index 819c1d9..7b19ddd 100644 --- a/html_parser/ar_html_parser.py +++ b/html_parser/ar_html_parser.py @@ -144,13 +144,7 @@ def replace_tags(self): for key, value in tag_dict.items(): ul = self.soup.new_tag("ul", Class="leaders") while True: - - - p_tag = self.soup.find('p', {"class": key}) - - - if not p_tag or p_tag.has_attr('Class') and p_tag['Class'] == 'transformation': break p_tag.name = value @@ -1406,9 +1400,6 @@ def create_case_notes_nav_tag(self): header['id'] = header_id.strip('#') nav_tag.append(new_ul) case_notes_nav.replace_with(nav_tag) - - - print('created analysis tag') diff --git a/html_parser/co_html_parser.py b/html_parser/co_html_parser.py index 4da8e33..9028b67 100644 --- a/html_parser/co_html_parser.py +++ b/html_parser/co_html_parser.py @@ -401,7 +401,7 @@ def replace_tags(self): - elif re.search(r'^[A-HJ-UW-Z]\.\s[A-Z][a-z]+', header_tag.text.strip()): + elif re.search(r'^[A-HJ-UW-Z]\.\s"?[A-Z][a-z]+', header_tag.text.strip()): header_tag.name = "h5" prev_id = header_tag.find_previous(lambda tag: tag.name in ['h5'] and re.search(r'^[IVX]+\.', tag.text.strip())).get( @@ -494,6 +494,12 @@ def replace_tags(self): header_tag.append(new_ul_tag) header_tag.unwrap() + stylesheet_link_tag = self.soup.new_tag('link') + stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', + 'href': 'https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css'} + self.soup.style.replace_with(stylesheet_link_tag) + self.meta_tags.append(stylesheet_link_tag) + def recreate_tag(self): ol_list = [] num_ol_tag = self.soup.new_tag("ol") @@ -764,17 +770,17 @@ def recreate_tag(self): print("tags are recreated") - def convert_roman_to_digit(self, roman): + def convert_roman_to_digit(self, roman_digit): value = {'M': 1000, 'D': 500, 'C': 100, 'L': 50, 'X': 10, 'V': 5, 'I': 1} prev = 0 ans = 0 - length = len(roman) + length = len(roman_digit) for num in range(length - 1, -1, -1): - if value[roman[num]] >= prev: - ans += value[roman[num]] + if value[roman_digit[num]] >= prev: + ans += value[roman_digit[num]] else: - ans -= value[roman[num]] - prev = value[roman[num]] + ans -= value[roman_digit[num]] + prev = value[roman_digit[num]] return ans # def convert_paragraph_to_alphabetical_ol_tags2(self): @@ -1291,7 +1297,6 @@ def convert_paragraph_to_alphabetical_ol_tags2(self): inr_cap_alpha_cur_tag = None alpha_cur_tag = None - for p_tag in self.soup.find_all(): if p_tag.b: p_tag.b.unwrap() @@ -1301,6 +1306,7 @@ def convert_paragraph_to_alphabetical_ol_tags2(self): p_tag.name = "li" cap_alpha = 'A' num_cur_tag = p_tag + prev_alpha = p_tag if re.search(r'^\(1\)', current_tag_text): if new_alpha: num_ol = self.soup.new_tag("ol") @@ -1522,8 +1528,7 @@ def convert_paragraph_to_alphabetical_ol_tags2(self): new_num = li_tag cur_tag = re.search(r'^\((?P\w+)\)\s*\((?P\d+)\)', current_tag_text) prev_rom_id = f'{prev_num_id}{cur_tag.group("cid")}{cur_tag.group("pid")}' - - li_tag["id"] = f'{new_alpha.get("id")}{cur_tag.group("pid")}' + li_tag["id"] = f'{alpha_cur_tag.get("id")}{cur_tag.group("pid")}' num_ol.append(li_tag) p_tag.contents = [] p_tag.append(num_ol) @@ -1740,7 +1745,7 @@ def convert_paragraph_to_alphabetical_ol_tags2(self): # num_count += 1 - elif re.search(rf'^{inr_cap_alpha}\.', current_tag_text): + elif re.search(rf'^{inr_cap_alpha}\.', current_tag_text) and p_tag.name=="p": p_tag.name = "li" inr_cap_alpha_cur_tag = p_tag num_count = 1 @@ -1829,7 +1834,7 @@ def convert_paragraph_to_alphabetical_ol_tags2(self): roman_count = 1 p_tag.string = re.sub(r'^\([a-z]{2,3}\)', '', current_tag_text) - if re.search(r'^Source|^Cross references:|^OFFICIAL COMMENT|^ARTICLE [IVX]+', + if re.search(r'^Source|^Cross references:|^OFFICIAL COMMENT|^(ARTICLE|Article) [IVX]+', current_tag_text, re.I) or p_tag.name in ['h3', 'h4']: ol_head = 1 @@ -1843,7 +1848,7 @@ def convert_paragraph_to_alphabetical_ol_tags2(self): main_sec_alpha = 'a' sec_alpha = 'a' - if re.search(r'^ARTICLE [IVX]+', current_tag_text,re.I): + if re.search(r'^(ARTICLE|Article) [IVX]+', current_tag_text,re.I): ol_count += 1 print('ol tags added') @@ -2112,7 +2117,6 @@ def create_chapter_section_nav(self): sub_tag = "-" self.set_chapter_section_nav(li_tag, chap_num, sub_tag, prev_id, None) - # create div tags def create_and_wrap_with_div_tag(self): """ - for each h2 in html @@ -2224,7 +2228,7 @@ def create_case_note_nav(self): nav_link = self.soup.new_tag('a') nav_link.append(case_tag.text) case_id = re.search(r'^(?P[A-Z])\.', case_tag.text.strip()).group("cid") - print(case_tag) + alpha_id = f"{rom_id}-{case_id}" nav_link["href"] = f"#{rom_id}-{case_id}" nav_list.append(nav_link) @@ -2243,7 +2247,6 @@ def create_case_note_nav(self): def create_case_note_ul(self): for case_tag in self.soup.find_all(class_=self.class_regex['ol']): - if case_tag.a: case_tag.name = "li" if re.search(r'^[IVX]+\.\s[A-Z]+', case_tag.a.text.strip()): @@ -2272,484 +2275,6 @@ def create_case_note_ul(self): else: digit_ul.append(case_tag) - def create_citation(self, t_id, c_id, s_id, p_id): - - title01 = { - 'GENERAL, PRIMARY, RECALL, AND CONGRESSIONAL VACANCY ELECTIONS': ['1', '1.5', '2', '3', '4', '5', '5.5', - '6', '7', '7.5', '8', '8.3', '8.5', '9', - '10', '10.5', '11', '12', '12', '13', - '13.5', '14', '15', '16', '17'], - 'OTHER ELECTION OFFENSES': ['30'], 'INITIATIVE AND REFERENDUM': ['40'], 'ODD-YEAR ELECTIONS': ['41'], - 'ELECTION CAMPAIGN REGULATIONS': ['45']} - - title02 = {'CONGRESSIONAL DISTRICTS': ['1'], 'GENERAL ASSEMBLY': ['2'], 'LEGISLATIVE SERVICES': ['3'], - 'STATUTES - CONSTRUCTION AND REVISION': ['4', '5'], 'MISCELLANEOUS': ['6', '7']} - - title03 = {'JURISDICTION': ['1', '2', '3']} - - title05 = {'CONSUMER CREDIT CODE': ['1', '2', '3', '3.1', '3.5', '3.7', '4', '5', '6', '7', '9'], - 'REFUND ANTICIPATION LOANS': ['9.5'], 'RENTAL PURCHASE': ['10'], 'INTEREST RATES': ['12', '13'], - 'DEBT MANAGEMENT': ['16', '17', '18', '19', '20']} - - title06 = {'FAIR TRADE AND RESTRAINT OF TRADE': ['1', '2', '2.5', '3', '4', '5', '6', '6.5'], - 'ENERGY AND WATER CONSERVATION': ['7', '7.5'], - 'AGRICULTURAL ASSISTANCE': ['8', '9'], 'ASSIGNMENTS IN GENERAL': ['10'], - 'PATENTS - PROHIBITED COMMUNICATION': ['12'], - 'ENFORCEMENT OF NONDRAMATIC MUSIC COPYRIGHTS': ['13'], 'ART TRANSACTIONS': ['15'], - 'CHARITABLE SOLICITATIONS': ['16'], - 'RECORDS RETENTION': ['17'], 'HEALTH CARE COVERAGE COOPERATIVES': ['18'], - 'TRANSACTIONS INVOLVING LICENSED HOSPITALS': ['19'], - 'HOSPITAL DISCLOSURES TO CONSUMERS': ['20'], - 'PROTECTION AGAINST EXPLOITATION OF AT-RISK ADULTS': ['21'], - 'RESIDENTIAL ROOFING SERVICES': ['22'], 'DIRECT PRIMARY HEALTH CARE': ['23'], 'CEMETERIES': ['24'], - 'PUBLIC ESTABLISHMENTS': ['25'], 'INTERNET SERVICE PROVIDERS': ['26']} - title07 = {'Colorado Corporation Code': ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10'], - 'Nonprofit Corporation': ['20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30'], - 'Special Purpose Corporations': ['40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '49.5'], - 'Religious and Benevolent Organizations': ['50', '51', '52'], - 'ASSOCIATIONS': ['55', '56', '57', '58'], - 'PARTNERSHIPS': ['60', '61', '62', '63', '64'], - 'TRADEMARKS AND BUSINESS NAMES': ['70', '71', '72', '73'], - 'TRADE SECRETS': ['74'], 'LIMITED LIABILITY COMPANIES': ['80'], - 'CORPORATIONS AND ASSOCIATIONS': ['90'], - 'Colorado Business Corporations': ['101', '102', '103', '104', '105', '106', '107', '108', '109', - '110', '111', '112', '113', '114', '115', '116', '117'], - 'Nonprofit Corporations': ['121', '122', '123', '124', '125', '126', '127', '128', '129', '130', - '131', '132', '134', '135', '136', '137']} - - title08 = {'Division of Labor - Industrial Claim Appeals Office': ['1'], - 'Labor Relations': ['2', '2.5', '3', '3.5'], - 'Wages': ['4', '5', '6', '7', '8', '9', '10'], - 'Labor Conditions': ['11', '12', '13', '13.3', '13.5', '14', '14.3'], - 'Workers\' Compensation Cost Containment': ['14.5'], 'Apprenticeship and Training': ['15', '15.5'], - 'Public Works': ['16', '17', '17.5', '18', '19', '19.5', '19.7'], 'Fuel Products': ['20', '20.5'], - 'Workers\' Compensation': ['40', '41', '42', '43', '44', '45', '46', '47'], - 'Workmen\'s Compensation': ['48', '49', '50', '51', '52', '53', '54'], - 'Workers\'Compensation - Continued': ['55'], - 'Occupational Diseases': ['60'], 'MedicalInsuranceProvisions': ['65', '66', '67'], - 'LABOR III - EMPLOYMENT SECURITY': ['70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', - '81', '82'], - 'EMPLOYMENT AND TRAINING': ['83', '84'], 'INDEPENDENT LIVING SERVICES': ['85', '86']} - - title09 = {'BUILDINGS AND EQUIPMENT': ['1', '1.3', '1.5', '2', '2.5', '3', '4', '5', '5.5'], - 'EXPLOSIVES': ['6', '7'], 'SPECIAL SAFETY PROVISIONS': ['10']} - - title10 = {'GENERAL PROVISIONS': ['1'], 'LICENSES': ['2'], 'REGULATION OF INSURANCE COMPANIES': ['3'], - 'CERTIFIED CAPITAL COMPANIES': ['3.5'], 'PROPERTY AND CASUALTY INSURANCE': ['4'], - 'NONADMITTED INSURANCE': ['5'], - 'CAPTIVE INSURANCE COMPANIES': ['6'], 'LIFE INSURANCE': ['7'], 'COVERCOLORADO': ['8'], - 'FRANCHISE INSURANCE': ['9'], - 'CREDIT INSURANCE': ['10'], 'TITLE INSURANCE': ['11'], 'MUTUAL INSURANCE': ['12'], - 'INTERINSURANCE': ['13'], 'FRATERNAL BENEFIT SOCIETIES': ['14'], 'PRENEED FUNERAL CONTRACTS': ['15'], - 'HEALTH CARE COVERAGE': ['16', '16.5'], 'HEALTH MAINTENANCE ORGANIZATIONS': ['17'], - 'MEDICARESUPPLEMENT INSURANCE': ['18'], 'LONG-TERM CARE': ['19'], - 'LIFE AND HEALTH INSURANCE PROTECTION': ['20'], - 'HEALTH CARE': ['21', '22', '22.3', '22.5'], - 'CASH-BONDING AGENTS': ['23']} - - title11 = { - 'Banking Code': ['1', '2', '3', '4', '5', '6', '6.3', '6.4', '6.5', '7', '8', '9', '10', '10.5', '11'], - 'General Financial Provisions': ['20', '21'], 'Industrial Banks': ['22'], - 'Trust Companies and Trust Funds': ['23', '24'], - 'BRANCH INSTITUTIONS': ['25'], 'CREDIT UNIONS': ['30'], 'MARIJUANA FINANCIAL SERVICES COOPERATIVES': ['33'], - 'MISCELLANEOUS': ['35', '36', '37', '37.5', '38'], - 'SAVINGS AND LOAN ASSOCIATIONS': ['40', '41', '42', '43', '44', '45', '46', '47', '47.5', '48', '49'], - 'Fiduciaries and Trusts': ['50'], 'Securities': ['51', '51.5', '52', '53', '54'], - 'PUBLIC SECURITIES': ['55', '56', '57', '58', '59', '59.3', '59.5'], - 'RECOVERY AND REINVESTMENT FINANCE ACT': ['59.7'], - 'U.S. AGENCY OBLIGATIONS': ['60'], 'HOSPITAL AND HEALTH CARE TRUSTS': ['70'], - 'COMPLIANCE REVIEW DOCUMENTS': ['71'], - 'Colorado Banking Code': ['101', '102', '103', '104', '105', '106', '107', '108', '109', '110']} - - title12 = {'GENERAL': ['1'], 'DIVISION OF REAL ESTATE': ['10'], 'DIVISION OF CONSERVATION': ['15'], - 'DIVISION OF PROFESSIONS AND OCCUPATIONS': ['20', '30'], - 'BUSINESS PROFESSIONS AND OCCUPATIONS': ['100', '105', '110', '115', '120', '125', '130', '135', - '140', '145', '150', '155', '160'], - 'HEALTH CARE PROFESSIONS AND OCCUPATIONS': ['200', '205', '210', '215', '220', '225', '230', '235', - '240', '245', '250', '255', '260', '265', - '270', '275', '280', '285', '290', '295', '300', '305', - '310', '315'], } - - title13 = {'COURTS OF RECORD': ['1', '1.5', '2', '3', '4', '5', '5.5', '6', '7', '8', '9'], - 'MUNICIPAL COURTS': ['10'], 'CIVIL PROTECTION ORDERS': ['14', '14.5'], 'CHANGE OF NAME': ['15'], - 'COSTS': ['16', '17', '17.5'], 'REGULATION OF ACTIONS AND PROCEEDINGS': ['20'], - 'DAMAGES AND LIMITATIONS ON ACTIONS': ['21'], 'CONTRACTS AND AGREEMENTS': ['22', '23'], - 'EVIDENCE': ['25', '26', '27'], 'FEES AND SALARIES': ['30', '31', '32', '33'], - 'FORCIBLE ENTRY AND DETAINER': ['40', '40.1'], 'HABEAS CORPUS': ['45'], - 'JOINT RIGHTS AND OBLIGATIONS': ['50', '50.5'], - 'JUDGMENTS AND EXECUTIONS': ['51', '51.5', '52', '53', '54', '54.5', '55', '56', '57', '58', '59', - '60', '61', '62', '62.1', '63', '64', '65'], - 'JURIES AND JURORS': ['70', '71', '72', '73', '74'], - 'LIMITATION OF ACTIONS': ['80', '81', '82'], 'PRIORITY OF ACTIONS': ['85'], - 'WITNESSES': ['90', '90.5'], 'ADVOCATES': ['91', '92', '93', '94']} - - title14 = {'ADOPTION - ADULTS': ['1'], 'MARRIAGE AND RIGHTS OF MARRIED PERSONS': ['2'], - 'DOMESTIC ABUSE': ['4'], 'DESERTION AND NONSUPPORT': ['5', '6', '7'], - 'DISSOLUTION OF MARRIAGE - PARENTAL RESPONSIBILITIES': ['10', '10.5', '11', '12', '13', '13.5', - '13.7'], - 'CHILD SUPPORT': ['14'], 'CIVIL UNION': ['15']} - - title15 = {'FIDUCIARY': ['1', '1.1', '1.5'], 'POWERS OF APPOINTMENT': ['2', '2.5'], - 'COLORADO UNIFORM TRUST CODE': ['5'], - 'COLORADO PROBATE CODE': ['10', '11', '12', '13', '14', '14.5', '15', '16', '17'], - 'DECLARATIONS - FUTURE HEALTH CARE TREATMENT': ['18', '18.5', '18.6', '18.7'], - 'HUMAN BODIES AFTER DEATH': ['19'], 'COMMUNITY PROPERTY RIGHTS': ['20'], - 'DESIGNATED BENEFICIARY AGREEMENTS': ['22'], 'ABANDONED ESTATE PLANNING DOCUMENTS': ['23']} - - title16 = { - 'CODE OF CRIMINAL PROCEDURE': ['1', '2', '2.5', '2.7', '3', '4', '5', '6', '7', '8', '8.5', '9', '10', '11', - '11.3', '11.5', '11.7', '11.8', '11.9', '12', '13'], - 'UNIFORM MANDATORY DISPOSITION OF DETAINERS ACT': ['14'], - 'WIRETAPPING AND EAVESDROPPING': ['15'], 'CRIMINAL ACTIVITY INFORMATION': ['15.5', '15.7', '15.8'], - 'SENTENCING AND IMPRISONMENT': ['16', '17'], 'COSTS - CRIMINAL ACTIONS': ['18', '18.5'], - 'FUGITIVES AND EXTRADITION': ['19', '20'], 'OFFENDERS - REGISTRATION': ['20.5', '21', '22', '23']} - - title17 = {'Organization': ['1'], 'Parole and Probation': ['2'], 'Care and Custody - Reimbursement': ['10'], - 'Facilities': ['18', '19', '20', '21', '22', '22.5', '23', '24', '25', '26', '26.5'], - 'Programs': ['27', '27.1', '27.5', '27.7', '27.8', '27.9', '28', '29', '30', '30.5', '31', '32', - '33', '34'], - 'DIAGNOSTIC PROGRAMS': ['40', '41'], 'MISCELLANEOUS PROVISIONS': ['42']} - - title22 = { - 'GENERAL AND ADMINISTRATIVE': ['1', '2', '3', '4', '5', '5.5', '6', '7', '8', '9', '9.5', '9.7', '10', - '10.3', '11', '12', '13', '14', '15', '16'], - 'COMPENSATORY EDUCATION': ['20', '20.5', '21', '22', '23', '24', '25', '26', '27', '27.5', '28', '29'], - 'SCHOOL DISTRICTS': ['30', '30.5', '30.7', '31', '32', '32.5', '33', '34', '35', '35.3', '35.5', '35.6', - '36', '37', '38'], - 'FINANCIAL POLICIES AND PROCEDURES': ['40', '41', '41.5', '42', '43', '43.5', '43.7', '44', '45'], - 'FINANCING OF SCHOOLS': ['50', '51'], - 'SECOND CHANCE PROGRAM': ['52'], - 'FINANCING OF SCHOOLS - Continued': ['53', '54', '55', '56', '57', '58'], - 'TEACHERS': ['60', '60.3', '60.5', '61', '61.5', '62', '62.5', '63', - '64', '65', '66', '67', '68', '6805', '69'], - 'JUNIOR COLLEGES': ['70', '71', '72', '73'], - 'MISCELLANEOUS': ['80', '81', '81.5', '82', '82.3', '82.5', '82.6', - '82.7', '82.8', '82.9', '83', '84', '86', '87', '88', '88.1', '89', '90', '91', '92', - '93', - '94', '95', '95.5', '96', - '97', '98', '99', '100', '101', '102']} - - title23 = { - 'General and Administrative': ['1', '1.5', '2', '3', '3.1', '3.3', '3.5', '3.6', '3.7', '3.8', '3.9', '4', - '4.5', '5', '6', '7', '7.4', '7.5', - '8', '9', '10', '11', '11.5', '12', '13', '15', '16', '17', '18', '19', - '19.3', '19.5', '19.7', '19.9'], - 'State Universities and Colleges': ['20', '20.3', '20.5', '21', '22', '23', '30', '31', '31.3', '31.5', - '32', '33', '34', '35', '40', '41', '50', '51', '52', '53', '54', '55', - '56'], - 'COMMUNITY COLLEGES AND OCCUPATIONAL EDUCATION': ['60', '61', '61.5', '62', '63', '64'], - 'EDUCATIONAL CENTERS AND LOCAL DISTRICT COLLEGES': ['70', '71', '72', '73'], - 'EDUCATIONAL PROGRAMS': ['74', '75', '76', '77', '78']} - - title24 = { - 'ADMINISTRATION': ['1', '1.5', '1.7', '1.9', '2', '3', '3.5', '3.7', '4', '4.1', '4.2', '5', '6', '7', - '7.5', '8', - '9', '9.5', '10', '11', '12', '13', '14', '15', '15.5', '16', '17', '18', '18.5', '19', - '19.5', '19.7', '19.8', '19.9'], - 'STATE OFFICERS': ['20', '21', '22'], - 'PRINCIPAL DEPARTMENTS': ['30', '31', '32', '33', '33.5', '34', '35', '36'], - 'GOVERNOR\'S OFFICE': ['37', '37.3', '37.5', '37.7', '38', '38.3', '38.5', '38.7', '38.9'], - 'OTHER AGENCIES': ['40', '40.5', '41', '42', '43', '44', '44.3', '44.5', '44.7', '45', '45.5', '46', '46.1', - '46.3', '46.5', '46.6', '47', '47.5', '48', '48.5', '48.6', '48.8', '49', '49.5', '49.7', - '49.9'], - 'STATE PERSONNEL SYSTEM AND STATE EMPLOYEES': ['50', '50.3', '50.5'], - 'PUBLIC EMPLOYEES\' RETIREMENT SYSTEMS': ['51', '51.1', '52', '52.5', '53', '54', '54.3', '54.5', '54.6', - '54.7', '54.8'], - 'FEDERAL PROGRAMS - HOUSING - RELOCATION': ['55', '56'], - 'INTERSTATE COMPACTS AND AGREEMENTS': ['60', '61', '62'], - 'PLANNING - STATE': ['65', '65.1', '65.5', '66', '67', '68'], - 'PUBLICATION OF LEGAL NOTICES AND PUBLIC PRINTING': ['70'], - 'ELECTRONIC TRANSACTIONS': ['71', '71.1', '71.3', '71.5'], - 'PUBLIC (OPEN) RECORDS': ['72', '72.1', '72.3', '72.4'], - 'GOVERNMENTAL ACCESS TO NEWS INFORMATION': ['72.5'], 'SECURITY BREACHES AND PERSONAL INFORMATIO': ['73'], - 'STATE FUNDS': ['75'], 'FEDERAL FUNDS': ['76'], - 'RESTRICTIONS ON PUBLIC BENEFITS': ['76.5'], - 'PRIORITIZING STATE ENFORCEMENT OF CIVIL IMMIGRATION LAW': ['76.6'], - 'STATE FISCAL POLICIES RELATING TO SECTION 20 OF ARTICLE X OF THE STATE CONSTITUTION': ['77'], - 'FEDERAL MANDATES': ['78'], 'INTERNET REGULATION': ['79'], 'STATE DELINQUENCY CHARGES': ['79.5'], - 'STATE HISTORY, ARCHIVES, AND EMBLEMS': ['80', '80.1'], 'ALLOCATION FOR ART': ['80.5'], - 'STATE PROPERTY': ['82', '82.5'], 'STATE ASSISTANCE - DENVER CONVENTION CENTER': ['83'], - 'INFORMATION TECHNOLOGY ACCESS FOR BLIND': ['85'], 'LIBRARIES': ['90'], 'CONSTRUCTION': ['91', '92', '93'], - 'PROCUREMENT CODE': ['101', '102', '103', '103.5', '104', '105', '106', '107', '108', '109', '110', '111', - '112'], - 'GOVERNMENT COMPETITION WITH PRIVATE ENTERPRISE': ['113', '114'], - 'FINANCING OF CRITICAL STATE NEEDS': ['115']} - - title25 = {'ADMINISTRATION': ['1', '1.5'], 'VITAL STATISTICS': ['2'], 'HOSPITALS': ['3', '3.5'], - 'DISEASE CONTROL': ['4'], 'PRODUCTS CONTROL AND SAFETY': ['5', '5.5'], - 'FAMILY PLANNING': ['6'], 'ENVIRONMENTAL CONTROL': [], - '': ['6.5', '6.6', '6.7', '7', '8', '8.5', '9', - '10', '11', '12', '13', '14', '15', '16', '16.5', '17', '18', '18.5', '18.7'], - 'ENVIRONMENT - SMALL COMMUNITIES': ['19'], 'SAFETY - DISABLED PERSONS': ['20'], - 'PREVENTION, INTERVENTION, AND TREATMENT SERVICES': ['20.5'], - 'HEALTH CARE': ['21', '21.5', '22', '23', '25', '26', '27', '27.5', '27.6', - '28', '29', '30', '31', '32', '33', '34', '34.1', '35', '36', '37', '38', '39', '40', - '41', '42', '43', - '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55']} - - title255 = {'ADMINISTRATION': ['1', '2'], 'PRESCRIPTION DRUGS': ['2.5'], 'INDIGENT CARE': ['3'], - 'COLORADO MEDICAL ASSISTANCE ACT': ['4', '5', '6'], 'CHILDREN\'S BASIC HEALTH PLAN': ['8'], - 'COMMUNITY LIVING': [ - '10'], 'HEALTH CARE COST SAVINGS ACT': [ - '11']} - - title27 = {'DEPARTMENT OF HUMAN SERVICES': ['1', '2'], - 'General Provisions': ['9', '10', '10.3', '10.5', '11', '12'], - 'Institutions': ['13', '14', '15', '16'], - 'CORRECTIONS': ['20', '21', '22', '23', '24', '25', '26', '27', '28'], - 'OTHER INSTITUTIONS': ['35'], 'COLORADO DIAGNOSTIC PROGRAM': ['40'], - 'BEHAVIORAL HEALTH': ['60', '61', '62', '63'], - 'MENTAL HEALTH AND MENTAL HEALTH DISORDERS': ['65', '66', '66.5', '67', '68', '69', '70'], - 'ALCOHOL AND SUBSTANCE USE -ALCOHOL AND SUBSTANCE USE DISORDERS': ['80', '81', '82'], - 'INSTITUTIONS': ['90', '91', '92', '93', '94']} - - title28 = {'EMERGENCY PREPAREDNESS': ['1', '2'], 'MILITARY': ['3', '3.1', '4', '4.5', '4.7'], - 'VETERANS': ['5'], 'DIVISION OF AVIATION': ['6']} - - title29 = {'GENERAL PROVISIONS': ['1', '2', '3', '3.5'], 'HOUSING': ['4'], - 'MISCELLANEOUS': ['5', '5.5', '6', '6.5', '7', '7.5', '8', '9', '10', '10.5', '11', '11.3', '11.5', - '11.6', '11.7', '11.8', '11.9'], - 'ENERGY CONSERVATION': ['12', '12.5'], 'PROPERTY INSURANCE': ['13'], - 'BOND ANTICIPATION NOTES': ['14'], - 'TAX ANTICIPATION NOTES': ['15'], 'LAND USE CONTROL AND CONSERVATION': ['20', '21'], - 'HAZARDOUS SUBSTANCE INCIDENTS': ['22'], 'WILDLAND FIRE PLANNING': ['22.5'], - 'SPECIAL STATUTORY AUTHORITIES': ['23', '24', '24.5'], 'MARKETING DISTRICTS': ['25'], - 'AFFORDABLE HOUSING DWELLING UNIT ADVISORY BOARDS': ['26'], - 'COMPETITION IN UTILITY AND ENTERTAINMENT SERVICES': ['27'], - 'MEDICAL PROVIDER FEES': ['28'], - 'IMMIGRATION STATUS - COOPERATION WITH FEDERAL OFFICIALS': ['29', '30', '31']} - - title30 = {'COMPENSATION - FEES': ['1', '2'], - 'COUNTY ELECTED OFFICIALS\' SALARY COMMISSION': ['3'], - 'LOCATION AND BOUNDARIES': ['5', '6', '7', '8'], - 'COUNTY OFFICERS': ['10'], 'General': ['11', '12', '15', '17', '20', '24'], - 'County Finance': ['25', '26'], - 'COUNTY PLANNING AND BUILDING CODES': ['28'], 'APPORTIONMENT OF FEDERAL MONEYS': ['29'], - 'FLOOD CONTROL': ['30'], - 'HOME RULE': ['35']} - - title31 = {'CORPORATE CLASS - ORGANIZATION AND TERRITORY': ['1', '2', '3', '4'], - 'MUNICIPAL ELECTIONS': ['10', '11'], 'ANNEXATION - CONSOLIDATION - DISCONNECTION': ['12'], - 'POWERS AND FUNCTIONS OF CITIES AND TOWNS': ['15', '16', '20', '21', '23', '25', '30', '30.5', '31', - '32', '35']} - - title32 = {'SPECIAL DISTRICT ACT': ['1'], 'MULTIPURPOSE DISTRICTS': ['2', '3'], - 'WATER AND SANITATION DISTRICTS': ['4'], 'SINGLE PURPOSE SERVICE DISTRICTS': ['5'], - 'REGIONAL SERVICE AUTHORITIES': ['7'], - 'SPECIAL STATUTORY DISTRICTS': ['8', '9', '9.5', '9.7', '10', '11', '11.5', '12', '13', '14', '15', - '16', '17', '18', '19', '20', '21']} - - title33 = {'WILDLIFE': ['1', '2', '3', '4', '5', '5.5', '6', '7', '8'], 'ADMINISTRATION': ['9'], - 'PARKS': ['10', '10.5', '11', '12', '13', '14', '14.5', '15'], - 'WILDLIFE - Continued': ['20', '21', '22', '23'], - 'OUTDOOR RECREATION': ['30', '31', '32'], 'COLORADO NATURAL AREAS': ['33'], - 'RECREATIONAL AREAS AND SKI SAFETY': ['40', '41', '42', '43', '44'], - 'GREAT OUTDOORS PROGRAM': ['60']} - - title34 = {'GEOLOGICAL SURVEY': ['1'], 'JOINT REVIEW PROCESS': ['10'], - 'Health and Safety': ['20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31'], - 'Mined Land Reclamation': ['32', '32.5', '33', '34'], - 'Metal Mines': ['40', '41', '42', '43', '44', '45', '46', - '47', '48', '49', '50', '51', '52', '53', '54'], - 'Conservation and Regulation': ['60', '61', '62', '63', '64'], 'Geothermal Resources': ['70']} - - title35 = {'ADMINISTRATION': ['1', '1.5', '2', '3', '3.5'], - 'PEST AND WEED CONTROL': ['4', '4.5', '5', '5.5', '6', '7', '8', '9', '10', '11'], - 'ORGANICALLY GROWN PRODUCTS': ['11.5'], 'FERTILIZERS': ['12', '13'], 'WEIGHTS AND MEASURES': ['14'], - 'CENTRAL FILING SYSTEM': ['15'], - 'POULTRY AND RABBITS': ['20', '21', '22'], - 'AGRICULTURAL PRODUCTS - STANDARDS AND REGULATIONS': ['23', '23.5', '24', '24.5', '25', '26', '27', - '27.3', '27.5'], - 'MARKETING AND SALES': ['28', '29', '29.5', '30', '31', '32', '33', '33.5', '34', '35', '36', '37', - '38', '39'], - 'PROTECTION OF LIVESTOCK': ['40'], - 'LIVESTOCK': ['41', '41.5', '42', '42.5', '43', '44', '45', '46', '47', '48', '49', '50', '50.5', - '51', '52', '53', '53.5', '54', '55', '56', '57', '57.5', '57.8', '57.9'], - 'MEAT PROCESSING': ['58', '59'], - 'AGRICULTURAL PRODUCTS - STANDARDS AND REGULATIONS- Continued': ['60', '61'], - 'FAIRS': ['65'], - 'Conservation Districts': ['70'], 'Soil Erosion': ['71', '72'], - 'DEVELOPMENT AUTHORITY': ['75'], - 'PRODUCE SAFETY': ['77'], - 'PET ANIMAL CARE': ['80', '81']} - - title36 = {'General and Administrative': ['1', '2'], - 'State Lands': ['3', '4', '5', '6'], 'Forestry': ['7', '8'], 'Natural Areas': ['10'], - 'WEATHER MODIFICATION': ['20']} - - title37 = {'CONSERVANCY LAW OF COLORADO - FLOOD CONTROL': ['1', '2', '3', '3.5', '4', '5', '6', '7', '8'], - 'DRAINAGE AND DRAINAGE DISTRICTS': ['20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', - '31', '32', '33'], - 'General and Administrative': ['40'], - 'Conservation and Irrigation Districts': ['41', '42', '43', '44', '45', '45.1', '46', '47', '48', - '50'], - 'General and Administrative': ['60'], - 'Interstate Compacts': ['61', '62', '63', '64', '65', '66', '67', '68', '69'], - 'Interbasin Compacts': ['75'], 'General and Administrative': ['80'], - 'Water Rights - Generally': ['80.5', '81', '82', '83', '84', '85', '85.5'], - 'Reservoirs and Waterways': ['86', '87', '88', '89'], 'Underground Water': ['90', '90.5', '91'], - 'Water Right Determination and Administration': ['92'], - 'River Basin Authorities': ['93'], 'WATER RESOURCES AND POWER DEVELOPMENT': ['95'], - 'WATER CONSERVATION': ['96', '96.5', '97', '98']} - - title38 = {'EMINENT DOMAIN': ['1', '2', '3', '4', '5', '5.5', '6', '7'], - 'FRAUDS - STATUTE OF FRAUDS': ['8', '10'], 'JOINT RIGHTS AND OBLIGATIONS': ['11'], - 'TENANTS AND LANDLORDS': ['12'], 'UNCLAIMED PROPERTY': ['13'], - 'LOANED PROPERTY': ['14'], - 'LIENS': ['20', '21', '21.5', '22', '23', '24', '24.5', '24.7', '25', '25.5', '26', '27'], - 'PARTITION': ['28'], 'MANUFACTURED HOMES': ['29'], - 'Interests in Land': ['30', '30.5', '30.7', '31', '32', '32.5', '33', '33.3', '33.5', '34'], - 'Conveyancing and Evidence of Title': ['35', '35.5', '35.7', '36'], - 'Mortgages and Trust Deeds': ['37', '38', '39', '40'], 'Limitations - Homestead Exemptions': ['41'], - 'Mineral Interests': ['42', '43'], 'Boundaries': ['44'], - 'Safety of Real Property': ['45'], 'SURVEY PLATS AND MONUMENT RECORDS': ['50', '51', '52', '53']} - - title39 = {'General and Administrative': ['1', '1.5', '2'], 'Exemptions': ['3'], - 'Deferrals': ['3.5', '3.7', '3.9'], - 'Valuation and Taxation': ['4', '4.1', '5', '6', '7'], 'Equalization': ['8', '9'], - 'Collection and Redemption': ['10', '11', '12'], - 'Conveyancing and Evidence of Title': ['13', '14'], 'General and Administrative': ['20', '21'], - 'Income Tax': ['22'], 'Estate and Inheritance and Succession Tax': ['23', '23.5', '24'], - 'Gift Tax': ['25'], 'Sales and Use Tax': ['26', '26.1'], - 'Gasoline and Special Fuel Tax': ['27'], 'Tobacco Tax': ['28'], - 'Controlled Substances Tax': ['28.7', '28.8'], 'Severance Tax': ['29'], - 'Enterprise Zones': ['30', '30.5'], 'Assistance for the Elderly or Disabled': ['31'], - 'Rural Technology Enterprise Zone Act': ['32'], 'Alternative Fuels Rebate': ['33'], - 'Taxation Commission': ['34'], 'Aviation Development Zone Act': ['35']} - - title40 = {'General and Administrative': ['1', '1.1', '2', '2.1', '2.2', '2.3', '3', '3.2', '3.4', '3.5', '4', - '5', '6', '6.5', '7', '7.5', '8', '8.5', '8.7', '9', '9.5', '9.7'], - 'Motor Carriers and Intrastate Telecommunications Services': ['10', '10.1', '11', '11.5', '12', '13', - '14', '15', '16', '16.5', '17'], - 'RAILROADS': ['18', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', - '33'], - 'GEOTHERMAL HEAT': ['40'], - 'ENERGY IMPACTS': ['41']} - - title41 = {'AIRCRAFT': ['1', '2'], 'Generally': ['3', '4'], 'Airport Revenue Bonds': ['5'], 'AEROSPACE': ['6']} - - title42 = {'GENERAL AND ADMINISTRATIVE': ['1'], 'DRIVERS\' LICENSES': ['2'], - 'TAXATION': ['3'], 'REGULATION OF VEHICLES AND TRAFFIC': ['4'], 'AUTOMOBILE THEFT LAW': ['5'], - 'CERTIFICATES OF TITLE': ['6'], 'MOTOR VEHICLE FINANCIAL RESPONSIBILITY LAW': ['7'], - 'PORT OF ENTRY WEIGH STATIONS': ['8'], - 'MOTOR VEHICLE REPAIRS': ['9', '9.5', '10', '11'], 'COLLECTOR\'S ITEMS': ['12'], - 'DISPOSITION OF PERSONAL PROPERTY': ['13'], 'IDLING STANDARD': ['14'], 'HIGHWAY SAFETY': ['20']} - - title43 = {'GENERAL AND ADMINISTRATIVE': ['1'], - 'HIGHWAYS AND HIGHWAY SYSTEMS': ['2'], 'SPECIAL HIGHWAY CONSTRUCTION': ['3'], 'FINANCING': ['4'], - 'HIGHWAY SAFETY': ['5', '6'], 'AVIATION SAFETY AND ACCESSIBILITY': ['10']} - - title44 = {'GENERAL PROVISIONS': ['1'], 'ALCOHOL AND TOBACCO REGULATION': ['3', '4', '5', '6', '7'], - 'MARIJUANA REGULATION': ['10', '11', '12'], 'AUTOMOBILES': ['20'], - 'GAMING AND RACING': ['30', '31', '32', '33'], 'LOTTERY': ['40']} - - title_part_01 = ['01', '02', '04', '05', '07', '09', '10', '11', '12', '13', '13.5'] - title_part_02 = ['02', '03', '04', '07'] - title_part_04 = ['01', '02', '2.5', '03', '04', '4.5', '07', '08', '09'] - title_part_05 = ['01', '02', '03', '3.5', '04', '05', '06', '19'] - title_part_08 = ['2', '5', '20', '20.5'] - title_part_10 = ['01', '02', '03', '04', '07', '08', '11', '12', '14', '16', '17'] - title_part_12 = ['10', '20', '30', '135', '215', '220', '230', '245', '280', '285', '290'] - title_part_13 = ['01', '05', '06', '17', '20', '21', '22', '56', '64', '90', '93'] - title_part_14 = ['2', '5'] - title_part_15 = ['01', '02', '2.5', '05', '10', '11', '12', '13', '14', '14.5', '15', '16', '18.7', '19'] - title_part_16 = ['02', '2.5', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12', '13'] - title_part_17 = ['01', '02', '22.5'] - title_part_18 = ['1', '1.3', '2', '3', '4', '5', '6', '7', '8', '9', '11', '12', '18'] - title_part_19 = ['1', '2', '3', '5', '7'] - title_part_20 = ['1'] - title_part_22 = ['02', '07', '11', '13', '20', '30', '30.5', '33', '63', '81', '95.5'] - title_part_23 = ['60', '71', '78'] - title_part_24 = ['4', '6'] - title_part_25 = ['01', '1.5', '03', '3.5', '04', '05', '5.5', '06', '6.5', '07', '08', '11', '14', '15', '16', - '17', '20.5'] - title_part_255 = ['01', '2.5', '03', '04', '05', '06', '10'] - title_part_26 = ['01', '02', '3.1', '06', '6.2', '6.5', '11', '12'] - title_part_27 = ['60', '80', '82'] - title_part_28 = ['03'] - title_part_29 = ['01', '04', '05', '11', '20', '27'] - title_part_31 = ['01', '02', '03', '04', '10', '12', '15', '16', '20', '21', '23', '25', '30', '30.5', '31', - '32', '35'] - title_part_32 = ['01', '04', '11', '11.5'] - title_part_33 = ['03', '06'] - title_part_34 = ['01'] - title_part_35 = ['07', '21', '31', '75'] - title_part_36 = ['07'] - title_part_38 = ['01', '06', '12', '20', '29', '31', '33.3', '35', '36', '41'] - title_part_39 = ['03', '05'] - title_part_41 = ['04'] - title_part_42 = ['01', '02', '03', '04', '05', '06', '07', '12', '20'] - title_part_43 = ['01', '02', '03', '04', '05 '] - title_part_44 = ['03', '10', '20', '30', '32'] - - if t_id == '25.5': - if t_id not in ['04', '18', '19', '20', '21', '26', '25.5']: - title_no = f'title{t_id}' - for key, value in title255.items(): - if c_id in value: - title = key - header = re.sub(r'[\s]+', '', title).lower() - if t_id in ['01', '02', '04', '05', '10', '12', '13', '15', '16', '17', '18', '19', '20', '22', - '25', '26', '29', - '31', '32', '34', '38', '42', '43', '08', '14', '27', '28', '35', '36', '39', '41', - '44']: - title_part_id = f'title_part_{t_id}' - if c_id.zfill(2) in eval(title_part_id): - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-{header}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-{header}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-{header}-ar{c_id.zfill(2)}-s{s_id}' - break - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id}-s{s_id}' - - else: - if t_id in ['01', '02', '04', '05', '10', '12', '13', '15', '16', '17', '18', '19', '20', '22', '25', - '26', '29', - '31', '32', '34', '38', '42', '43', '08', '14', '27', '28', '35', '36', '39', '41', '44']: - title_part_id = f'title_part_{t_id}' - if c_id.zfill(2) in title_part_255: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id.zfill(2)}-s{s_id}' - - else: - if t_id not in ['04', '18', '19', '20', '21', '26', '25.5', '26.5']: - title_no = f'title{t_id}' - for key, value in eval(title_no).items(): - if c_id in value: - title = key - header = re.sub(r'[\s]+', '', title).lower() - if t_id in ['01', '02', '04', '05', '10', '12', '13', '15', '16', '17', '18', '19', '20', '22', - '25', '26', '29', - '31', '32', '34', '38', '42', '43', '08', '14', '27', '28', '35', '36', '39', '41', - '44']: - title_part_id = f'title_part_{t_id}' - if c_id.zfill(2) in eval(title_part_id): - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-{header}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-{header}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-{header}-ar{c_id.zfill(2)}-s{s_id}' - break - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id}-s{s_id}' - else: - if t_id in ['01', '02', '04', '05', '10', '12', '13', '15', '16', '17', '18', '19', '20', '22', '25', - '26', '29', - '31', '32', '34', '38', '42', '43', '08', '14', '27', '28', '35', '36', '39', '41', '44']: - title_part_id = f'title_part_{t_id}' - if c_id.zfill(2) in eval(title_part_id): - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id.zfill(2)}-s{s_id}' - else: - tag_id = f'gov.co.crs.title.{t_id}.html#t{t_id}-ar{c_id.zfill(2)}-s{s_id}' - return tag_id - def add_citation(self): class_dict = {'co_code': 'Colo\.\s*\d+', 'cocode': 'Colo.+P\.\d\w\s\d+', @@ -2772,7 +2297,7 @@ def add_citation(self): r"\s*\d+(\.\d+)*-\d+(\.\d+)*-\d+(\.\d+)*(\s*\(\d+\))|" r"\s*\d+(\.\d+)*-\d+(\.\d+)*-\d+(\.\d+)*)", tag.get_text())): - inside_text = re.sub(r'|

|^|$', '', + inside_text = re.sub(r'|

|

|^|$', '', text.strip(), re.DOTALL) if tag.get("class") == [self.class_regex["ul"]]: @@ -3081,6 +2606,7 @@ def write_soup_to_file(self): soup_str = str(self.soup.prettify(formatter=None)) for tag in self.meta_tags: + cleansed_tag = re.sub(r'/>', ' />', str(tag)) soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) @@ -3091,16 +2617,16 @@ def write_soup_to_file(self): file.write(soup_str.replace('&', '&')) # add css file - def css_file(self): - head = self.soup.find("head") - style = self.soup.head.find("style") - style.decompose() - css_link = self.soup.new_tag("link") - css_link.attrs[ - "href"] = "https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css" - css_link.attrs["rel"] = "stylesheet" - css_link.attrs["type"] = "text/css" - head.append(css_link) + # def css_file(self): + # head = self.soup.find("head") + # style = self.soup.head.find("style") + # style.decompose() + # css_link = self.soup.new_tag("link") + # css_link.attrs[ + # "href"] = "https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css" + # css_link.attrs["rel"] = "stylesheet" + # css_link.attrs["type"] = "text/css" + # head.append(css_link) def start_parse(self): @@ -3115,7 +2641,7 @@ def start_parse(self): start_time = datetime.now() print(start_time) self.create_page_soup() - self.css_file() + # self.css_file() if re.search('constitution', self.html_file_name): self.class_regex = { diff --git a/html_parser/ga_html_parser.py b/html_parser/ga_html_parser.py index 2f242d1..61ca06e 100644 --- a/html_parser/ga_html_parser.py +++ b/html_parser/ga_html_parser.py @@ -18,7 +18,8 @@ def __init__(self, input_file_name): self.title = None self.previous = None self.junk_tag_class = ['Apple-converted-space', 'Apple-tab-span'] - self.tag_type_dict = {'head1': r'TITLE \d','ul': r'^Chap\.|^Art\.|^Sec\.|^CHAPTER \d|^Article 1', 'head2': r'^CHAPTER \d|^ARTICLE \d|^Article 1', + self.tag_type_dict = {'head1': r'TITLE \d', 'ul': r'^Chap\.|^Art\.|^Sec\.|^CHAPTER \d|^Article 1', + 'head2': r'^CHAPTER \d|^ARTICLE \d|^Article 1', 'head4': '^JUDICIAL DECISIONS|OPINIONS OF THE ATTORNEY GENERAL', 'ol_p': r'^\([a-z]\)', 'junk1': '^Annotations$', 'normalp': '^Editor\'s note', 'article': r'^Article \d$|^Part \d$'} @@ -49,7 +50,6 @@ def create_page_soup(self): self.soup.html.attrs['lang'] = 'en' print('created soup') - def get_class_name(self): """ - Find the textutil generated class names for each type of tag (h1, h2, ....) @@ -62,7 +62,7 @@ def get_class_name(self): class_tag = self.soup.find( lambda tag: tag.name == 'p' and re.search(self.tag_type_dict.get(key), tag.get_text().strip()) and - tag.attrs["class"][0] not in self.tag_type_dict.values() ) + tag.attrs["class"][0] not in self.tag_type_dict.values()) if class_tag: self.tag_type_dict[key] = class_tag['class'][0] @@ -70,8 +70,6 @@ def get_class_name(self): if re.search('junk', key): self.junk_tag_class.append(class_tag['class'][0]) - - if not re.search('constitution', self.html_file_name): h3_class = self.soup.find(lambda tag: tag.name == 'p' and re.search( rf'^\d+-\d+-\d+', tag.get_text().strip(), re.I) and tag.get('class')[0] != self.tag_type_dict['ul'])[ @@ -121,9 +119,6 @@ def remove_junk(self): self.soup.head.append(new_meta) print('junk removed') - - - def replace_tags(self): """ - create dictionary with class names as keys with associated tag name as its value @@ -141,7 +136,6 @@ def replace_tags(self): self.tag_type_dict.get('head3.1', ''): 'h3' } - for key, value in tag_dict.items(): ul = self.soup.new_tag("ul", Class="leaders") while True: @@ -156,41 +150,47 @@ def replace_tags(self): elif p_tag.findPrevious().name != 'li': p_tag.wrap(ul) - if p_tag.findNext().has_attr('class') and p_tag.findNext()['class'][0] != self.tag_type_dict['ul']: - new_nav = self.soup.new_tag('nav') - if re.search(r'sec\.|chap\.|Art\.', ul.contents[0].get_text(), re.I): - ul.contents[0].name = 'p' - ul.contents[0]['class'] = 'navheader' - new_nav.append(ul.contents[0]) - ul.wrap(new_nav) - ul = self.soup.new_tag("ul", Class="leaders") + new_nav = self.soup.new_tag('nav') + if re.search(r'sec\.|chap\.|Art\.', ul.contents[0].get_text(), re.I): + ul.contents[0].name = 'p' + ul.contents[0]['class'] = 'navheader' + new_nav.append(ul.contents[0]) + ul.wrap(new_nav) + ul = self.soup.new_tag("ul", Class="leaders") if value in ['h2', 'h3']: - - - if chap_section_regex := re.search( r'^(?P\d+)-(?P<chapter>\d+([a-z])?)-(?P<section>\d+(\.\d+)?)' r'|^(chapter|article|part)\s(?P<chap>\d+([a-z])?)', p_tag.get_text().strip(), re.I): + if chapter := chap_section_regex.group('chap'): + if re.search('^article', p_tag.get_text().strip(), re.I) and \ (chap_id := p_tag.findPrevious(lambda tag: tag.name == 'h2' and not re.search('^part', tag.get_text(), re.I))): + if re.search(r'chapter \d', chap_id.get_text(), re.I): p_tag['id'] = f'{chap_id["id"]}a{chapter.zfill(2)}' else: - cleansed_chap = re.sub(r'\d+$', '', chap_id["id"]) + cleansed_chap = re.sub(r'\d+([A-Z])*$', '', chap_id["id"]) p_tag['id'] = f'{cleansed_chap}{chapter.zfill(2)}' p_tag['class'] = 'articleh2' + + elif re.search('^article', p_tag.get_text().strip(), re.I): + p_tag['id'] = f't{self.title}c{chapter.zfill(2)}' + p_tag['class'] = 'articleh2' + elif re.search('^part', p_tag.get_text().strip(), re.I) and \ (chap_id := p_tag.findPrevious('h2')): if re.search(r'(chapter|article) \d', chap_id.get_text(), re.I): - p_tag['id'] = f'{p_tag.find_previous("h2",class_="articleh2").get("id")}p{chapter.zfill(2)}' + # print(p_tag) + p_tag[ + 'id'] = f'{p_tag.find_previous("h2", class_="articleh2").get("id")}p{chapter.zfill(2)}' else: cleansed_chap = re.sub(r'\d+$', '', chap_id["id"]) @@ -203,6 +203,9 @@ def replace_tags(self): else: p_tag['id'] = f't{self.title.zfill(2)}c{chapter.zfill(2)}' + + + else: chapter = chap_section_regex.group("chapter") section = f'{self.title}-{chapter}-{chap_section_regex.group("section")}' @@ -230,52 +233,62 @@ def replace_tags(self): section_id = f'{self.title.zfill(2)}-{chap_id.zfill(2)}-{section_match.group("sec")}' p_tag['id'] = f'{chap_tag["id"]}s{section_id}' - elif re.search(r'^Subpart \d+[A-Z]*',p_tag.get_text().strip()): - sec_id = re.search(r'^Subpart (?P<sno>\d+[A-Z]*)',p_tag.get_text().strip()).group("sno") - p_tag['id'] = f'{p_tag.find_previous("h2",class_="parth2").get("id")}s{sec_id}' - + elif re.search(r'^Subpart \d+[A-Z]*', p_tag.get_text().strip()): + sec_id = re.search(r'^Subpart (?P<sno>\d+[A-Z]*)', p_tag.get_text().strip()).group("sno") + p_tag['id'] = f'{p_tag.find_previous("h2", class_="parth2").get("id")}s{sec_id}' + + elif re.search(r'^APPENDIXRULES', p_tag.get_text().strip()): + p_tag.name = 'h2' + apdx_text = re.sub(r'\W+', '', p_tag.get_text().strip()).lower() + p_tag['id'] = f't{self.title}apr{apdx_text}' + p_tag['class'] = "apdxrules" + + elif re.search(r'^Rule \d+(-\d+-\.\d+)*(\s\(\d+\))*\.', p_tag.get_text().strip()): + p_tag.name = 'h2' + rule_id = re.search(r'^Rule (?P<r_id>\d+(-\d+-\.\d+)*(\s\(\d+\))*)\.', + p_tag.get_text().strip()).group("r_id") + p_tag['id'] = f'{p_tag.find_previous("h2", class_="apdxrules").get("id")}r{rule_id.zfill(2)}' else: p_tag.name = 'h5' - # elif value == 'h4': - # chap_tag = p_tag.find_previous('h2') - # if self.headers_class_dict.get(p_tag.get_text()): - # p_tag['class'] = self.headers_class_dict.get(p_tag.get_text()) - # p_tag['id'] = re.sub(r'\s+|\'', '', f't{self.title.zfill(2)}-{p_tag.get_text()}') - # part_tag = p_tag.find_previous( - # lambda tag: re.search(r'h\d', tag.name) and tag.name != 'h5' and tag.has_attr('class') - # and tag['class'] not in self.headers_class_dict.values()) - # if re.search(r'^\d', p_tag.get_text()): - # chap_id = p_tag.find_previous_sibling(lambda tag: re.search('^[a-zA-Z]', tag.get_text()) - # and tag.name != 'h5' and re.search(r'h\d', - # tag.name)) - # elif part_tag and part_tag.has_attr('class') and part_tag['class'] == 'part_header': - # chap_id = part_tag - # elif not p_tag.has_attr('class') or p_tag['class'] not in self.headers_class_dict.values(): - # chap_id = p_tag.find_previous(lambda tag: tag.name in ['h2', 'h3'] or tag.has_attr('class') and - # tag['class'] in self.headers_class_dict.values()) - # else: - # chap_id = p_tag.find_previous(lambda tag: tag.name in ['h2', 'h3']) - # if chap_id and chap_id.has_attr('id'): - # id_text = re.sub(r'\s|"|\'', '', p_tag.get_text()) - # p_tag['id'] = f'{chap_id["id"]}-{id_text}' - # if self.tag_type_dict.get('part') and key == self.tag_type_dict['part']: - # part_num = re.search(r'^part\s(?P<num>\w+(\.\w+)?)', p_tag.get_text().strip(), re.I).group( - # 'num') - # p_tag['class'] = 'part_header' - # p_tag['id'] = f'{chap_tag["id"]}p{part_num.zfill(2)}' - # if p_tag.get('class') in self.headers_class_dict.values(): - # previous_id_num = 0 - # if previous_h4 := p_tag.findPrevious( - # lambda tag: tag.name == 'h4' and re.search(f"{p_tag['id']}\d+$", tag['id'], re.I)): - # previous_id_num = int(re.search(r'\d+$', previous_h4['id'], re.I).group()) - # p_tag['id'] = f'{p_tag["id"]}{str(previous_id_num + 1).zfill(2)}' - - + elif value == 'h4': + chap_tag = p_tag.find_previous('h2') + if self.headers_class_dict.get(p_tag.get_text()): + p_tag['class'] = self.headers_class_dict.get(p_tag.get_text()) + p_tag['id'] = re.sub(r'\s+|\'', '', f't{self.title.zfill(2)}-{p_tag.get_text()}') + part_tag = p_tag.find_previous( + lambda tag: re.search(r'h\d', tag.name) and tag.name != 'h5' and tag.has_attr('class') + and tag['class'] not in self.headers_class_dict.values()) + if re.search(r'^\d', p_tag.get_text()): + chap_id = p_tag.find_previous_sibling(lambda tag: re.search('^[a-zA-Z]', tag.get_text()) + and tag.name != 'h5' and re.search(r'h\d', + tag.name)) + elif part_tag and part_tag.has_attr('class') and part_tag['class'] == 'part_header': + chap_id = part_tag + elif not p_tag.has_attr('class') or p_tag['class'] not in self.headers_class_dict.values(): + chap_id = p_tag.find_previous(lambda tag: tag.name in ['h2', 'h3'] or tag.has_attr('class') and + tag['class'] in self.headers_class_dict.values()) + else: + chap_id = p_tag.find_previous(lambda tag: tag.name in ['h2', 'h3']) + if chap_id and chap_id.has_attr('id'): + id_text = re.sub(r'\s|"|\'', '', p_tag.get_text()) + p_tag['id'] = f'{chap_id["id"]}-{id_text}' + if self.tag_type_dict.get('part') and key == self.tag_type_dict['part']: + part_num = re.search(r'^part\s(?P<num>\w+(\.\w+)?)', p_tag.get_text().strip(), re.I).group( + 'num') + p_tag['class'] = 'part_header' + p_tag['id'] = f'{chap_tag["id"]}p{part_num.zfill(2)}' + if p_tag.get('class') in self.headers_class_dict.values(): + previous_id_num = 0 + if previous_h4 := p_tag.findPrevious( + lambda tag: tag.name == 'h4' and re.search(f"{p_tag['id']}\d+$", tag['id'], re.I)): + previous_id_num = int(re.search(r'\d+$', previous_h4['id'], re.I).group()) + p_tag['id'] = f'{p_tag["id"]}{str(previous_id_num + 1).zfill(2)}' elif value == 'h5': + if re.search(r'\w+', p_tag.get_text()): # break_span = self.soup.new_tag('span', Class='headbreak') @@ -296,25 +309,29 @@ def replace_tags(self): p_tag.insert_before(watermark_p) title_tag = p_tag else: + p_tag.name = 'h5' cur_id_list = [] + h4_count = 1 + head4_data_list = ['Law reviews. —', 'Cross references. —', 'Editor’s notes. —', + 'JUDICIAL DECISIONS', 'OPINIONS OF THE ATTORNEY GENERAL', 'RESEARCH REFERENCES'] for tag in self.soup.find_all("h4"): if re.search(r'\. —$', tag.get_text()) or tag.get_text().isupper(): - h4_text = re.sub(r'\s+','',tag.get_text()).lower() - h4_id = f'{tag.find_previous({"h3","h2","h1"}).get("id")}-{h4_text}' + h4_text = re.sub(r'[\s’,“]+', '', tag.get_text()).lower() + h4_id = f'{tag.find_previous({"h3", "h2", "h1"}).get("id")}-{h4_text}' if h4_id in cur_id_list: - tag["id"] = f'{h4_id}.1' + tag["id"] = f'{h4_id}.{h4_count}' + h4_count += 1 else: tag["id"] = f'{h4_id}' + h4_count = 1 cur_id_list.append(tag["id"]) else: tag.name = "p" - - - + del tag['id'] stylesheet_link_tag = self.soup.new_tag('link') stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', @@ -325,8 +342,6 @@ def replace_tags(self): chap_nav.insert(0, watermark_p) chap_nav.insert(1, title_tag) - - print('tags replaced') def convert_paragraph_to_alphabetical_ol_tags(self): @@ -343,6 +358,7 @@ def convert_paragraph_to_alphabetical_ol_tags(self): cap_alpha_ol = self.soup.new_tag("ol", type="A") inner_ol = self.soup.new_tag("ol", type="i") roman_ol = self.soup.new_tag("ol", type="I") + cap_roman_ol = self.soup.new_tag("ol", type="I") num_ol = self.soup.new_tag("ol") previous_alpha_li = None previous_num_li = None @@ -355,226 +371,289 @@ def convert_paragraph_to_alphabetical_ol_tags(self): sec_sub_li = None sub_alpha_ol = None prev_chap_id = None - for p_tag in self.soup.findAll('p', {'class': self.tag_type_dict['ol_p']}): - if not re.search('\w+', p_tag.get_text()): - continue - chap_id = p_tag.findPrevious(lambda tag: tag.name in ['h2', 'h3']) - - sec_id = chap_id["id"] - if sec_id != prev_chap_id: - ol_count = 0 - prev_chap_id = sec_id - set_string = True - data_str = p_tag.get_text() - p_tag.string = data_str - if re.search(rf'^\({main_sec_alpha}\)', data_str): - cap_alpha = 'A' - sec_sub_ol = None - p_tag.name = 'li' - previous_alpha_li = p_tag - if main_sec_alpha == 'a': - ol_count += 1 - p_tag.wrap(alpha_ol) - else: - alpha_ol.append(p_tag) - num_ol = self.soup.new_tag("ol") - previous_num_li = None - previous_inner_li = None - ol_head = 1 - alpha_li_id = f'{sec_id}ol{ol_count}{main_sec_alpha}' - p_tag['id'] = alpha_li_id - main_sec_alpha = chr(ord(main_sec_alpha) + 1) - if re.search(r'^\(\w\)\s?\(\d\)', data_str): - li_num = re.search(r'^\(\w\)\s?\((?P<num>\d)\)', data_str).group('num') - p_tag.string = re.sub(r'^\(\w+\)', '', p_tag.text.strip()) - new_li = self.soup.new_tag('p') - new_li.string = re.sub(r'^\(\w\)\s?\(\d\)', '', data_str) - p_tag.string.replace_with(new_li) - new_li.wrap(num_ol) - new_li.name = 'li' - previous_num_li = new_li - cap_alpha_ol = self.soup.new_tag("ol", type="A") - set_string = False - ol_head += 1 - num_li_id = f'{alpha_li_id}{li_num}' - new_li['id'] = num_li_id - if re.search(r'^\(\w\)\s?\(\d\)\s?\(\w\)', data_str): - li_alpha = re.search(r'^\(\w\)\s?\(\d\)\s?\((?P<alpha>\w)\)', data_str).group('alpha') - new_li = self.soup.new_tag('p') - new_li.string = re.sub(r'^\(\w+\)\s?\(\d\)\s?\(\w\)', '', data_str) - previous_num_li.string.replace_with(new_li) - new_li.wrap(cap_alpha_ol) - new_li.name = 'li' - previous_inner_li = new_li - inner_ol = self.soup.new_tag("ol", type="i") - new_li['id'] = f'{num_li_id}{li_alpha}' - if cap_alpha == 'Z': - cap_alpha = 'A' - else: - cap_alpha = chr(ord(cap_alpha) + 1) + cur_head_list = [] - elif re.search(r'^\([IVX]+\)', p_tag.text.strip()) and cap_alpha not in ['I','V','X']: - p_tag.name = "li" - - if re.search(r'^\(I\)', p_tag.text.strip()): - cap_roman_ol = self.soup.new_tag("ol", type="I") - p_tag.wrap(cap_roman_ol) - prev_rom_id = p_tag.find_previous("li").get("id") - p_tag.find_previous("li").append(cap_roman_ol) - else: - print(p_tag) - cap_roman_ol.append(p_tag) + # for p_tag in self.soup.findAll('p', {'class': self.tag_type_dict['ol_p']}): + for p_tag in self.soup.findAll(): - rom_head = re.search(r'^\((?P<rom>[IVX]+)\)', p_tag.text.strip()) - p_tag["id"] = f'{prev_rom_id}{rom_head.group("rom")}' - p_tag.string = re.sub(r'^\([IVX]+\)', '', p_tag.text.strip()) - - elif re.search(r'^\(\w+(\.\d)?\)', p_tag.text.strip()): - if re.search(r'^\(\d+\.\d\)', p_tag.text.strip()): - if previous_num_li: - previous_num_li.append(p_tag) + if p_tag.get("class") == [self.tag_type_dict['ol_p']]: + if not re.search('\w+', p_tag.get_text()): continue - - if re.search(rf'^\({ol_head}\)', p_tag.text.strip()): - cap_alpha = "A" - incr_ol_count = False - if previous_alpha_li: - previous_alpha_li.append(p_tag) - previous_num_li = p_tag - p_tag.name = "li" - if ol_head == 1: - incr_ol_count = True - p_tag.wrap(num_ol) + chap_id = p_tag.findPrevious(lambda tag: tag.name in ['h2', 'h3', 'h1']) + + sec_id = chap_id["id"] + if sec_id != prev_chap_id: + ol_count = 0 + prev_chap_id = sec_id + set_string = True + data_str = p_tag.get_text() + p_tag.string = data_str + if re.search(rf'^\({main_sec_alpha}\)', data_str): + cap_alpha = 'A' + sec_sub_ol = None + p_tag.name = 'li' + previous_alpha_li = p_tag + if main_sec_alpha == 'a': + ol_count += 1 + p_tag.wrap(alpha_ol) else: - num_ol.append(p_tag) - cap_alpha_ol = self.soup.new_tag("ol", type="A") + alpha_ol.append(p_tag) + num_ol = self.soup.new_tag("ol") + previous_num_li = None previous_inner_li = None - if alpha_li_id: - num_li_id = f'{alpha_li_id}{ol_head}' - else: - if incr_ol_count: - ol_count += 1 - num_li_id = f'{sec_id}ol{ol_count}{ol_head}' - p_tag['id'] = num_li_id - ol_head += 1 - if re.search(r'^\(\d+\)\s?\(\w+\)', p_tag.text.strip()): - li_alpha = re.search(r'^\(\d+\)\s?\((?P<alpha>\w+)\)', p_tag.text.strip()).group('alpha') + ol_head = 1 + alpha_li_id = f'{sec_id}ol{ol_count}{main_sec_alpha}' + p_tag['id'] = alpha_li_id + main_sec_alpha = chr(ord(main_sec_alpha) + 1) + if re.search(r'^\(\w\)\s?\(\d\)', data_str): + li_num = re.search(r'^\(\w\)\s?\((?P<num>\d)\)', data_str).group('num') + p_tag.string = re.sub(r'^\(\w+\)', '', p_tag.text.strip()) new_li = self.soup.new_tag('p') - new_li.string = re.sub(r'^\(\d+\)\s?\(\w+\)', '', p_tag.text.strip()) + new_li.string = re.sub(r'^\(\w\)\s?\(\d\)', '', data_str) p_tag.string.replace_with(new_li) - new_li.wrap(cap_alpha_ol) + new_li.wrap(num_ol) new_li.name = 'li' - previous_inner_li = new_li + previous_num_li = new_li + cap_alpha_ol = self.soup.new_tag("ol", type="A") set_string = False - inner_ol = self.soup.new_tag("ol", type="i") - cap_alpha_li_id = f'{num_li_id}{li_alpha}' - new_li['id'] = f'{num_li_id}{li_alpha}' - if cap_alpha == 'Z': - cap_alpha = 'A' - else: - cap_alpha = chr(ord(cap_alpha) + 1) - if re.search(r'^\(\d+\)\s?\([A-Z]\)\s?\(\w+\)', data_str): - li_roman = re.search(r'^\(\d+\)\s?\([A-Z]\)\s?\((?P<roman>\w+)\)', data_str).group('roman') + ol_head += 1 + num_li_id = f'{alpha_li_id}{li_num}' + new_li['id'] = num_li_id + if re.search(r'^\(\w\)\s?\(\d\)\s?\(\w\)', data_str): + li_alpha = re.search(r'^\(\w\)\s?\(\d\)\s?\((?P<alpha>\w)\)', data_str).group('alpha') new_li = self.soup.new_tag('p') - new_li.string = re.sub(r'^\(\d+\)\s?\([A-Z]\)\s?\(\w+\)', '', data_str) - p_tag.string.replace_with(new_li) - new_li.wrap(inner_ol) + new_li.string = re.sub(r'^\(\w+\)\s?\(\d\)\s?\(\w\)', '', data_str) + previous_num_li.string.replace_with(new_li) + new_li.wrap(cap_alpha_ol) new_li.name = 'li' - set_string = False - small_roman_id = f'{cap_alpha_li_id}{li_roman}' - new_li['id'] = small_roman_id - previous_roman_li = new_li - elif re.search(r'^\(\d+\)', p_tag.text.strip()) and sec_sub_ol: - digit = re.search(r'^\((?P<sec_digit>\d+)\)', data_str).group('sec_digit') - sec_sub_li = self.soup.new_tag('li') - sec_sub_li.string = re.sub(r'^\(\w+\)', '', p_tag.text.strip()) - sec_sub_li['id'] = f"{sub_ol_id}{digit}" - sec_sub_ol.append(sec_sub_li) - sub_alpha_ol = self.soup.new_tag('ol', type='A') - sec_sub_li.append(sub_alpha_ol) - p_tag.decompose() - continue - elif previous_num_li: - if cap_alpha_match := re.search(fr'^\({cap_alpha}+\)|(^\([A-Z]+(\.\d+)?\))', p_tag.text.strip()): - li_alpha = re.search(r'^\((?P<alpha>\w+(\.\d+)?)\)', data_str).group('alpha') - previous_num_li.append(p_tag) - p_tag.name = 'li' - previous_roman_li = None - if sec_sub_ol: - p_tag['id'] = f'{sec_sub_li["id"]}{li_alpha}' - if re.search(r'\d+', cap_alpha_match.group(0)): - p_tag.name = 'p' - previous_inner_li.apend(p_tag) + previous_inner_li = new_li + inner_ol = self.soup.new_tag("ol", type="i") + new_li['id'] = f'{num_li_id}{li_alpha}' + if cap_alpha == 'Z': + cap_alpha = 'A' else: - sub_alpha_ol.append(p_tag) + cap_alpha = chr(ord(cap_alpha) + 1) + + # elif re.search(r'^\([IVX]+\)', p_tag.text.strip()) and cap_alpha not in ['I', 'V', 'X']: + elif re.search(r'^\([IVX]+\)', p_tag.text.strip()): + p_tag.name = "li" + + if re.search(r'^\(I\)', p_tag.text.strip()): + cap_roman_ol = self.soup.new_tag("ol", type="I") + p_tag.wrap(cap_roman_ol) + prev_rom_id = p_tag.find_previous("li").get("id") + p_tag.find_previous("li").append(cap_roman_ol) + else: + cap_roman_ol.append(p_tag) + + rom_head = re.search(r'^\((?P<rom>[IVX]+)\)', p_tag.text.strip()) + + p_tag["id"] = f'{prev_rom_id}{rom_head.group("rom")}' + p_tag.string = re.sub(r'^\([IVX]+\)', '', p_tag.text.strip()) + + elif re.search(r'^\(\w+(\.\d)?\)', p_tag.text.strip()): + if re.search(r'^\(\d+\.\d\)', p_tag.text.strip()): + if previous_num_li: + previous_num_li.append(p_tag) + continue + + if re.search(rf'^\({ol_head}\)', p_tag.text.strip()): + cap_alpha = "A" + incr_ol_count = False + if previous_alpha_li: + previous_alpha_li.append(p_tag) + previous_num_li = p_tag + p_tag.name = "li" + if ol_head == 1: + incr_ol_count = True + p_tag.wrap(num_ol) else: - if re.search(r'\d+', cap_alpha_match.group(0)): - p_tag.name = 'p' - previous_inner_li.insert(len(previous_inner_li.contents), p_tag) - else: - p_tag.wrap(cap_alpha_ol) - previous_inner_li = p_tag - inner_ol = self.soup.new_tag("ol", type="i") - cap_alpha_li_id = f'{num_li_id}{li_alpha}' - p_tag['id'] = cap_alpha_li_id - if re.search(r'^\([A-Z]\)\s?\(\w+\)', p_tag.text.strip()): - li_roman = re.search(r'^\([A-Z]\)\s?\((?P<roman>\w+)\)', data_str).group('roman') + num_ol.append(p_tag) + cap_alpha_ol = self.soup.new_tag("ol", type="A") + previous_inner_li = None + if alpha_li_id: + num_li_id = f'{alpha_li_id}{ol_head}' + else: + if incr_ol_count: + ol_count += 1 + num_li_id = f'{sec_id}ol{ol_count}{ol_head}' + p_tag['id'] = num_li_id + ol_head += 1 + if re.search(r'^\(\d+\)\s?\(\w+\)', p_tag.text.strip()): + li_alpha = re.search(r'^\(\d+\)\s?\((?P<alpha>\w+)\)', p_tag.text.strip()).group('alpha') new_li = self.soup.new_tag('p') - new_li.string = re.sub(r'^\([A-Z]\)\s?\(\w+\)', '', p_tag.text.strip()) + new_li.string = re.sub(r'^\(\d+\)\s?\(\w+\)', '', p_tag.text.strip()) p_tag.string.replace_with(new_li) - new_li.wrap(inner_ol) + new_li.wrap(cap_alpha_ol) new_li.name = 'li' + previous_inner_li = new_li set_string = False - small_roman_id = f'{cap_alpha_li_id}{li_roman}' - p_tag['id'] = small_roman_id - previous_roman_li = new_li - if cap_alpha == 'Z': - cap_alpha = 'A' - elif not re.search(r'\d+', cap_alpha_match.group(0)): - cap_alpha = chr(ord(cap_alpha) + 1) - elif previous_inner_li: - if re.search(r'^\([a-z]+\)', p_tag.text.strip()): - li_roman = re.search(r'^\((?P<roman>\w+)\)', data_str).group('roman') - previous_inner_li.append(p_tag) + inner_ol = self.soup.new_tag("ol", type="i") + cap_alpha_li_id = f'{num_li_id}{li_alpha}' + new_li['id'] = f'{num_li_id}{li_alpha}' + if cap_alpha == 'Z': + cap_alpha = 'A' + else: + cap_alpha = chr(ord(cap_alpha) + 1) + if re.search(r'^\(\d+\)\s?\([A-Z]\)\s?\(\w+\)', data_str): + li_roman = re.search(r'^\(\d+\)\s?\([A-Z]\)\s?\((?P<roman>\w+)\)', data_str).group( + 'roman') + new_li = self.soup.new_tag('p') + new_li.string = re.sub(r'^\(\d+\)\s?\([A-Z]\)\s?\(\w+\)', '', data_str) + p_tag.string.replace_with(new_li) + new_li.wrap(inner_ol) + new_li.name = 'li' + set_string = False + small_roman_id = f'{cap_alpha_li_id}{li_roman}' + new_li['id'] = small_roman_id + previous_roman_li = new_li + elif re.search(r'^\(\d+\)', p_tag.text.strip()) and sec_sub_ol: + digit = re.search(r'^\((?P<sec_digit>\d+)\)', data_str).group('sec_digit') + sec_sub_li = self.soup.new_tag('li') + sec_sub_li.string = re.sub(r'^\(\w+\)', '', p_tag.text.strip()) + sec_sub_li['id'] = f"{sub_ol_id}{digit}" + sec_sub_ol.append(sec_sub_li) + sub_alpha_ol = self.soup.new_tag('ol', type='A') + sec_sub_li.append(sub_alpha_ol) + p_tag.decompose() + continue + elif previous_num_li: + if cap_alpha_match := re.search(fr'^\({cap_alpha}+\)|(^\([A-Z]+(\.\d+)?\))', + p_tag.text.strip()): + li_alpha = re.search(r'^\((?P<alpha>\w+(\.\d+)?)\)', data_str).group('alpha') + previous_num_li.append(p_tag) p_tag.name = 'li' - p_tag.wrap(inner_ol) - roman_ol = self.soup.new_tag("ol", type="I") - small_roman_id = f'{cap_alpha_li_id}{li_roman}' - p_tag['id'] = small_roman_id - previous_roman_li = p_tag - elif sub_sec_match := re.search(r'^\(\w\.\d\)\s?\((?P<sec_digit>\d+)\)', p_tag.text.strip()): - digit_match = re.search(r'^\(\w\.(?P<digit>\d+)\)\s?\((?P<sec_digit>\d+)\)', - p_tag.text.strip()) - sub_ol = self.soup.new_tag('ol', Class="sub_ol") - sub_ol_id = f"{cap_alpha_li_id}.{digit_match.group('digit')}" - sub_li = self.soup.new_tag('li') - sub_li.string = sub_sec_match.group() - sub_li['id'] = sub_ol_id - sub_ol.append(sub_li) - sec_sub_ol = self.soup.new_tag('ol') - sub_li.append(sec_sub_ol) - sec_sub_li = self.soup.new_tag('li') - sec_sub_li.string = re.sub(r'^\(\w\.\d+\)\s?\(\d+\)', '', data_str) - sec_sub_li['id'] = f"{sub_ol_id}{digit_match.group('sec_digit')}" - sec_sub_ol.append(sec_sub_li) - sub_alpha_ol = self.soup.new_tag('ol', type='A') - sec_sub_li.append(sub_alpha_ol) - previous_alpha_li.insert(len(previous_alpha_li.contents), sub_ol) - p_tag.decompose() - elif previous_roman_li: - if re.search(r'^\([A-Z]+\)', p_tag.text.strip()): + previous_roman_li = None + if sec_sub_ol: + p_tag['id'] = f'{sec_sub_li["id"]}{li_alpha}' + if re.search(r'\d+', cap_alpha_match.group(0)): + p_tag.name = 'p' + previous_inner_li.apend(p_tag) + else: + sub_alpha_ol.append(p_tag) + else: + if re.search(r'\d+', cap_alpha_match.group(0)): + p_tag.name = 'p' + previous_inner_li.insert(len(previous_inner_li.contents), p_tag) + else: + p_tag.wrap(cap_alpha_ol) + previous_inner_li = p_tag + inner_ol = self.soup.new_tag("ol", type="i") + cap_alpha_li_id = f'{num_li_id}{li_alpha}' + p_tag['id'] = cap_alpha_li_id + if re.search(r'^\([A-Z]\)\s?\(\w+\)', p_tag.text.strip()): + li_roman = re.search(r'^\([A-Z]\)\s?\((?P<roman>\w+)\)', data_str).group('roman') + new_li = self.soup.new_tag('p') + new_li.string = re.sub(r'^\([A-Z]\)\s?\(\w+\)', '', p_tag.text.strip()) + p_tag.string.replace_with(new_li) + new_li.wrap(inner_ol) + new_li.name = 'li' + set_string = False + small_roman_id = f'{cap_alpha_li_id}{li_roman}' + p_tag['id'] = small_roman_id + previous_roman_li = new_li + if cap_alpha == 'Z': + cap_alpha = 'A' + elif not re.search(r'\d+', cap_alpha_match.group(0)): + cap_alpha = chr(ord(cap_alpha) + 1) + elif previous_inner_li: + if re.search(r'^\([a-z]+\)', p_tag.text.strip()): li_roman = re.search(r'^\((?P<roman>\w+)\)', data_str).group('roman') - previous_roman_li.append(p_tag) + previous_inner_li.append(p_tag) p_tag.name = 'li' - p_tag.wrap(roman_ol) - p_tag['id'] = f'{small_roman_id}{li_roman}' - else: - previous_inner_li.insert(len(previous_num_li.contents), p_tag) + p_tag.wrap(inner_ol) + roman_ol = self.soup.new_tag("ol", type="I") + small_roman_id = f'{cap_alpha_li_id}{li_roman}' + p_tag['id'] = small_roman_id + previous_roman_li = p_tag + elif sub_sec_match := re.search(r'^\(\w\.\d\)\s?\((?P<sec_digit>\d+)\)', + p_tag.text.strip()): + digit_match = re.search(r'^\(\w\.(?P<digit>\d+)\)\s?\((?P<sec_digit>\d+)\)', + p_tag.text.strip()) + sub_ol = self.soup.new_tag('ol', Class="sub_ol") + sub_ol_id = f"{cap_alpha_li_id}.{digit_match.group('digit')}" + sub_li = self.soup.new_tag('li') + sub_li.string = sub_sec_match.group() + sub_li['id'] = sub_ol_id + sub_ol.append(sub_li) + sec_sub_ol = self.soup.new_tag('ol') + sub_li.append(sec_sub_ol) + sec_sub_li = self.soup.new_tag('li') + sec_sub_li.string = re.sub(r'^\(\w\.\d+\)\s?\(\d+\)', '', data_str) + sec_sub_li['id'] = f"{sub_ol_id}{digit_match.group('sec_digit')}" + sec_sub_ol.append(sec_sub_li) + sub_alpha_ol = self.soup.new_tag('ol', type='A') + sec_sub_li.append(sub_alpha_ol) + previous_alpha_li.insert(len(previous_alpha_li.contents), sub_ol) + p_tag.decompose() + elif previous_roman_li: + if re.search(r'^\([A-Z]+\)', p_tag.text.strip()): + li_roman = re.search(r'^\((?P<roman>\w+)\)', data_str).group('roman') + previous_roman_li.append(p_tag) + p_tag.name = 'li' + p_tag.wrap(roman_ol) + p_tag['id'] = f'{small_roman_id}{li_roman}' + else: + previous_inner_li.insert(len(previous_num_li.contents), p_tag) - elif re.search(r'^\([^\)]{6,}\)', p_tag.get_text(), re.I) or \ - re.search(r'^\d+-\d+-\d+', p_tag.find_previous_sibling().get_text()) or \ - re.search(r'Section \d+\.\s', p_tag.get_text()): + elif re.search(r'^\d+-\d+-\d+', p_tag.find_previous_sibling().get_text().strip()) or \ + re.search(r'^\([^\)]{6,}\)', p_tag.get_text(), re.I) or \ + re.search(r'Section \d+\.\s', p_tag.get_text()) or p_tag.find_previous_sibling().name == "h3": + + ol_head = 1 + main_sec_alpha = 'a' + cap_alpha = "A" + previous_alpha_li = None + previous_num_li = None + previous_inner_li = None + alpha_li_id = None + previous_roman_li = None + sec_sub_ol = None + alpha_ol = self.soup.new_tag("ol", type="a") + num_ol = self.soup.new_tag("ol") + + else: + if re.search(r'^History\. —', p_tag.text.strip()): + ol_head = 1 + main_sec_alpha = 'a' + cap_alpha = "A" + previous_alpha_li = None + previous_num_li = None + previous_inner_li = None + alpha_li_id = None + previous_roman_li = None + sec_sub_ol = None + alpha_ol = self.soup.new_tag("ol", type="a") + num_ol = self.soup.new_tag("ol") + # else: + # if previous_inner_li: + # previous_inner_li.append(p_tag) + # elif previous_num_li: + # previous_num_li.append(p_tag) + # elif previous_alpha_li: + # previous_alpha_li.append(p_tag) + # print(p_tag) + # print(p_tag.find_previous_sibling()) + # print(p_tag.find_previous()) + # if re.search(r'^History\. —', p_tag.text.strip()) or \ + # p_tag.find_previous_sibling().name == "h3" or \ + # p_tag.find_previous_sibling().name == "h5": + # ol_head = 1 + # main_sec_alpha = 'a' + # cap_alpha = "A" + # previous_alpha_li = None + # previous_inner_li = None + # alpha_li_id = None + # previous_roman_li = None + # sec_sub_ol = None + # alpha_ol = self.soup.new_tag("ol", type="a") + # num_ol = self.soup.new_tag("ol") + + if set_string: + p_tag.string = re.sub(r'^\(\w+\)', '', p_tag.text.strip()) + + elif p_tag.name in ['h4', 'h3', 'h2']: ol_head = 1 main_sec_alpha = 'a' cap_alpha = "A" @@ -587,15 +666,6 @@ def convert_paragraph_to_alphabetical_ol_tags(self): alpha_ol = self.soup.new_tag("ol", type="a") num_ol = self.soup.new_tag("ol") - else: - if previous_inner_li: - previous_inner_li.append(p_tag) - elif previous_num_li: - previous_num_li.append(p_tag) - elif previous_alpha_li: - previous_alpha_li.append(p_tag) - if set_string: - p_tag.string = re.sub(r'^\(\w+\)', '', p_tag.text.strip()) print('ol tags added') def convert_to_numeric_ol_tags(self): @@ -635,40 +705,104 @@ def create_analysis_nav_tag(self): - create new nav tag and ul tag - append each created li to new ul tag """ - for analysis_p_tag in self.soup.findAll('p', {'class': self.tag_type_dict['normalp']}): - if re.search(r'^Editor\'s notes.+ANALYSIS', analysis_p_tag.get_text(), re.DOTALL): - parent_id = analysis_p_tag.find_previous(lambda tag: tag.name in ['h2', 'h2', 'h3'])['id'] - editors_tag = self.soup.new_tag('p') - editors_header = self.soup.new_tag('h5', Class='ednotes lalign') - editors_header.string = "Editor's notes." - editors_header['id'] = f'{parent_id}-ednotes01' - editors_text = re.search(r'^Editor\'s notes\.(?P<text>.+)ANALYSIS', - analysis_p_tag.get_text(), re.DOTALL).group('text') - editors_tag.string = editors_text - analysis_p_tag.insert_before(editors_header) - editors_header.insert_after(editors_tag) - if re.search('<b>ANALYSIS', str(analysis_p_tag)): - p_tag = self.soup.new_tag('p') - p_tag.string = 'ANALYSIS' - p_tag['class'] = 'analysis_nav_header' - nav_tag = self.soup.new_tag('nav') - nav_tag.append(p_tag) - new_ul = self.soup.new_tag("ul", Class="leaders") - ol = self.soup.new_tag("ol") - previous_li = None - for headers_text in analysis_p_tag.get_text().splitlines(): - if not re.search('ANALYSIS|Editor\'s notes', headers_text.strip()) and headers_text.strip(): - new_li = self.soup.new_tag('li') - new_li.string = headers_text - if previous_li and re.search(r'^\d', headers_text.strip()): - previous_li.append(new_li) - ol.insert(len(ol), new_li) + # for analysis_p_tag in self.soup.findAll('p', {'class': self.tag_type_dict['ol_p']}): + # if re.search(r'^Editor\'s notes.+ANALYSIS', analysis_p_tag.get_text(), re.DOTALL): + # parent_id = analysis_p_tag.find_previous(lambda tag: tag.name in ['h2', 'h2', 'h3'])['id'] + # editors_tag = self.soup.new_tag('p') + # editors_header = self.soup.new_tag('h5', Class='ednotes lalign') + # editors_header.string = "Editor's notes." + # editors_header['id'] = f'{parent_id}-ednotes01' + # editors_text = re.search(r'^Editor\'s notes\.(?P<text>.+)ANALYSIS', + # analysis_p_tag.get_text(), re.DOTALL).group('text') + # editors_tag.string = editors_text + # analysis_p_tag.insert_before(editors_header) + # editors_header.insert_after(editors_tag) + # if re.search('<b>ANALYSIS', str(analysis_p_tag)): + # p_tag = self.soup.new_tag('p') + # p_tag.string = 'ANALYSIS' + # p_tag['class'] = 'analysis_nav_header' + # nav_tag = self.soup.new_tag('nav') + # nav_tag.append(p_tag) + # new_ul = self.soup.new_tag("ul", Class="leaders") + # ol = self.soup.new_tag("ol") + # previous_li = None + # for headers_text in analysis_p_tag.get_text().splitlines(): + # if not re.search('ANALYSIS|Editor\'s notes', headers_text.strip()) and headers_text.strip(): + # new_li = self.soup.new_tag('li') + # new_li.string = headers_text + # if previous_li and re.search(r'^\d', headers_text.strip()): + # previous_li.append(new_li) + # ol.insert(len(ol), new_li) + # else: + # new_ul.insert(len(new_ul), new_li) + # previous_li = new_li + # ol = self.soup.new_tag("ol") + # nav_tag.append(new_ul) + # analysis_p_tag.replace_with(nav_tag) + # + # + + a_tag_list = [] + for analysis_p_tag in self.soup.findAll('p', {'class': self.tag_type_dict['ol_p']}): + if re.search(r'^Analysis', analysis_p_tag.text.strip()): + for a_tag in analysis_p_tag.find_next_siblings(): + if a_tag.get("class") == [self.tag_type_dict['ol_p']]: + a_tag.name = "li" + a_tag_text = re.sub(r'[\W_]+', '', a_tag.text.strip()).strip().lower() + a_tag_list.append(a_tag_text) + if re.search(r'^\d+\.', a_tag.text.strip()): + if re.search(r'^1\.', a_tag.text.strip()): + innr_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + a_tag.wrap(innr_ul_tag) + a_tag.find_previous("li").append(innr_ul_tag) + else: + innr_ul_tag.append(a_tag) + analysishead_num_tag_id = f"{analysishead_tag_id}-{a_tag_text}" + a_tag_id = f"{analysishead_tag_id}-{a_tag_text}" + + elif re.search(r'^[a-z]\.', a_tag.text.strip()): + if re.search(r'^a\.', a_tag.text.strip()): + innr_alpha_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + a_tag.wrap(innr_alpha_ul_tag) + a_tag.find_previous("li").append(innr_alpha_ul_tag) + else: + innr_alpha_ul_tag.append(a_tag) + a_tag_id = f"{analysishead_num_tag_id}-{a_tag_text}" + else: - new_ul.insert(len(new_ul), new_li) - previous_li = new_li - ol = self.soup.new_tag("ol") - nav_tag.append(new_ul) - analysis_p_tag.replace_with(nav_tag) + + if a_tag.find_previous().name == "a": + ul_tag.append(a_tag) + + else: + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + a_tag.wrap(ul_tag) + analysishead_tag_id = f"#{a_tag.find_previous('h4').get('id')}-{a_tag_text}" + a_tag_id = f"#{a_tag.find_previous('h3').get('id')}-judicialdecisions-{a_tag_text}" + + anchor = self.soup.new_tag('a', href=a_tag_id) + anchor.string = a_tag.text + a_tag.string = '' + a_tag.append(anchor) + + elif a_tag.get("class") == [self.tag_type_dict['head4']]: + break + + for analysis_head_tag in self.soup.findAll('p', {'class': self.tag_type_dict['head4']}): + a_head_text = re.sub(r'[\W_]+', '', analysis_head_tag.text.strip()).lower() + + if a_head_text in a_tag_list: + analysis_head_tag.name = "h5" + if re.search(r'^\d+\.', analysis_head_tag.text.strip()): + analysis_num_tag_id = f'{analysis_head_tag_id}-{a_head_text}' + analysis_head_tag['id'] = f'{analysis_head_tag_id}-{a_head_text}' + elif re.search(r'^[a-z]\.', analysis_head_tag.text.strip()): + analysis_head_tag['id'] = f'{analysis_num_tag_id}-{a_head_text}' + else: + analysis_head_tag_id = f'{analysis_head_tag.find_previous("h3").get("id")}-judicialdecisions-{a_head_text}' + analysis_head_tag[ + 'id'] = f'{analysis_head_tag.find_previous("h3").get("id")}-judicialdecisions-{a_head_text}' + print('created analysis tag') def remove_or_replace_class_names(self): @@ -697,12 +831,14 @@ def remove_or_replace_class_names(self): if tag.attrs.get('http-equiv') == 'Content-Style-Type': tag.decompose() continue + print(tag) self.meta_tags.append(tag) elif tag.name == 'br': if not tag.parent or tag in tag.parent.contents: tag.decompose() continue elif re.search(r'^§+$', tag.get_text()): + tag.unwrap() del tag["class"] continue @@ -850,6 +986,9 @@ def add_anchor_tags(self): - add a property called 'aria-describedby' with value same as previously built reference link """ self.soup = BeautifulSoup(self.soup.prettify(formatter=None), features='lxml') + id_num = 0 + li_num = 0 + arnum = 1 for ul in self.soup.findAll('nav'): id_num = 0 li_num = 0 @@ -923,24 +1062,24 @@ def add_anchor_tags(self): li.append(anchor) for li in self.soup.find_all("li"): - if re.search(r'^\d+\w?\.?\s', li.text.strip()): - first_nav = self.soup.find("ul") - if not li.has_attr('id'): - chap_no = re.search('^\d+\w?', li.get_text().strip()).group() - header_id = f'#t{self.title.zfill(2)}c{chap_no.zfill(2)}' - anchor = self.soup.new_tag('a', href=header_id) - cleansed_header_id = header_id.strip("#") - anchor.attrs['aria-describedby'] = cleansed_header_id - li['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' - anchor.string = li.text - if li.string: - li.string.replace_with(anchor) - else: - li.contents = [] - li.append(anchor) - - - elif re.search(r'^CHAPTER \d+[A-Z]*', li.text.strip()): + # if re.search(r'^\d+\w?\.?\s', li.text.strip()): + # first_nav = self.soup.find("ul") + # print(li) + # if not li.has_attr('id'): + # chap_no = re.search('^\d+\w?', li.get_text().strip()).group() + # header_id = f'#t{self.title.zfill(2)}c{chap_no.zfill(2)}' + # anchor = self.soup.new_tag('a', href=header_id) + # cleansed_header_id = header_id.strip("#") + # anchor.attrs['aria-describedby'] = cleansed_header_id + # li['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' + # anchor.string = li.text + # if li.string: + # li.string.replace_with(anchor) + # else: + # li.contents = [] + # li.append(anchor) + + if re.search(r'^CHAPTER \d+[A-Z]*', li.text.strip()): chap_no = re.search('^CHAPTER (?P<cno>\d+[A-Z]*)', li.get_text().strip()).group('cno') header_id = f'#t{self.title.zfill(2)}c{chap_no.zfill(2)}' anchor = self.soup.new_tag('a', href=header_id) @@ -954,10 +1093,23 @@ def add_anchor_tags(self): li.contents = [] li.append(anchor) + elif re.search(r'^Article \d+[A-Z]*', li.text.strip()): + chap_no = re.search('^Article (?P<cno>\d+[A-Z]*)', li.get_text().strip()).group('cno') + header_id = f'#{li.find_previous({"h2", "h1"}).get("id")}a{chap_no.zfill(2)}' + anchor = self.soup.new_tag('a', href=header_id) + cleansed_header_id = header_id.strip("#") + anchor.attrs['aria-describedby'] = cleansed_header_id + li['id'] = f'{cleansed_header_id}-anav{str(li_num).zfill(2)}' + anchor.string = li.text + if li.string: + li.string.replace_with(anchor) + else: + li.contents = [] + li.append(anchor) elif re.search(r'^Subpart \d+[A-Z]*', li.text.strip()): chap_no = re.search('^Subpart (?P<cno>\d+[A-Z]*)', li.get_text().strip()).group('cno') - header_id = f'#{li.find_previous("h2",class_="parth2").get("id")}s{chap_no}' + header_id = f'#{li.find_previous("h2", class_="parth2").get("id")}s{chap_no}' anchor = self.soup.new_tag('a', href=header_id) cleansed_header_id = header_id.strip("#") anchor.attrs['aria-describedby'] = cleansed_header_id @@ -981,11 +1133,37 @@ def add_anchor_tags(self): else: li.contents = [] li.append(anchor) + elif re.search(r'^APPENDIXRULES', li.get_text().strip()): + chap_no = re.sub(r'\W+', '', li.get_text().strip()).lower() + header_id = f'#t{self.title}apr{chap_no}' + anchor = self.soup.new_tag('a', href=header_id) + cleansed_header_id = header_id.strip("#") + anchor.attrs['aria-describedby'] = cleansed_header_id + li['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' + anchor.string = li.text + if li.string: + li.string.replace_with(anchor) + else: + li.contents = [] + li.append(anchor) + elif re.search(r'^Rule \d+(-\d+-\.\d+)*(\s\(\d+\))*\.', li.get_text().strip()): + chap_no = re.search(r'^Rule (?P<r_id>\d+(-\d+-\.\d+)*(\s\(\d+\))*)\.', li.get_text().strip()).group( + "r_id") + header_id = f'#{li.find_previous("h2", class_="apdxrules").get("id")}r{chap_no.zfill(2)}' + anchor = self.soup.new_tag('a', href=header_id) + cleansed_header_id = header_id.strip("#") + anchor.attrs['aria-describedby'] = cleansed_header_id + li['id'] = f'{cleansed_header_id}-arnav{arnum:02}' - print('added anchor tags') - - + anchor.string = li.text + if li.string: + li.string.replace_with(anchor) + else: + li.contents = [] + li.append(anchor) + arnum += 1 + print('added anchor tags') def clean_html_and_add_cite(self): """ @@ -1039,8 +1217,9 @@ def clean_html_and_add_cite(self): if ol_reg := re.search(r'(\(\w+\))+', match.strip()): ol_num = re.sub(r'\(|\)', '', ol_reg.group()) a_id = f'{a_id}ol1{ol_num}' + text = re.sub(fr'\s{re.escape(match)}', - f'<cite class="ocga"><a href="{a_id}" target="{target}">{match}</a></cite>', inside_text, + f' <cite class="ocga"><a href="{a_id}" target="{target}">{match}</a></cite>', inside_text, re.I) tag.append(text) @@ -1056,6 +1235,7 @@ def clean_html_and_add_cite(self): main_tag = self.soup.new_tag('main') chap_nav = self.soup.find('nav') tag_to_wrap = chap_nav.find_next_sibling() + while True: next_tag = tag_to_wrap.find_next_sibling() main_tag.append(tag_to_wrap) @@ -1064,6 +1244,7 @@ def clean_html_and_add_cite(self): break tag_to_wrap = next_tag + def write_soup_to_file(self): """ - add the space before self closing meta tags @@ -1072,7 +1253,6 @@ def write_soup_to_file(self): """ soup_str = str(self.soup.prettify(formatter=None)) - for tag in self.meta_tags: cleansed_tag = re.sub(r'/>', ' />', str(tag)) soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) @@ -1080,7 +1260,132 @@ def write_soup_to_file(self): # html5validate.validate(soup_str) with open(f"../../cic-code-ga/transforms/ga/ocga/r{self.release_number}/{self.html_file_name}", "w") as file: # file.write(soup_str) - file.write(soup_str.replace('<br/>', '<br />')) + soup_str = re.sub(r'&(?!amp;)', '&', soup_str) + soup_str = re.sub(r'<br/>', '<br />', soup_str) + file.write(soup_str) + + # def replace_tag_names_constitution(self): + # """ + # - create dictionary with class names as keys with associated tag name as its value + # - find all the tags in html with specified class names from dict + # and replace tag with associated tag name (p1 -> h1) + # - based on tag name find or build id for that tag + # - create watermark tag and append it with h1 to first nav tag + # """ + # tag_dict = {"h2": self.tag_type_dict['head2'], + # "h3": self.tag_type_dict['head3'], + # "h4": self.tag_type_dict['head4'], "li": self.tag_type_dict['ul']} + # + # for key, value in tag_dict.items(): + # amendment_num = 0 + # ul = self.soup.new_tag("ul", Class="leaders") + # while True: + # p_tag = self.soup.find('p', {"class": value}) + # if not p_tag: + # break + # p_tag.name = key + # + # if value == self.tag_type_dict['ul']: + # if p_tag.findPrevious().name != 'li': + # p_tag.wrap(ul) + # elif p_tag.findPrevious().name == 'li': + # ul.append(p_tag) + # if p_tag.findNext().has_attr('class') and \ + # p_tag.findNext()['class'][0] != self.tag_type_dict['ul']: + # new_nav = self.soup.new_tag('nav') + # ul.wrap(new_nav) + # ul = self.soup.new_tag("ul", Class="leaders") + # if key == 'h2': + # if chap_section_regex := re.search(r'^(ARTICLE|section)\s(?P<chap>\w+)\.', + # p_tag.get_text().strip(), re.I): + # if re.search('^section', p_tag.get_text().strip(), re.I): + # p_tag.name = 'h3' + # parent = p_tag.find_previous_sibling(lambda tag: tag.name == 'h2') + # p_tag['id'] = f"{parent['id']}s{chap_section_regex.group('chap')}" + # else: + # p_tag['id'] = f"{self.title}-a{chap_section_regex.group('chap')}" + # elif re.search('amendments', p_tag.get_text().strip(), re.I): + # amendment_num += 1 + # p_tag['id'] = f"{self.title}-amendment{str(amendment_num).zfill(2)}" + # elif key == 'h3': + # if chap_section_regex := re.search(r'(Paragraph|Section)\s(?P<sec>\w+(-\w+)?)\.', + # p_tag.get_text().strip(), re.I): + # if re.search('paragraph', p_tag.get_text().strip(), re.I): + # p_tag['class'] = 'paragraph_head' + # + # parent = p_tag.find_previous_sibling(lambda tag: tag.name in 'h3' + # and not re.search('paragraph', + # tag.get_text().strip() + # , re.I)) + # print(p_tag) + # print(parent['id']) + # print(chap_section_regex.group('sec')) + # + # p_tag['id'] = f"{parent['id']}p{chap_section_regex.group('sec')}" + # else: + # parent = p_tag.find_previous_sibling(lambda tag: tag.name == 'h2') + # p_tag['id'] = f"{parent['id']}s{chap_section_regex.group('sec')}" + # elif amendment_num_reg := re.search(r'Amendment\s(?P<amend>\w+)]', + # p_tag.get_text().strip(), re.I): + # p_tag['class'] = 'amendment_head' + # parent = p_tag.find_previous_sibling(lambda tag: tag.name in 'h2' + # and re.search('^Amendments', + # tag.get_text().strip(), re.I)) + # p_tag['id'] = f"{parent['id']}am{amendment_num_reg.group('amend')}" + # elif key == 'h4': + # if self.headers_class_dict.get(p_tag.get_text()): + # p_tag['class'] = self.headers_class_dict.get(p_tag.get_text()) + # p_tag['id'] = re.sub(r'\s+|\'', '', f'{self.title.zfill(2)}-{p_tag.get_text()}') + # if re.search(r'^\d', p_tag.get_text()): + # chap_id = p_tag.find_previous_sibling(lambda tag: re.search('^[a-zA-Z]', tag.get_text()) + # and tag.name != 'h5' and re.search(r'h\d', + # tag.name)) + # elif not p_tag.has_attr('class') or p_tag['class'] not in self.headers_class_dict.values(): + # chap_id = p_tag.find_previous(lambda tag: tag.name in ['h2', 'h3'] or tag.has_attr('class') and + # tag['class'] in self.headers_class_dict.values()) + # else: + # chap_id = p_tag.findPrevious(lambda tag: tag.name != 'h4' and re.search(r'h\d', tag.name)) + # if chap_id and chap_id.has_attr('id'): + # id_text = re.sub(r'\s|"|\'', '', p_tag.get_text()) + # p_tag['id'] = f'{chap_id["id"]}-{id_text}' + # if p_tag.find_previous(lambda tag: p_tag['id'] == tag.get('id', '')): + # p_tag['id'] = f"{p_tag['id']}.1" + # if not re.search(r'\w+', p_tag.get_text()): + # p_tag.decompose() + # + # head4_data_list = ['Law reviews. —', 'Cross references. —', 'Editor’s notes. —', + # 'JUDICIAL DECISIONS', 'OPINIONS OF THE ATTORNEY GENERAL', 'RESEARCH REFERENCES'] + # cur_id_list = [] + # for tag in self.soup.find_all("h4"): + # if re.search(r'\. —$', tag.get_text()) or tag.get_text().isupper(): + # h4_text = re.sub(r'[\s’,“]+', '', tag.get_text()).lower() + # h4_id = f'{tag.find_previous({"h3", "h2", "h1"}).get("id")}-{h4_text}' + # if h4_id in cur_id_list: + # tag["id"] = f'{h4_id}.1' + # else: + # tag["id"] = f'{h4_id}' + # + # cur_id_list.append(tag["id"]) + # + # else: + # tag.name = "p" + # + # stylesheet_link_tag = self.soup.new_tag('link') + # stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', + # 'href': 'https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css'} + # self.soup.style.replace_with(stylesheet_link_tag) + # h1_tag = self.soup.find(lambda tag: re.search('^CONSTITUTION OF THE', tag.get_text())) + # h1_tag.name = 'h1' + # watermark_p = self.soup.new_tag('p', Class='transformation') + # watermark_p.string = self.watermark_text.format(self.release_number, self.release_date, + # datetime.now().date()) + # h1_tag.insert_before(watermark_p) + # title_tag = h1_tag + # chap_nav = self.soup.find('nav') + # chap_nav.insert(0, watermark_p) + # chap_nav.insert(1, title_tag) + # for tag in self.soup.findAll('span'): + # tag.unwrap() def replace_tag_names_constitution(self): """ @@ -1090,82 +1395,200 @@ def replace_tag_names_constitution(self): - based on tag name find or build id for that tag - create watermark tag and append it with h1 to first nav tag """ - tag_dict = {"h2": self.tag_type_dict['head2'], - "h3": self.tag_type_dict['head3'], - "h4": self.tag_type_dict['head4'], "li": self.tag_type_dict['ul']} + head4_data_list = ['Law reviews. —', 'Cross references. —', 'Editor’s notes. —', + 'JUDICIAL DECISIONS', 'OPINIONS OF THE ATTORNEY GENERAL', 'RESEARCH REFERENCES'] + cur_id_list = [] + h4count = 1 + + for p_tag in self.soup.body.findAll(): + if p_tag.get("class") == [self.tag_type_dict['head1']]: + p_tag.name = "h1" + p_tag.wrap(self.soup.new_tag("nav")) + p_tag['id'] = self.title + if p_tag.get("class") == [self.tag_type_dict['head3']]: + if re.search(r'Article (?P<aid>[IVX]+[A-Z]*)', p_tag.get_text().strip(), re.I): + article_id = re.search(r'Article (?P<aid>[IVX]+[A-Z]*)', p_tag.get_text().strip(), re.I).group( + "aid") + p_tag.name = "h2" + p_tag['id'] = f't{self.title}a{article_id}' + p_tag["class"] = "articleh2" + h4count = 1 + elif re.search(r'Section (?P<aid>[IVX]+[A-Z]*)', p_tag.get_text().strip(), re.I): + section_id = re.search(r'Section (?P<aid>[IVX]+[A-Z]*)', p_tag.get_text().strip(), re.I).group( + "aid") + p_tag.name = "h2" + p_tag['id'] = f'{p_tag.find_previous("h2", class_="articleh2").get("id")}s{section_id}' + p_tag["class"] = "sectionh2" + h4count = 1 + elif re.search(r'^AMENDMENTS TO THE CONSTITUTION', p_tag.get_text().strip(), re.I): + section_id = re.sub(r'[\s]+', '', p_tag.text.strip()).lower() + p_tag.name = "h2" + p_tag['id'] = f'{p_tag.find_previous("h2").get("id")}amd{section_id}' + h4count = 1 + + elif p_tag.get("class") == [self.tag_type_dict['head']]: + if re.search(r'Paragraph (?P<aid>[IVX]+[-A-Z]*)', p_tag.get_text().strip(), re.I): + para_id = re.search(r'Paragraph (?P<aid>[IVX]+[-A-Z]*)', p_tag.get_text().strip(), re.I).group( + "aid") + p_tag.name = "h3" + p_tag['id'] = f'{p_tag.find_previous("h2", class_="sectionh2").get("id")}p{para_id}' + p_tag["class"] = "parah2" + h4count = 1 + elif re.search(r'^Sec\. (\d+)*\.', p_tag.get_text().strip(), re.I): + para_id = re.search(r'^Sec\. (?P<aid>\d+)*\.', p_tag.get_text().strip(), re.I).group("aid") + p_tag.name = "h3" + p_tag['id'] = f'{p_tag.find_previous("h2").get("id")}s{para_id}' + h4count = 1 + elif re.search(r'^Amendment [IVX]+', p_tag.get_text().strip(), re.I): + para_id = re.search(r'Amendment (?P<aid>[IVX]+)', p_tag.get_text().strip(), re.I).group("aid") + p_tag.name = "h3" + p_tag['id'] = f'{p_tag.find_previous("h2").get("id")}amd{para_id}' + h4count = 1 + + elif p_tag.get("class") == [self.tag_type_dict['head4']]: + if p_tag.get_text() in head4_data_list: + p_tag.name = "h4" + h4_text = re.sub(r'\W+_', '', p_tag.get_text()).lower() + h4_id = f'{p_tag.find_previous({"h3", "h2", "h1"}).get("id")}-{h4_text}' + if h4_id in cur_id_list: + p_tag["id"] = f'{h4_id}.{h4count}' + h4count += 1 + else: + p_tag["id"] = f'{h4_id}' + h4count = 1 - for key, value in tag_dict.items(): - amendment_num = 0 - ul = self.soup.new_tag("ul", Class="leaders") - while True: - p_tag = self.soup.find('p', {"class": value}) - if not p_tag: - break - p_tag.name = key + cur_id_list.append(p_tag["id"]) + elif p_tag.get("class") == [self.tag_type_dict['head2']]: + p_tag.name = "li" - if value == self.tag_type_dict['ul']: - if p_tag.findPrevious().name != 'li': - p_tag.wrap(ul) - elif p_tag.findPrevious().name == 'li': - ul.append(p_tag) - if p_tag.findNext().has_attr('class') and \ - p_tag.findNext()['class'][0] != self.tag_type_dict['ul']: - new_nav = self.soup.new_tag('nav') - ul.wrap(new_nav) - ul = self.soup.new_tag("ul", Class="leaders") - if key == 'h2': - if chap_section_regex := re.search(r'^(ARTICLE|section)\s(?P<chap>\w+)\.', - p_tag.get_text().strip(), re.I): - if re.search('^section', p_tag.get_text().strip(), re.I): - p_tag.name = 'h3' - parent = p_tag.find_previous_sibling(lambda tag: tag.name == 'h2') - p_tag['id'] = f"{parent['id']}s{chap_section_regex.group('chap')}" - else: - p_tag['id'] = f"{self.title}-a{chap_section_regex.group('chap')}" - elif re.search('amendments', p_tag.get_text().strip(), re.I): - amendment_num += 1 - p_tag['id'] = f"{self.title}-amendment{str(amendment_num).zfill(2)}" - elif key == 'h3': - if chap_section_regex := re.search(r'(Paragraph|Section)\s(?P<sec>\w+(-\w+)?)\.', - p_tag.get_text().strip(), re.I): - if re.search('paragraph', p_tag.get_text().strip(), re.I): - p_tag['class'] = 'paragraph_head' - parent = p_tag.find_previous_sibling(lambda tag: tag.name in 'h3' - and not re.search('paragraph', - tag.get_text().strip() - , re.I)) - p_tag['id'] = f"{parent['id']}p{chap_section_regex.group('sec')}" - else: - parent = p_tag.find_previous_sibling(lambda tag: tag.name == 'h2') - p_tag['id'] = f"{parent['id']}s{chap_section_regex.group('sec')}" - elif amendment_num_reg := re.search(r'Amendment\s(?P<amend>\w+)]', - p_tag.get_text().strip(), re.I): - p_tag['class'] = 'amendment_head' - parent = p_tag.find_previous_sibling(lambda tag: tag.name in 'h2' - and re.search('^Amendments', - tag.get_text().strip(), re.I)) - p_tag['id'] = f"{parent['id']}am{amendment_num_reg.group('amend')}" - elif key == 'h4': - if self.headers_class_dict.get(p_tag.get_text()): - p_tag['class'] = self.headers_class_dict.get(p_tag.get_text()) - p_tag['id'] = re.sub(r'\s+|\'', '', f'{self.title.zfill(2)}-{p_tag.get_text()}') - if re.search(r'^\d', p_tag.get_text()): - chap_id = p_tag.find_previous_sibling(lambda tag: re.search('^[a-zA-Z]', tag.get_text()) - and tag.name != 'h5' and re.search(r'h\d', - tag.name)) - elif not p_tag.has_attr('class') or p_tag['class'] not in self.headers_class_dict.values(): - chap_id = p_tag.find_previous(lambda tag: tag.name in ['h2', 'h3'] or tag.has_attr('class') and - tag['class'] in self.headers_class_dict.values()) + if p_tag.find_previous().name == "li": + ul_tag.append(p_tag) + else: + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + p_tag.wrap(ul_tag) + if re.search(r'Article [IVX]+', p_tag.get_text().strip(), re.I): + ul_tag.find_previous("nav").append(ul_tag) else: - chap_id = p_tag.findPrevious(lambda tag: tag.name != 'h4' and re.search(r'h\d', tag.name)) - if chap_id and chap_id.has_attr('id'): - id_text = re.sub(r'\s|"|\'', '', p_tag.get_text()) - p_tag['id'] = f'{chap_id["id"]}-{id_text}' - if p_tag.find_previous(lambda tag: p_tag['id'] == tag.get('id', '')): - p_tag['id'] = f"{p_tag['id']}.1" - if not re.search(r'\w+', p_tag.get_text()): - p_tag.decompose() - + nav_tag = self.soup.new_tag("nav") + ul_tag.wrap(nav_tag) + + # tag_dict = {"h2": self.tag_type_dict['head2'], + # "h3": self.tag_type_dict['head3'], + # "h4": self.tag_type_dict['head4'], "li": self.tag_type_dict['ul']} + # + # for key, value in tag_dict.items(): + # amendment_num = 0 + # ul = self.soup.new_tag("ul", Class="leaders") + # while True: + # p_tag = self.soup.find('p', {"class": value}) + # if not p_tag: + # break + # p_tag.name = key + # + # if value == self.tag_type_dict['ul']: + # if p_tag.findPrevious().name != 'li': + # p_tag.wrap(ul) + # elif p_tag.findPrevious().name == 'li': + # ul.append(p_tag) + # if p_tag.findNext().has_attr('class') and \ + # p_tag.findNext()['class'][0] != self.tag_type_dict['ul']: + # new_nav = self.soup.new_tag('nav') + # ul.wrap(new_nav) + # ul = self.soup.new_tag("ul", Class="leaders") + + # if key == 'h2': + # if chap_section_regex := re.search(r'^(Article|Section)\s(?P<chap>\w+)\.', + # p_tag.get_text().strip(), re.I): + # if re.search('^Section', p_tag.get_text().strip(), re.I): + # p_tag.name = 'h3' + # parent = p_tag.find_previous_sibling(lambda tag: tag.name == 'h2') + # p_tag['id'] = f"{parent['id']}s{chap_section_regex.group('chap')}" + # else: + # p_tag['id'] = f"{self.title}-a{chap_section_regex.group('chap')}" + # elif re.search('amendments', p_tag.get_text().strip(), re.I): + # amendment_num += 1 + # p_tag['id'] = f"{self.title}-amendment{str(amendment_num).zfill(2)}" + + # elif key == 'h3': + # if re.search(r'Article (?P<aid>[IVX]+)',p_tag.get_text().strip(), re.I): + # article_id = re.search(r'Article (?P<aid>[IVX]+)',p_tag.get_text().strip(), re.I).group("aid") + # p_tag.name = "h2" + # p_tag['id'] = f't{self.title}a{article_id}' + # p_tag["class"] = "articleh2" + # elif re.search(r'Section (?P<aid>[IVX]+)',p_tag.get_text().strip(), re.I): + # section_id = re.search(r'Section (?P<aid>[IVX]+)',p_tag.get_text().strip(), re.I).group("aid") + # p_tag.name = "h2" + # p_tag['id'] = f'{p_tag.find_previous("h3",class_="articleh2")}a{section_id}' + # p_tag["class"] = "sectionh2" + # elif re.search(r'Paragraph (?P<aid>[IVX]+)',p_tag.get_text().strip(), re.I): + # para_id = re.search(r'Paragraph (?P<aid>[IVX]+)',p_tag.get_text().strip(), re.I).group("aid") + # p_tag.name = "h2" + # p_tag['id'] = f'{p_tag.find_previous("h3",class_="sectionh2")}a{para_id}' + # p_tag["class"] = "parah2" + + # if chap_section_regex := re.search(r'(Paragraph|Section)\s(?P<sec>\w+(-\w+)?)\.', + # p_tag.get_text().strip(), re.I): + # if re.search('paragraph', p_tag.get_text().strip(), re.I): + # p_tag['class'] = 'paragraph_head' + # + # parent = p_tag.find_previous_sibling(lambda tag: tag.name in 'h3' + # and not re.search('paragraph', + # tag.get_text().strip() + # , re.I)) + # print(p_tag) + # print(parent['id']) + # print(chap_section_regex.group('sec')) + # + # p_tag['id'] = f"{parent['id']}p{chap_section_regex.group('sec')}" + # else: + # parent = p_tag.find_previous_sibling(lambda tag: tag.name == 'h2') + # p_tag['id'] = f"{parent['id']}s{chap_section_regex.group('sec')}" + # elif amendment_num_reg := re.search(r'Amendment\s(?P<amend>\w+)]', + # p_tag.get_text().strip(), re.I): + # p_tag['class'] = 'amendment_head' + # parent = p_tag.find_previous_sibling(lambda tag: tag.name in 'h2' + # and re.search('^Amendments', + # tag.get_text().strip(), re.I)) + # p_tag['id'] = f"{parent['id']}am{amendment_num_reg.group('amend')}" + + # elif key == 'h4': + # if self.headers_class_dict.get(p_tag.get_text()): + # p_tag['class'] = self.headers_class_dict.get(p_tag.get_text()) + # p_tag['id'] = re.sub(r'\s+|\'', '', f'{self.title.zfill(2)}-{p_tag.get_text()}') + # if re.search(r'^\d', p_tag.get_text()): + # chap_id = p_tag.find_previous_sibling(lambda tag: re.search('^[a-zA-Z]', tag.get_text()) + # and tag.name != 'h5' and re.search(r'h\d', + # tag.name)) + # elif not p_tag.has_attr('class') or p_tag['class'] not in self.headers_class_dict.values(): + # chap_id = p_tag.find_previous(lambda tag: tag.name in ['h2', 'h3'] or tag.has_attr('class') and + # tag['class'] in self.headers_class_dict.values()) + # else: + # chap_id = p_tag.findPrevious(lambda tag: tag.name != 'h4' and re.search(r'h\d', tag.name)) + # if chap_id and chap_id.has_attr('id'): + # id_text = re.sub(r'\s|"|\'', '', p_tag.get_text()) + # p_tag['id'] = f'{chap_id["id"]}-{id_text}' + # if p_tag.find_previous(lambda tag: p_tag['id'] == tag.get('id', '')): + # p_tag['id'] = f"{p_tag['id']}.1" + # if not re.search(r'\w+', p_tag.get_text()): + # p_tag.decompose() + # + # head4_data_list = ['Law reviews. —', 'Cross references. —', 'Editor’s notes. —', + # 'JUDICIAL DECISIONS', 'OPINIONS OF THE ATTORNEY GENERAL', 'RESEARCH REFERENCES'] + # cur_id_list = [] + # for tag in self.soup.find_all("h4"): + # if re.search(r'\. —$', tag.get_text()) or tag.get_text().isupper(): + # h4_text = re.sub(r'[\s’,“]+', '', tag.get_text()).lower() + # h4_id = f'{tag.find_previous({"h3", "h2", "h1"}).get("id")}-{h4_text}' + # if h4_id in cur_id_list: + # tag["id"] = f'{h4_id}.1' + # else: + # tag["id"] = f'{h4_id}' + # + # cur_id_list.append(tag["id"]) + # + # else: + # tag.name = "p" + # stylesheet_link_tag = self.soup.new_tag('link') stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', 'href': 'https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css'} @@ -1184,76 +1607,113 @@ def replace_tag_names_constitution(self): tag.unwrap() def add_anchor_constitution(self): - for nav in self.soup.findAll('nav'): - new_p = self.soup.new_tag('p') - if not re.search('^analysis', nav.get_text(), re.I): - new_p.string = nav.find('li').get_text() - if nav.h1: - nav.h1.insert_after(new_p) - else: - nav.insert(0, new_p) - nav.find('li').decompose() - if re.search('article', new_p.get_text(), re.I): - amendment_num = 0 - for li in nav.ul.findAll('li'): - if roman_match := re.search(r'^(\S+)\.', li.get_text()): - article_num = roman_match.group(1) - header_id = f'{self.title}-a{article_num}' - anchor = self.soup.new_tag('a', href=f'#{header_id}') - anchor.string = li.string - anchor.attrs['aria-describedby'] = header_id - li.string.replace_with(anchor) - elif re.search('AMENDMENT|APPENDIX', li.get_text(), re.I): - amendment_num += 1 - header_id = f'{self.title}-amendment{str(amendment_num).zfill(2)}' - anchor = self.soup.new_tag('a', href=f'#{header_id}') - anchor.string = li.string - anchor.attrs['aria-describedby'] = header_id - li.string.replace_with(anchor) - elif re.search(r'section|sec\.', new_p.get_text(), re.I): - for li in nav.ul.findAll('li'): - if roman_match := re.search(r'^(\S+)\.', li.get_text()): - section_num = roman_match.group(1) - parent = nav.find_previous_sibling(lambda tag: tag.name == 'h2') - header_id = f'{parent["id"]}s{section_num}' - anchor = self.soup.new_tag('a', href=f'#{header_id}') - anchor.string = li.string - anchor.attrs['aria-describedby'] = header_id - li.string.replace_with(anchor) - elif re.search('paragraph', new_p.get_text(), re.I): - for li in nav.ul.findAll('li'): - if roman_match := re.search(r'^(\S+)\.', li.get_text()): - paragraph_num = roman_match.group(1) - parent = nav.find_previous_sibling(lambda tag: tag.name in ['h2', 'h3'] and - re.search('^article|^section', tag.get_text(), - re.I)) - header_id = f'{parent["id"]}p{paragraph_num}' - anchor = self.soup.new_tag('a', href=f'#{header_id}') - anchor.string = li.string - anchor.attrs['aria-describedby'] = header_id - li.string.replace_with(anchor) - elif re.search(r'amend\.', new_p.get_text(), re.I): - for li in nav.ul.findAll('li'): - if roman_match := re.search(r'^(\S+)\.', li.get_text()): - paragraph_num = roman_match.group(1) - parent = nav.find_previous_sibling(lambda tag: tag.name == 'h2' and - re.search('^amendments', tag.get_text(), re.I)) - header_id = f'{parent["id"]}am{paragraph_num}' - anchor = self.soup.new_tag('a', href=f'#{header_id}') - anchor.string = li.string - anchor.attrs['aria-describedby'] = header_id - li.string.replace_with(anchor) - elif re.search('^analysis', nav.get_text(), re.I): - for li in nav.ul.findAll('li'): - parent = nav.find_previous_sibling(lambda tag: tag.name in ['h3', 'h2'] or - tag.has_attr('class') and - tag['class'] in self.headers_class_dict.values()) - id_text = re.sub(r"\s+|\'", '', li.get_text()) - header_id = f'{parent["id"]}-{id_text}' - anchor = self.soup.new_tag('a', href=f'#{header_id}') - anchor.string = li.string - anchor.attrs['aria-describedby'] = header_id - li.string.replace_with(anchor) + # for nav in self.soup.findAll('nav'): + # new_p = self.soup.new_tag('p') + # if not re.search('^analysis', nav.get_text(), re.I): + # new_p.string = nav.find('li').get_text() + # if nav.h1: + # nav.h1.insert_after(new_p) + # else: + # nav.insert(0, new_p) + # nav.find('li').decompose() + # if re.search('article', new_p.get_text(), re.I): + # amendment_num = 0 + # for li in nav.ul.findAll('li'): + # if roman_match := re.search(r'^(\S+)\.', li.get_text()): + # article_num = roman_match.group(1) + # header_id = f'{self.title}-a{article_num}' + # anchor = self.soup.new_tag('a', href=f'#{header_id}') + # anchor.string = li.string + # anchor.attrs['aria-describedby'] = header_id + # li.string.replace_with(anchor) + # elif re.search('AMENDMENT|APPENDIX', li.get_text(), re.I): + # amendment_num += 1 + # header_id = f'{self.title}-amendment{str(amendment_num).zfill(2)}' + # anchor = self.soup.new_tag('a', href=f'#{header_id}') + # anchor.string = li.string + # anchor.attrs['aria-describedby'] = header_id + # li.string.replace_with(anchor) + # elif re.search(r'section|sec\.', new_p.get_text(), re.I): + # for li in nav.ul.findAll('li'): + # if roman_match := re.search(r'^(\S+)\.', li.get_text()): + # section_num = roman_match.group(1) + # parent = nav.find_previous_sibling(lambda tag: tag.name == 'h2') + # header_id = f'{parent["id"]}s{section_num}' + # anchor = self.soup.new_tag('a', href=f'#{header_id}') + # anchor.string = li.string + # anchor.attrs['aria-describedby'] = header_id + # li.string.replace_with(anchor) + # elif re.search('paragraph', new_p.get_text(), re.I): + # for li in nav.ul.findAll('li'): + # if roman_match := re.search(r'^(\S+)\.', li.get_text()): + # paragraph_num = roman_match.group(1) + # parent = nav.find_previous_sibling(lambda tag: tag.name in ['h2', 'h3'] and + # re.search('^article|^section', tag.get_text(), + # re.I)) + # header_id = f'{parent["id"]}p{paragraph_num}' + # anchor = self.soup.new_tag('a', href=f'#{header_id}') + # anchor.string = li.string + # anchor.attrs['aria-describedby'] = header_id + # li.string.replace_with(anchor) + # elif re.search(r'amend\.', new_p.get_text(), re.I): + # for li in nav.ul.findAll('li'): + # if roman_match := re.search(r'^(\S+)\.', li.get_text()): + # paragraph_num = roman_match.group(1) + # parent = nav.find_previous_sibling(lambda tag: tag.name == 'h2' and + # re.search('^amendments', tag.get_text(), re.I)) + # header_id = f'{parent["id"]}am{paragraph_num}' + # anchor = self.soup.new_tag('a', href=f'#{header_id}') + # anchor.string = li.string + # anchor.attrs['aria-describedby'] = header_id + # li.string.replace_with(anchor) + # elif re.search('^analysis', nav.get_text(), re.I): + # for li in nav.ul.findAll('li'): + # parent = nav.find_previous_sibling(lambda tag: tag.name in ['h3', 'h2'] or + # tag.has_attr('class') and + # tag['class'] in self.headers_class_dict.values()) + # id_text = re.sub(r"\s+|\'", '', li.get_text()) + # header_id = f'{parent["id"]}-{id_text}' + # anchor = self.soup.new_tag('a', href=f'#{header_id}') + # anchor.string = li.string + # anchor.attrs['aria-describedby'] = header_id + # li.string.replace_with(anchor) + # elif re.search('Article [IVX]+', new_p.get_text(), re.I): + # article_id = re.search('Article (?P<aid>[IVX]+)', new_p.get_text(), re.I).group("aid") + # header_id = f'{parent["id"]}-{id_text}' + # anchor = self.soup.new_tag('a', href=f'#{header_id}') + # anchor.string = li.string + # anchor.attrs['aria-describedby'] = header_id + # li.string.replace_with(anchor) + + for li_tag in self.soup.findAll("li"): + if re.search(r'^Article [IVX]+', li_tag.text.strip()): + a_id = re.search(r'^Article (?P<aid>[IVX]+[A-Z]*)', li_tag.text.strip()).group('aid') + li_tag_id = f'#t{self.title}a{a_id}' + anchor = self.soup.new_tag('a', href=li_tag_id) + anchor.string = li_tag.text + li_tag.string = '' + li_tag.append(anchor) + elif re.search(r'^Section [IVX]+', li_tag.text.strip()): + s_id = re.search(r'^Section (?P<aid>[IVX]+[A-Z]*)', li_tag.text.strip()).group('aid') + li_tag_id = f'#{li_tag.find_previous("h2", class_="articleh2").get("id")}s{s_id}' + anchor = self.soup.new_tag('a', href=li_tag_id) + anchor.string = li_tag.text + li_tag.string = '' + li_tag.append(anchor) + elif re.search(r'^Paragraph [IVX]+', li_tag.text.strip()): + p_id = re.search(r'^Paragraph (?P<aid>[IVX]+[-A-Z]*)', li_tag.text.strip()).group('aid') + li_tag_id = f'#{li_tag.find_previous("h2", class_="sectionh2").get("id")}p{p_id}' + anchor = self.soup.new_tag('a', href=li_tag_id) + anchor.string = li_tag.text + li_tag.string = '' + li_tag.append(anchor) + elif re.search(r'^Amendment [IVX]+', li_tag.text.strip()): + p_id = re.search(r'^Amendment (?P<aid>[IVX]+[-A-Z]*)', li_tag.text.strip()).group('aid') + li_tag_id = f'#{li_tag.find_previous("h2").get("id")}amd{p_id}' + anchor = self.soup.new_tag('a', href=li_tag_id) + anchor.string = li_tag.text + li_tag.string = '' + li_tag.append(anchor) def start_parse(self): """ @@ -1267,9 +1727,10 @@ def start_parse(self): print(start_time) self.create_page_soup() if re.search('constitution', self.html_file_name): - self.tag_type_dict = {'head1': r'^CONSTITUTION OF THE ', 'head2': r'^ARTICLE I', 'ul': r'^PREAMBLE', + self.tag_type_dict = {'head1': r'^CONSTITUTION OF THE ', 'head2': r'^Article I', 'ul': r'^PREAMBLE', 'head4': '^JUDICIAL DECISIONS', 'ol_p': r'^\(\d\)', 'junk1': '^Annotations$', - 'head3': r'^SECTION 1\.|^Paragraph I\.', 'normalp': '^Editor\'s note'} + 'head3': r'^Section I|^Paragraph I\.|^Article I', 'normalp': '^Editor\'s note', + 'head': 'Paragraph I\. |^Sec\. 1\.'} self.get_class_name() self.remove_junk() self.replace_tag_names_constitution() @@ -1278,18 +1739,17 @@ def start_parse(self): self.add_anchor_constitution() self.wrap_div_tags() else: - self.get_class_name() self.remove_junk() self.replace_tags() self.convert_paragraph_to_alphabetical_ol_tags() - # self.convert_to_numeric_ol_tags() - self.create_analysis_nav_tag() self.remove_or_replace_class_names() self.wrap_div_tags() self.add_anchor_tags() - # except Exception: - # pass + # + # # except Exception: + # # pass + # self.clean_html_and_add_cite() - self.write_soup_to_file() + # self.write_soup_to_file() print(datetime.now() - start_time) diff --git a/html_parser/html_parse_runner.py b/html_parser/html_parse_runner.py index 159a99d..ee46fd9 100644 --- a/html_parser/html_parse_runner.py +++ b/html_parser/html_parse_runner.py @@ -5,6 +5,7 @@ import importlib import argparse import os +from datetime import datetime class HtmlParseRunner: @@ -22,6 +23,9 @@ def start_parser(state_key): - set environment variables using parsed command line args - Call start parse method with state_key as arg """ + start_time = datetime.now() + print(start_time) + parser = argparse.ArgumentParser() parser.add_argument("--state_key", help="State of which parser should be run", required=True, type=str) parser.add_argument("--input_file_name", help="file which needs to be parsed", type=str) @@ -32,3 +36,5 @@ def start_parser(state_key): os.environ.setdefault('release_number', args.release_number) os.environ.setdefault('release_date', args.release_date) HtmlParseRunner.start_parser(args.state_key) + + print("finished at",datetime.now() - start_time) \ No newline at end of file diff --git a/html_parser/id_html_parser.py b/html_parser/id_html_parser.py index baaccde..2ae57dd 100644 --- a/html_parser/id_html_parser.py +++ b/html_parser/id_html_parser.py @@ -10,7 +10,7 @@ from parser_base import ParserBase -class idParseHtml(ParserBase): +class IDParseHtml(ParserBase): def __init__(self, input_file_name): super().__init__() self.html_file_name = input_file_name @@ -145,7 +145,6 @@ def replace_tags(self): sec_count = 1 self.snav_count = 1 - elif chap_head := re.search(r'(Chapter(s?)|CHAPTER(s?))\s(?P<c_title>\d+[a-zA-Z]?)',header_tag.get_text()): header_tag.name = "h2" header_tag["id"] = f't{self.title}c{chap_head.group("c_title").zfill(2)}' @@ -181,7 +180,6 @@ def replace_tags(self): header_tag["class"] = "articleh3" case_note_count = 1 - elif header_tag.get_text().isupper(): header_tag.name = "h4" header_tag_text = re.sub(r'[\s]*', '', header_tag.get_text()) @@ -222,7 +220,6 @@ def replace_tags(self): sub_sec_count = 1 self.case_note_head.append(header_tag.get_text().lower()) - if header_tag.get("class") == [self.tag_type_dict['ul']]: if re.search(r'^\d+-\d+[a-zA-Z]?[a-zA-Z]?(-\d+)?\.?\s[“[a-zA-Z]+|^\d+-\d+[a-zA-Z]?[a-zA-Z]?(-\d+)?\s?[—,]\s?\d+-\d+[a-zA-Z]?(-\d+)?\.?\s[“[a-zA-Z]',header_tag.get_text()) or (re.search(r'^\d+\.|Chapter \d+[a-zA-Z]?[.—,-]',header_tag.get_text()) and not header_tag.find_previous("h3")) : header_tag.name = "li" @@ -1040,7 +1037,7 @@ def write_soup_to_file(self): soup_str = str(self.soup.prettify(formatter=None)) - with open(f"/home/mis/cic-code-id/transforms/id/ocid/r{self.release_number}/{self.html_file_name}", "w") as file: + with open(f"../../cic-code-id-1/transforms/id/ocid/r{self.release_number}/{self.html_file_name}", "w") as file: file.write(soup_str) diff --git a/html_parser/ky_html_parser.py b/html_parser/ky_html_parser.py index 960658a..ccdb235 100644 --- a/html_parser/ky_html_parser.py +++ b/html_parser/ky_html_parser.py @@ -1,4 +1,3 @@ - """ - this file accepts the text util generated html and parse it - here the html is converted in such a way that it matches the html5 standards @@ -6,27 +5,27 @@ - this method based on the file type(constitution files or title files) decides which methods to run """ - from bs4 import BeautifulSoup, Doctype import re from datetime import datetime from parser_base import ParserBase - class KYParseHtml(ParserBase): def __init__(self, input_file_name): super().__init__() self.class_regex = {'ul': '^CHAPTER', 'head2': '^CHAPTER', 'title': '^(TITLE)|^(CONSTITUTION OF KENTUCKY)', 'sec_head': r'^([^\s]+[^\D]+)', - 'junk': '^(Text)', 'ol': r'^(\(1\))', 'head4': '^(NOTES TO DECISIONS)','nd_nav':'^1\.'} + 'junk': '^(Text)', 'ol': r'^(\(1\))', 'head4': '^(NOTES TO DECISIONS)', 'nd_nav': '^1\.'} + self.title_id = None self.soup = None self.junk_tag_class = ['Apple-converted-space', 'Apple-tab-span'] self.html_file_name = input_file_name self.nd_list = [] + self.meta_tags = [] - self.watermark_text = """Release {0} of the Official Code of Kentucky Annotated released {1}. + self.watermark_text = """Release {0} of the Official Code of Kentucky Annotated released {1} Transformed and posted by Public.Resource.Org using cic-beautify-state-codes.py version 1.4 on {2}. This document is not subject to copyright and is in the public domain. """ @@ -50,8 +49,6 @@ def create_page_soup(self): self.soup.html.attrs['lang'] = 'en' print('created soup') - - def generate_class_name(self): """ @@ -61,15 +58,13 @@ def generate_class_name(self): for key, value in self.class_regex.items(): tag_class = self.soup.find( lambda tag: tag.name == 'p' and re.search(self.class_regex.get(key), tag.get_text().strip()) and - tag.attrs["class"][0] not in self.class_regex.values() ) + tag.attrs["class"][0] not in self.class_regex.values()) if tag_class: self.class_regex[key] = tag_class.get('class')[0] print(self.class_regex) print('updated class dict') - - def remove_junk(self): """ - Delete the junk tags (empty tags,span tags and unwanted meta tags) @@ -83,13 +78,13 @@ def remove_junk(self): elif junk_tag.get("class") == ['Apple-tab-span']: junk_tag.decompose() # elif junk_tag.name == "br": - # if junk_tag.parent.name == "p": - # junk_tag.parent.name = "span" - # junk_tag.parent["class"] = "gnrlbreak" - # junk_tag.decompose() - # else: - # junk_tag.name = "span" - # junk_tag["class"] = "headbreak" + # if junk_tag.parent.name == "p": + # junk_tag.parent.name = "span" + # junk_tag.parent["class"] = "gnrlbreak" + # junk_tag.decompose() + # else: + # junk_tag.name = "span" + # junk_tag["class"] = "headbreak" [text_junk.decompose() for text_junk in self.soup.find_all("p", class_=self.class_regex["junk"])] @@ -111,7 +106,6 @@ def remove_junk(self): print('junk removed') - def create_ul_tag(self): """ - wrap the list items with unordered tag @@ -146,7 +140,6 @@ def create_ul_tag(self): print("ul tag is created") - def create_main_tag(self): """ - wrap all contents inside main tag(Except chapter index) @@ -249,7 +242,6 @@ def create_and_wrap_with_div_tag(self): print('wrapped div tags') - def convert_roman_to_digit(self, roman): value = {'M': 1000, 'D': 500, 'C': 100, 'L': 50, 'X': 10, 'V': 5, 'I': 1} prev = 0 @@ -264,11 +256,10 @@ def convert_roman_to_digit(self, roman): return ans - def add_watermark_and_remove_class_name(self): for tag in self.soup.find_all(): - if tag.name in ['li','h2', 'h4', 'h3','h5']: + if tag.name in ['li', 'h2', 'h4', 'h3', 'h5']: del tag["class"] if tag.name == 'p': if len(tag.get_text(strip=True)) == 0: @@ -276,7 +267,7 @@ def add_watermark_and_remove_class_name(self): else: del tag["class"] - watermark_tag = self.soup.new_tag('p', Class='transformation') + watermark_tag = self.soup.new_tag('p', **{"class": "transformation"}) watermark_tag.string = self.watermark_text.format(self.release_number, self.release_date, datetime.now().date()) @@ -284,24 +275,32 @@ def add_watermark_and_remove_class_name(self): if title_tag: title_tag.insert(0, watermark_tag) - for meta in self.soup.findAll('meta'): - if meta.get('http-equiv') == "Content-Style-Type": - meta.decompose() - - # for all_tag in self.soup.findAll(): - # if all_tag.get("class"): - # all_tag_class = str(all_tag.get("class")) - # # print(all_tag_class) - # if re.match(r'^\[\'p\d\'\]',all_tag_class.strip()): - # del all_tag["class"] - + # for meta in self.soup.findAll('meta'): + # if meta.get('http-equiv') == "Content-Style-Type": + # meta.decompose() + # for meta in self.soup.findAll('meta'): + # if meta.get('http-equiv') == "Content-Style-Type": + # meta.decompose() + + for tag in self.soup.findAll(): + if len(tag.contents) == 0: + if tag.name == 'meta': + if tag.attrs.get('http-equiv') == 'Content-Style-Type': + tag.decompose() + continue + self.meta_tags.append(tag) + elif tag.name == 'br': + if not tag.parent or tag in tag.parent.contents: + tag.decompose() + continue + if len(tag.get_text(strip=True)) == 0: + tag.extract() for all_li in self.soup.find_all("li"): if re.search(r'^<li\s*class="p\d"', all_li.text.strip()): all_li.unwrap() - def add_citation(self): title_dict = {"I": ['1', '2', '3'], "II": ['5', '6', '6A', '7', '7A', '7B', '8'], "III": ['11', '11A', '12', '13', '13A', '13B', '14', '14A', '15', '15A', '16', '17', '18', '18A', @@ -391,43 +390,30 @@ def add_citation(self): cite_li_tags = [] titleid = "" - - cite_p_tags = [] for tag in self.soup.findAll(lambda tag: re.search(r"KRS\s?\d+[a-zA-Z]*\.\d+(\(\d+\))*(-\d+)*|" - r"(KRS Chapter \d+[a-zA-Z]*)|" - r"(KRS Title \D+, Chapter \D+?,)|" - r"KRS\s*\d+[a-zA-Z]*\.\d+\(\d+\)|" - r"(KRS\s*\d+[a-zA-Z]*\.\d+\(\d+\)|" - r"(U.S.C.\s*secs*\.\s*\d+)|" - r"(Ky.\s?(App\.)?\s?LEXIS\s?\d+)|" - r"(Ky.\s*(L. Rptr.\s*)*\d+)|" - r"(OAG \d+-\d+))",tag.get_text()) and tag.name == 'p'and tag not in cite_p_tags): + r"(KRS Chapter \d+[a-zA-Z]*)|" + r"(KRS Title \D+, Chapter \D+?,)|" + r"KRS\s*\d+[a-zA-Z]*\.\d+\(\d+\)|" + r"(KRS\s*\d+[a-zA-Z]*\.\d+\(\d+\)|" + r"(U.S.C.\s*secs*\.\s*\d+)|" + r"(Ky.\s?(App\.)?\s?LEXIS\s?\d+)|" + r"(Ky.\s*(L. Rptr.\s*)*\d+)|" + r"(OAG \d+-\d+))", + tag.get_text()) and tag.name == 'p' and tag not in cite_p_tags): cite_p_tags.append(tag) text = str(tag) - # for match in set( - # x[0] for x in re.findall(r'((Ky.\s*(L. Rptr.\s*)*\d+)|' - # r'(Ky.\s?(App\.)?\s?LEXIS\s?\d+)|' - # r'(U.S.C.\s*secs*\.\s*\d+(\([a-zA-Z]\))*(\(\d+\))*)|' - # r'(KRS\s?\d+[a-zA-Z]*\.\d+(\(\d+\))*(\(\D\))*)(-\d+)*|' - # r'(Chapter \d+[a-zA-Z]*)|' - # r'(Title\s+?\D+,\s+?Chapter\s+?\D+?,)|' - # r'(\d+?\w?\.\d+\s+?\(\d\)+?)|' - # r'(\d+\.\d{3}[^\d])|' - # r'(\d+\.\d{3}\(\d+\))|' - # r'(KRS\s*\d+[a-zA-Z]*\.\d+\(\d+\))|' - # r'(OAG \d+-\d+))', tag.get_text())): for match in set( - x[0] for x in re.findall(r'((Ky.\s*(L. Rptr.\s*)*\d+)|' - r'(Ky.\s?(App\.)?\s?LEXIS\s?\d+)|' - r'(U.S.C.\s*secs*\.\s*\d+(\([a-zA-Z]\))*(\(\d+\))*)|' - r'(KRS*\s?\d+[a-zA-Z]*\.\d+(\(\d+\))*(\(\D\))*)(-\d+)*|' - r'(Chapter \d+[a-zA-Z]*)|' - r'(Title\s+?\D+,\s+?Chapter\s+?\D+?,)|' - r'(KRS*\s*\d+[a-zA-Z]*\.\d+\(\d+\))|' - r'(OAG \d+-\d+))', tag.get_text())): + x[0] for x in re.findall(r'((Ky.\s*(L. Rptr.\s*)*\d+)|' + r'(Ky.\s?(App\.)?\s?LEXIS\s?\d+)|' + r'(U.S.C.\s*secs*\.\s*\d+(\([a-zA-Z]\))*(\(\d+\))*)|' + r'(KRS*\s?\d+[a-zA-Z]*\.\d+(\(\d+\))*(\(\D\))*)(-\d+)*|' + r'(Chapter \d+[a-zA-Z]*)|' + r'(Title\s+?\D+,\s+?Chapter\s+?\D+?,)|' + r'(KRS*\s*\d+[a-zA-Z]*\.\d+\(\d+\))|' + r'(OAG \d+-\d+))', tag.get_text())): inside_text = re.sub(r'<p\sclass="\w\d+">|</p>|<b>|</b>|<p>', '', text, re.DOTALL) tag.clear() @@ -583,7 +569,6 @@ def add_citation(self): print("citation created") - def set_appropriate_tag_name_and_id(self, tag_name, header_tag, chap_nums, prev_id, sub_tag, class_name): if re.search('constitution', self.html_file_name): header_tag.name = tag_name @@ -661,11 +646,9 @@ def replace_tags(self): elif header_tag.text.strip().isupper(): header_tag.name = "h2" - chap_num = re.sub(r'[\s]+','', header_tag.text.strip()).lower() + chap_num = re.sub(r'[\s]+', '', header_tag.text.strip()).lower() header_tag["id"] = f"{self.title_id}c{chap_num}" - - elif header_tag.get("class") == [self.class_regex["sec_head"]]: if re.search(r'^Section', header_tag.text.strip()): header_tag.name = "h3" @@ -680,7 +663,6 @@ def replace_tags(self): 2) header_tag["id"] = f'{prev_id}s{cur_id}' - elif header_tag.get("class") == [self.class_regex["ul"]]: if re.search(r'^§(§)*|^(ARTICLE)|^(Section)|^(AMENDMENT)', header_tag.text.strip()): header_tag.name = "li" @@ -741,10 +723,10 @@ def replace_tags(self): header_tag["id"] = innr_subsec_header_tag_id else: - if re.search(r'^NOTES TO DECISIONS|^Analysis|^Cited:',header_tag.get_text().strip()): + if re.search(r'^NOTES TO DECISIONS|^Analysis|^Cited:', header_tag.get_text().strip()): header_tag.name = "h4" - prev_head_id = header_tag.find_previous(['h3','h2','h1']).get("id") + prev_head_id = header_tag.find_previous(['h3', 'h2', 'h1']).get("id") current_id = re.sub(r'[\s\W]', '', header_tag.text.strip()) curr_tag_id = f'{prev_head_id}-{current_id}' @@ -809,7 +791,7 @@ def replace_tags(self): class_name) if re.search("^Subchapter", header_tag.text.strip()): - chap_nums = re.sub(r'[\s]+','',header_tag.get_text().strip()).lower() + chap_nums = re.sub(r'[\s]+', '', header_tag.get_text().strip()).lower() prev_id = header_tag.find_previous("h2", class_="chapterh2").get("id") header_tag["id"] = f"{prev_id}{chap_nums}" @@ -829,15 +811,14 @@ def replace_tags(self): else: header_tag.name = "h2" - prev_id = header_tag.find_previous('h2',class_='chapterh2').get("id") - header_id = re.sub(r'[\s\.\[\]]', '', header_tag.get_text()).lower() + prev_id = header_tag.find_previous('h2', class_='chapterh2').get("id") + header_id = re.sub(r'[\s.\[\]’&,]', '', header_tag.get_text()).lower() header_tag["id"] = f"{prev_id}{header_id}" inc_count = 1 elif header_tag.get("class") == [self.class_regex["sec_head"]]: header_tag.name = "h3" if re.match(r'^\d+\.\d+\D?-\d+', header_tag.text.strip()): - header_pattern = re.search(r'^(?P<sec>(?P<chap>\d+)\.\d+\D?-\d+)', header_tag.text.strip()) chap_num = header_pattern.group("chap").zfill(2) sec_num = header_pattern.group("sec").zfill(2) @@ -849,7 +830,7 @@ def replace_tags(self): else: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}" - + inc_count = 1 head_tag_id_list.append(header_tag_id) elif re.match(r'^\d+\.\d+\.?(-\d+\.)?', header_tag.text.strip()): @@ -860,7 +841,7 @@ def replace_tags(self): if header_tag_id in head_tag_id_list: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}.{inc_count}" - inc_count +=1 + inc_count += 1 else: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}" @@ -877,7 +858,7 @@ def replace_tags(self): if header_tag_id in head_tag_id_list: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}.{inc_count}" - inc_count +=1 + inc_count += 1 else: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}" @@ -888,7 +869,7 @@ def replace_tags(self): chap_num = re.search(r'^([^.]+)', header_tag.text.strip()).group().zfill(2) sub_num = re.search(r'(\d+[a-zA-Z]*\.(?P<sub>\d+)-\d+\.)', header_tag.text.strip()).group( "sub").zfill(2) - sec_num = re.sub(r'[\s\.\[\]]', '', header_tag.text.strip()) + sec_num = re.sub(r'[\s\.\[\],’&]', '', header_tag.text.strip()) header_tag["id"] = f"t{self.title_id}c{chap_num}sub{sub_num}s{sec_num}" else: header_pattern = re.search(r'^(?P<sec>(?P<chap>\d+\D)\.\d+)', header_tag.text.strip()) @@ -899,10 +880,10 @@ def replace_tags(self): if header_tag_id in head_tag_id_list: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}.{inc_count}" - inc_count +=1 + inc_count += 1 else: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}" - + inc_count = 1 head_tag_id_list.append(header_tag_id) @@ -920,7 +901,7 @@ def replace_tags(self): inc_count += 1 else: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}" - + inc_count = 1 head_tag_id_list.append(header_tag_id) @@ -937,7 +918,7 @@ def replace_tags(self): inc_count += 1 else: header_tag["id"] = f"t{self.title_id}c{chap_num}s{sec_num}" - + inc_count = 1 head_tag_id_list.append(header_tag_id) elif re.match(r'^\d+\.___\.', header_tag.text.strip()): @@ -951,7 +932,7 @@ def replace_tags(self): inc_count += 1 else: header_tag["id"] = f"t{self.title_id}c{chap_num}s{chap_num}" - + inc_count = 1 head_tag_id_list.append(header_tag_id) header_tag["class"] = "chapterh2" @@ -977,7 +958,8 @@ def replace_tags(self): anav = 0 anav = anav + 1 - header_tag['id'] = f"{header_tag.find_previous('h2',class_='chapterh2').get('id')}a{art_nums.zfill(2)}-anav{anav:02}" + header_tag[ + 'id'] = f"{header_tag.find_previous('h2', class_='chapterh2').get('id')}a{art_nums.zfill(2)}-anav{anav:02}" elif re.search("^(Part)", header_tag.text): chap_nums = header_tag.find_previous("h2").get("id") @@ -1026,14 +1008,15 @@ def replace_tags(self): sec_id = re.search("(snav)(?P<id>\d+)", previous_tag_id.strip()).group("id").zfill(2) sec_id = int(sec_id) + 1 - section_id = re.sub(r'\s+', '', header_tag.get_text()).lower() + section_id = re.sub(r'[\s’&.,]+', '', header_tag.get_text()).lower() header_tag["id"] = f"{prev_chapter_id}s{section_id}-snav{sec_id:02}" else: chap_nums = re.search(r'^(CHAPTER|Chapter)\s(?P<chapter_id>\d+)', - header_tag.find_previous("h2",class_='chapterh2').text.strip()).group( + header_tag.find_previous("h2", + class_='chapterh2').text.strip()).group( 'chapter_id').zfill(2) - section_id = re.sub(r'\s+', '', header_tag.get_text()).lower() + section_id = re.sub(r'[\s’&.]+', '', header_tag.get_text()).lower() if re.match(r'^CHAPTER', header_tag.find_previous().text.strip()): snav = 0 snav = snav + 1 @@ -1042,7 +1025,8 @@ def replace_tags(self): elif header_tag.get('class') == [self.class_regex["ol"]]: if section := re.search(r'^SECTION (?P<sec>\d+)[.:]', header_tag.text.strip()): header_tag.name = "h3" - header_tag["id"] = f"{header_tag.find_previous('h3',class_='chapterh2').get('id')}s{section.group('sec').zfill(2)}" + header_tag[ + "id"] = f"{header_tag.find_previous('h3', class_='chapterh2').get('id')}s{section.group('sec').zfill(2)}" if section := re.search(r'^(Article|ARTICLE) (?P<sec>[IVX]+)(\.)?', header_tag.text.strip()): header_tag.name = "h3" header_tag["id"] = f"{header_tag.find_previous('h3').get('id')}a{section.group('sec').zfill(2)}" @@ -1054,7 +1038,7 @@ def replace_tags(self): elif header_tag.get('class') == [self.class_regex["head4"]]: if re.match(r'^\d+\.(\d\.)*', header_tag.text.strip()) \ - and not re.match(r'^(\d+\D*\.\d\d+)', header_tag.text.strip()) : + and not re.match(r'^(\d+\D*\.\d\d+)', header_tag.text.strip()): header_tag.name = "h5" elif header_tag.span: header_tag.name = "h4" @@ -1062,7 +1046,7 @@ def replace_tags(self): if header_tag.name == "h4": if header_tag.find_previous("h3"): prev_tag = header_tag.find_previous("h3").get("id") - tag_text = re.sub(r'\s+', '', header_tag.get_text()).lower() + tag_text = re.sub(r'[\s’&.]+', '', header_tag.get_text()).lower() header_tag["id"] = f"{prev_tag}{tag_text}" if header_tag.find_previous("h4"): @@ -1105,8 +1089,13 @@ def replace_tags(self): innr_subsec_header_tag_id = f"{prev_child_id1}-{innr_subsec_header_id}" header_tag["id"] = innr_subsec_header_tag_id - print("tags are replaced") + stylesheet_link_tag = self.soup.new_tag('link') + stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', + 'href': 'https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css'} + self.soup.style.replace_with(stylesheet_link_tag) + self.meta_tags.append(stylesheet_link_tag) + print("tags are replaced") def set_chapter_section_nav(self, list_item, chap_num, sub_tag, prev_id, sec_num): nav_list = [] @@ -1173,7 +1162,7 @@ def create_chapter_section_nav(self): self.set_chapter_section_nav(list_item, chap_num, sub_tag, prev_id, None) elif list_item.text.strip().isupper(): - chap_num = re.sub(r'\s+','',list_item.text.strip()).lower() + chap_num = re.sub(r'\s+', '', list_item.text.strip()).lower() prev_id = list_item.find_previous("h1").get("id") sub_tag = "c" self.set_chapter_section_nav(list_item, chap_num, sub_tag, prev_id, None) @@ -1191,7 +1180,6 @@ def create_chapter_section_nav(self): else: if re.match(r'^(\d+\.\d+\.)', list_item.text.strip()): - sec_pattern = re.search(r'^(?P<sec>(?P<chap>\d+)\.\d+)', list_item.text.strip()) chap_num = sec_pattern.group("chap").zfill(2) sec_num = sec_pattern.group("sec").zfill(2) @@ -1227,7 +1215,7 @@ def create_chapter_section_nav(self): chap_num = re.search(r'((?P<chap>\d+[a-zA-Z]*)\.(?P<sub>\d+)-\d+\.)', list_item.text.strip()).group("chap").zfill(2) - sec_num = re.sub(r'[\s\.\[\]]', '', list_item.text.strip()) + sec_num = re.sub(r'[\s\.\[\]’&,]', '', list_item.text.strip()) nav_link = self.soup.new_tag('a') nav_link.string = list_item.text nav_link["href"] = f"#t{self.title_id}c{chap_num}sub{sub_num}s{sec_num}" @@ -1283,8 +1271,8 @@ def create_chapter_section_nav(self): new_link["href"] = f"#{chap_num}sub{subpart_nums.zfill(2)}" list_item.contents = [new_link] - elif re.match(r'^\d+-\d+\.\d+\.',list_item.text.strip()): - chap_num = re.search(r'^(?P<sec>(?P<chap>\d+)-\d+\.\d+)\.',list_item.text.strip()) + elif re.match(r'^\d+-\d+\.\d+\.', list_item.text.strip()): + chap_num = re.search(r'^(?P<sec>(?P<chap>\d+)-\d+\.\d+)\.', list_item.text.strip()) new_link = self.soup.new_tag('a') new_link.string = list_item.text new_link["href"] = f"#t{self.title_id}c{chap_num.group('chap')}s{chap_num.group('sec')}" @@ -1293,23 +1281,26 @@ def create_chapter_section_nav(self): elif re.match(r'^(\d+\.\d+\D?-\d+\.)|^(\d+\.\d+\D?,?)', list_item.text.strip()): chap_num = re.search(r'^([^.]+)', list_item.text.strip()).group().zfill(2) - sec_num = re.search(r'^(\d+\.\d+\D?-\d+)|^(\d+\.\d+\D?),?', list_item.text.strip()).group().zfill(2) + sec_num = re.search(r'^(\d+\.\d+\D?-\d+)|^(\d+\.\d+\D?),?', + list_item.text.strip()).group().zfill(2) self.set_chapter_section_nav(list_item, chap_num, None, None, sec_num) else: - chapter_header = list_item.find_previous("h2") - chap_nums = re.search(r'(\s+[^\s]+)', chapter_header.text.strip()).group() - chap_num = re.sub(r'\s+', '', chap_nums).zfill(2) - sec_id = re.sub(r'[\s+.]', '', list_item.get_text()).lower() + # print(list_item) + # chapter_header = list_item.find_previous("h2") + # chap_nums = re.search(r'(\s+[^\s]+)', chapter_header.text.strip()).group() + # chap_num = re.sub(r'\s+', '', chap_nums).zfill(2) + + chap_num = list_item.find_previous('h2', class_='chapterh2').get("id") + sec_id = re.sub(r'[\s+.’&,]', '', list_item.get_text()).lower() new_link = self.soup.new_tag('a') new_link.string = list_item.text - new_link["href"] = f"#t{self.title_id}c{chap_num}{sec_id}" + new_link["href"] = f"#{chap_num}{sec_id}" list_item.contents = [new_link] - - + # print(list_item) + # print(chap_num) def create_ul_tag_to_notes_to_decision(self): new_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) - # new_nav_tag = self.soup.new_tag("nav") innr_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) innr_ul_tag1 = self.soup.new_tag("ul", **{"class": "leaders"}) innr_ul_tag2 = self.soup.new_tag("ul", **{"class": "leaders"}) @@ -1321,7 +1312,6 @@ def create_ul_tag_to_notes_to_decision(self): else: tag_class = self.class_regex["ol"] - for note_tag in self.soup.find_all(class_=tag_class): nd_tag_text = re.sub(r'[\W]', '', note_tag.get_text()).lower() if re.match(note_nav_pattern, note_tag.text.strip()) and nd_tag_text in self.nd_list: @@ -1350,8 +1340,9 @@ def create_ul_tag_to_notes_to_decision(self): new_ul_tag.append(note_tag) # - - elif re.match(r'^(\d+\.\s*—\s*“?[a-zA-Z]+)|^(\d+\.\d+)|^(\d+\.(\d+\.)\s*“*[a-zA-Z]+)|^(\d+\.\s*—\s*“?[0-9]+)', - note_tag.text.strip()) and note_tag.name == "li": + elif re.match( + r'^(\d+\.\s*—\s*“?[a-zA-Z]+)|^(\d+\.\d+)|^(\d+\.(\d+\.)\s*“*[a-zA-Z]+)|^(\d+\.\s*—\s*“?[0-9]+)', + note_tag.text.strip()) and note_tag.name == "li": if re.match(r'^(\d+\.\s*“?[a-zA-Z]+)|^(\d+\.\d+)|^(\d+\.(\d+\.)\s*“*[a-zA-Z]+)', note_tag.find_previous().text.strip()) and note_tag.name == "li": if re.match(r'^(\d+\.(\d+\.)\s*“*[a-zA-Z]+)', note_tag.find_previous().text.strip()): @@ -1420,8 +1411,6 @@ def set_ref_link_to_notetodecision_nav(self, nd_tag, prev_head_tag, sub_sec_id, nd_tag.insert(0, nav_link) return f"{prev_head_tag}-{sub_sec_id}" - - def create_ref_link_to_notetodecision_nav(self): nav_link = self.soup.new_tag('a') innr_nav_link1 = self.soup.new_tag('a') @@ -1435,8 +1424,6 @@ def create_ref_link_to_notetodecision_nav(self): else: nd_class_name = self.class_regex['ol'] - - for nd_tag in self.soup.find_all(class_=nd_class_name): nd_tag_text = re.sub(r'[\W]', '', nd_tag.get_text()).lower() @@ -1538,14 +1525,13 @@ def create_ref_link_to_notetodecision_nav(self): p_text = re.sub(r'[\s.—]', '', nd_tag.text.strip()) nav_link_list.append(p_text) - def convert_paragraph_to_alphabetical_ol_tags(self): main_sec_alpha = 'a' cap_alpha = 'A' ol_head = 1 num_count = 1 roman_count = 1 - alpha_ol = self.soup.new_tag("ol", Class="alpha") + alpha_ol = self.soup.new_tag("ol", type="a") cap_alpha_ol = self.soup.new_tag("ol", type="A") inner_ol = self.soup.new_tag("ol", type="i") cap_roman_ol = self.soup.new_tag("ol", type="I") @@ -1555,7 +1541,7 @@ def convert_paragraph_to_alphabetical_ol_tags(self): ol_head1 = 1 sec_alpha = 'a' sec_alpha1 = 'a' - alpha_cur_tag2 =None + alpha_cur_tag2 = None for p_tag in self.soup.find_all(): if p_tag.b: @@ -1566,6 +1552,7 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.span.unwrap() current_tag_text = p_tag.text.strip() + if p_tag.name == "h3": num_cur_tag = None @@ -1604,7 +1591,7 @@ def convert_paragraph_to_alphabetical_ol_tags(self): ol_head1 += 1 if re.search(r'^\(\d+\)(\s)*\([a-z]\)', current_tag_text): - alpha_ol = self.soup.new_tag("ol", Class="alpha") + alpha_ol = self.soup.new_tag("ol", type="a") li_tag = self.soup.new_tag("li") li_tag.string = re.sub(r'^\(\d+\)(\s)*\(\w\)', '', current_tag_text) li_tag.append(current_tag_text) @@ -1625,15 +1612,12 @@ def convert_paragraph_to_alphabetical_ol_tags(self): inner_li_tag.append(current_tag_text) cur_tag = re.search(r'^\((?P<cid>\d+)\)(\s)?\((?P<pid>\w)\)\s(?P<nid>\d+)\.', current_tag_text) prev_id = f'{num_cur_tag.get("id")}{cur_tag.group("pid")}' - inner_li_tag["id"] = f'{num_cur_tag.get("id")}{cur_tag.group("pid")}{cur_tag.group("nid")}' num_ol1.append(inner_li_tag) alpha_cur_tag.string = "" alpha_cur_tag.append(num_ol1) - num_count = 2 - elif re.search(rf'^\(\s*{main_sec_alpha}\s*\)', current_tag_text): p_tag.name = "li" alpha_cur_tag = p_tag @@ -1641,8 +1625,8 @@ def convert_paragraph_to_alphabetical_ol_tags(self): num_count = 1 ol_head1 = 1 - if re.search(r'^\(a\)', current_tag_text) : - alpha_ol = self.soup.new_tag("ol", Class="alpha") + if re.search(r'^\(a\)', current_tag_text): + alpha_ol = self.soup.new_tag("ol", type="a") p_tag.wrap(alpha_ol) if num_cur_tag: prevnum_id = num_cur_tag.get("id") @@ -1658,8 +1642,6 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.string = re.sub(rf'^\(\s*{main_sec_alpha}\s*\)', '', current_tag_text) main_sec_alpha = chr(ord(main_sec_alpha) + 1) - - if re.search(r'^\(\w\)\s?1\.', current_tag_text): num_ol1 = self.soup.new_tag("ol") inner_li_tag = self.soup.new_tag("li") @@ -1676,16 +1658,14 @@ def convert_paragraph_to_alphabetical_ol_tags(self): num_count = 2 sec_alpha = 'a' - elif re.search(r'^\(\s*\d\d\s*\)', current_tag_text): p_tag.name = "li" - p_tag_text = re.search(r'^\(\s*(?P<id>\d\d)\s*\)',current_tag_text).group("id") + p_tag_text = re.search(r'^\(\s*(?P<id>\d\d)\s*\)', current_tag_text).group("id") alpha_ol.append(p_tag) p_tag["id"] = f'{prevnum_id}{p_tag_text}' p_tag.string = re.sub(r'^\(\s*\d\d\s*\)', '', current_tag_text) - - elif re.search(rf'^{num_count}\.', current_tag_text) and p_tag.name == "p" : + elif re.search(rf'^{num_count}\.', current_tag_text) and p_tag.name == "p": p_tag.name = "li" num_tag = p_tag sec_alpha = 'a' @@ -1693,32 +1673,26 @@ def convert_paragraph_to_alphabetical_ol_tags(self): if re.search(r'^1\.', current_tag_text): num_ol1 = self.soup.new_tag("ol") p_tag.wrap(num_ol1) - if alpha_cur_tag: prev_id = alpha_cur_tag.get("id") alpha_cur_tag.append(num_ol1) - elif cap_alpha_cur_tag: prev_id = cap_alpha_cur_tag.get("id") cap_alpha_cur_tag.append(num_ol1) elif num_cur_tag: prev_id = num_cur_tag.get("id") num_cur_tag.append(num_ol1) - else: - prev_id = f'{p_tag.find_previous({"h4", "h3"}).get("id")}ol{ol_count}' else: num_ol1.append(p_tag) - p_tag["id"] = f'{prev_id}{num_count}' p_tag.string = re.sub(rf'^{num_count}\.', '', current_tag_text) num_count += 1 if re.search(r'^\d+\.\s?a\.', current_tag_text): - - innr_alpha_ol = self.soup.new_tag("ol", Class="alpha") + innr_alpha_ol = self.soup.new_tag("ol", type="a") inner_li_tag = self.soup.new_tag("li") inner_li_tag.string = re.sub(r'^\d+\.\s?a\.', '', current_tag_text) inner_li_tag.append(current_tag_text) @@ -1732,24 +1706,18 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.insert(0, innr_alpha_ol) sec_alpha = 'b' - - elif re.search(rf'^{sec_alpha}\.', current_tag_text): p_tag.name = "li" alpha_cur_tag1 = p_tag roman_count = 1 ol_head1 = 1 - - - if re.search(r'^a\.', current_tag_text) : - - innr_alpha_ol = self.soup.new_tag("ol", Class="alpha") + if re.search(r'^a\.', current_tag_text): + innr_alpha_ol = self.soup.new_tag("ol", type="a") previd = p_tag.find_previous("li") p_tag.wrap(innr_alpha_ol) prevnum_id1 = previd.get("id") previd.append(innr_alpha_ol) p_tag["id"] = f'{prevnum_id1}{sec_alpha}' - else: innr_alpha_ol.append(p_tag) p_tag["id"] = f'{prevnum_id1}{sec_alpha}' @@ -1757,7 +1725,6 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.string = re.sub(rf'^{sec_alpha}\.', '', current_tag_text) sec_alpha = chr(ord(sec_alpha) + 1) - if re.search(r'^\w+\.\s?i\.', current_tag_text): roman_ol = self.soup.new_tag("ol", type="i") inner_li_tag = self.soup.new_tag("li") @@ -1772,14 +1739,11 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.string = "" p_tag.insert(0, roman_ol) - elif re.search(rf'^{cap_alpha}\.', current_tag_text): - p_tag.name = "li" cap_alpha_cur_tag = p_tag cap_alpha1 = cap_alpha num_count = 1 - if re.search(r'^A\.', current_tag_text): cap_alpha_ol = self.soup.new_tag("ol", type="A") p_tag.wrap(cap_alpha_ol) @@ -1797,9 +1761,7 @@ def convert_paragraph_to_alphabetical_ol_tags(self): cap_alpha = chr(ord(cap_alpha) + 1) - - - elif re.search(r'^[IVX]+\.',current_tag_text): + elif re.search(r'^[IVX]+\.', current_tag_text): p_tag.name = "li" cap_roman_cur_tag = p_tag ol_head = 1 @@ -1811,10 +1773,9 @@ def convert_paragraph_to_alphabetical_ol_tags(self): else: cap_roman_ol.append(p_tag) - - rom_head = re.search(r'^(?P<rom>[IVX]+)\.',current_tag_text) + rom_head = re.search(r'^(?P<rom>[IVX]+)\.', current_tag_text) p_tag["id"] = f'{prev_id1}ol{ol_count}{rom_head.group("rom")}' - p_tag.string = re.sub(r'^[IVX]+\.','',current_tag_text) + p_tag.string = re.sub(r'^[IVX]+\.', '', current_tag_text) elif re.search(r'^[ivx]+\.', current_tag_text): p_tag.name = "li" @@ -1822,7 +1783,6 @@ def convert_paragraph_to_alphabetical_ol_tags(self): if re.search(r'^i\.', current_tag_text): roman_ol = self.soup.new_tag("ol", type="i") - p_tag.wrap(roman_ol) alpha_cur_tag1.append(roman_ol) prev_id1 = alpha_cur_tag1.get("id") @@ -1835,8 +1795,6 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag["id"] = f'{prev_id1}{rom_head.group("rom")}' p_tag.string = re.sub(r'^[ivx]+\.', '', current_tag_text) - - if re.search(r'^History|^Cross references:|^OFFICIAL COMMENT', current_tag_text) or p_tag.name in ['h3']: ol_head = 1 ol_head1 = 1 @@ -1853,8 +1811,6 @@ def convert_paragraph_to_alphabetical_ol_tags(self): alpha_cur_tag1 = None sec_alpha1 = 'a' - - def write_soup_to_file(self): """ @@ -1863,10 +1819,14 @@ def write_soup_to_file(self): - write html str to an output file """ soup_str = str(self.soup.prettify(formatter=None)) + + for tag in self.meta_tags: + cleansed_tag = re.sub(r'/>', ' />', str(tag)) + soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) + with open(f"../../cic-code-ky/transforms/ky/ocky/r{self.release_number}/{self.html_file_name}", "w") as file: file.write(soup_str) - def css_file(self): head = self.soup.find("head") style = self.soup.head.find("style") @@ -1891,7 +1851,7 @@ def start_parse(self): start_time = datetime.now() print(start_time) self.create_page_soup() - self.css_file() + # self.css_file() if re.search('constitution', self.html_file_name): self.class_regex = {'ul': '^(§ )|^(ARTICLE)', 'head2': '^(§ )|^(ARTICLE)', 'title': '^(CONSTITUTION OF KENTUCKY)|^(THE CONSTITUTION OF THE UNITED STATES OF AMERICA)', @@ -1918,7 +1878,6 @@ def start_parse(self): self.create_main_tag() self.create_ul_tag() self.create_chapter_section_nav() - self.create_ref_link_to_notetodecision_nav() self.create_ul_tag_to_notes_to_decision() self.create_and_wrap_with_div_tag() @@ -1928,6 +1887,3 @@ def start_parse(self): self.write_soup_to_file() print(datetime.now() - start_time) - - - diff --git a/html_parser/nc_html_parser.py b/html_parser/nc_html_parser.py index 21c99ec..477de9e 100644 --- a/html_parser/nc_html_parser.py +++ b/html_parser/nc_html_parser.py @@ -22,11 +22,12 @@ def __init__(self, input_file_name): self.title = None self.previous = None self.junk_tag_class = ['Apple-converted-space', 'Apple-tab-span'] - self.class_regex = {'head1': r'Chapter \d+', 'ul': r'^Article|^1\.|^(?P<sec_id>\d+([A-Z])*-\d+(\.\d+)*)', - 'head2': r'^ARTICLE \d+\.', + self.class_regex = {'head1': r'Chapter \d+', + 'ul': r'^Article|^1\.|^(?P<sec_id>\d+([A-Z])*-\d+(\.\d+)*)|^§|^Article 1.', + 'head2': r'^ARTICLE \d+\.|^Article 1.|^ARTICLE I\.', 'head4': '^CASE NOTES|^OFFICIAL COMMENT', - 'head3': '^§* \d+([A-Z])*-\d+(-\d+)*(\.|,| through)', 'ol_p': r'^\(\d\)|^I\.', - 'junk1': '^Annotations$', 'nav': '^Subchapter I\.|^——————————'} + 'head3': r'^§* \d+([A-Z])*-\d+(-\d+)*(\.|,| through)|^§', 'ol_p': r'^\(\d\)|^I\.', + 'junk1': '^Annotations$', 'nav': r'^Subchapter I\.|^——————————'} self.watermark_text = """Release {0} of the Official Code of North Carolina Annotated released {1}. Transformed and posted by Public.Resource.Org using cic-beautify-state-codes version v1.3 on {2}. @@ -91,7 +92,7 @@ def remove_junk(self): junk_tag.decompose() [text_junk.decompose() for text_junk in self.soup.find_all("p", class_=self.class_regex["junk1"]) if - re.search('^Annotations|^Text|^Statute text', text_junk.text.strip())] + re.search('^Annotations|^Text|^Statute text|^History', text_junk.text.strip())] [text_junk.decompose() for text_junk in self.soup.find_all("p", class_=self.class_regex["nav"]) if re.search('^——————————', text_junk.text.strip())] @@ -126,14 +127,29 @@ def recreate_tag(self): new_tag["class"] = "casenote" p_tag.unwrap() + if re.search(r'^Analysis', p_tag.text.strip()): + for tag in p_tag.find_next_siblings(): + if tag.get('class') == [self.class_regex["head4"]]: + break + else: + tag["class"] = "casenote" + + # for tag in next_sibling_tags: + # if re.search(r'^([IVX]|[A-Z]|[1-9])\. ',tag.text.strip()): + # print(tag) + def replace_tags(self): watermark_p = None title_tag = None + analysis_list = [] cur_head_list = [] cur_id_list = [] - cap_alpha = 'A' + # cap_alpha = 'A' + cap_alpha = None cap_roman = "I" + cap_num = None alpha = None + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) head4_list = ['Revision of title. —', 'Cross references. —', 'Law reviews. —', 'Editor\'s notes. —', 'History.', 'Effective dates. —'] @@ -150,7 +166,7 @@ def replace_tags(self): if header_tag.get("class") == [self.class_regex["head1"]]: if re.search(r'^Constitution of North Carolina|^Constitution of the United States', - header_tag.text.strip()): + header_tag.text.strip(), re.I): header_tag.name = "h1" header_tag.wrap(self.soup.new_tag("nav")) header_tag['id'] = self.title_id @@ -161,34 +177,31 @@ def replace_tags(self): self.soup.find("nav").insert(0, watermark_p) elif header_tag.get("class") == [self.class_regex["head2"]]: - if re.search(r'^ARTICLE [IVX]+', header_tag.text.strip()): + if re.search(r'^(ARTICLE|Article) [IVX]+', header_tag.text.strip(), re.I): header_tag.name = "h2" - article_id = re.search(r'^ARTICLE (?P<ar_id>[IVX]+)', header_tag.text.strip()).group('ar_id') + article_id = re.search(r'^(ARTICLE|Article) (?P<ar_id>[IVX]+)', header_tag.text.strip(), + re.I).group('ar_id') header_tag[ 'id'] = f"{header_tag.find_previous('h1').get('id')}a{article_id.zfill(2)}" - - - elif re.search(r'^§ \d+\.', header_tag.text.strip()): header_tag.name = "h3" sec_id = re.search(r'^§ (?P<s_id>\d+)\.', header_tag.text.strip()).group('s_id') header_tag[ 'id'] = f"{header_tag.find_previous('h2').get('id')}s{sec_id.zfill(2)}" - if re.search(r'^AMENDMENTS|^Preamble', header_tag.text.strip()): + if re.search(r'^(\d+[A-Z])*? AMENDMENTS|^Preamble', header_tag.text.strip(), re.I): header_tag.name = "h2" article_id = re.sub(r'[\s\W]+', '', header_tag.text.strip()).lower() header_tag[ 'id'] = f"{header_tag.find_previous('h1').get('id')}a{article_id.zfill(2)}" header_tag['class'] = "amend" - - elif header_tag.get("class") == [self.class_regex["head3"]]: if re.search(r'^§ \d+\.', header_tag.text.strip()): header_tag.name = "h3" sec_id = re.search(r'^§ (?P<s_id>\d+)\.', header_tag.text.strip()).group('s_id') + header_tag[ 'id'] = f"{header_tag.find_previous('h2').get('id')}s{sec_id.zfill(2)}" elif re.search(r'^(Section|Sec\.) \d+', header_tag.text.strip()): @@ -201,14 +214,13 @@ def replace_tags(self): header_tag[ 'id'] = f"{header_tag.find_previous('h3', class_='amend').get('id')}s{sec_id.zfill(2)}" - elif re.search(r'^Amendment \d+', header_tag.text.strip()): + elif re.search(r'^Amendment (\d+|[IVX]+)', header_tag.text.strip()): header_tag.name = "h3" - sec_id = re.search(r'^Amendment (?P<s_id>\d+)', header_tag.text.strip()).group('s_id') + sec_id = re.search(r'^Amendment (?P<s_id>(\d+|[IVX]+))', header_tag.text.strip()).group('s_id') header_tag[ 'id'] = f"{header_tag.find_previous('h2', class_='amend').get('id')}s{sec_id.zfill(2)}" - elif header_tag.get("class") == [self.class_regex["head4"]]: if re.search(r'^CASE NOTES|^OFFICIAL COMMENT|^COMMENT', header_tag.text.strip()): header_tag.name = "h4" @@ -248,13 +260,9 @@ def replace_tags(self): h5_num_id = f"{h5_alpha_id}-{h5_num_text}" header_tag['id'] = h5_num_id - - - - elif header_tag.get("class") == [self.class_regex["ul"]] and not re.search('^(Article|Sec\.)', + elif header_tag.get("class") == [self.class_regex["ul"]] and not re.search('^(Article|Sec\.)$', header_tag.text.strip()): header_tag.name = "li" - if header_tag.find_previous().name == "li": ul_tag.append(header_tag) @@ -268,19 +276,20 @@ def replace_tags(self): nav_tag = self.soup.new_tag("nav") ul_tag.wrap(nav_tag) - # titlefiles else: title_pattern = re.compile(r'^(Chapter)\s(?P<title_id>\d+([A-Z])*)') - subchapter_pattern = re.compile(r'^Subchapter (?P<s_id>[IVX]+([A-Z])*)\.') + subchapter_pattern = re.compile(r'^(Subchapter|SUBCHAPTER) (?P<s_id>[IVX]+-*?([A-Z])*)\.') section_pattern = re.compile( - r'^§+\s*(?P<sec_id>\d+([A-Z])*-\d+([A-Z])*(\.\d+[A-Z]*)*(-\d+)*)[:., through]') + r'^§+\s*(?P<sec_id>\d+([A-Z])*-\d+([A-Z])*(\.\d+[A-Z]*)*(-\d+)*|\d+([A-Z])*)[:., through]') chap_ul_pattern = re.compile(r'^(?P<s_id>[0-9]*[A-Z]*(\.\d+)*)([:.,]| through| to)') sec_ul_pattern = re.compile(r'^(?P<sec_id>\d+([A-Z])*-\d+([A-Z])*(\.\d+)*(-\d+)*)[., through]') - SUBCHAPTER_pattern = re.compile(r'^SUBCHAPTER (?P<s_id>[IVX]+([A-Z])*)\.') - article_pattern = re.compile(r'^ARTICLE (?P<a_id>\d+([A-Z])*)(\.| to)') + rule_pattern = re.compile(r'^Rule(s)*?\s(?P<r_id>(\d+[A-Z]*?-\d+[A-Z]*?-\d+[A-Z]*?)|\d+)[:., through]') + article_pattern = re.compile(r'^(ARTICLE|Article) (?P<a_id>\d+([A-Z])*(\.\d+)*|[IVX]+)(\.| to)') section_rule_pattern = re.compile(r'^Rule(s)*\s(?P<r_id>\d+(\.\d+)*)[:., through]') sub_article_pattern = re.compile(r'^(ARTICLE|Article) (?P<s_id>([IVX]+([A-Z])*)*(\d+)*)') + part_pattern = re.compile(r'^Part (?P<pid>\d+[A-Z]*?)\.') + subpart_pattern = re.compile(r'Subpart (?P<aid>[0-9A-Z]+)\.') if header_tag.get("class") == [self.class_regex["head1"]]: if title_pattern.search(header_tag.text.strip()): @@ -296,14 +305,41 @@ def replace_tags(self): elif header_tag.get("class") == [self.class_regex["head2"]]: - - if SUBCHAPTER_pattern.search(header_tag.text.strip()): + if subchapter_pattern.search(header_tag.text.strip()): header_tag.name = "h2" - subchapter_id = SUBCHAPTER_pattern.search(header_tag.text.strip()).group('s_id') + subchapter_id = subchapter_pattern.search(header_tag.text.strip()).group('s_id') header_tag[ 'id'] = f"{header_tag.find_previous('h1').get('id')}s{subchapter_id.zfill(2)}" header_tag["class"] = "subchap" + elif part_pattern.search(header_tag.text.strip()): + header_tag.name = "h2" + subchapter_id = part_pattern.search(header_tag.text.strip()).group('pid') + curr_head_id = f"{header_tag.find_previous('h2',class_='article').get('id')}p{subchapter_id.zfill(2)}" + if curr_head_id in cur_head_list: + header_tag['id'] = f"{curr_head_id}.1" + else: + header_tag['id'] = f"{curr_head_id}" + + header_tag["class"] = "part" + cur_head_list.append(curr_head_id) + + + elif subpart_pattern.search(header_tag.text.strip()): + header_tag.name = "h2" + subchapter_id = subpart_pattern.search(header_tag.text.strip()).group('aid') + curr_head_id = f"{header_tag.find_previous('h2',class_='part').get('id')}sp{subchapter_id.zfill(2)}" + + if curr_head_id in cur_head_list: + header_tag[ + 'id'] = f"{header_tag.find_previous('h2',class_='part').get('id')}sp{subchapter_id.zfill(2)}.1" + else: + header_tag[ + 'id'] = f"{header_tag.find_previous('h2', class_='part').get('id')}sp{subchapter_id.zfill(2)}" + + header_tag["class"] = "subpart" + cur_head_list.append(curr_head_id) + elif article_pattern.search(header_tag.text.strip()): header_tag.name = "h2" chapter_id = article_pattern.search(header_tag.text.strip()).group('a_id') @@ -318,6 +354,26 @@ def replace_tags(self): header_tag["class"] = "article" self.count = 1 + elif section_pattern.search(header_tag.text.strip()): + header_tag.name = "h3" + section_id = section_pattern.search(header_tag.text.strip()).group('sec_id') + + curr_head_id = f"{header_tag.find_previous({'h2', 'h1'}).get('id')}s{section_id.zfill(2)}" + + if curr_head_id in cur_head_list: + + header_tag[ + 'id'] = f"{header_tag.find_previous({'h2', 'h1'}).get('id')}s{section_id.zfill(2)}.{self.count}." + self.count += 1 + else: + header_tag[ + 'id'] = f"{header_tag.find_previous({'h2', 'h1'}).get('id')}s{section_id.zfill(2)}" + + cur_head_list.append(curr_head_id) + header_tag["class"] = "section" + self.head4count = 1 + + elif header_tag.get("class") == [self.class_regex["head3"]]: if section_pattern.search(header_tag.text.strip()): @@ -329,7 +385,7 @@ def replace_tags(self): if curr_head_id in cur_head_list: header_tag[ - 'id'] = f"{header_tag.find_previous({'h2', 'h1'}).get('id')}s{section_id.zfill(2)}.{self.count}" + 'id'] = f"{header_tag.find_previous({'h2', 'h1'}).get('id')}s{section_id.zfill(2)}.{self.count}." self.count += 1 else: header_tag[ @@ -339,11 +395,18 @@ def replace_tags(self): header_tag["class"] = "section" self.head4count = 1 - elif section_rule_pattern.search(header_tag.text.strip()): + elif section_rule_pattern.search(header_tag.text.strip()) or \ + rule_pattern.search(header_tag.text.strip()): header_tag.name = "h3" - rule_sec = section_rule_pattern.search(header_tag.text.strip()).group("r_id") + if section_rule_pattern.search(header_tag.text.strip()): + rule_sec = section_rule_pattern.search(header_tag.text.strip()).group("r_id") + + else: + rule_sec = rule_pattern.search(header_tag.text.strip()).group("r_id") + header_tag[ 'id'] = f"{header_tag.find_previous('h2', class_='article').get('id')}r{rule_sec.zfill(2)}" + header_tag["class"] = "rulesec" @@ -370,28 +433,33 @@ def replace_tags(self): elif re.search(rf'^{cap_roman}\.', header_tag.text.strip()): - header_tag.name = "h5" - h5_rom_text = re.search(r'^(?P<h5_id>[IVX]+)\.', header_tag.text.strip()).group("h5_id") - h5_rom_id = f"{header_tag.find_previous('h4').get('id')}-{h5_rom_text}" + h5_rom_id = f"{header_tag.find_previous({'h4', 'h3'}).get('id')}-{h5_rom_text}" header_tag['id'] = h5_rom_id cap_alpha = 'A' cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) - elif re.search(fr'^{cap_alpha}\.', header_tag.text.strip()): + + elif cap_alpha and re.search(fr'^{cap_alpha}\.', header_tag.text.strip()): header_tag.name = "h5" h5_alpha_text = re.search(r'^(?P<h5_id>[A-Z]+)\.', header_tag.text.strip()).group("h5_id") h5_alpha_id = f"{h5_rom_id}-{h5_alpha_text}" header_tag['id'] = h5_alpha_id cap_alpha = chr(ord(cap_alpha) + 1) + cap_num = 1 - elif re.search(r'^\d+\.', header_tag.text.strip()): + + elif cap_num and re.search(fr'^{cap_num}\.', header_tag.text.strip()): header_tag.name = "h5" h5_num_text = re.search(r'^(?P<h5_id>\d+)\.', header_tag.text.strip()).group("h5_id") + h5_num_id = f"{h5_alpha_id}-{h5_num_text}" header_tag['id'] = h5_num_id + cap_num += 1 + + elif header_tag.get("class") == [self.class_regex["nav"]]: if subchapter_pattern.search(header_tag.text.strip()): @@ -414,14 +482,22 @@ def replace_tags(self): chapter_id = sub_article_pattern.search(header_tag.text.strip()).group('s_id') header_tag[ - 'id'] = f"{header_tag.find_previous('h3').get('id')}a{chapter_id.zfill(2)}" - + 'id'] = f"{header_tag.find_previous({'h3','h2','h1'}).get('id')}a{chapter_id.zfill(2)}" header_tag["class"] = "subar" + + + elif header_tag.get("class") == [self.class_regex["ul"]]: if chap_ul_pattern.search(header_tag.text.strip()) or \ - sec_ul_pattern.search(header_tag.text.strip()): + sec_ul_pattern.search(header_tag.text.strip()) or \ + section_pattern.search(header_tag.text.strip()) or \ + article_pattern.search(header_tag.text.strip()) or \ + subchapter_pattern.search(header_tag.text.strip()) or \ + rule_pattern.search(header_tag.text.strip()) or \ + part_pattern.search(header_tag.text.strip()) or \ + subpart_pattern.search(header_tag.text.strip()): header_tag.name = "li" if header_tag.find_previous().name == "li": @@ -454,6 +530,7 @@ def replace_tags(self): stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', 'href': 'https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css'} self.soup.style.replace_with(stylesheet_link_tag) + self.meta_tags.append(stylesheet_link_tag) print('tags replaced') @@ -489,15 +566,18 @@ def create_chapter_section_nav(self): count = 0 - section_pattern = re.compile(r'^(?P<sec_id>\d+([A-Z])*-\d+([A-Z])*(\.\d+[A-Z]*)*(-\d+)*)[., through]') - ul_pattern = re.compile(r'^(?P<s_id>[0-9]*[A-Z]*(\.\d+)*)[:., through]') - subchapter_pattern = re.compile(r'^Subchapter (?P<s_id>[IVX]+([A-Z])*)\.') + section_pattern = re.compile(r'^§*?\s*?(?P<sec_id>\d+([A-Z])*-\d+([A-Z])*(\.\d+[A-Z]*)*(-\d+)*|\d+([A-Z])*)[., through]') + ul_pattern = re.compile(r'^Article*?\s*?(?P<s_id>[0-9A-Z]+(\.\d+)*)[:., through]') + subchapter_pattern = re.compile(r'^Subchapter (?P<s_id>[IVX]+-*?([A-Z])*)\.') + rule_pattern = re.compile(r'^Rule(s)*?\s(?P<r_id>(\d+[A-Z]*?-\d+[A-Z]*?-\d+[A-Z]*?)|\d+)[:., through]') + part_pattern = re.compile(r'^Part (?P<pid>\d+[A-Z]*?)\.') + subpart_pattern = re.compile(r'Subpart (?P<aid>[0-9A-Z]+)\.') for list_item in self.soup.find_all(): if list_item.name == "li": if re.search('constitution', self.html_file_name): - if re.search(r'^[IVX]+\.', list_item.text.strip()): - chap_num = re.search(r'^(?P<chap>[IVX]+)\. ', list_item.text.strip()).group( + if re.search(r'^Article [IVX]+ ', list_item.text.strip()): + chap_num = re.search(r'^Article (?P<chap>[IVX]+) ', list_item.text.strip()).group( "chap").zfill(2) sub_tag = "a" prev_id = None @@ -505,7 +585,7 @@ def create_chapter_section_nav(self): cnav = f'anav{self.c_nav_count:02}' self.set_chapter_section_nav(list_item, chap_num.zfill(2), sub_tag, prev_id, None, cnav) - elif re.search(r'^Preamble|^AMENDMENTS', list_item.text.strip()): + elif re.search(r'^Preamble|^(\d+[A-Z])*? AMENDMENTS', list_item.text.strip(), re.I): article_id = re.sub(r'[\s\W]+', '', list_item.text.strip()).lower() sub_tag = "a" prev_id = None @@ -513,9 +593,18 @@ def create_chapter_section_nav(self): cnav = f'anav{self.c_nav_count:02}' self.set_chapter_section_nav(list_item, article_id, sub_tag, prev_id, None, cnav) + elif re.search(r'^Amendment [IVX]+\.', list_item.text.strip()): + article_id = re.search(r'^Amendment (?P<aid>[IVX]+)\.', list_item.text.strip()).group( + "aid").zfill(2) + sub_tag = "s" + prev_id = list_item.find_previous('h2', class_='amend').get("id") + self.c_nav_count += 1 + cnav = f'anav{self.c_nav_count:02}' + self.set_chapter_section_nav(list_item, article_id, sub_tag, prev_id, None, cnav) + - elif re.search(r'^\d+\.', list_item.text.strip()): - chap_num = re.search(r'^(?P<chap>\d+)\. ', list_item.text.strip()).group( + elif re.search(r'^§ \d+\.', list_item.text.strip()): + chap_num = re.search(r'^§ (?P<chap>\d+)\. ', list_item.text.strip()).group( "chap").zfill(2) sub_tag = "s" prev_id = list_item.find_previous('h2').get("id") @@ -531,21 +620,56 @@ def create_chapter_section_nav(self): self.s_nav_count += 1 cnav = f'snav{self.s_nav_count:02}' self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None, cnav) + + elif rule_pattern.search(list_item.text.strip()): + chap_id = rule_pattern.search(list_item.text.strip()).group('r_id') + sub_tag = "r" + prev_id = list_item.find_previous('h2').get("id") + self.s_nav_count += 1 + cnav = f'snav{self.s_nav_count:02}' + self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None, cnav) + + elif ul_pattern.search(list_item.text.strip()): chap_id = ul_pattern.search(list_item.text.strip()).group('s_id') sub_tag = "a" if list_item.find_previous('p', class_="nav"): prev_id = f"t{self.title_id}s{subchapter_pattern.search(list_item.find_previous('p', class_='nav').text.strip()).group('s_id').zfill(2)}" - elif list_item.find_previous('h2', class_="article"): - prev_id = list_item.find_previous('h2', class_="article").get("id") - sub_tag = "r" + elif list_item.find_previous('h2', class_="subchap"): + prev_id = list_item.find_previous('h2', class_="subchap").get("id") + sub_tag = "a" else: prev_id = list_item.find_previous('h1').get("id") self.s_nav_count += 1 cnav = f'snav{self.s_nav_count:02}' self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None, cnav) + elif subchapter_pattern.search(list_item.text.strip()): + chap_id = subchapter_pattern.search(list_item.text.strip()).group('s_id') + sub_tag = "s" + prev_id = list_item.find_previous('h1').get("id") + self.s_nav_count += 1 + cnav = f'snav{self.s_nav_count:02}' + self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None, cnav) + + elif part_pattern.search(list_item.text.strip()): + chap_id = part_pattern.search(list_item.text.strip()).group('pid') + sub_tag = "p" + prev_id = list_item.find_previous('h2',class_="article").get("id") + self.s_nav_count += 1 + cnav = f'snav{self.s_nav_count:02}' + self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None, cnav) + + elif subpart_pattern.search(list_item.text.strip()): + + chap_id = subpart_pattern.search(list_item.text.strip()).group('aid') + sub_tag = "sp" + prev_id = list_item.find_previous('h2',class_="part").get("id") + self.s_nav_count += 1 + cnav = f'snav{self.s_nav_count:02}' + self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None, cnav) + elif list_item.name in ['h1', 'h2']: @@ -670,12 +794,14 @@ def convert_paragraph_to_alphabetical_ol_tags(self): cap_roman = "I" small_roman = "i" roman_cur_tag = None + p_tag_terminator = None for p_tag in self.soup.body.find_all(['h3', 'h4', 'h5', 'p']): current_tag_text = p_tag.text.strip() if re.search(rf'^\({main_sec_alpha}\)', current_tag_text) and p_tag.name == "p": + p_tag_terminator = 1 p_tag.name = "li" sec_alpha_cur_tag = p_tag num_count = 1 @@ -712,21 +838,42 @@ def convert_paragraph_to_alphabetical_ol_tags(self): elif re.search(rf'^\([a-z]\d+\)', current_tag_text) and p_tag.name == "p": p_tag.name = "li" - sec_alpha_cur_tag = p_tag + num_count = 1 - sec_alpha_ol.append(p_tag) + if sec_alpha_cur_tag: + sec_alpha_cur_tag.append(p_tag) + else: + sec_alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(sec_alpha_ol) + sec_alpha_cur_tag = p_tag li_id = re.search(rf'^\((?P<id>[a-z]\d+)\)', current_tag_text).group("id") p_tag["id"] = f'{sec_alpha_id}-{li_id}' p_tag.string = re.sub(rf'^\({li_id}\)', '', current_tag_text) + if re.search(rf'^\([a-z]\d+\)\s*\(1\)', current_tag_text): + num_ol1 = self.soup.new_tag("ol") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([a-z]\d+\)\s*\(1\)', '', current_tag_text) + li_tag.append(current_tag_text) + num_cur_tag1 = li_tag + cur_tag1 = re.search(r'^\((?P<cid>[a-z]\d+\)\s*\((?P<pid>1)\))', current_tag_text) + + num_id1 = f'{sec_alpha_cur_tag.get("id")}{cur_tag1.group("cid")}' + li_tag["id"] = f'{sec_alpha_cur_tag.get("id")}{cur_tag1.group("pid")}' + num_ol1.append(li_tag) + p_tag.string = "" + p_tag.append(num_ol1) + num_count = 2 + elif re.search(rf'^\({num_count}\)', current_tag_text) and p_tag.name == "p": p_tag.name = "li" num_cur_tag1 = p_tag main_sec_alpha1 = 'a' small_roman = "i" + p_tag_terminator = 1 if re.search(r'^\(1\)', current_tag_text): num_ol1 = self.soup.new_tag("ol") @@ -808,6 +955,7 @@ def convert_paragraph_to_alphabetical_ol_tags(self): sec_alpha_cur_tag1 = p_tag small_roman = "i" cap_alpha1_cur_tag = None + ol_head = 1 if re.search(r'^a\.', current_tag_text): sec_alpha_ol1 = self.soup.new_tag("ol", type="a") @@ -826,8 +974,6 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.string = re.sub(rf'^{main_sec_alpha1}\.', '', current_tag_text) main_sec_alpha1 = chr(ord(main_sec_alpha1) + 1) - ol_head = 1 - if re.search(rf'^[a-z]\.\s*1\.', current_tag_text): head_ol = self.soup.new_tag("ol") li_tag = self.soup.new_tag("li") @@ -844,11 +990,13 @@ def convert_paragraph_to_alphabetical_ol_tags(self): ol_head = 2 - elif re.search(rf'^{ol_head}\.', current_tag_text) and p_tag.get("class") != "casenote" and not p_tag.b: + elif re.search(rf'^{ol_head}\.', current_tag_text) and p_tag.get("class") != "casenote": + p_tag.name = "li" ol_head_tag = p_tag cap_roman = "I" small_roman = "i" + p_tag_terminator = 1 if re.search(r'^1\.', current_tag_text): head_ol = self.soup.new_tag("ol") @@ -874,10 +1022,11 @@ def convert_paragraph_to_alphabetical_ol_tags(self): elif re.search(rf'^{cap_roman}\.', current_tag_text) \ - and p_tag.get("class") != "casenote" and not p_tag.b and ol_head_tag: + and p_tag.get("class") != "casenote" and ol_head_tag: p_tag.name = "li" roman_cur_tag = p_tag + cap_alpha1 = 'A' if re.search(r'^I\.', current_tag_text): roman_ol = self.soup.new_tag("ol", type="I") @@ -899,13 +1048,10 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.name = "li" roman_cur_tag = p_tag - - if re.search(r'^\(i\)', current_tag_text): smallroman_ol = self.soup.new_tag("ol", type="i") p_tag.wrap(smallroman_ol) - prev_id1 = p_tag.find_previous("li").get('id') p_tag.find_previous("li").append(smallroman_ol) @@ -920,26 +1066,36 @@ def convert_paragraph_to_alphabetical_ol_tags(self): elif re.search(rf'^{cap_alpha1}\.', current_tag_text) \ - and p_tag.get("class") != "casenote" and not p_tag.b: + and p_tag.get("class") != "casenote" and p_tag.name == "p": p_tag.name = "li" cap_alpha1_cur_tag = p_tag - ol_head = 1 + p_tag_terminator = 1 + # ol_head = 1 if re.search(r'^A\.', current_tag_text): cap_alpha1_ol = self.soup.new_tag("ol", type="A") p_tag.wrap(cap_alpha1_ol) - if sec_alpha_cur_tag: - sec_alpha_cur_tag.append(cap_alpha1_ol) - cap_alpha1_id = sec_alpha_cur_tag.get("id") - elif roman_cur_tag: - roman_cur_tag.append(cap_alpha1_ol) - cap_alpha1_id = roman_cur_tag.get("id") - else: + if re.search(r'^ARTICLE [IVX]+',p_tag.find_previous("h4").text.strip()): cap_alpha1_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" ol_count += 1 + ol_head = 1 + else: + if sec_alpha_cur_tag: + sec_alpha_cur_tag.append(cap_alpha1_ol) + cap_alpha1_id = sec_alpha_cur_tag.get("id") + elif roman_cur_tag: + roman_cur_tag.append(cap_alpha1_ol) + cap_alpha1_id = roman_cur_tag.get("id") + else: + cap_alpha1_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + ol_count += 1 + + else: cap_alpha1_ol.append(p_tag) + if re.search(r'^ARTICLE [IVX]+', p_tag.find_previous("h4").text.strip()): + ol_head = 1 p_tag["id"] = f'{cap_alpha1_id}{cap_alpha1}' p_tag.string = re.sub(rf'^{cap_alpha1}\.', '', current_tag_text) @@ -968,7 +1124,25 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.string = re.sub(rf'^\([a-z]\)', '', current_tag_text) - if re.search(r'^CASE NOTES|^"Sec\. \d+\.', current_tag_text) or p_tag.name in ['h3', 'h4', 'h5']: + # elif re.search(rf'^\(\d+\)', current_tag_text) and p_tag.name == "p": + # p_tag.name = "li" + # num_cur_tag1 = p_tag + # main_sec_alpha1 = 'a' + # small_roman = "i" + # num_ol1.append(p_tag) + # tag_num = re.search(r'^\((?P<nid>\d+)\)',current_tag_text).group("nid") + # p_tag["id"] = f'{num_id1}{tag_num}' + # p_tag.string = re.sub(rf'^\(\d+\)', '', current_tag_text) + + # elif not re.search(r'^CASE NOTES|^SECTION|^"Sec\. \d+\.|^Official Commentary|^History\.|^Editor’s Note\.', current_tag_text) and p_tag.name == "p": + # if p_tag_terminator: + # p_tag.find_previous("li").append(p_tag) + + + if re.search(r'^CASE NOTES|^SECTION|^"Sec\. \d+\.|^Official Commentary|^History\.|^Editor’s Note\.|^ARTICLE [IVX]+\.', current_tag_text) or p_tag.name in ['h3', 'h4', 'h5']: + + p_tag_terminator = None + ol_head = 1 ol_count = 1 num_count = 1 @@ -979,9 +1153,12 @@ def convert_paragraph_to_alphabetical_ol_tags(self): cap_alpha1 = "A" cap_alpha1_cur_tag = None sec_alpha_cur_tag1 = None + ol_head_tag = None cap_roman = "I" small_roman = "i" + if re.search(r'^Official Commentary|^History\.',current_tag_text): + ol_count += 1 print('ol tags added') @@ -1098,7 +1275,8 @@ def clean_html_and_add_cite(self): id_reg = re.search(r'Chapter (?P<title>\d+[A-Z]*)', match.strip()) else: id_reg = re.search( - r'G\.S\.\s*(?P<cite>(?P<title>\d+[A-Z]*)-(?P<sec>\d+(\.\d+)*)(-\d+)*)(?P<ol>\([a-z]*\)(\([0-9]+\))*(\([a-z]\))*)*', + r'G\.S\.\s*(?P<cite>(?P<title>\d+[A-Z]*)-(?P<sec>\d+(\.\d+)*)(-\d+)*)(?P<ol>\([a-z]*\)(\([' + r'0-9]+\))*(\([a-z]\))*)*', match.strip()) title = id_reg.group("title").strip() @@ -1150,12 +1328,13 @@ def clean_html_and_add_cite(self): target = "_blank" if id_reg.group("ol"): ol_id = re.sub(r'[() ]+', '', id_reg.group("ol")) - a_id = f'gov.nc.stat.title.{title_id1}.html#{head_id.group("h_id")}ol1{ol_id}' + a_id = f'gov.nc.stat.title.{title_id1}.html#{head_id.group("h_id")}ol1{ol_id} ' else: a_id = f'gov.nc.stat.title.{title_id1}.html#{head_id.group("h_id")}' text = re.sub(fr'\s{re.escape(match)}', - f' <cite class="octn"><a href="{a_id}" target="{target}">{match}</a></cite>', + f' <cite class="octn"><a href="{a_id}" target="{target}">{match}</a' + f'></cite>', inside_text, re.I) tag.append(text) @@ -1223,12 +1402,11 @@ def write_soup_to_file(self): soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) print("validating") - with open(f"../../code-nc/transforms/nc/ocnc/r{self.release_number}/{self.html_file_name}", "w") as file: + with open(f"../../cic-code-nc-1/transforms/nc/ocnc/r{self.release_number}/{self.html_file_name}", "w") as file: file.write(soup_str) def create_case_note_nav(self): cap_alpha = None - for case_tag in self.soup.find_all("p", class_='casenote'): if re.search(r'^[IVX]+\. ', case_tag.text.strip()): if case_tag.find_next("p", class_='casenote') and cap_alpha == "I": @@ -1322,12 +1500,14 @@ def create_case_note_ul(self): def create_case_note_nav1(self): cap_alpha = None + num = None cap_roman = "I" for case_tag in self.soup.find_all({"p", "h4"}): - - if case_tag.get("class") == "casenote" and case_tag.name == "p": - if re.search(rf'^{cap_roman}\. ', case_tag.text.strip()): + # if case_tag.get("class") == "casenote" and case_tag.name == "p": + if case_tag.name == "p": + if re.search(rf'^{cap_roman}\.', case_tag.text.strip()) \ + and case_tag.get("class") == "casenote": nav_list = [] nav_link = self.soup.new_tag('a') nav_link.append(case_tag.text) @@ -1339,6 +1519,26 @@ def create_case_note_nav1(self): case_tag["class"] = "casenote" cap_alpha = 'A' cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) + elif re.search(r'^[IVX]+\.', case_tag.text.strip()): + if case_tag.get("class") == [self.class_regex['head4']]: + + case_tag.name = "h5" + h5_rom_text = re.search(r'^(?P<h5_id>[IVX]+)\.', case_tag.text.strip()).group("h5_id") + h5_rom_id = f"{case_tag.find_previous('h4').get('id')}-{h5_rom_text}" + case_tag['id'] = h5_rom_id + else: + nav_list = [] + nav_link = self.soup.new_tag('a') + nav_link.append(case_tag.text) + case_id = re.search(r'^(?P<cid>[IVX]+)\.', case_tag.text.strip()).group("cid") + + rom_id = f"{case_tag.find_previous('h4').get('id')}-{case_id}" + nav_link["href"] = f"#{case_tag.find_previous({'h4', 'h3'}).get('id')}-{case_id}" + nav_list.append(nav_link) + case_tag.contents = nav_list + case_tag["class"] = "casenote" + cap_alpha = 'A' + cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) elif cap_alpha: if re.search(fr'^{cap_alpha}\.', case_tag.text.strip()): @@ -1353,17 +1553,20 @@ def create_case_note_nav1(self): case_tag.contents = nav_list case_tag["class"] = "casenote" cap_alpha = chr(ord(cap_alpha) + 1) + num = 1 - elif re.search(r'^[0-9]+\.', case_tag.text.strip()): + elif num and re.search(fr'^{num}\.', case_tag.text.strip()): nav_list = [] nav_link = self.soup.new_tag('a') nav_link.append(case_tag.text) case_id = re.search(r'^(?P<cid>[0-9]+)\.', case_tag.text.strip()).group("cid") + digit_id = f"{alpha_id}-{case_id}" nav_link["href"] = f"#{alpha_id}-{case_id}" nav_list.append(nav_link) case_tag.contents = nav_list case_tag["class"] = "casenote" + num += 1 elif case_tag.name == "h4": cap_roman = "I" @@ -1404,7 +1607,16 @@ def create_case_note_ul1(self): case_tag.wrap(digit_ul) alpha_tag.append(digit_ul) else: + digit_ul.append(case_tag) + else: + if re.search(r'^II\.', case_tag.a.text.strip()): + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(rom_ul) + else: + rom_ul.append(case_tag) + + elif case_tag.name == "h4": cap_roman = "I" @@ -1421,8 +1633,8 @@ def start_parse(self): print(start_time) self.create_page_soup() if re.search('constitution', self.html_file_name): - self.class_regex = {'head1': r'^Constitution of North Carolina|Constitution of the United States', - 'ul': r'^(Article|Preamble)', 'head2': '^ARTICLE I', + self.class_regex = {'head1': r'^Constitution of North Carolina|CONSTITUTION OF THE UNITED STATES', + 'ul': r'^(Article|Preamble)', 'head2': '^(ARTICLE|Article) I', 'head4': '^CASE NOTES', 'ol_p': r'^\(\d\)', 'junk1': '^Annotations$', 'head': '^Section added\.', 'head3': r'^§ \d|^sec\.|^Section \d', 'nav': '^Subchapter I\.|^——————————'} @@ -1449,7 +1661,6 @@ def start_parse(self): self.wrap_div_tags() self.clean_html_and_add_cite() - self.write_soup_to_file() + # self.write_soup_to_file() print(f'finished {self.html_file_name}') print(datetime.now() - start_time) - diff --git a/html_parser/nd_html_parser.py b/html_parser/nd_html_parser.py index 23e2dea..d5fd60e 100644 --- a/html_parser/nd_html_parser.py +++ b/html_parser/nd_html_parser.py @@ -226,7 +226,8 @@ def replace_tags(self): and not re.search(r'^Analysis', header_tag.text.strip()): header_tag.name = "li" header_tag['class'] = "note" - note_to_decision_list.append(header_tag.text.strip()) + header_tag_text = re.sub(r'\W+','',header_tag.text.strip()) + note_to_decision_list.append(header_tag_text) elif re.search(r'^Source:', header_tag.text.strip()) and header_tag.b: new_tag = self.soup.new_tag("h4") @@ -249,6 +250,7 @@ def replace_tags(self): header_tag.b.clear() elif header_tag.get("class") == [self.class_regex["NTD"]]: + note_tag_text = re.sub(r'\W+','', header_tag.text.strip()) if header_tag.text.strip() in self.head4_list: header_tag.name = "h4" NTD_rom_head_tag = None @@ -265,7 +267,7 @@ def replace_tags(self): cur_id_list.append(header_tag['id']) header4_tag = header_tag - elif header_tag.text.strip() in note_to_decision_list: + elif note_tag_text in note_to_decision_list: if re.search(r'^[IVX]+\.', header_tag.text.strip()): header_tag.name = "h5" NTD_rom_head_tag = header_tag @@ -1062,36 +1064,60 @@ def clean_html_and_add_cite(self): match.strip()) else: id_reg = re.search( - r'(?P<cite>(?P<title>\d+(\.\d+)*)-\d+(\.\d+)*-\d+(\.\d+)*)', + r'(?P<cite>(?P<title>\d+(\.\d+)*)-\d+(\.\d+)*-\d+(\.\d+)*)(?P<ol>(\([a-z]\))(\(\d+\))*)*', match.strip()) title = id_reg.group("title").strip() title_id = f'{title.zfill(2)}' if os.path.isfile( - f"../../code-nd/transforms/nd/ocnd/r{self.release_number}/gov.nd.code.title.{title_id}.html"): + f"/home/mis/PycharmProjects/cic-code-nd-1/transforms/nd/ocnd/r{self.release_number}/gov.nd.code.title.{title_id}.html"): with open( - f"../../code-nd/transforms/nd/ocnd/r{self.release_number}/gov.nd.code.title.{title_id}.html", + f"/home/mis/PycharmProjects/cic-code-nd-1/transforms/nd/ocnd/r{self.release_number}/gov.nd.code.title.{title_id}.html", 'r') as firstfile: for line in firstfile: - if re.search(rf'id=".+(s|c){id_reg.group("cite")}">$', line.strip()): - tag.clear() - head_id = re.search(rf'id="(?P<h_id>.+(s|c){id_reg.group("cite")})">$', line.strip()) + if id_reg.group("ol"): + ol_id = re.sub(r'[() ]+', '', id_reg.group("ol")) + cite_id = f'{id_reg.group("cite")}ol1{ol_id}' - if title_id == self.title_id: - target = "_self" - a_id = f'#{head_id.group("h_id")}' - else: - target = "_blank" - a_id = f'gov.nd.stat.title.{title_id}.html#{head_id.group("h_id")}' + if re.search(rf'id=".+{cite_id}">$', line.strip()): + li_id = re.search(rf'id="(?P<l_id>.+{cite_id})">$', + line.strip()).group("l_id") + + if title_id == self.title.zfill(2): + target = "_self" + a_id = f'#{li_id}' + else: + target = "_blank" + a_id = f'gov.nd.stat.title.{title_id}.html#{li_id}' + + tag.clear() + text = re.sub(fr'\s{re.escape(match)}', + f' <cite class="ocak"><a href="{a_id}" target="{target}">{match}</a></cite>', + inside_text, + re.I) + tag.append(text) - text = re.sub(fr'\s{re.escape(match)}', - f' <cite class="ocnd"><a href="{a_id}" target="{target}">{match}</a></cite>', - inside_text, - re.I) - tag.append(text) + + else: + if re.search(rf'id=".+(s|c){id_reg.group("cite")}">$', line.strip()): + tag.clear() + head_id = re.search(rf'id="(?P<h_id>.+(s|c){id_reg.group("cite")})">$', line.strip()) + + if title_id == self.title_id: + target = "_self" + a_id = f'#{head_id.group("h_id")}' + else: + target = "_blank" + a_id = f'gov.nd.stat.title.{title_id}.html#{head_id.group("h_id")}' + + text = re.sub(fr'\s{re.escape(match)}', + f' <cite class="ocnd"><a href="{a_id}" target="{target}">{match}</a></cite>', + inside_text, + re.I) + tag.append(text) for match in set( x for x in re.findall(r'N\.D\. LEXIS \d+', @@ -1133,8 +1159,6 @@ def clean_html_and_add_cite(self): if len(tag.get_text(strip=True)) == 0: tag.extract() - - def write_soup_to_file(self): """ - add the space before self closing meta tags @@ -1149,8 +1173,12 @@ def write_soup_to_file(self): soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) print("validating") - with open(f"../../code-nd/transforms/nd/ocnd/r{self.release_number}/{self.html_file_name}", "w") as file: - file.write(soup_str.replace('& ', '& ')) + with open(f"../../cic-code-nd-1/transforms/nd/ocnd/r{self.release_number}/{self.html_file_name}", "w") as file: + soup_str = re.sub(r'&(?!amp;)','&',soup_str) + soup_str = re.sub('<br/>','<br />',soup_str) + soup_str = re.sub(r'<span class.*?>\s*</span>', '', soup_str) + file.write(soup_str) + # file.write(soup_str.replace('& ', '& ')) def create_Notes_to_Decisions(self): note_to_dec_ids: list = [] @@ -1448,4 +1476,5 @@ def start_parse(self): self.clean_html_and_add_cite() self.write_soup_to_file() print(f'finished {self.html_file_name}') + print(datetime.now()) print(datetime.now() - start_time) diff --git a/html_parser/parser_base.py b/html_parser/parser_base.py index 26cdebd..48af91d 100644 --- a/html_parser/parser_base.py +++ b/html_parser/parser_base.py @@ -38,7 +38,8 @@ def start(self, state_key): """ self.cpu_count = multiprocessing.cpu_count() print(self.cpu_count) - input_files_list = listdir(f'../transforms/{state_key.lower()}/oc{state_key.lower()}/r{self.release_number}/raw/') + input_files_list = listdir( + f'../transforms/{state_key.lower()}/oc{state_key.lower()}/r{self.release_number}/raw/') self.run_with_multiprocessing_pool(input_files_list, state_key) def run_with_multiprocessing_pool(self, files_list, state_key): @@ -47,10 +48,9 @@ def run_with_multiprocessing_pool(self, files_list, state_key): - call wrapper function with one file name at a time """ with multiprocessing.Pool(self.cpu_count) as pool: - pool.map_async(self.wrapper_function, files_list) - pool.close() - pool.join() - + pool.map_async(self.wrapper_function, files_list) + pool.close() + pool.join() def wrapper_function(self, files_list): """ diff --git a/html_parser/tn_html_parser.py b/html_parser/tn_html_parser.py index ea69109..0391031 100644 --- a/html_parser/tn_html_parser.py +++ b/html_parser/tn_html_parser.py @@ -175,7 +175,8 @@ def replace_tags(self): p_tag['id'] = f'{cleansed_chap}{chapter.zfill(2)}' p_tag['class'] = 'parth2' elif re.search('^subchapter', p_tag.get_text().strip(), re.I) and \ - (chap_id := p_tag.findPrevious(lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text()))): + (chap_id := p_tag.findPrevious( + lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text()))): p_tag.name = 'h3' p_tag['id'] = f'{chap_id["id"]}sc{chapter.zfill(2)}' p_tag['class'] = 'subchapterh3' @@ -196,7 +197,8 @@ def replace_tags(self): elif section_match := re.search(r'^(?P<sec>\w+)\.', p_tag.get_text()): p_tag.name = 'h3' chap_tag = p_tag.find_previous(lambda tag: tag.name == 'h2' - and re.search(r'(Part|chapter) \w+', tag.get_text(), re.I)) + and re.search(r'(Part|chapter) \w+', tag.get_text(), + re.I)) if re.search(r'chapter \w+', chap_tag.get_text(), re.I): chap_id = re.search(r'chapter (?P<chap_id>\w+)', chap_tag.get_text(), re.I).group( @@ -223,9 +225,11 @@ def replace_tags(self): lambda tag: re.search(r'h\d+\.', tag.name) and tag.name != 'h5' and tag.has_attr('class') and tag['class'] not in self.headers_class_dict.values()) if re.search(r'^\d+\.', p_tag.get_text()): - chap_id = p_tag.find_previous_sibling(lambda tag: re.search('^NOTES TO DECISIONS|Decisions Under Prior Law|^Decisions', tag.get_text()) - and tag.name != 'h5' and re.search(r'h\d', - tag.name)) + chap_id = p_tag.find_previous_sibling( + lambda tag: re.search('^NOTES TO DECISIONS|Decisions Under Prior Law|^Decisions', + tag.get_text()) + and tag.name != 'h5' and re.search(r'h\d', + tag.name)) elif part_tag and part_tag.has_attr('class') and part_tag['class'] == 'part_header': @@ -322,7 +326,7 @@ def convert_paragraph_to_alphabetical_ol_tags(self): if not re.search(r'\w+', p_tag.get_text()): continue - if chap_id := p_tag.findPrevious(lambda tag: tag.name in ['h2', 'h3','h4']): + if chap_id := p_tag.findPrevious(lambda tag: tag.name in ['h2', 'h3', 'h4']): sec_id = chap_id["id"] if sec_id != prev_chap_id: ol_count = 0 @@ -333,8 +337,8 @@ def convert_paragraph_to_alphabetical_ol_tags(self): if re.search('Except as otherwise provided in this subdivision', data_str, re.I): print() - - if re.search(rf'^\({main_sec_alpha}\)', data_str) and not (previous_roman_li and previous_roman_inner_alpha_li): + if re.search(rf'^\({main_sec_alpha}\)', data_str) and not ( + previous_roman_li and previous_roman_inner_alpha_li): cap_alpha = 'A' small_roman_inner_alpha = 'a' small_roman_inner_alpha_num = 1 @@ -400,7 +404,8 @@ def convert_paragraph_to_alphabetical_ol_tags(self): previous_roman_inner_alpha_li = None previous_roman_inner_alpha_num_li = None small_roman_inner_alpha = 'a' - li_roman = re.search(r'^\(\w\)\s*\(\d+\)\s*\([A-Z]+\)\s*\((?P<roman>\w+)\)', data_str).group( + li_roman = re.search(r'^\(\w\)\s*\(\d+\)\s*\([A-Z]+\)\s*\((?P<roman>\w+)\)', + data_str).group( 'roman') new_li = self.soup.new_tag('p') new_li.string = re.sub(r'^\(\w\)\s*\(\d+\)\s*\([A-Z]+\)\s*\(\w+\)', '', data_str) @@ -418,7 +423,8 @@ def convert_paragraph_to_alphabetical_ol_tags(self): previous_num_li.append(p_tag) continue - if re.search(rf'^\({ol_head}\)', p_tag.text.strip()) and not (previous_roman_inner_alpha_li and previous_roman_inner_alpha_num_li): + if re.search(rf'^\({ol_head}\)', p_tag.text.strip()) and not ( + previous_roman_inner_alpha_li and previous_roman_inner_alpha_num_li): cap_alpha = "A" small_roman = 'i' small_roman_inner_alpha = 'a' @@ -472,7 +478,8 @@ def convert_paragraph_to_alphabetical_ol_tags(self): previous_roman_inner_alpha_li = None previous_roman_inner_alpha_num_li = None small_roman_inner_alpha = 'a' - li_roman = re.search(r'^\(\d+\)\s*\([A-Z]+\)\s*\((?P<roman>\w+)\)', data_str).group('roman') + li_roman = re.search(r'^\(\d+\)\s*\([A-Z]+\)\s*\((?P<roman>\w+)\)', data_str).group( + 'roman') new_li = self.soup.new_tag('p') new_li.string = re.sub(r'^\(\d+\)\s*\([A-Z]+\)\s*\(\w+\)', '', data_str) p_tag.string.replace_with(new_li) @@ -496,7 +503,8 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.decompose() continue elif previous_num_li: - if cap_alpha_match := re.search(fr'^\({cap_alpha}+\)|(^\([A-Z]+(\.\d+)?\))', p_tag.text.strip()): + if cap_alpha_match := re.search(fr'^\({cap_alpha}+\)|(^\([A-Z]+(\.\d+)?\))', + p_tag.text.strip()): small_roman = 'i' small_roman_inner_alpha = 'a' small_roman_inner_alpha_num = 1 @@ -569,11 +577,12 @@ def convert_paragraph_to_alphabetical_ol_tags(self): p_tag.name = 'li' p_tag.wrap(inner_ol) roman_ol = self.soup.new_tag("ol", type="I") - small_roman_id = f'{cap_alpha_li_id}{li_roman}' #title 40 + small_roman_id = f'{cap_alpha_li_id}{li_roman}' # title 40 p_tag['id'] = small_roman_id previous_roman_li = p_tag small_letter_inner_ol = self.soup.new_tag("ol", type="a") - if re.search(f'^\([a-z]+\)\s*\({small_roman_inner_alpha}\)', p_tag.get_text().strip()): + if re.search(f'^\([a-z]+\)\s*\({small_roman_inner_alpha}\)', + p_tag.get_text().strip()): small_roman_inner_alpha_num = 1 previous_roman_inner_alpha_num_li = None new_li = self.soup.new_tag('p') @@ -618,8 +627,9 @@ def convert_paragraph_to_alphabetical_ol_tags(self): else: previous_inner_li.insert(len(previous_num_li.contents), p_tag) - elif re.search(r'^Acts\s|^Code\s|^T\.C\.A|^Article\s', p_tag.get_text().strip(), re.I) or (p_tag.find_previous_sibling() - and re.search(r'^\d+-\d+-\d+', p_tag.find_previous_sibling().get_text())): + elif re.search(r'^Acts\s|^Code\s|^T\.C\.A|^Article\s', p_tag.get_text().strip(), re.I) or ( + p_tag.find_previous_sibling() + and re.search(r'^\d+-\d+-\d+', p_tag.find_previous_sibling().get_text())): set_string = False ol_head = 1 main_sec_alpha = 'a' @@ -730,15 +740,16 @@ def remove_or_replace_class_names(self): tag.unwrap() print('removed class names') - def create_notes_decision_to_nav(self): - - for notes_head in self.soup.find_all(lambda tag: tag.name == 'h4' and re.search('DecisionsUnderPriorLaw|Decisions', tag.get('id', ''))): - if notes_head.find_next_sibling('p') and re.search('^\d+\.', notes_head.find_next_sibling('p').get_text().strip()): + for notes_head in self.soup.find_all( + lambda tag: tag.name == 'h4' and re.search('DecisionsUnderPriorLaw|Decisions', tag.get('id', ''))): + if notes_head.find_next_sibling('p') and re.search('^\d+\.', + notes_head.find_next_sibling('p').get_text().strip()): nav_tag = self.soup.new_tag('nav') new_ul = self.soup.new_tag("ul", Class="leaders") - for headers_text in [s for s in notes_head.find_next_sibling('p').get_text().splitlines() if re.search('^\d', s)]: + for headers_text in [s for s in notes_head.find_next_sibling('p').get_text().splitlines() if + re.search('^\d', s)]: new_li = self.soup.new_tag('li') header_id = re.sub(r'[\s\'—“”]+', '', f'#{notes_head.get("id")}-{headers_text.strip()}') new_ul.append(new_li) @@ -748,12 +759,14 @@ def create_notes_decision_to_nav(self): nav_tag.append(new_ul) notes_head.find_next_sibling('p').replace_with(nav_tag) - - for notes_head in self.soup.find_all(lambda tag: tag.name == 'h4' and re.search('NOTESTODECISIONS', tag.get('id', ''))): - if notes_head.find_next_sibling('p') and re.search('^\d+\.', notes_head.find_next_sibling('p').get_text().strip()): + for notes_head in self.soup.find_all( + lambda tag: tag.name == 'h4' and re.search('NOTESTODECISIONS', tag.get('id', ''))): + if notes_head.find_next_sibling('p') and re.search('^\d+\.', + notes_head.find_next_sibling('p').get_text().strip()): nav_tag = self.soup.new_tag('nav') new_ul = self.soup.new_tag("ul", Class="leaders") - for headers_text in [s for s in notes_head.find_next_sibling('p').get_text().splitlines() if re.search('^\d', s)]: + for headers_text in [s for s in notes_head.find_next_sibling('p').get_text().splitlines() if + re.search('^\d', s)]: new_li = self.soup.new_tag('li') header_id = re.sub(r'[\s\'—“”]+', '', f'#{notes_head.get("id")}-{headers_text.strip()}') new_ul.append(new_li) @@ -763,7 +776,6 @@ def create_notes_decision_to_nav(self): nav_tag.append(new_ul) notes_head.find_next_sibling('p').replace_with(nav_tag) - def add_anchor_tags(self): """ - for each nav tag in html @@ -772,7 +784,6 @@ def add_anchor_tags(self): - add a property called 'aria-describedby' with value same as previously built reference link """ - self.soup = BeautifulSoup(self.soup.prettify(formatter=None), features='lxml') [tag.unwrap() for tag in self.soup.find_all('key')] for ul in self.soup.findAll('nav'): @@ -784,10 +795,10 @@ def add_anchor_tags(self): li_num += 1 if st_num_reg := re.search(r'^Subtitle\s(?P<st_num>\d+)', li.get_text().strip()): header_id = f'#t{self.title.zfill(2)}st{st_num_reg.group("st_num").zfill(2)}' - header_id = re.sub(r'\s+','',header_id) + header_id = re.sub(r'\s+', '', header_id) anchor = self.soup.new_tag('a', href=header_id) cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+','',cleansed_header_id) + cleansed_header_id = re.sub(r'\s+', '', cleansed_header_id) anchor.attrs['aria-describedby'] = cleansed_header_id li['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' anchor.string = li.text @@ -804,7 +815,7 @@ def add_anchor_tags(self): header_id = re.sub(r'\s+', '', header_id) anchor = self.soup.new_tag('a', href=header_id) cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+','',cleansed_header_id) + cleansed_header_id = re.sub(r'\s+', '', cleansed_header_id) anchor.attrs['aria-describedby'] = cleansed_header_id li['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' anchor.string = li.text @@ -817,12 +828,13 @@ def add_anchor_tags(self): for li in ul.findAll('li'): li_num += 1 if chap_no := re.search(r'^Subchapter\s(?P<sub_chap_num>\d+)', li.get_text().strip()): - previous_head = ul.find_previous(lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text().strip())) + previous_head = ul.find_previous( + lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text().strip())) header_id = f'#{previous_head["id"]}sc{chap_no.group("sub_chap_num").zfill(2)}' header_id = re.sub(r'\s+', '', header_id) anchor = self.soup.new_tag('a', href=header_id) cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+','',cleansed_header_id) + cleansed_header_id = re.sub(r'\s+', '', cleansed_header_id) anchor.attrs['aria-describedby'] = cleansed_header_id li['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' anchor.string = li.text @@ -854,7 +866,7 @@ def add_anchor_tags(self): anchor = self.soup.new_tag('a', href=header_id) cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+','',cleansed_header_id) + cleansed_header_id = re.sub(r'\s+', '', cleansed_header_id) anchor.attrs['aria-describedby'] = cleansed_header_id li['id'] = f'{cleansed_header_id}-snav{str(li_num).zfill(2)}' anchor.string = li.text @@ -869,12 +881,13 @@ def add_anchor_tags(self): for li in ul.findAll('li'): li_num += 1 if chap_no := re.search(r'^Part\s(?P<sub_chap_num>\d+)', li.get_text().strip()): - previous_head = ul.find_previous(lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text().strip())) + previous_head = ul.find_previous( + lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text().strip())) header_id = f'#{previous_head["id"]}p{chap_no.group("sub_chap_num").zfill(2)}' header_id = re.sub(r'\s+', '', header_id) anchor = self.soup.new_tag('a', href=header_id) cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+','',cleansed_header_id) + cleansed_header_id = re.sub(r'\s+', '', cleansed_header_id) anchor.attrs['aria-describedby'] = cleansed_header_id li['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' anchor.string = li.text @@ -887,64 +900,63 @@ def add_anchor_tags(self): for ul_tag in self.soup.findAll('nav'): li_num = 0 for li_tag in ul_tag.findAll('li'): - if not li_tag.a: - if re.search('^Part \d', li_tag.get_text().strip()): - li_num += 1 - if chap_no := re.search(r'^Part\s(?P<sub_chap_num>\d+)', li_tag.get_text().strip()): - previous_head = ul_tag.find_previous( - lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text().strip())) - header_id = f'#{previous_head["id"]}p{chap_no.group("sub_chap_num").zfill(2)}' - header_id = re.sub(r'\s+', '', header_id) - anchor = self.soup.new_tag('a', href=header_id) - cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+', '', cleansed_header_id) - anchor.attrs['aria-describedby'] = cleansed_header_id - li_tag['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' - anchor.string = li_tag.text - if li_tag.string: - li_tag.string.replace_with(anchor) - else: - li_tag.contents = [] - li_tag.append(anchor) - - elif re.search('^\d+-\d+\.', li_tag.get_text().strip()): - li_num += 1 - if chap_no := re.search(r'^(?P<sub_chap_num>\d+-\d+)\.', li_tag.get_text().strip()): - previous_head = ul_tag.find_previous( - lambda tag: tag.name == 'h3' and re.search('^\d+\.', tag.get_text().strip())) - header_id = f'#{previous_head["id"]}s{chap_no.group("sub_chap_num").zfill(2)}' - header_id = re.sub(r'\s+|\'|[\s\']', '', header_id) - anchor = self.soup.new_tag('a', href=header_id) - cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+|\'|[\s\']', '', cleansed_header_id) - anchor.attrs['aria-describedby'] = cleansed_header_id - li_tag['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' - anchor.string = li_tag.text - if li_tag.string: - li_tag.string.replace_with(anchor) - else: - li_tag.contents = [] - li_tag.append(anchor) - - elif re.search('^\d', li_tag.get_text().strip()): - li_num += 1 - if chap_no := re.search(r'^(?P<sub_chap_num>\d+)', li_tag.get_text().strip()): - previous_head = ul_tag.find_previous( - lambda tag: tag.name == 'h2' and re.search('^Part', tag.get_text().strip())) - header_id = f'#{previous_head["id"]}s{chap_no.group("sub_chap_num").zfill(2)}' - header_id = re.sub(r'\s+|\'|[\s\']', '', header_id) - anchor = self.soup.new_tag('a', href=header_id) - cleansed_header_id = header_id.strip("#") - cleansed_header_id = re.sub(r'\s+|\'|[\s\']', '', cleansed_header_id) - anchor.attrs['aria-describedby'] = cleansed_header_id - li_tag['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' - anchor.string = li_tag.text - if li_tag.string: - li_tag.string.replace_with(anchor) - else: - li_tag.contents = [] - li_tag.append(anchor) + if not li_tag.a: + if re.search('^Part \d', li_tag.get_text().strip()): + li_num += 1 + if chap_no := re.search(r'^Part\s(?P<sub_chap_num>\d+)', li_tag.get_text().strip()): + previous_head = ul_tag.find_previous( + lambda tag: tag.name == 'h2' and re.search('^Chapter', tag.get_text().strip())) + header_id = f'#{previous_head["id"]}p{chap_no.group("sub_chap_num").zfill(2)}' + header_id = re.sub(r'\s+', '', header_id) + anchor = self.soup.new_tag('a', href=header_id) + cleansed_header_id = header_id.strip("#") + cleansed_header_id = re.sub(r'\s+', '', cleansed_header_id) + anchor.attrs['aria-describedby'] = cleansed_header_id + li_tag['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' + anchor.string = li_tag.text + if li_tag.string: + li_tag.string.replace_with(anchor) + else: + li_tag.contents = [] + li_tag.append(anchor) + + elif re.search('^\d+-\d+\.', li_tag.get_text().strip()): + li_num += 1 + if chap_no := re.search(r'^(?P<sub_chap_num>\d+-\d+)\.', li_tag.get_text().strip()): + previous_head = ul_tag.find_previous( + lambda tag: tag.name == 'h3' and re.search('^\d+\.', tag.get_text().strip())) + header_id = f'#{previous_head["id"]}s{chap_no.group("sub_chap_num").zfill(2)}' + header_id = re.sub(r'\s+|\'|[\s\']', '', header_id) + anchor = self.soup.new_tag('a', href=header_id) + cleansed_header_id = header_id.strip("#") + cleansed_header_id = re.sub(r'\s+|\'|[\s\']', '', cleansed_header_id) + anchor.attrs['aria-describedby'] = cleansed_header_id + li_tag['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' + anchor.string = li_tag.text + if li_tag.string: + li_tag.string.replace_with(anchor) + else: + li_tag.contents = [] + li_tag.append(anchor) + elif re.search('^\d', li_tag.get_text().strip()): + li_num += 1 + if chap_no := re.search(r'^(?P<sub_chap_num>\d+)', li_tag.get_text().strip()): + previous_head = ul_tag.find_previous( + lambda tag: tag.name == 'h2' and re.search('^Part', tag.get_text().strip())) + header_id = f'#{previous_head["id"]}s{chap_no.group("sub_chap_num").zfill(2)}' + header_id = re.sub(r'\s+|\'|[\s\']', '', header_id) + anchor = self.soup.new_tag('a', href=header_id) + cleansed_header_id = header_id.strip("#") + cleansed_header_id = re.sub(r'\s+|\'|[\s\']', '', cleansed_header_id) + anchor.attrs['aria-describedby'] = cleansed_header_id + li_tag['id'] = f'{cleansed_header_id}-cnav{str(li_num).zfill(2)}' + anchor.string = li_tag.text + if li_tag.string: + li_tag.string.replace_with(anchor) + else: + li_tag.contents = [] + li_tag.append(anchor) print('added anchor tags') @@ -1002,7 +1014,7 @@ def wrap_div_tags(self): new_sub_sec_div.append(inner_tag) next_tag = inner_next_tag if not inner_next_tag or inner_next_tag.name in ['h3', - 'h2'] or inner_next_tag.name == 'h4' \ + 'h2'] or inner_next_tag.name == 'h4' \ and inner_next_tag.get('class'): break inner_tag = inner_next_tag @@ -1232,7 +1244,7 @@ def add_anchor_constitution(self): if roman_match := re.search(r'^Article (\w+)', li.get_text(), re.I): article_num = roman_match.group(1) header_id = f'{self.title}-a{article_num.zfill(2)}' - header_id = re.sub(r'\s+','',header_id) + header_id = re.sub(r'\s+', '', header_id) anchor = self.soup.new_tag('a', href=f'#{header_id}') anchor.string = li.get_text() anchor.attrs['aria-describedby'] = header_id @@ -1267,8 +1279,10 @@ def add_anchor_constitution(self): li.append(anchor) elif re.search(r'^§ \d+\.|^sec\.', nav.get_text(), re.I): for li in nav.ul.findAll('li'): - if roman_match := re.search(r'^§ (?P<sec_num1>\w+)\.|^Sec\.\s*(?P<sec_num>\w+)', li.get_text(), re.I): - section_num = roman_match.group('sec_num') if roman_match.group('sec_num') else roman_match.group('sec_num1') + if roman_match := re.search(r'^§ (?P<sec_num1>\w+)\.|^Sec\.\s*(?P<sec_num>\w+)', li.get_text(), + re.I): + section_num = roman_match.group('sec_num') if roman_match.group( + 'sec_num') else roman_match.group('sec_num1') parent = nav.find_previous_sibling(lambda tag: tag.name == 'h2' and tag.get('id')) header_id = f'{parent["id"]}s{section_num.zfill(2)}' anchor = self.soup.new_tag('a', href=f'#{header_id}') @@ -1295,17 +1309,16 @@ def add_anchor_constitution(self): li.contents = [] li.append(anchor) - def note_to_decision_nav(self): - innr_ul = self.soup.new_tag("ul",Class="leaders") + innr_ul = self.soup.new_tag("ul", Class="leaders") for nd_tag in self.soup.findAll("li"): - if re.search(r'^\d+\.\s*—',nd_tag.text.strip()): + if re.search(r'^\d+\.\s*—', nd_tag.text.strip()): - if re.search(r'^\d+\.\s*\w+',nd_tag.find_previous("li").text.strip()): + if re.search(r'^\d+\.\s*\w+', nd_tag.find_previous("li").text.strip()): prev_tag = nd_tag.find_previous_sibling("li") - innr_ul = self.soup.new_tag("ul",Class="leaders") + innr_ul = self.soup.new_tag("ul", Class="leaders") nd_tag.wrap(innr_ul) prev_tag.append(innr_ul) else: @@ -1325,7 +1338,6 @@ def note_to_decision_nav(self): nd_tag.wrap(innr_ul_tag1) nd_tag.find_previous("li").append(innr_ul_tag1) - if re.match(r'^(\d+\.\s*—\s*—\s*—\s*[a-zA-Z]+)', nd_tag.text.strip()) and nd_tag.name == "li": if re.match(r'^(\d+\.\s*—\s*—\s*—\s*[a-zA-Z]+)', @@ -1337,7 +1349,6 @@ def note_to_decision_nav(self): nd_tag.wrap(innr_ul_tag2) nd_tag.find_previous("li").append(innr_ul_tag2) - def start_parse(self): """ - set the values to instance variables @@ -1376,4 +1387,3 @@ def start_parse(self): self.write_soup_to_file() print(f'finished {self.html_file_name}') print(datetime.now() - start_time) - diff --git a/html_parser/va_html_parser.py b/html_parser/va_html_parser.py index 2018417..91687d4 100644 --- a/html_parser/va_html_parser.py +++ b/html_parser/va_html_parser.py @@ -5,6 +5,7 @@ - this method based on the file type(constitution files or title files) decides which methods to run """ import collections +import os from bs4 import BeautifulSoup, Doctype, element import re @@ -15,9 +16,13 @@ class VAParseHtml(ParserBase): def __init__(self, input_file_name): super().__init__() - self.class_regex = {'ul': '^\d+\-\d+\.\s*|^\d+\.\d+\.|^\d+\.\d+[A-Z]*-\d+\.', 'head2': '^Chapter \d+\.', 'head1': '^Title|^The Constitution of the United States of America', - 'head3': r'^§\s\d+(\.\d+)*[A-Z]*\-\d+\.\s*','junk': '^Statute text','article':'——————————', 'head4': '^CASE NOTES','ol': r'^A\.\s', \ - 'head':'^§§\s*\d+-\d+\s*through\s*\d+-\d+\.|^§§+\s(?P<sec_id>\d+.\d+(-\d+)*)\.*\s*|^Part \d+\.'} + self.class_regex = {'ul': r'^(Chapter|PART) \d+\.|^Chap.|^\d.|^§', + 'head1': '^Title|^The Constitution of the United States of America', + 'head2': r'^Chapter \d+\.|^PART 1\.', + 'head3': r'^§\s\d+(\.\d+)*[A-Z]*\-\d+\.\s*|^§', 'junk': '^Text|^Statute text', + 'article': '——————————', + 'head4': '^CASE NOTES', 'ol': r'^A\.\s', + 'head': r'^§§\s*\d+-\d+\s*through\s*\d+-\d+\.|^§§+\s(?P<sec_id>\d+.\d+(-\d+)*)\.*\s*|^Part \d+\.'} self.title_id = None self.soup = None @@ -37,7 +42,6 @@ def __init__(self, input_file_name): self.start_parse() - def create_page_soup(self): """ @@ -71,7 +75,6 @@ def generate_class_name(self): print(self.class_regex) print('updated class dict') - def remove_junk(self): """ - Delete the junk tags (empty tags and unwanted meta tags) @@ -85,15 +88,15 @@ def remove_junk(self): junk_tag.decompose() [text_junk.decompose() for text_junk in self.soup.find_all("p", class_=self.class_regex["junk"])] - [text_junk.decompose() for text_junk in self.soup.find_all("p", class_=self.class_regex["article"]) if re.search('^——————————',text_junk.text.strip())] + [text_junk.decompose() for text_junk in self.soup.find_all("p", class_=self.class_regex["article"]) if + re.search('^——————————', text_junk.text.strip())] for text_junk in self.soup.find_all("p"): if len(text_junk.get_text(strip=True)) == 0 and not text_junk.get("class") == [self.class_regex["ul"]]: text_junk.decompose() - if title := re.search(r'title\s(?P<title>\d+)', - self.soup.find('p', class_=self.class_regex['head1']).get_text(), re.I): + self.soup.find('p', class_=self.class_regex['head1']).get_text(), re.I): self.title = title.group('title') else: @@ -109,12 +112,12 @@ def remove_junk(self): self.soup.head.append(new_meta) print('junk removed') - def recreate_tag(self): for p_tag in self.soup.find_all(): if re.search('constitution', self.html_file_name): if p_tag.get("class") == [self.class_regex["casenav"]]: - if p_tag.br and re.search(r'^[IA1]\.', p_tag.text.strip()) and re.search(r'^CASE NOTES', p_tag.find_previous().text.strip()): + if p_tag.br and re.search(r'^[IA1]\.', p_tag.text.strip()) and re.search(r'^CASE NOTES', + p_tag.find_previous().text.strip()): p_tag_text = p_tag.text.strip() p_tag.clear() rept_tag = re.split('\n', p_tag_text) @@ -129,7 +132,7 @@ def recreate_tag(self): if p_tag.get("class") == [self.class_regex["article"]]: if re.search(r'^Article\s*\d+\.|^Subtitle\s*[IVX]+\.|^Part\s*[A-Z]+', p_tag.text.strip()): - p_tag["class"] = "navhead" + p_tag["class"] = "navhead" if p_tag.get("class") == [self.class_regex["ul"]] or p_tag.get("class") == [self.class_regex["ol"]]: if re.search(r'^(\d+(\.\d+)*[A-Z]*-\d{1,4}(\.\d+)*\..+\.\s*){1}', p_tag.text.strip()): @@ -148,7 +151,8 @@ def recreate_tag(self): p_tag.unwrap() if p_tag.get("class") == [self.class_regex["ol"]]: - if p_tag.br and re.search(r'^[IA1]\.', p_tag.text.strip()) and re.search(r'^CASE NOTES', p_tag.find_previous().text.strip()): + if p_tag.br and re.search(r'^[IA1]\.', p_tag.text.strip()) and re.search(r'^CASE NOTES', + p_tag.find_previous().text.strip()): p_tag_text = p_tag.text.strip() p_tag.clear() rept_tag = re.split('\n', p_tag_text) @@ -159,103 +163,133 @@ def recreate_tag(self): new_tag["class"] = "casenote" p_tag.unwrap() + # elif re.search(r'^[IVX]+\.|^[A-Z]\.|^[0-9]\. ', p_tag.text.strip()): + # p_tag["class"] = "casenote" def replace_tags(self): cur_id_list = [] cur_head_list = [] + head4_list = ['Cross references.', 'History.', 'Editor’s note.', 'CIRCUIT COURT OPINIONS'] + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + for header_tag in self.soup.body.find_all(): if re.search('constitution', self.html_file_name): if re.search('constitution\.va', self.html_file_name): - self.title_id = 'constitution-va' + self.title_id = 'constitution-va' elif re.search('constitution\.us', self.html_file_name): - self.title_id = 'constitution-us' - + self.title_id = 'constitution-us' if header_tag.get("class") == [self.class_regex["head1"]]: - if re.search(r'^The Constitution of the United States|^Constitution of Virginia',header_tag.text.strip()): + if re.search(r'^THE CONSTITUTION OF THE UNITED STATES OF AMERICA|^Constitution of Virginia', + header_tag.text.strip()): header_tag.name = "h1" header_tag.wrap(self.soup.new_tag("nav")) - header_tag['id'] = self.title_id - elif re.search(r'^ARTICLE [IVX]+\.*',header_tag.text.strip()): - header_tag.name = "h2" - article_id = re.search(r'^ARTICLE (?P<ar_id>[IVX]+)\.*', header_tag.text.strip()).group('ar_id') - header_tag[ - 'id'] = f"{header_tag.find_previous('h1').get('id')}a{article_id.zfill(2)}" - elif re.search(r'^SCHEDULE',header_tag.text.strip()): - header_tag.name = "h2" - article_id = re.sub(r'[\W]','',header_tag.text.strip()).lower() - header_tag[ - 'id'] = f"{header_tag.find_previous('h1').get('id')}a{article_id}" + header_tag['id'] = self.title_id - if header_tag.get("class") == [self.class_regex["amdhead"]]: - if re.search(r'^AMENDMENTS TO THE CONSTITUTION', header_tag.text.strip()): - header_tag.name = "h2" - amd_id = re.sub(r'[\W]','',header_tag.text.strip()).lower() + + elif header_tag.get("class") == [self.class_regex["head4"]]: + if re.search(r'^CASE NOTES', header_tag.text.strip()): + header_tag.name = "h4" + amd_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() header_tag[ - 'id'] = f"{header_tag.find_previous('h1').get('id')}am{amd_id}" + 'id'] = f"{header_tag.find_previous('h3').get('id')}-{amd_id}" - if header_tag.get("class") == [self.class_regex["head3"]]: + elif header_tag.get("class") == [self.class_regex["head3"]]: if re.search(r'^(Section|§) \d+(-[A-Z])*\.', header_tag.text.strip()): header_tag.name = "h3" - sec_id = re.search(r'^(Section|§) (?P<sec_id>\d+(-[A-Z])*)\.', header_tag.text.strip()).group('sec_id') + sec_id = re.search(r'^(Section|§) (?P<sec_id>\d+(-[A-Z])*)\.', header_tag.text.strip()).group( + 'sec_id') header_tag[ 'id'] = f"{header_tag.find_previous('h2').get('id')}s{sec_id.zfill(2)}" - if re.search(r'^\[Amendment [IVX]+\]',header_tag.text.strip()): + elif re.search(r'^\[Amendment [IVX]+\]', header_tag.text.strip()): header_tag.name = "h3" - amd_id = re.search(r'^\[Amendment (?P<ar_id>[IVX]+)', header_tag.text.strip()).group('ar_id') + sec_id = re.search(r'^\[Amendment (?P<sec_id>[IVX]+)\]', + header_tag.text.strip()).group( + 'sec_id') header_tag[ - 'id'] = f"{header_tag.find_previous('h2').get('id')}s{amd_id.zfill(2)}" + 'id'] = f"{header_tag.find_previous('h2').get('id')}s{sec_id.zfill(2)}" - if header_tag.get("class") == [self.class_regex["article"]]: - if re.search(r'^Section \d+\.', header_tag.text.strip()): - header_tag.name = "h4" - sec_id = re.search(r'^Section (?P<sec_id>\d+)\.', header_tag.text.strip()).group('sec_id') - header_tag[ - 'id'] = f"{header_tag.find_previous('h3').get('id')}-sub{sec_id.zfill(2)}" - if header_tag.get("class") == [self.class_regex["ol"]]: - if re.search(r'^CASE NOTES', header_tag.text.strip()): - header_tag.name = "h4" - sec_id = re.sub(r'[\W]','',header_tag.text.strip()).lower() + # if re.search(r'^\[Amendment [IVX]+\]', header_tag.text.strip()): + # header_tag.name = "h3" + # amd_id = re.search(r'^\[Amendment (?P<ar_id>[IVX]+)', header_tag.text.strip()).group('ar_id') + # header_tag[ + # 'id'] = f"{header_tag.find_previous('h2').get('id')}s{amd_id.zfill(2)}" + # + # if header_tag.get("class") == [self.class_regex["article"]]: + # if re.search(r'^Section \d+\.', header_tag.text.strip()): + # header_tag.name = "h4" + # sec_id = re.search(r'^Section (?P<sec_id>\d+)\.', header_tag.text.strip()).group('sec_id') + # header_tag[ + # 'id'] = f"{header_tag.find_previous('h3').get('id')}-sub{sec_id.zfill(2)}" + # + elif header_tag.get("class") == [self.class_regex["head2"]]: + if re.search(r'^ARTICLE [IVX]+\.*', header_tag.text.strip(), re.I): + header_tag.name = "h2" + article_id = re.search(r'^ARTICLE (?P<ar_id>[IVX]+)\.*', header_tag.text.strip(), re.I).group( + 'ar_id') header_tag[ - 'id'] = f"{header_tag.find_previous('h3').get('id')}-{sec_id}" + 'id'] = f"{header_tag.find_previous('h1').get('id')}a{article_id.zfill(2)}" + elif re.search(r'^AMENDMENTS TO THE CONSTITUTION|^Schedule', header_tag.text.strip(), re.I): + header_tag.name = "h2" + amd_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() + header_tag[ + 'id'] = f"{header_tag.find_previous('h1').get('id')}a{amd_id}" - if re.search(r'^[IVX]+\.', header_tag.text.strip()): - header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[IVX]+)\.', header_tag.text.strip()).group('c_id').lower() - header_tag['id'] = f"{header_tag.find_previous('h4').get('id')}-{tag_text}" - header_tag['class'] = 'casehead' - elif re.search(r'^[A-Z]\.', header_tag.text.strip()): - header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[A-Z])\.', header_tag.text.strip()).group('c_id').lower() - header_tag[ - 'id'] = f"{header_tag.find_previous('h5', class_='casehead').get('id')}-{tag_text}" - header_tag['class'] = 'casesub' - - elif re.search(r'^[0-9]+\.', header_tag.text.strip()): - header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[0-9]+)\.', header_tag.text.strip()).group('c_id').lower() - header_tag[ - 'id'] = f"{header_tag.find_previous('h5', class_='casesub').get('id')}-{tag_text}" - header_tag['class'] = 'casedigit' - - elif re.search(r'^[ivx]+\.', header_tag.text.strip()): - header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[ivx]+)\.', header_tag.text.strip()).group('c_id').lower() - header_tag[ - 'id'] = f"{header_tag.find_previous('h5', class_='casealpha').get('id')}-{tag_text}" - - elif re.search(r'^[a-z]\.', header_tag.text.strip()): - header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[a-z])\.', header_tag.text.strip()).group('c_id').lower() - header_tag[ - 'id'] = f"{header_tag.find_previous('h5', class_='casedigit').get('id')}-{tag_text}" - header_tag['class'] = 'casealpha' + elif re.search(r'^\[Amendment[IVX]+\]', header_tag.text.strip()): + header_tag.name = "h3" + sec_id = re.search(r'^\[Amendment[(?P<sec_id>IVX)]+\]', + header_tag.text.strip()).group( + 'sec_id') + header_tag[ + 'id'] = f"{header_tag.find_previous('h2').get('id')}s{sec_id.zfill(2)}" - if header_tag.get("class") == [self.class_regex["ul"]] and not re.search('^PREAMBLE|^Sec\.|^Article|^Amend\.',header_tag.text.strip()): + # if re.search(r'^CASE NOTES', header_tag.text.strip()): + # header_tag.name = "h4" + # sec_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() + # header_tag[ + # 'id'] = f"{header_tag.find_previous('h3').get('id')}-{sec_id}" + # + # if re.search(r'^[IVX]+\.', header_tag.text.strip()): + # header_tag.name = "h5" + # tag_text = re.search('^(?P<c_id>[IVX]+)\.', header_tag.text.strip()).group('c_id').lower() + # header_tag['id'] = f"{header_tag.find_previous('h4').get('id')}-{tag_text}" + # header_tag['class'] = 'casehead' + # + # elif re.search(r'^[A-Z]\.', header_tag.text.strip()): + # header_tag.name = "h5" + # tag_text = re.search('^(?P<c_id>[A-Z])\.', header_tag.text.strip()).group('c_id').lower() + # header_tag[ + # 'id'] = f"{header_tag.find_previous('h5', class_='casehead').get('id')}-{tag_text}" + # header_tag['class'] = 'casesub' + # + # elif re.search(r'^[0-9]+\.', header_tag.text.strip()): + # header_tag.name = "h5" + # tag_text = re.search('^(?P<c_id>[0-9]+)\.', header_tag.text.strip()).group('c_id').lower() + # header_tag[ + # 'id'] = f"{header_tag.find_previous('h5', class_='casesub').get('id')}-{tag_text}" + # header_tag['class'] = 'casedigit' + # + # elif re.search(r'^[ivx]+\.', header_tag.text.strip()): + # header_tag.name = "h5" + # tag_text = re.search('^(?P<c_id>[ivx]+)\.', header_tag.text.strip()).group('c_id').lower() + # header_tag[ + # 'id'] = f"{header_tag.find_previous('h5', class_='casealpha').get('id')}-{tag_text}" + # + # elif re.search(r'^[a-z]\.', header_tag.text.strip()): + # header_tag.name = "h5" + # tag_text = re.search('^(?P<c_id>[a-z])\.', header_tag.text.strip()).group('c_id').lower() + # header_tag[ + # 'id'] = f"{header_tag.find_previous('h5', class_='casedigit').get('id')}-{tag_text}" + # header_tag['class'] = 'casealpha' + # + + if header_tag.get("class") == [self.class_regex["ul"]] and re.search( + '^Section \d+\.|^Article|^Amendments|^\[Amendment [IVX]+\] |^Schedule|^§', + header_tag.text.strip(), re.I): header_tag.name = "li" else: @@ -264,14 +298,17 @@ def replace_tags(self): header_tag.name = "h1" header_tag.attrs = {} header_tag.wrap(self.soup.new_tag("nav")) - self.title_id = re.search(r'^(Title)\s(?P<title_id>\d+(\.\d+)*[A-Z]*)', header_tag.text.strip()).group('title_id').zfill(2) - header_tag['id'] =f"t{self.title_id}" + self.title_id = re.search(r'^(Title)\s(?P<title_id>\d+(\.\d+)*[A-Z]*)', + header_tag.text.strip()).group('title_id').zfill(2) + header_tag['id'] = f"t{self.title_id}" - elif re.search(r'^Article\s*(?P<ar_id>\d+(\.\d+)*)\.', header_tag.text.strip()): + elif re.search(r'^Article\s*(?P<ar_id>\d+(\.\d+)*)\.', header_tag.text.strip()): header_tag.name = "h2" - article_id = re.search(r'^Article\s*(?P<ar_id>\d+(\.\d+)*)\.', header_tag.text.strip()).group('ar_id') - if header_tag.find_previous('h2',class_='chapter'): - header_tag['id'] = f"{header_tag.find_previous('h2',class_='chapter').get('id')}a{article_id.zfill(2)}" + article_id = re.search(r'^Article\s*(?P<ar_id>\d+(\.\d+)*)\.', header_tag.text.strip()).group( + 'ar_id') + if header_tag.find_previous('h2', class_='chapter'): + header_tag[ + 'id'] = f"{header_tag.find_previous('h2', class_='chapter').get('id')}a{article_id.zfill(2)}" else: header_tag[ @@ -280,7 +317,8 @@ def replace_tags(self): self.snav_count = 1 elif re.search(r'^SUBTITLE\s*(?P<sub_id>[IVX]+)\.', header_tag.text.strip()): header_tag.name = "h2" - article_id = re.search(r'^SUBTITLE\s*(?P<sub_id>[IVX]+)\.', header_tag.text.strip()).group('sub_id') + article_id = re.search(r'^SUBTITLE\s*(?P<sub_id>[IVX]+)\.', header_tag.text.strip()).group( + 'sub_id') header_tag[ 'id'] = f"t{self.title_id.zfill(2)}s{article_id.zfill(2)}" header_tag["class"] = "subtitle" @@ -288,109 +326,176 @@ def replace_tags(self): elif re.search(r'^PART\s*(?P<part_id>[A-Z]+)\.', header_tag.text.strip()): header_tag.name = "h2" - article_id = re.search(r'^PART\s*(?P<part_id>[A-Z]+)\.', header_tag.text.strip()).group('part_id') + article_id = re.search(r'^PART\s*(?P<part_id>[A-Z]+(\d+)*)\.', header_tag.text.strip()).group( + 'part_id') header_tag[ 'id'] = f"{header_tag.find_previous('h2', class_='subtitle').get('id')}p{article_id.zfill(2)}" header_tag["class"] = "part" self.snav_count = 1 - elif header_tag.get("class") == [self.class_regex["head2"]] and re.search(r'^(Chapter)\s(?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', header_tag.text.strip()) : - header_tag.name = "h2" - chapter_id = re.search(r'^(Chapter)\s(?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', header_tag.text.strip()).group('chap_id') + elif header_tag.get("class") == [self.class_regex["head2"]]: + if re.search(r'^(Chapter|PART)\s(?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', header_tag.text.strip()): + header_tag.name = "h2" + chapter_id = re.search(r'^(Chapter|PART)\s(?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', + header_tag.text.strip()).group('chap_id') + + if header_tag.find_previous('h2', class_=['part', 'subtitle']): + header_tag[ + 'id'] = f"{header_tag.find_previous('h2', class_=['part', 'subtitle']).get('id')}c{chapter_id.zfill(2)}" + else: + header_tag['id'] = f"t{self.title_id.zfill(2)}c{chapter_id.zfill(2)}" + header_tag["class"] = "chapter" + elif re.search( + r'^Article\s(?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', header_tag.text.strip()): + header_tag.name = "h2" + chapter_id = re.search(r'^Article\s(?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', + header_tag.text.strip()).group('chap_id') + + if header_tag.find_previous('h2', class_=['part', 'subtitle', 'chapter']): + header_tag[ + 'id'] = f"{header_tag.find_previous('h2', class_=['part', 'subtitle', 'chapter']).get('id')}a{chapter_id.zfill(2)}" + else: + header_tag['id'] = f"t{self.title_id.zfill(2)}a{chapter_id.zfill(2)}" + header_tag["class"] = "article" + elif re.search( + r'^Subtitle\s(?P<chap_id>[IVX]+)\.', header_tag.text.strip()): + header_tag.name = "h2" + chapter_id = re.search(r'^Subtitle\s(?P<chap_id>[IVX]+)\.', + header_tag.text.strip()).group('chap_id') + + header_tag['id'] = f"t{self.title_id.zfill(2)}sub{chapter_id.zfill(2)}" + header_tag["class"] = "Subtitle" + elif re.search(r'^Part\s(?P<chap_id>([A-Z])*(\d+)*)\.', header_tag.text.strip()): + header_tag.name = "h2" + chapter_id = re.search(r'^Part\s(?P<chap_id>([A-Z])*(\d+)*)\.', + header_tag.text.strip()).group('chap_id') + + header_tag[ + 'id'] = f"{header_tag.find_previous('h2', class_=['Subtitle', 'article']).get('id')}p{chapter_id.zfill(2)}" + + header_tag["class"] = "Part" + elif re.search(r'^APPENDIX', header_tag.text.strip()): + header_tag.name = "h2" + chapter_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() - if header_tag.find_previous('h2', class_ =['part','subtitle']): - header_tag['id'] = f"{header_tag.find_previous('h2', class_=['part','subtitle']).get('id')}c{chapter_id.zfill(2)}" + header_tag[ + 'id'] = f"{header_tag.find_previous('h1').get('id')}c{chapter_id.zfill(2)}" else: - header_tag['id'] = f"t{self.title_id.zfill(2)}c{chapter_id.zfill(2)}" - header_tag["class"] = "chapter" + header_tag.name = "h2" + chapter_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() + + header_tag[ + 'id'] = f"{header_tag.find_previous('h1').get('id')}c{chapter_id.zfill(2)}" + self.navhead = None self.snav_count = 1 elif header_tag.get("class") == [self.class_regex["head3"]]: - header_tag.name = "h3" + if not re.search(r'^§', header_tag.find_next_sibling().text.strip()): + header_tag.name = "h3" - section_id = re.search(r'^§+\s(?P<sec_id>\d+(\.\d+)*[A-Z]*-\d+(\.\d+)*(:\d+)*)\.*\s*', header_tag.text.strip()).group( - 'sec_id') - curr_head_id = f"{header_tag.find_previous(['h2','h1']).get('id')}s{section_id.zfill(2)}" + if re.search(r'^§+\s(?P<sec_id>\d+(\.\d+)*[A-Z]*-\d+(\.\d+)*(:\d+)*(\.\d+)*(\.\d+)*)\.*\s*', + header_tag.text.strip()): + section_id = re.search( + r'^§+\s(?P<sec_id>\d+(\.\d+)*[A-Z]*-\d+(\.\d+)*(:\d+)*(\.\d+)*(\.\d+)*)\.*\s*', + header_tag.text.strip()).group('sec_id') + else: + section_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() + curr_head_id = f"{header_tag.find_previous(['h2', 'h1']).get('id')}s{section_id.zfill(2)}" if curr_head_id in cur_head_list: - header_tag['id'] = f"{header_tag.find_previous(['h2','h1']).get('id')}s{section_id.zfill(2)}.1." + header_tag[ + 'id'] = f"{header_tag.find_previous(['h2', 'h1']).get('id')}s{section_id.zfill(2)}.1." else: header_tag['id'] = f"{header_tag.find_previous(['h2', 'h1']).get('id')}s{section_id.zfill(2)}" cur_head_list.append(curr_head_id) elif header_tag.get("class") == [self.class_regex["head4"]]: - if re.search(r'^[IVX]+\.',header_tag.text.strip()): + if re.search(r'^CASE NOTES|OFFICIAL COMMENT', header_tag.text.strip()): + header_tag.name = "h4" + sec_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() + header_tag[ + 'id'] = f"{header_tag.find_previous('h3').get('id')}-{sec_id}" + + if re.search(r'^[IVX]+\.', header_tag.text.strip()): header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[IVX]+)\.',header_tag.text.strip()).group('c_id').lower() + tag_text = re.search('^(?P<c_id>[IVX]+)\.', header_tag.text.strip()).group('c_id').lower() - header_tag['id'] = f"{header_tag.find_previous('h4').get('id')}-{tag_text}" + header_tag['id'] = f"{header_tag.find_previous('h3').get('id')}-casenotes-{tag_text}" header_tag['class'] = 'casehead' - elif re.search(r'^[A-Z]\.',header_tag.text.strip()): + elif re.search(r'^[A-Z]\.', header_tag.text.strip()): header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[A-Z])\.',header_tag.text.strip()).group('c_id').lower() - header_tag['id'] = f"{header_tag.find_previous('h5',class_='casehead').get('id')}-{tag_text}" + tag_text = re.search('^(?P<c_id>[A-Z])\.', header_tag.text.strip()).group('c_id').lower() + header_tag['id'] = f"{header_tag.find_previous('h5', class_='casehead').get('id')}-{tag_text}" header_tag['class'] = 'casesub' elif re.search(r'^[0-9]+\.', header_tag.text.strip()): - header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[0-9]+)\.',header_tag.text.strip()).group('c_id').lower() - header_tag['id'] = f"{header_tag.find_previous('h5',class_='casesub').get('id')}-{tag_text}" - header_tag['class'] = 'casedigit' + if header_tag.find_previous('h5', class_='casesub'): + header_tag.name = "h5" + tag_text = re.search('^(?P<c_id>[0-9]+)\.', header_tag.text.strip()).group('c_id').lower() + + header_tag[ + 'id'] = f"{header_tag.find_previous('h5', class_='casesub').get('id')}-{tag_text}" + header_tag['class'] = 'casedigit' elif re.search(r'^[ivx]+\.', header_tag.text.strip()): header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[ivx]+)\.',header_tag.text.strip()).group('c_id').lower() - header_tag['id'] = f"{header_tag.find_previous('h5',class_='casealpha').get('id')}-{tag_text}" + tag_text = re.search('^(?P<c_id>[ivx]+)\.', header_tag.text.strip()).group('c_id').lower() + header_tag['id'] = f"{header_tag.find_previous('h5', class_='casealpha').get('id')}-{tag_text}" elif re.search(r'^[a-z]\.', header_tag.text.strip()): header_tag.name = "h5" - tag_text = re.search('^(?P<c_id>[a-z])\.',header_tag.text.strip()).group('c_id').lower() - header_tag['id'] = f"{header_tag.find_previous('h5',class_='casedigit').get('id')}-{tag_text}" + tag_text = re.search('^(?P<c_id>[a-z])\.', header_tag.text.strip()).group('c_id').lower() + header_tag['id'] = f"{header_tag.find_previous('h5', class_='casedigit').get('id')}-{tag_text}" header_tag['class'] = 'casealpha' else: - header_tag.name = "h4" - subsection_id = header_tag.text.strip().lower() - subsection_id = re.sub('[\s]','',subsection_id) - curr_tag_id = f"{header_tag.find_previous(['h3','h2','h1']).get('id')}-{subsection_id}" - - if curr_tag_id in cur_id_list: - if header_tag.find_previous('h3'): - header_tag['id'] = f"{header_tag.find_previous('h3').get('id')}-{subsection_id}.1" - elif header_tag.find_previous('h2'): - header_tag['id'] = f"{header_tag.find_previous('h2').get('id')}-{subsection_id}.1" - elif header_tag.find_previous('h1'): - header_tag['id'] = f"{header_tag.find_previous('h1').get('id')}-{subsection_id}.1" - else: - if header_tag.find_previous('h3'): - header_tag['id'] = f"{header_tag.find_previous('h3').get('id')}-{subsection_id}" - elif header_tag.find_previous('h2'): - header_tag['id'] = f"{header_tag.find_previous('h2').get('id')}-{subsection_id}" - elif header_tag.find_previous('h1'): - header_tag['id'] = f"{header_tag.find_previous('h1').get('id')}-{subsection_id}" - cur_id_list.append(header_tag['id']) + if header_tag.text.strip() in head4_list: + header_tag.name = "h4" + # subsection_id = header_tag.text.strip().lower() + subsection_id = sec_id = re.sub(r'[\W]+', '', header_tag.text.strip()).lower() + curr_tag_id = f"{header_tag.find_previous(['h3', 'h2', 'h1']).get('id')}-{subsection_id}" + + if curr_tag_id in cur_id_list: + if header_tag.find_previous('h3'): + header_tag['id'] = f"{header_tag.find_previous('h3').get('id')}-{subsection_id}.1" + elif header_tag.find_previous('h2'): + header_tag['id'] = f"{header_tag.find_previous('h2').get('id')}-{subsection_id}.1" + elif header_tag.find_previous('h1'): + header_tag['id'] = f"{header_tag.find_previous('h1').get('id')}-{subsection_id}.1" + else: + if header_tag.find_previous('h3'): + header_tag['id'] = f"{header_tag.find_previous('h3').get('id')}-{subsection_id}" + elif header_tag.find_previous('h2'): + header_tag['id'] = f"{header_tag.find_previous('h2').get('id')}-{subsection_id}" + elif header_tag.find_previous('h1'): + header_tag['id'] = f"{header_tag.find_previous('h1').get('id')}-{subsection_id}" + cur_id_list.append(header_tag['id']) elif header_tag.get("class") == "navhead": header_tag.name = "h2" if re.search(r'^Article\s*(?P<ar_id>\d+)\.', header_tag.text.strip()): - article_id = re.search(r'^Article\s*(?P<ar_id>\d+(\.\d+)*)\.', header_tag.text.strip()).group('ar_id') + article_id = re.search(r'^Article\s*(?P<ar_id>\d+(\.\d+)*)\.', header_tag.text.strip()).group( + 'ar_id') if header_tag.find_previous('h2', class_='chapter'): - header_tag['id'] = f"{header_tag.find_previous('h2', class_='chapter').get('id')}a{article_id.zfill(2)}" + header_tag[ + 'id'] = f"{header_tag.find_previous('h2', class_='chapter').get('id')}a{article_id.zfill(2)}" elif header_tag.find_previous('h2', class_='subtitle'): header_tag[ - 'id'] = f"{header_tag.find_previous('h2', class_='subtitle').get('id')}a{article_id.zfill(2)}" + 'id'] = f"{header_tag.find_previous('h2', class_='subtitle').get('id')}a{article_id.zfill(2)}" else: + header_tag[ - 'id'] = f"{header_tag.find_previous('h2').get('id')}a{article_id.zfill(2)}" + 'id'] = f"{header_tag.find_previous({'h2', 'h1'}).get('id')}a{article_id.zfill(2)}" elif re.search(r'^Subtitle\s*(?P<sub_id>[IVX]+)\.', header_tag.text.strip()): - article_id = re.search(r'^Subtitle\s*(?P<sub_id>[IVX]+)\.', header_tag.text.strip()).group('sub_id') + article_id = re.search(r'^Subtitle\s*(?P<sub_id>[IVX]+)\.', header_tag.text.strip()).group( + 'sub_id') header_tag[ 'id'] = f"t{self.title_id.zfill(2)}s{article_id.zfill(2)}" @@ -400,14 +505,16 @@ def replace_tags(self): article_id = re.search(r'^Part\s*(?P<p_id>[A-Z]+)', header_tag.text.strip()).group('p_id') prev_tag = header_tag.find_previous_sibling( - lambda tag: tag.name == 'h2' and re.search(r'^Subtitle\s*(?P<sub_id>[IVX]+)\.', tag.text.strip()) and + lambda tag: tag.name == 'h2' and re.search(r'^Subtitle\s*(?P<sub_id>[IVX]+)\.', + tag.text.strip()) and tag.get("class") == "navhead") header_tag[ 'id'] = f"{prev_tag.get('id')}p{article_id.zfill(2)}" self.snav_count = 1 elif header_tag.get("class") == [self.class_regex["ul"]]: - if not re.search('^Chap\.|^Sec\.|^Part',header_tag.text.strip()) and not len(header_tag.get_text(strip=True)) == 0 : + if not re.search('^Chap\.|^Sec\.|^Part\.', header_tag.text.strip()) and not len( + header_tag.get_text(strip=True)) == 0: header_tag.name = "li" if re.search(r'^(?P<sec_id>\d+-\d+)\.\s*', header_tag.text.strip()): @@ -420,15 +527,16 @@ def replace_tags(self): "id"] = f"{header_tag.find_previous('h2').get('id')}s{chap_id}-snav{self.snav_count:02}" self.snav_count += 1 - elif header_tag.get("class") == [self.class_regex["head"]]: if re.search(r'^§§\s*(?P<sec_id>\d+(\.\d+)*-\d+)\s*through\s*\d+-\d+\.', header_tag.text.strip()): header_tag.name = "h3" - section_id = re.search(r'^§§\s*(?P<sec_id>\d+-\d+)\s*through\s*\d+-\d+\.', header_tag.text.strip()).group('sec_id') + section_id = re.search(r'^§§\s*(?P<sec_id>\d+-\d+)\s*through\s*\d+-\d+\.', + header_tag.text.strip()).group('sec_id') header_tag['id'] = f"t{self.title_id.zfill(2)}c{section_id.zfill(2)}" elif re.search(r'^§§+\s(?P<sec_id>\d+.\d+(-\d+)*)\.*\s*', header_tag.text.strip()): header_tag.name = "h3" - section_id = re.search(r'^§§+\s(?P<sec_id>\d+.\d+(-\d+)*)\.*\s*', header_tag.text.strip()).group('sec_id') + section_id = re.search(r'^§§+\s(?P<sec_id>\d+.\d+(-\d+)*)\.*\s*', + header_tag.text.strip()).group('sec_id') header_tag['id'] = f"t{self.title_id.zfill(2)}s{section_id.zfill(2)}" elif re.search(r'^Part \d+\.', header_tag.text.strip()): @@ -455,69 +563,76 @@ def replace_tags(self): header_tag["class"] = "subtitle" self.snav_count = 1 + elif header_tag.get("class") == [self.class_regex["ol"]]: if re.search(r'^§\s\d+\.', header_tag.text.strip()): header_tag.name = "h5" subsection_id = re.search(r'^§\s(?P<ar_id>\d+)\.', header_tag.text.strip()).group("ar_id") - header_tag["id"] = f"{header_tag.find_previous(['h4','h3', 'h2', 'h1']).get('id')}-sub{subsection_id}" - - print('tags replaced') - + header_tag[ + "id"] = f"{header_tag.find_previous(['h4', 'h3', 'h2', 'h1']).get('id')}-sub{subsection_id}" + elif re.search(r'^[IVX]+\. |^[A-Z1-9a-z]\. ', header_tag.text.strip()) and not header_tag.b: + header_tag["class"] = "casenote" - def create_main_tag(self): - """ - - wrap all contents inside main tag(Except chapter index) - """ - if re.search('constitution', self.html_file_name): - section_nav_tag = self.soup.new_tag("main") - first_chapter_header = self.soup.find("h2") - for main_tag in self.soup.find_all(): - if main_tag.name == "i": - main_tag.unwrap() - if main_tag.find_next("h2") == first_chapter_header: - continue - elif main_tag == first_chapter_header: - main_tag.wrap(section_nav_tag) - else: - section_nav_tag.append(main_tag) - if main_tag.name == "span" or main_tag.name == "b" : - main_tag.find_previous().append(main_tag) - else: - section_nav_tag = self.soup.new_tag("main") - first_chapter_header = self.soup.find(['h2']) - for main_tag in self.soup.findAll(): - if main_tag.find_next("h2") == first_chapter_header: - continue - elif main_tag == first_chapter_header: - main_tag.wrap(section_nav_tag) - else: - if main_tag.name == "span" and not main_tag.get("class") == "gnrlbreak": - continue - elif main_tag.name == "b" or main_tag.name == "i" or main_tag.name == "br": - continue - else: - section_nav_tag.append(main_tag) - - - main_tag = self.soup.find("main") - if not main_tag: - section_nav_tag = self.soup.new_tag("main") - first_chapter_header = self.soup.find(['h3']) - for main_tag in self.soup.findAll(): - if main_tag.find_next("h3") == first_chapter_header: - continue - elif main_tag == first_chapter_header: - main_tag.wrap(section_nav_tag) - else: - if main_tag.name == "span" and not main_tag.get("class") == "gnrlbreak": - continue - elif main_tag.name == "b" or main_tag.name == "i" or main_tag.name == "br": - continue - else: - section_nav_tag.append(main_tag) + stylesheet_link_tag = self.soup.new_tag('link') + stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', + 'href': 'https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css'} + self.soup.style.replace_with(stylesheet_link_tag) + self.meta_tags.append(stylesheet_link_tag) - print("main tag is created") + print('tags replaced') + # def create_main_tag(self): + # """ + # - wrap all contents inside main tag(Except chapter index) + # """ + # if re.search('constitution', self.html_file_name): + # section_nav_tag = self.soup.new_tag("main") + # first_chapter_header = self.soup.find("h2") + # for main_tag in self.soup.find_all(): + # if main_tag.name == "i": + # main_tag.unwrap() + # if main_tag.find_next("h2") == first_chapter_header: + # continue + # elif main_tag == first_chapter_header: + # main_tag.wrap(section_nav_tag) + # else: + # section_nav_tag.append(main_tag) + # if main_tag.name == "span" or main_tag.name == "b": + # main_tag.find_previous().append(main_tag) + # else: + # section_nav_tag = self.soup.new_tag("main") + # first_chapter_header = self.soup.find(['h2']) + # for main_tag in self.soup.findAll(): + # if main_tag.find_next("h2") == first_chapter_header: + # continue + # elif main_tag == first_chapter_header: + # main_tag.wrap(section_nav_tag) + # else: + # if main_tag.name == "span" and not main_tag.get("class") == "gnrlbreak": + # continue + # elif main_tag.name == "b" or main_tag.name == "i" or main_tag.name == "br": + # continue + # else: + # section_nav_tag.append(main_tag) + # + # main_tag = self.soup.find("main") + # if not main_tag: + # section_nav_tag = self.soup.new_tag("main") + # first_chapter_header = self.soup.find(['h3']) + # for main_tag in self.soup.findAll(): + # if main_tag.find_next("h3") == first_chapter_header: + # continue + # elif main_tag == first_chapter_header: + # main_tag.wrap(section_nav_tag) + # else: + # if main_tag.name == "span" and not main_tag.get("class") == "gnrlbreak": + # continue + # elif main_tag.name == "b" or main_tag.name == "i" or main_tag.name == "br": + # continue + # else: + # section_nav_tag.append(main_tag) + # + # print("main tag is created") def create_ul_tag(self): """ @@ -533,32 +648,30 @@ def create_ul_tag(self): ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) list_item.wrap(ul_tag) - if re.search(r'^Article|^PREAMBLE',ul_tag.find_previous("p").text.strip()): - ul_tag.find_previous("nav").append(ul_tag.find_previous("p")) - ul_tag.find_previous("nav").append(ul_tag) + if ul_tag.find_previous("ul") or ul_tag.find_previous("p"): + nav_tag = self.soup.new_tag("nav") + ul_tag.wrap(nav_tag) else: - ul_tag.find_previous("p").wrap(self.soup.new_tag("nav")) ul_tag.find_previous("nav").append(ul_tag) + else: ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) for list_item in self.soup.find_all("li"): - if list_item.find_previous().name == "li": + if list_item.find_previous().name in ["ul", "li"]: ul_tag.append(list_item) else: ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) list_item.wrap(ul_tag) - if ul_tag.find_previous("p").text.strip() == 'Chap.' or ul_tag.find_previous("p").text.strip() == 'Part': - ul_tag.find_previous("nav").append(ul_tag.find_previous("p")) - ul_tag.find_previous("nav").append(ul_tag) + if ul_tag.find_previous("ul") or ul_tag.find_previous("p"): + nav_tag = self.soup.new_tag("nav") + ul_tag.wrap(nav_tag) else: - ul_tag.find_previous("p").wrap(self.soup.new_tag("nav")) ul_tag.find_previous("nav").append(ul_tag) print("ul tag is created") - def set_chapter_section_nav(self, list_item, chap_num, sub_tag, prev_id, sec_num): nav_list = [] nav_link = self.soup.new_tag('a') @@ -580,15 +693,19 @@ def set_chapter_section_nav(self, list_item, chap_num, sub_tag, prev_id, sec_num nav_list.append(nav_link) list_item.contents = nav_list - def create_chapter_section_nav(self): count = 0 + anav_count = 1 + snav_count = 1 + pnav_count = 1 + spnav_count = 1 + for list_item in self.soup.find_all("li"): if re.search('constitution', self.html_file_name): - if re.search(r'^[IXV]+\.|^AMENDMENTS|^Schedule', list_item.text.strip()): - if re.match(r'^[IXV]+\.', list_item.text.strip()): - chap_num = re.search(r'^(?P<chap>[IXV]+)\. ', list_item.text.strip()).group( + if re.search(r'^Article [IXV]+\.|^AMENDMENTS|^Schedule|^\[?Amendments?', list_item.text.strip()): + if re.match(r'^Article [IXV]+\.', list_item.text.strip()): + chap_num = re.search(r'^Article (?P<chap>[IXV]+)\. ', list_item.text.strip()).group( "chap").zfill(2) sub_tag = "a" @@ -600,54 +717,122 @@ def create_chapter_section_nav(self): prev_id = None - elif re.search(r'^AMENDMENTS', list_item.text.strip()): - chap_num = re.sub(r'[\W]', '', list_item.text.strip()).lower() + elif re.search(r'^AMENDMENTS|^Amendments', list_item.text.strip()): + chap_num = re.sub(r'[\W]+', '', list_item.text.strip()).lower() sub_tag = "am" prev_id = None + elif re.search(r'^\[Amendment [IVX]+\]', list_item.text.strip()): + chap_num = re.search(r'^\[Amendment (?P<am_id>[IVX]+)\]', list_item.text.strip()).group( + "am_id").zfill(2) + sub_tag = "s" + prev_id = list_item.find_previous("h2").get("id") elif re.search(r'^Schedule', list_item.text.strip()): - chap_num = re.sub(r'[\W]', '', list_item.text.strip()).lower() + chap_num = re.sub(r'[\W]+', '', list_item.text.strip()).lower() sub_tag = "a" prev_id = None self.set_chapter_section_nav(list_item, chap_num, sub_tag, prev_id, None) - elif re.search(r'^\d+(-[A-Z])*\.',list_item.text.strip()): - chap_num = re.search(r'^(?P<sec>\d+(-[A-Z])*)\. ', list_item.text.strip()).group( + elif re.search(r'^§? \d+(-[A-Z])*\.', list_item.text.strip()): + chap_num = re.search(r'^§? (?P<sec>\d+(-[A-Z])*)\. ', list_item.text.strip()).group( "sec").zfill(2) sub_tag = "s" prev_id = list_item.find_previous("h2").get("id") self.set_chapter_section_nav(list_item, chap_num, sub_tag, prev_id, None) - + elif re.search(r'^Section \d+\.', list_item.text.strip()): + chap_num = re.search(r'^Section (?P<sec>\d+)\.', list_item.text.strip()).group( + "sec").zfill(2) + sub_tag = "s" + prev_id = list_item.find_previous("h2").get("id") + self.set_chapter_section_nav(list_item, chap_num, sub_tag, prev_id, None) else: - if re.search(r'^(?P<sec_id>\d+(\.\d+)*[A-Z]*-\d+(:\d+)*)\.*\s*', list_item.text.strip()): - chap_id = re.search(r'^(?P<sec_id>\d+(\.\d+)*[A-Z]*-\d+(\.\d+)*(:\d+)*)\.*\s*', list_item.text.strip()).group('sec_id') + if re.search(r'^§{1,2} (?P<sec_id>\d+(\.\d+)*[A-Z]*-\d+(:\d+)*(\.\d+)*(\.\d+)*)\.*\s*', + list_item.text.strip()): + chap_id = re.search( + r'^§{1,2} (?P<sec_id>\d+(\.\d+)*[A-Z]*-\d+(\.\d+)*(:\d+)*(\.\d+)*(\.\d+)*)\.*\s*', + list_item.text.strip()).group('sec_id') + sub_tag = "s" + prev_id = list_item.find_previous(['h2', 'h1']).get("id") + self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None) + + elif re.search(r'^\d+(\.\d+)*\. ', list_item.text.strip()): + chap_id = re.search(r'^(?P<sec_id>\d+(\.\d+)*)\.', + list_item.text.strip()).group('sec_id') + sub_tag = "c" + prev_id = list_item.find_previous(['h2', 'h1']).get("id") + self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None) + + + elif re.search(r'^\d+(\.\d+)*-\d+(\.\d+)*\.', list_item.text.strip()): + chap_id = re.search(r'^(?P<sec_id>\d+(\.\d+)*-\d+(\.\d+)*)\.', + list_item.text.strip()).group('sec_id') sub_tag = "s" - prev_id = list_item.find_previous(['h2','h1']).get("id") + prev_id = list_item.find_previous('h2').get("id") self.set_chapter_section_nav(list_item, chap_id.zfill(2), sub_tag, prev_id, None) - elif re.search(r'^(?P<chap_id>\d+(\.\d+)*)\.', list_item.text.strip()): - chapter_id = re.search(r'^(?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', list_item.text.strip()).group('chap_id') + + elif re.search(r'^(Chapter|PART) (?P<chap_id>\d+(\.\d+)*)\.', list_item.text.strip()): + chapter_id = re.search(r'^(Chapter|PART) (?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', + list_item.text.strip()).group( + 'chap_id') sub_tag = "c" - if list_item.find_previous("h2",class_="navhead"): + if list_item.find_previous("h2", class_="navhead"): prev_id = list_item.find_previous("h2").get("id") else: prev_id = None - self.set_chapter_section_nav(list_item, chapter_id.zfill(2), sub_tag, prev_id, None) list_item["id"] = f"t{self.title}c{chapter_id.zfill(2)}-cnav{self.cnav_count:02}" self.cnav_count += 1 + elif re.search(r'^Article (?P<chap_id>\d+(\.\d+)*)\.', list_item.text.strip()): + chapter_id = re.search(r'^Article (?P<chap_id>\d+(\.\d+)*(:\d+)*)\.', list_item.text.strip()).group( + 'chap_id') + sub_tag = "a" + prev_id = list_item.find_previous({"h2", "h1"}).get("id") + self.set_chapter_section_nav(list_item, chapter_id.zfill(2), sub_tag, prev_id, None) + list_item["id"] = f"{prev_id}a{chapter_id.zfill(2)}-anav{anav_count:02}" + anav_count += 1 + + elif re.search(r'^Subtitle (?P<chap_id>[IVX]+)\.', list_item.text.strip()): + chapter_id = re.search(r'^Subtitle (?P<chap_id>[IVX]+)\.', list_item.text.strip()).group( + 'chap_id') + sub_tag = "sub" + prev_id = list_item.find_previous("h1").get("id") + self.set_chapter_section_nav(list_item, chapter_id.zfill(2), sub_tag, prev_id, None) + list_item["id"] = f"{prev_id}sub{chapter_id.zfill(2)}-snav{snav_count:02}" + snav_count += 1 + elif re.search(r'^Part (?P<chap_id>([A-Z])*(\d+)*)\.', list_item.text.strip()): + chapter_id = re.search(r'^Part (?P<chap_id>([A-Z])*(\d+)*)\.', list_item.text.strip()).group( + 'chap_id') + sub_tag = "p" + prev_id = list_item.find_previous("h2").get("id") + self.set_chapter_section_nav(list_item, chapter_id.zfill(2), sub_tag, prev_id, None) + list_item["id"] = f"{prev_id}a{chapter_id.zfill(2)}-pnav{pnav_count:02}" + pnav_count += 1 + elif re.search(r'^United States Census', list_item.text.strip()): + chapter_id = re.sub(r'[\W]+', '', list_item.text.strip()).lower() + sub_tag = "s" + prev_id = list_item.find_previous("h2").get("id") + self.set_chapter_section_nav(list_item, chapter_id.zfill(2), sub_tag, prev_id, None) + list_item["id"] = f"{prev_id}a{chapter_id.zfill(2)}-spnav{spnav_count:02}" + spnav_count += 1 + else: + chapter_id = re.sub(r'[\W]+', '', list_item.text.strip()).lower() + sub_tag = "c" + prev_id = list_item.find_previous("h1").get("id") + self.set_chapter_section_nav(list_item, chapter_id.zfill(2), sub_tag, prev_id, None) + list_item["id"] = f"{prev_id}a{chapter_id.zfill(2)}-cnav{self.cnav_count:02}" + self.cnav_count += 1 def create_case_note_nav(self): - - if self.soup.find("p",class_='casenote'): - for case_tag in self.soup.find_all("p",class_='casenote'): + if self.soup.find("p", class_='casenote'): + for case_tag in self.soup.find_all("p", class_='casenote'): if re.search(r'^[IVX]+\.', case_tag.text.strip()): nav_list = [] nav_link = self.soup.new_tag('a') @@ -674,6 +859,7 @@ def create_case_note_nav(self): nav_link = self.soup.new_tag('a') nav_link.append(case_tag.text) case_id = re.search(r'^(?P<cid>[0-9]+)\.', case_tag.text.strip()).group("cid").lower() + digit_id = f"{alpha_id}-{case_id}" nav_link["href"] = f"#{alpha_id}-{case_id}" nav_list.append(nav_link) @@ -699,53 +885,51 @@ def create_case_note_nav(self): case_tag.contents = nav_list def create_case_note_ul(self): - for case_tag in self.soup.find_all(class_='casenote'): - case_tag.name = "li" - if re.search(r'^[IVX]+\.', case_tag.a.text.strip()): - rom_tag = case_tag - if re.search(r'^I\.', case_tag.a.text.strip()): - rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) - case_tag.wrap(rom_ul) - else: - rom_ul.append(case_tag) - - elif re.search(r'^[A-Z]\.', case_tag.a.text.strip()): - alpha_tag = case_tag - if re.search(r'^A\.', case_tag.a.text.strip()): - alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) - case_tag.wrap(alpha_ul) - rom_tag.append(alpha_ul) - else: - alpha_ul.append(case_tag) - - elif re.search(r'^[0-9]+\.', case_tag.a.text.strip()): - digit_tag = case_tag - if re.search(r'^1\.', case_tag.a.text.strip()): - digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) - case_tag.wrap(digit_ul) - alpha_tag.append(digit_ul) - else: - digit_ul.append(case_tag) + for case_tag in self.soup.find_all(class_='casenote'): + case_tag.name = "li" + if re.search(r'^[IVX]+\.', case_tag.a.text.strip()): + rom_tag = case_tag + if re.search(r'^I\.', case_tag.a.text.strip()): + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(rom_ul) + else: + rom_ul.append(case_tag) + + elif re.search(r'^[A-Z]\.', case_tag.a.text.strip()): + alpha_tag = case_tag + if re.search(r'^A\.', case_tag.a.text.strip()): + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(alpha_ul) + rom_tag.append(alpha_ul) + else: + alpha_ul.append(case_tag) + + elif re.search(r'^[0-9]+\.', case_tag.a.text.strip()): + digit_tag = case_tag + if re.search(r'^1\.', case_tag.a.text.strip()): + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(digit_ul) + alpha_tag.append(digit_ul) + else: + digit_ul.append(case_tag) - elif re.search(r'^[ivx]+\.', case_tag.a.text.strip()): - if re.search(r'^i\.', case_tag.a.text.strip()): - srom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) - case_tag.wrap(srom_ul) - salpha_tag.append(srom_ul) - else: - srom_ul.append(case_tag) - - - elif re.search(r'^[a-z]\.', case_tag.a.text.strip()): - salpha_tag = case_tag - if re.search(r'^a\.', case_tag.a.text.strip()): - salpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) - case_tag.wrap(salpha_ul) - digit_tag.append(salpha_ul) - else: - salpha_ul.append(case_tag) + elif re.search(r'^[ivx]+\.', case_tag.a.text.strip()): + if re.search(r'^i\.', case_tag.a.text.strip()): + srom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(srom_ul) + salpha_tag.append(srom_ul) + else: + srom_ul.append(case_tag) + elif re.search(r'^[a-z]\.', case_tag.a.text.strip()): + salpha_tag = case_tag + if re.search(r'^a\.', case_tag.a.text.strip()): + salpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(salpha_ul) + digit_tag.append(salpha_ul) + else: + salpha_ul.append(case_tag) def create_and_wrap_with_div_tag(self): self.soup = BeautifulSoup(self.soup.prettify(formatter=None), features='lxml') @@ -770,6 +954,7 @@ def create_and_wrap_with_div_tag(self): tag_to_wrap.wrap(new_sub_sec_div) while True: + inner_next_tag = inner_tag.find_next_sibling() if inner_tag.name == 'h5': new_h5_div = self.soup.new_tag('div') @@ -818,14 +1003,17 @@ def create_and_wrap_with_div_tag(self): if not sec_header: print() - div_tag = self.soup.find("div") for header in self.soup.findAll('h3'): new_chap_div = self.soup.new_tag('div') sec_header = header.find_next_sibling() header.wrap(new_chap_div) while True: - next_sec_tag = sec_header.find_next_sibling() + # if sec_header: + # next_sec_tag = sec_header.find_next_sibling() + # else: + # next_sec_tag = sec_header.find_next() + if sec_header.name == 'h4': new_sec_div = self.soup.new_tag('div') tag_to_wrap = sec_header.find_next_sibling() @@ -885,6 +1073,97 @@ def create_and_wrap_with_div_tag(self): print('wrapped div tags') + def wrap_div_tags(self): + """ + - for each h2 in html + - create new div and append h2 to that div + - find next tag, if next tag is h3 + - create new div and append h3 to it + - append that new div to h2 div + - find next tag of h3, if next tag is h4 + - create new div and append h4 to that div + - append that new div to h3 div + - find next tag, if next tag is h5 + - create new div and append h5 to that div + - append that new div to h4 div + - if not h5 append that tag to h2 div and so on + - if not h4 append that tag to h2 div and so on + - if not h3 append that tag to h2 div and so on + """ + self.soup = BeautifulSoup(self.soup.prettify(formatter=None), features='lxml') + for header in self.soup.findAll('h2'): + new_chap_div = self.soup.new_tag('div') + sec_header = header.find_next_sibling() + if not sec_header: + print() + header.wrap(new_chap_div) + if sec_header: + while True: + next_sec_tag = sec_header.find_next_sibling() + if sec_header.name == 'h3': + new_sec_div = self.soup.new_tag('div') + tag_to_wrap = sec_header.find_next_sibling() + sec_header.wrap(new_sec_div) + while True: + if tag_to_wrap: + next_tag = tag_to_wrap.find_next_sibling() + else: + break + if tag_to_wrap.name == 'h4': + new_sub_sec_div = self.soup.new_tag('div') + inner_tag = tag_to_wrap.find_next_sibling() + tag_to_wrap.wrap(new_sub_sec_div) + + while True: + inner_next_tag = inner_tag.find_next_sibling() + if inner_tag.name == 'h5': + new_h5_div = self.soup.new_tag('div') + inner_h5_tag = inner_tag.find_next_sibling() + inner_tag.wrap(new_h5_div) + while True: + next_h5_child_tag = inner_h5_tag.find_next_sibling() + new_h5_div.append(inner_h5_tag) + inner_next_tag = next_h5_child_tag + if not next_h5_child_tag or next_h5_child_tag.name in ['h3', 'h2', 'h4', + 'h5']: + break + inner_h5_tag = next_h5_child_tag + inner_tag = new_h5_div + new_sub_sec_div.append(inner_tag) + next_tag = inner_next_tag + if not inner_next_tag or inner_next_tag.name in ['h3', + 'h2'] or inner_next_tag.name == 'h4' \ + and inner_next_tag.get('class'): + break + inner_tag = inner_next_tag + tag_to_wrap = new_sub_sec_div + elif tag_to_wrap.name == 'h5': + new_sub_sec_div = self.soup.new_tag('div') + inner_tag = tag_to_wrap.find_next_sibling() + tag_to_wrap.wrap(new_sub_sec_div) + while True: + inner_next_tag = inner_tag.find_next_sibling() + new_sub_sec_div.append(inner_tag) + next_tag = inner_next_tag + if not inner_next_tag or inner_next_tag.name in ['h3', 'h2', 'h4', 'h5']: + break + inner_tag = inner_next_tag + tag_to_wrap = new_sub_sec_div + if not re.search(r'h\d', tag_to_wrap.name): + new_sec_div.append(tag_to_wrap) + next_sec_tag = next_tag + if not next_tag or next_tag.name in ['h3', 'h2']: + break + tag_to_wrap = next_tag + sec_header = new_sec_div + new_chap_div.append(sec_header) + if not next_sec_tag or next_sec_tag.name == 'h2': + break + sec_header = next_sec_tag + if not sec_header: + print() + + print('wrapped div tags') def convert_paragraph_to_alphabetical_ol_tags1(self): """ @@ -914,22 +1193,24 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): if p_tag.i: p_tag.i.unwrap() - if re.search(rf'^{cap_alpha}\.', current_tag_text) and p_tag.name == "p": p_tag.name = "li" ol_head = 1 cap_alpha_cur_tag = p_tag if re.search(r'^A\.', current_tag_text): - cap_alpha_ol = self.soup.new_tag("ol",type="A") + cap_alpha_ol = self.soup.new_tag("ol", type="A") p_tag.wrap(cap_alpha_ol) - cap_alpha_id = f"{p_tag.find_previous({'h5','h4','h3'}).get('id')}ol{ol_count}" + cap_alpha_id = f"{p_tag.find_previous({'h5', 'h4', 'h3'}).get('id')}ol{ol_count}" else: cap_alpha_ol.append(p_tag) p_tag["id"] = f'{cap_alpha_id}{cap_alpha}' p_tag.string = re.sub(rf'^{cap_alpha}\.', '', current_tag_text) - cap_alpha = chr(ord(cap_alpha) + 1) + if cap_alpha == "Z": + cap_alpha = "A" + else: + cap_alpha = chr(ord(cap_alpha) + 1) if re.search(rf'^[A-Z]+\.\s*\d+\.', current_tag_text): num_ol = self.soup.new_tag("ol") @@ -945,7 +1226,7 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): p_tag.append(num_ol) ol_head = 2 - if re.search(r'[A-Z]+\.\s*\d+\.\s*[a-z]+\.',current_tag_text): + if re.search(r'[A-Z]+\.\s*\d+\.\s*[a-z]+\.', current_tag_text): sec_alpha_ol1 = self.soup.new_tag("ol", type="a") inner_li_tag = self.soup.new_tag("li") inner_li_tag.string = re.sub(r'[A-Z]+\.\s*\d+\.\s*[a-z]+\.', '', current_tag_text) @@ -962,18 +1243,23 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): elif re.search(rf'^{ol_head}\.', current_tag_text) and p_tag.name == "p": p_tag.name = "li" num_cur_tag = p_tag - main_sec_alpha1 ='a' + main_sec_alpha1 = 'a' + main_sec_alpha = "a" if re.search(r'^1\.', current_tag_text): num_ol = self.soup.new_tag("ol") p_tag.wrap(num_ol) - num_id = f"{p_tag.find_previous({'h4','h3','h2','h1'}).get('id')}ol{ol_count}" + num_id = f"{p_tag.find_previous({'h4', 'h3', 'h2', 'h1'}).get('id')}ol{ol_count}" if cap_alpha_cur_tag: cap_alpha_cur_tag.append(num_ol) num_id = cap_alpha_cur_tag.get('id') - if n_tag: + elif n_tag: n_tag.append(num_ol) num_id = n_tag.get('id') + elif sec_alpha_cur_tag: + sec_alpha_cur_tag.append(num_ol) + num_id = sec_alpha_cur_tag.get('id') + else: num_ol.append(p_tag) p_tag["id"] = f'{num_id}{ol_head}' @@ -998,7 +1284,7 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): sec_alpha_cur_tag = p_tag if re.search(r'^\(a\)', current_tag_text): - sec_alpha_ol = self.soup.new_tag("ol",type="a") + sec_alpha_ol = self.soup.new_tag("ol", type="a") p_tag.wrap(sec_alpha_ol) if num_cur_tag: sec_alpha_id = num_cur_tag.get('id') @@ -1007,7 +1293,7 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): sec_alpha_id = num_cur_tag1.get('id') num_cur_tag1.append(sec_alpha_ol) else: - sec_alpha_id = f"{p_tag.find_previous({'h5','h4','h3','h2'}).get('id')}ol{ol_count}" + sec_alpha_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" else: sec_alpha_ol.append(p_tag) @@ -1021,14 +1307,14 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): num_count = 1 if re.search(r'^a\.', current_tag_text): - sec_alpha_ol1 = self.soup.new_tag("ol",type="a") + sec_alpha_ol1 = self.soup.new_tag("ol", type="a") p_tag.wrap(sec_alpha_ol1) if num_cur_tag: sec_alpha_id1 = num_cur_tag.get('id') num_cur_tag.append(sec_alpha_ol1) else: - sec_alpha_id1 = f"{p_tag.find_previous({'h5','h4','h3','h2'}).get('id')}ol{ol_count}" + sec_alpha_id1 = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" else: sec_alpha_ol1.append(p_tag) @@ -1037,11 +1323,23 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): p_tag.string = re.sub(rf'^{main_sec_alpha1}\.', '', current_tag_text) main_sec_alpha1 = chr(ord(main_sec_alpha1) + 1) + if re.search(r'^[a-z]+\.\s1\.', current_tag_text): + num_ol = self.soup.new_tag("ol") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'[a-z]+\.\s1\.', '', current_tag_text) + li_tag.append(current_tag_text) + cur_tag = re.search(r'(?P<pid>[a-z]+)\.\s*1\.', current_tag_text) + num_cur_tag = f'{sec_alpha_cur_tag.get("id")}' + li_tag["id"] = f'{sec_alpha_cur_tag.get("id")}1' + sec_alpha_ol1.append(li_tag) + p_tag.contents = [] + p_tag.append(sec_alpha_ol1) + ol_head = 2 elif re.search(rf'^\({num_count}\)', current_tag_text) and p_tag.name == "p": p_tag.name = "li" num_cur_tag1 = p_tag - main_sec_alpha = 'a' + # main_sec_alpha = 'a' cap_alpha1 = 'A' if re.search(r'^\(1\)', current_tag_text): @@ -1052,13 +1350,12 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): num_id1 = sec_alpha_cur_tag.get('id') sec_alpha_cur_tag.append(num_ol1) else: - num_id1 = f"{p_tag.find_previous({'h5','h4','h3','h2'}).get('id')}ol{ol_count}" + num_id1 = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" main_sec_alpha = 'a' else: num_ol1.append(p_tag) - p_tag["id"] = f'{num_id1}{num_count}' p_tag.string = re.sub(rf'^\({num_count}\)', '', current_tag_text) num_count += 1 @@ -1076,7 +1373,7 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): num_cur_tag1.append(cap_alpha_ol1) cap_alpha_id1 = num_cur_tag1.get("id") else: - cap_alpha_id1 = f"{p_tag.find_previous({'h5','h4','h3','h2'}).get('id')}ol{ol_count}" + cap_alpha_id1 = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" else: cap_alpha_ol1.append(p_tag) @@ -1105,7 +1402,7 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): elif re.search(r'^\([ivx]+\)', current_tag_text) and p_tag.name == "p": p_tag.name = "li" roman_cur_tag = p_tag - ol_head = 1 + # ol_head = 1 if re.search(r'^\(i\)', current_tag_text): roman_ol = self.soup.new_tag("ol", type="i") @@ -1119,23 +1416,24 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): elif cap_alpha_cur_tag: cap_alpha_cur_tag.append(roman_ol) prev_id1 = cap_alpha_cur_tag.get("id") + elif num_cur_tag1: num_cur_tag1.append(roman_ol) prev_id1 = num_cur_tag1.get("id") else: - prev_id1 = f"{p_tag.find_previous({'h5','h4','h3','h2'}).get('id')}ol{ol_count}" + prev_id1 = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" else: - + print(p_tag) roman_ol.append(p_tag) rom_head = re.search(r'^\((?P<rom>[ivx]+)\)', current_tag_text) p_tag["id"] = f'{prev_id1}{rom_head.group("rom")}' p_tag.string = re.sub(r'^\([ivx]+\)', '', current_tag_text) - - if re.search(r'^CASE NOTES', current_tag_text) or p_tag.name in ['h3','h4','h5']: + if re.search(r'^CASE NOTES|^(ARTICLE|Article) [IVX]+\.', current_tag_text) or p_tag.name in ['h3', 'h4', + 'h5']: ol_head = 1 - cap_alpha ='A' + cap_alpha = 'A' cap_alpha_cur_tag = None num_count = 1 num_cur_tag = None @@ -1144,14 +1442,16 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): num_cur_tag1 = None sec_alpha_cur_tag = None cap_alpha1 = "A" - n_tag =None + n_tag = None + if re.search(r'^(ARTICLE|Article) [IVX]+\.', current_tag_text): + ol_count += 1 print('ol tags added') - def add_citation(self): - title_01 = {'c01': ['1-1', '1-2', '1-2.1', '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9'],'c02': ['1-10', '1-11', '1-12', '1-13', '1-14', '1-15', '1-16', '1-17'], + title_01 = {'c01': ['1-1', '1-2', '1-2.1', '1-3', '1-4', '1-5', '1-6', '1-7', '1-8', '1-9'], + 'c02': ['1-10', '1-11', '1-12', '1-13', '1-14', '1-15', '1-16', '1-17'], 'c03': ['1-18', '1-19', '1-20', '1-21'], 'c2.1a01': ['1-200', '1-201'], 'c2.1a02': ['1-202', '1-203', '1-204', '1-205', '1-206', '1-207', '1-208', '1-209', '1-210', '1-211', '1-212', '1-213', '1-214', '1-215', '1-216', '1-217', '1-218', '1-219', @@ -1161,188 +1461,300 @@ def add_citation(self): '1-247', '1-248', '1-249', '1-250', '1-251', '1-252', '1-253', '1-254', '1-255', '1-256', '1-257', '1-208.1', '1-211.1', '1-219.1', '1-222.1', '1-240.1', '1-201', '1-202', '1-203', '1-204', '1-205'], - 'c3.1': ['1-300', '1-301', '1-302', '1-303', '1-304', '1-305', '1-306', '1-307', '1-308', '1-309','1-310', '1-311', '1-312', '1-313'], + 'c3.1': ['1-300', '1-301', '1-302', '1-303', '1-304', '1-305', '1-306', '1-307', '1-308', '1-309', + '1-310', '1-311', '1-312', '1-313'], 'c04': ['1-400', '1-401', '1-402', '1-403', '1-404', '1-405', '1-406', '1-407', '1-408'], - 'c05a01': ['1-500', '1-501', '1-502', '1-503', '1-504', '1-505', '1-506', '1-507', '1-508','1-509'], + 'c05a01': ['1-500', '1-501', '1-502', '1-503', '1-504', '1-505', '1-506', '1-507', '1-508', + '1-509'], 'c05a02': ['1-510', '1-511', '1-512'], 'c06': ['1-600', '1-601', '1-602', '1-603', '1-604', '1-605', '1-606', '1-607', '1-608', '1-609', '1-610']} - title_11 = {'c01':['11-1','11-2','11-3','11-4','11-5','11-8','11-9','11-2.01','11-2.1','11-2.2','11-2.3','11-2.4','11-4.1','11-4.1:1','11-4.2','11-4.3','11-4.4','11-4.5','11-4.6','11-7.1','11-9.1','11-9.8'], - 'c02':['11-10','11-11','11-12','11-13'],'c03':['11-14','11-15','11-16','11-16.1','11-16.2'],'c04':['11-17','11-17.1','11-18','11-4'], - 'c4.1':['11-23.6'],'c05':['11-24'],'c06':['11-30'],'c6.1':['11-34.1','11-34.2','11-34.3','11-34.4'],'c07':['11-35'] - } + title_11 = { + 'c01': ['11-1', '11-2', '11-3', '11-4', '11-5', '11-8', '11-9', '11-2.01', '11-2.1', '11-2.2', '11-2.3', + '11-2.4', '11-4.1', '11-4.1:1', '11-4.2', '11-4.3', '11-4.4', '11-4.5', '11-4.6', '11-7.1', + '11-9.1', '11-9.8'], + 'c02': ['11-10', '11-11', '11-12', '11-13'], 'c03': ['11-14', '11-15', '11-16', '11-16.1', '11-16.2'], + 'c04': ['11-17', '11-17.1', '11-18', '11-4'], + 'c4.1': ['11-23.6'], 'c05': ['11-24'], 'c06': ['11-30'], + 'c6.1': ['11-34.1', '11-34.2', '11-34.3', '11-34.4'], 'c07': ['11-35'] + } - title_4_1 = {'s0Ic01': ['4.1-100', '4.1-100', '4.1-101', '4.1-101.01', '4.1-101.02', '4.1-101.03', '4.1-101.04','4.1-101.05', '4.1-101.06', '4.1-101.07', '4.1-101.08', '4.1-101.09', '4.1-101.010', - '4.1-101.011', '4.1-101.1', '4.1-102', '4.1-103', '4.1-103', '4.1-103.01', '4.1-103.02','4.1-103.03', '4.1-103.03', '4.1-103.1', '4.1-104', '4.1-105', '4.1-106', '4.1-107', '4.1-108', - '4.1-109','4.1-110', '4.1-111', '4.1-111', '4.1-112', '4.1-112.1', '4.1-112.2', '4.1-113', '4.1-113.1','4.1-114', '4.1-114', '4.1-115', '4.1-116', '4.1-117', '4.1-118', '4.1-119', '4.1-119', - '4.1-119', '4.1-119.1', '4.1-120', '4.1-121', '4.1-122', '4.1-123', '4.1-124', '4.1-124','4.1-125', '4.1-126', '4.1-127', '4.1-128', '4.1-129', '4.1-130', '4.1-131', '4.1-132','4.1-132', '4.1-133'], - 's0Ic02a01': ['4.1-200', '4.1-201', '4.1-201', '4.1-201.1', '4.1-201.1', '4.1-202', '4.1-203', '4.1-203', '4.1-203.1', '4.1-204', '4.1-204', '4.1-204', '4.1-205', '4.1-205'], - 's0Ic02a02': ['4.1-206', '4.1-206.1', '4.1-206.1', '4.1-206.2', '4.1-206.3', '4.1-206.3', '4.1-207','4.1-207.1', '4.1-208', '4.1-209', '4.1-209', '4.1-209.1', '4.1-209.1', '4.1-210', - '4.1-211', '4.1-211', '4.1-212','4.1-212', '4.1-212.1', '4.1-212.1', '4.1-212.1', '4.1-213', '4.1-214', '4.1-215','4.1-215', '4.1-216', '4.1-216', '4.1-216.1', '4.1-217', '4.1-218', '4.1-219', '4.1-220', - '4.1-221', '4.1-221', '4.1-221.1','4.1-221.1', '4.1-222', '4.1-223', '4.1-223', '4.1-224', '4.1-225', '4.1-225.1','4.1-225.1', '4.1-226', '4.1-227', '4.1-227', '4.1-228', '4.1-229'], - 's0Ic02a03': ['4.1-230', '4.1-230', '4.1-231', '4.1-231.1', '4.1-232', '4.1-232', '4.1-233', '4.1-233.1','4.1-234', '4.1-235', '4.1-236', '4.1-237', '4.1-238', '4.1-238', '4.1-239', '4.1-240'], - 's0Ic03a01': ['4.1-300', '4.1-301', '4.1-302', '4.1-302.1', '4.1-302.2', '4.1-303', '4.1-304', '4.1-305','4.1-306', '4.1-307', '4.1-308', '4.1-309', '4.1-309.1', '4.1-310', '4.1-310', '4.1-310.1', - '4.1-310.1', '4.1-311', '4.1-312', '4.1-313', '4.1-314', '4.1-315', '4.1-316', '4.1-317','4.1-318', '4.1-319', '4.1-320', '4.1-321', '4.1-322', '4.1-323'], - 's0Ic03a02': ['4.1-324', '4.1-325', '4.1-325', '4.1-325.01', '4.1-325.1', '4.1-325.1', '4.1-325.2','4.1-325.2', '4.1-326', '4.1-327', '4.1-327', '4.1-328', '4.1-329', '4.1-330', '4.1-331','4.1-332'], - 's0Ic03a03': ['4.1-333', '4.1-334', '4.1-335', '4.1-336', '4.1-337', '4.1-338', '4.1-339', '4.1-345','4.1-346', '4.1-347', '4.1-348', '4.1-349', '4.1-350', '4.1-351', '4.1-352', '4.1-353','4.1-354'], - 's0Ic04': ['4.1-400', '4.1-401', '4.1-402', '4.1-403', '4.1-404', '4.1-405', '4.1-406', '4.1-407','4.1-408', '4.1-409', '4.1-410', '4.1-411', '4.1-412', '4.1-413', '4.1-414', '4.1-415','4.1-416', '4.1-417', '4.1-418'], - 's0Ic05': ['4.1-500', '4.1-501', '4.1-502', '4.1-503', '4.1-504', '4.1-505', '4.1-506', '4.1-507','4.1-508', '4.1-509', '4.1-509.1', '4.1-510', '4.1-511', '4.1-512', '4.1-513', '4.1-514','4.1-515', '4.1-516', '4.1-517'], - 'sIIc06': ['4.1-600', '4.1-601', '4.1-602', '4.1-603', '4.1-604', '4.1-605', '4.1-606', '4.1-607','4.1-608', '4.1-609', '4.1-610', '4.1-611', '4.1-612', '4.1-613', - '4.1-614', '4.1-615', '4.1-616', '4.1-617', '4.1-618', '4.1-619', '4.1-620', '4.1-621','4.1-622', '4.1-623', '4.1-624', '4.1-625', '4.1-626', '4.1-627', '4.1-628'], - 'sIIc11': ['4.1-1100', '4.1-1101', '4.1-1101.1', '4.1-1105.1', '4.1-1107', '4.1-1108', '4.1-1109','4.1-1110', '4.1-1112', '4.1-1120', '4.1-1121'], - 'sIIc13': ['4.1-1302'],'sIIc15': ['4.1-1500', '4.1-1501', '4.1-1502', '4.1-1503']} + title_4_1 = {'s0Ic01': ['4.1-100', '4.1-100', '4.1-101', '4.1-101.01', '4.1-101.02', '4.1-101.03', '4.1-101.04', + '4.1-101.05', '4.1-101.06', '4.1-101.07', '4.1-101.08', '4.1-101.09', '4.1-101.010', + '4.1-101.011', '4.1-101.1', '4.1-102', '4.1-103', '4.1-103', '4.1-103.01', '4.1-103.02', + '4.1-103.03', '4.1-103.03', '4.1-103.1', '4.1-104', '4.1-105', '4.1-106', '4.1-107', + '4.1-108', + '4.1-109', '4.1-110', '4.1-111', '4.1-111', '4.1-112', '4.1-112.1', '4.1-112.2', + '4.1-113', '4.1-113.1', '4.1-114', '4.1-114', '4.1-115', '4.1-116', '4.1-117', + '4.1-118', '4.1-119', '4.1-119', + '4.1-119', '4.1-119.1', '4.1-120', '4.1-121', '4.1-122', '4.1-123', '4.1-124', + '4.1-124', '4.1-125', '4.1-126', '4.1-127', '4.1-128', '4.1-129', '4.1-130', '4.1-131', + '4.1-132', '4.1-132', '4.1-133'], + 's0Ic02a01': ['4.1-200', '4.1-201', '4.1-201', '4.1-201.1', '4.1-201.1', '4.1-202', '4.1-203', + '4.1-203', '4.1-203.1', '4.1-204', '4.1-204', '4.1-204', '4.1-205', '4.1-205'], + 's0Ic02a02': ['4.1-206', '4.1-206.1', '4.1-206.1', '4.1-206.2', '4.1-206.3', '4.1-206.3', + '4.1-207', '4.1-207.1', '4.1-208', '4.1-209', '4.1-209', '4.1-209.1', '4.1-209.1', + '4.1-210', + '4.1-211', '4.1-211', '4.1-212', '4.1-212', '4.1-212.1', '4.1-212.1', '4.1-212.1', + '4.1-213', '4.1-214', '4.1-215', '4.1-215', '4.1-216', '4.1-216', '4.1-216.1', + '4.1-217', '4.1-218', '4.1-219', '4.1-220', + '4.1-221', '4.1-221', '4.1-221.1', '4.1-221.1', '4.1-222', '4.1-223', '4.1-223', + '4.1-224', '4.1-225', '4.1-225.1', '4.1-225.1', '4.1-226', '4.1-227', '4.1-227', + '4.1-228', '4.1-229'], + 's0Ic02a03': ['4.1-230', '4.1-230', '4.1-231', '4.1-231.1', '4.1-232', '4.1-232', '4.1-233', + '4.1-233.1', '4.1-234', '4.1-235', '4.1-236', '4.1-237', '4.1-238', '4.1-238', + '4.1-239', '4.1-240'], + 's0Ic03a01': ['4.1-300', '4.1-301', '4.1-302', '4.1-302.1', '4.1-302.2', '4.1-303', '4.1-304', + '4.1-305', '4.1-306', '4.1-307', '4.1-308', '4.1-309', '4.1-309.1', '4.1-310', + '4.1-310', '4.1-310.1', + '4.1-310.1', '4.1-311', '4.1-312', '4.1-313', '4.1-314', '4.1-315', '4.1-316', + '4.1-317', '4.1-318', '4.1-319', '4.1-320', '4.1-321', '4.1-322', '4.1-323'], + 's0Ic03a02': ['4.1-324', '4.1-325', '4.1-325', '4.1-325.01', '4.1-325.1', '4.1-325.1', '4.1-325.2', + '4.1-325.2', '4.1-326', '4.1-327', '4.1-327', '4.1-328', '4.1-329', '4.1-330', + '4.1-331', '4.1-332'], + 's0Ic03a03': ['4.1-333', '4.1-334', '4.1-335', '4.1-336', '4.1-337', '4.1-338', '4.1-339', + '4.1-345', '4.1-346', '4.1-347', '4.1-348', '4.1-349', '4.1-350', '4.1-351', + '4.1-352', '4.1-353', '4.1-354'], + 's0Ic04': ['4.1-400', '4.1-401', '4.1-402', '4.1-403', '4.1-404', '4.1-405', '4.1-406', '4.1-407', + '4.1-408', '4.1-409', '4.1-410', '4.1-411', '4.1-412', '4.1-413', '4.1-414', '4.1-415', + '4.1-416', '4.1-417', '4.1-418'], + 's0Ic05': ['4.1-500', '4.1-501', '4.1-502', '4.1-503', '4.1-504', '4.1-505', '4.1-506', '4.1-507', + '4.1-508', '4.1-509', '4.1-509.1', '4.1-510', '4.1-511', '4.1-512', '4.1-513', + '4.1-514', '4.1-515', '4.1-516', '4.1-517'], + 'sIIc06': ['4.1-600', '4.1-601', '4.1-602', '4.1-603', '4.1-604', '4.1-605', '4.1-606', '4.1-607', + '4.1-608', '4.1-609', '4.1-610', '4.1-611', '4.1-612', '4.1-613', + '4.1-614', '4.1-615', '4.1-616', '4.1-617', '4.1-618', '4.1-619', '4.1-620', '4.1-621', + '4.1-622', '4.1-623', '4.1-624', '4.1-625', '4.1-626', '4.1-627', '4.1-628'], + 'sIIc11': ['4.1-1100', '4.1-1101', '4.1-1101.1', '4.1-1105.1', '4.1-1107', '4.1-1108', '4.1-1109', + '4.1-1110', '4.1-1112', '4.1-1120', '4.1-1121'], + 'sIIc13': ['4.1-1302'], 'sIIc15': ['4.1-1500', '4.1-1501', '4.1-1502', '4.1-1503']} title_6_2 = {'s0Ic01a01': ['6.2-100'], - 's0Ic01a02': ['6.2-101', '6.2-101.1', '6.2-102', '6.2-103', '6.2-104', '6.2-105', '6.2-106', '6.2-107'], - 's0Ic02a01': ['6.2-200', '6.2-201'], 's0Ic02a02': ['6.2-202', '6.2-203', '6.2-204'], - 's0Ic03a01': ['6.2-300'], 's0Ic03a02': ['6.2-301', '6.2-302', '6.2-303'], - 's0Ic03a03': ['6.2-304', '6.2-305', '6.2-306', '6.2-307', '6.2-308'], - 's0Ic03a04': ['6.2-309', '6.2-310', '6.2-311', '6.2-312', '6.2-313', '6.2-314', '6.2-315', '6.2-316', - '6.2-317', '6.2-318', '6.2-319', '6.2-320', '6.2-321', '6.2-322', '6.2-323', - '6.2-324', '6.2-325', '6.2-326', '6.2-327', '6.2-328', '6.2-329', ], - - 's0Ic04a01': ['6.2-400', '6.2-401', '6.2-402', '6.2-403', '6.2-404', '6.2-405'], - 's0Ic04a02': ['6.2-406', '6.2-407', '6.2-408', '6.2-409', '6.2-410', '6.2-411', '6.2-412', '6.2-413', - '6.2-414', '6.2-415', '6.2-416', '6.2-417', '6.2-418', '6.2-419', '6.2-420', '6.2-421', - '6.2-422', '6.2-423'], - 's0Ic04a03': ['6.2-424', '6.2-425', '6.2-426', '6.2-427', '6.2-428', '6.2-429', '6.2-430', '6.2-431', - '6.2-432'], - 's0Ic04a04': ['6.2-433', '6.2-434', '6.2-435'], - 's0Ic04a05': ['6.2-436', '6.2-437'], - 's0Ic05': ['6.2-500', '6.2-501', '6.2-502', '6.2-503', '6.2-504', '6.2-505', '6.2-506', '6.2-507', - '6.2-508', '6.2-509', '6.2-510', '6.2-511', '6.2-512', '6.2-513'], - 'sIIc06a01': ['6.2-600', '6.2-601', '6.2-602', '6.2-603', '6.2-603.1'], - 'sIIc06a02': ['6.2-604', '6.2-605', '6.2-606', '6.2-607', '6.2-608', '6.2-609', '6.2-610', '6.2-611', - '6.2-612', '6.2-613', '6.2-614', '6.2-615', '6.2-616', '6.2-617', '6.2-618', '6.2-619', - '6.2-620'], - 'sIIc07': ['6.2-700', '6.2-701', '6.2-702', '6.2-703', '6.2-704', '6.2-705', '6.2-706', '6.2-707', - '6.2-708', '6.2-709', '6.2-710', '6.2-711', '6.2-712', '6.2-713', '6.2-714', '6.2-715'], - - 'sIIc08a01': ['6.2-800', '6.2-801', '6.2-802', '6.2-803', '6.2-804', '6.2-805', '6.2-806', '6.2-807'], - 'sIIc08a02': ['6.2-808', '6.2-809', '6.2-810', '6.2-811', '6.2-812', '6.2-813', '6.2-814', '6.2-815', - '6.2-816', '6.2-817', '6.2-818'], - 'sIIc08a03': ['6.2-819', '6.2-820', '6.2-821'], - 'sIIc08a04': ['6.2-822', '6.2-823', '6.2-824', '6.2-825', '6.2-826', '6.2-827', '6.2-828', '6.2-829', - '6.2-830'], - 'sIIc08a05': ['6.2-831', '6.2-832', '6.2-833', '6.2-834', '6.2-835'], - 'sIIc08a06': ['6.2-836', '6.2-837', '6.2-838', '6.2-839', '6.2-840', '6.2-841', '6.2-842', '6.2-843', - '6.2-844', '6.2-845', '6.2-846', '6.2-847', '6.2-848'], - 'sIIc08a07': ['6.2-849', '6.2-850', '6.2-851', '6.2-852', '6.2-853', '6.2-854', '6.2-855', '6.2-856', - '6.2-857', '6.2-858', '6.2-859', ], - 'sIIc08a08': ['6.2-860', '6.2-861', '6.2-862', '6.2-863', '6.2-864', '6.2-865', '6.2-866', '6.2-867', - '6.2-868', '6.2-869'], - 'sIIc08a09': ['6.2-870', '6.2-871', '6.2-872', '6.2-873', '6.2-874', '6.2-875', '6.2-876', '6.2-877', - '6.2-878', '6.2-879', '6.2-880', '6.2-881', '6.2-882', '6.2-883', - '6.2-884', '6.2-885', '6.2-886', '6.2-887', '6.2-888'], - 'sIIc08a10': ['6.2-889', '6.2-890', '6.2-891', '6.2-892'], - 'sIIc08a11':['6.2 - 893','6.2 - 894','6.2 - 895','6.2 - 896','6.2 - 897'], - 'sIIc08a12':['6.2 - 898','6.2 - 899','6.2 - 900','6.2 - 901','6.2 - 902','6.2 - 903','6.2 - 904','6.2 - 905','6.2 - 906','6.2 - 907','6.2 - 908','6.2 - 909','6.2 - 910','6.2 - 911'], - 'sIIc08a13':['6.2 - 912','6.2 - 913','6.2 - 914','6.2 - 915','6.2 - 916','6.2 - 917','6.2 - 918','6.2 - 919','6.2 - 920','6.2 - 921','6.2 - 922','6.2 - 923','6.2 - 924'], - 'sIIc08a14':['6.2 - 925','6.2 - 926','6.2 - 927','6.2 - 928','6.2 - 929','6.2 - 930','6.2 - 931','6.2 - 932','6.2 - 933','6.2 - 934','6.2 - 935','6.2 - 936','6.2 - 937'], - 'sIIc08a15':['6.2 - 938','6.2 - 939','6.2 - 940','6.2 - 941','6.2 - 942','6.2 - 943','6.2 - 944','6.2 - 945','6.2 - 946'], - 'sIIc08a16':['6.2 - 947','6.2 - 948','6.2 - 949','6.2 - 950'],'sIIc08a17':['6.2 - 951','6.2 - 952','6.2 - 953'], - 'sIIc10a01':['6.2-1000', '6.2-1001', '6.2-1002', '6.2-1003','6.2-1004', '6.2-1005', '6.2-1006', '6.2-1007','6.2-1008', '6.2-1009', '6.2-1010', '6.2-1011', - '6.2-1012'],'sIIc10a02': ['6.2-1013', '6.2-1014','6.2-1015', '6.2-1016','6.2-1017', '6.2-1018','6.2-1019', '6.2-1020','6.2-1021', '6.2-1022','6.2-1023', '6.2-1024', - '6.2-1025', '6.2-1026','6.2-1027','6.2-1028', '6.2-1029','6.2-1030', '6.2-1031','6.2-1032', '6.2-1033','6.2-1034', '6.2-1035','6.2-1036', '6.2-1037', - '6.2-1038', '6.2-1039','6.2-1040', '6.2-1041','6.2-1042', '6.2-1043','6.2-1044', '6.2-1045', '6.2-1046'], - 'sIIc10a03': [ '6.2-1047', '6.2-1048', '6.2-1049', '6.2-1050', '6.2-1051', '6.2-1052', '6.2-1053', '6.2-1054', '6.2-1055', - '6.2-1056', '6.2-1057', '6.2-1058', '6.2-1059', '6.2-1060', '6.2-1061', '6.2-1062', '6.2-1063', '6.2-1064'], - 'sIIc10a04': ['6.2-1065', '6.2-1066', '6.2-1067', '6.2-1068', '6.2-1069', '6.2-1070', '6.2-1071', '6.2-1072', - '6.2-1073'], - 'sIIc10a05': ['6.2-1074', '6.2-1075', '6.2-1076', '6.2-1077', '6.2-1078', '6.2-1079', '6.2-1080'], - 'sIIc10a06': ['6.2-1081', '6.2-1082', '6.2-1083', '6.2-1084', '6.2-1085', '6.2-1086', '6.2-1087', '6.2-1088', - '6.2-1089', '6.2-1090', '6.2-1091', '6.2-1092', '6.2-1093', - '6.2-1094', '6.2-1095', '6.2-1096', '6.2-1097', '6.2-1098', '6.2-1099'], - - 'sIIc11a01': ['6.2-1100', '6.2-1101', '6.2-1102', '6.2-1103', '6.2-1104', '6.2-1105', '6.2-1106', '6.2-1107', - '6.2-1108', '6.2-1109', '6.2-1110', '6.2-1111', '6.2-1112', '6.2-1113'], - 'sIIc11a02': ['6.2-1114', '6.2-1115', '6.2-1116', '6.2-1117', '6.2-1118', '6.2-1119', '6.2-1120', '6.2-1121', - '6.2-1122', '6.2-1123', '6.2-1124', '6.2-1125', '6.2-1126', '6.2-1127', '6.2-1128', '6.2-1129', - '6.2-1130', '6.2-1131', '6.2-1132', - ], - 'sIIc11a03': ['6.2-1133', '6.2-1134', '6.2-1135', '6.2-1136', '6.2-1137', '6.2-1138'], - 'sIIc11a04': ['6.2-1139', '6.2-1140', '6.2-1141', '6.2-1142', '6.2-1143', '6.2-1144', '6.2-1145', '6.2-1146', - '6.2-1147'], - 'sIIc11a05': ['6.2-1148', '6.2-1149', '6.2-1150', '6.2-1151', '6.2-1152', '6.2-1153', '6.2-1154', '6.2-1155', - '6.2-1156', '6.2-1157', '6.2-1158', '6.2-1159', '6.2-1160', '6.2-1161', '6.2-1162', '6.2-1163', - '6.2-1164', '6.2-1165'], - 'sIIc11a06': ['6.2-1166', '6.2-1167', '6.2-1168', '6.2-1169', '6.2-1170', '6.2-1171', '6.2-1172', '6.2-1173', - '6.2-1174', '6.2-1175', '6.2-1176', '6.2-1177', '6.2-1178'], - 'sIIc11a07': ['6.2-1179', '6.2-1180', '6.2-1181', '6.2-1182', '6.2-1183', '6.2-1184', '6.2-1185'], - 'sIIc11a08': ['6.2-1186', '6.2-1187', '6.2-1188', '6.2-1189', '6.2-1190'], - 'sIIc11a09': ['6.2-1191', '6.2-1192', '6.2-1193', '6.2-1194', '6.2-1195', '6.2-1196', '6.2-1197', '6.2-1198', - '6.2-1199', '6.2-1200', '6.2-1201', '6.2-1202', '6.2-1203', '6.2-1204', '6.2-1205'], - 'sIIc13a01': ['6.2-1300', '6.2-1301', '6.2-1302', '6.2-1303', '6.2-1304', '6.2-1305', '6.2-1306', '6.2-1307'], - 'sIIc13a02': ['6.2-1308', '6.2-1309', '6.2-1310', '6.2-1311', '6.2-1312', '6.2-1313', '6.2-1314', '6.2-1315', - '6.2-1316', '6.2-1317', '6.2-1318', '6.2-1319'], - 'sIIc13a03': ['6.2-1320', '6.2-1321', '6.2-1322', '6.2-1323', '6.2-1324', '6.2-1325', '6.2-1326'], - 'sIIc13a04': ['6.2-1327', '6.2-1328', '6.2-1329', '6.2-1330'], - 'sIIc13a05': ['6.2-1331', '6.2-1332', '6.2-1333', '6.2-1334', '6.2-1335', '6.2-1336', '6.2-1337', '6.2-1338', - '6.2-1339', '6.2-1340', '6.2-1341', '6.2-1342', '6.2-1343'], - 'sIIc13a06': ['6.2-1344', '6.2-1345', '6.2-1346', '6.2-1347', '6.2-1347.1'], - 'sIIc13a07': ['6.2-1348', '6.2-1349', '6.2-1350', '6.2-1351', '6.2-1352', '6.2-1353', '6.2-1354', '6.2-1355', - '6.2-1356', '6.2-1357'], - 'sIIc13a08': ['6.2-1358', '6.2-1359', '6.2-1360', '6.2-1361', '6.2-1362', '6.2-1363', '6.2-1364', '6.2-1365', - '6.2-1366', '6.2-1367', '6.2-1368', '6.2-1369'], - 'sIIc13a09': ['6.2-1370', '6.2-1371', '6.2-1372', '6.2-1373', '6.2-1374', '6.2-1375', '6.2-1376'], - 'sIIc13a010': ['6.2-1377', '6.2-1378'], - 'sIIc13a011': ['6.2-1379', '6.2-1380'], - 'sIIIc14': ['6.2-1400', '6.2-1401', '6.2-1402', '6.2-1403', '6.2-1404', '6.2-1405', '6.2-1406', '6.2-1407', - '6.2-1408', '6.2-1409', '6.2-1410', '6.2-1411', '6.2-1412', - '6.2-1413', '6.2-1414', '6.2-1415', '6.2-1416', '6.2-1417', '6.2-1418', '6.2-1419', '6.2-1420', - '6.2-1421'], - 'sIIIc15': ['6.2-1500', '6.2-1501', '6.2-1502', '6.2-1503', '6.2-1504', '6.2-1505', '6.2-1506', '6.2-1507', - '6.2-1508', '6.2-1508.1', '6.2-1509', '6.2-1510', '6.2-1511', '6.2-1512', '6.2-1513', '6.2-1514', - '6.2-1515', '6.2-1516', '6.2-1517', '6.2-1518', '6.2-1519', '6.2-1520', '6.2-1521', '6.2-1522', - '6.2-1523', '6.2-1523.1', '6.2-1523.2', '6.2-1523.3', '6.2-1524', '6.2-1525', '6.2-1526', - '6.2-1527', '6.2-1528', '6.2-1529', '6.2-1530', - '6.2-1531', '6.2-1532', '6.2-1533', '6.2-1534', '6.2-1535', '6.2-1536', '6.2-1537', '6.2-1538', - '6.2-1539', '6.2-1540', '6.2-1541', '6.2-1542', '6.2-1543'], - 'sIIIc16': ['6.2-1600', '6.2-1601', '6.2-1602', '6.2-1603', '6.2-1604', '6.2-1605', '6.2-1606', '6.2-1607', - '6.2-1608', '6.2-1609', '6.2-1610', '6.2-1611', '6.2-1612', '6.2-1613', '6.2-1614', '6.2-1615', - '6.2-1616', '6.2-1617', '6.2-1618', '6.2-1619', '6.2-1620', '6.2-1621', '6.2-1622', '6.2-1623', - '6.2-1624', '6.2-1625', '6.2-1626', '6.2-1627', '6.2-1628', '6.2-1629'], - 'sIIIc17': ['6.2-1700', '6.2-1701', '6.2-1702', '6.2-1703', '6.2-1704', '6.2-1705', '6.2-1706', '6.2-1707', - '6.2-1708', '6.2-1709', '6.2-1710', '6.2-1711', '6.2-1712', '6.2-1713', '6.2-1714', - '6.2-1715', '6.2-1716', '6.2-1717', '6.2-1718', '6.2-1719', '6.2-1720', '6.2-1721', '6.2-1701.1', - '6.2-1701.2', '6.2-1701.3', '6.2-1712.1'], - 'sIIIc18': ['6.2-1800', '6.2-1801', '6.2-1802', '6.2-1803', '6.2-1804', '6.2-1805', '6.2-1806', '6.2-1807', - '6.2-1808', '6.2-1809', '6.2-1810', '6.2-1811', '6.2-1812', '6.2-1813', '6.2-1814', '6.2-1815', - '6.2-1816', - '6.2-1817', '6.2-1818', '6.2-1819', '6.2-1820', '6.2-1821', '6.2-1822', '6.2-1823', '6.2-1824', - '6.2-1825', '6.2-1826', '6.2-1827', '6.2-1828', '6.2-1829', '6.2-1816.1', '6.2-1817.1', - '6.2-1818.1', '6.2-1818.2', - '6.2-1818.3', '6.2-1818.4'], - 'sIIIc19': ['6.2-1900', '6.2-1901', '6.2-1902', '6.2-1903', '6.2-1904', '6.2-1905', '6.2-1906', '6.2-1907', - '6.2-1908', '6.2-1909', '6.2-1910', '6.2-1911', '6.2-1912', - '6.2-1913', '6.2-1914', '6.2-1915', '6.2-1916', '6.2-1917', '6.2-1918', '6.2-1919', '6.2-1920', - '6.2-1921', '6.2-1904.1', '6.2-1906.1'], - 'sIIIc20': ['6.2-2000', '6.2-2001', '6.2-2002', '6.2-2003', '6.2-2004', '6.2-2005', '6.2-2006', '6.2-2007', - '6.2-2008', '6.2-2009', '6.2-2010', '6.2-2011', '6.2-2012', '6.2-2013', '6.2-2014', '6.2-2015', - '6.2-2016', - '6.2-2017', '6.2-2018', '6.2-2019', '6.2-2020', '6.2-2021', '6.2-2022', '6.2-2023', '6.2-2024', - '6.2-2025'], - 'sIIIc20.1': ['6.2-2026', '6.2-2027', '6.2-2028', '6.2-2029', '6.2-2030', '6.2-2031', '6.2-2032', '6.2-2033', - '6.2-2034', '6.2-2035', '6.2-2036', '6.2-2037', '6.2-2038', '6.2-2039', '6.2-2040', '6.2-2041', - '6.2-2042', - '6.2-2043', '6.2-2044', '6.2-2045', '6.2-2046', '6.2-2047', '6.2-2048', '6.2-2049', '6.2-2050'], - - 'sIIIc21': ['6.2-2100', '6.2-2101', '6.2-2102', '6.2-2103', '6.2-2104', '6.2-2105', '6.2-2106', '6.2-2107', - '6.2-2108', '6.2-2109', '6.2-2110', '6.2-2111', '6.2-207.1'], - 'sIIIc22': ['6.2-2200', '6.2-2201', '6.2-2202', '6.2-2203', '6.2-2204', '6.2-2205', '6.2-2206', '6.2-2207', - '6.2-2208', '6.2-2209', '6.2-2210', '6.2-2211', '6.2-2212', '6.2-2213', '6.2-2214', '6.2-2215', - '6.2-2216', '6.2-2217', '6.2-2218', '6.2-2219', '6.2-2220', '6.2-2221', '6.2-2222', '6.2-2223', - '6.2-2224', '6.2-2225', '6.2-2226', '6.2-2227', '6.2-2215.1', '6.2-2216.1', '6.2-2216.2', - '6.2-2216.3', '6.2-2216.4', '6.2-2218.1' - ], - 'sIIIc23': ['6.2-2300', '6.2-2301', '6.2-2302', '6.2-2303', '6.2-2304', '6.2-2305', '6.2-2306', '6.2-2307', - '6.2-2308', '6.2-2309', '6.2-2310', '6.2-2311', '6.2-2312', '6.2-2313', '6.2-2314'], - 'sIIIc24': ['6.2-2400', '6.2-2401', '6.2-2402'], - 'sIIIc25': ['6.2-2500', '6.2-2501', '6.2-2502', '6.2-2503', '6.2-2504', '6.2-2505'], - 'sIIIc26': ['6.2-2600', '6.2-2601', '6.2-2602', '6.2-2603', '6.2-2604', '6.2-2605', '6.2-2606', '6.2-2607', - '6.2-2608', '6.2-2609', '6.2-2610', '6.2-2611', '6.2-2612', '6.2-2613', '6.2-2614', '6.2-2615', - '6.2-2616','6.2-2617', '6.2-2618', '6.2-2619', '6.2-2620', '6.2-2621', '6.2-2622'],} + 's0Ic01a02': ['6.2-101', '6.2-101.1', '6.2-102', '6.2-103', '6.2-104', '6.2-105', '6.2-106', + '6.2-107'], + 's0Ic02a01': ['6.2-200', '6.2-201'], 's0Ic02a02': ['6.2-202', '6.2-203', '6.2-204'], + 's0Ic03a01': ['6.2-300'], 's0Ic03a02': ['6.2-301', '6.2-302', '6.2-303'], + 's0Ic03a03': ['6.2-304', '6.2-305', '6.2-306', '6.2-307', '6.2-308'], + 's0Ic03a04': ['6.2-309', '6.2-310', '6.2-311', '6.2-312', '6.2-313', '6.2-314', '6.2-315', + '6.2-316', + '6.2-317', '6.2-318', '6.2-319', '6.2-320', '6.2-321', '6.2-322', '6.2-323', + '6.2-324', '6.2-325', '6.2-326', '6.2-327', '6.2-328', '6.2-329', ], + + 's0Ic04a01': ['6.2-400', '6.2-401', '6.2-402', '6.2-403', '6.2-404', '6.2-405'], + 's0Ic04a02': ['6.2-406', '6.2-407', '6.2-408', '6.2-409', '6.2-410', '6.2-411', '6.2-412', + '6.2-413', + '6.2-414', '6.2-415', '6.2-416', '6.2-417', '6.2-418', '6.2-419', '6.2-420', + '6.2-421', + '6.2-422', '6.2-423'], + 's0Ic04a03': ['6.2-424', '6.2-425', '6.2-426', '6.2-427', '6.2-428', '6.2-429', '6.2-430', + '6.2-431', + '6.2-432'], + 's0Ic04a04': ['6.2-433', '6.2-434', '6.2-435'], + 's0Ic04a05': ['6.2-436', '6.2-437'], + 's0Ic05': ['6.2-500', '6.2-501', '6.2-502', '6.2-503', '6.2-504', '6.2-505', '6.2-506', '6.2-507', + '6.2-508', '6.2-509', '6.2-510', '6.2-511', '6.2-512', '6.2-513'], + 'sIIc06a01': ['6.2-600', '6.2-601', '6.2-602', '6.2-603', '6.2-603.1'], + 'sIIc06a02': ['6.2-604', '6.2-605', '6.2-606', '6.2-607', '6.2-608', '6.2-609', '6.2-610', + '6.2-611', + '6.2-612', '6.2-613', '6.2-614', '6.2-615', '6.2-616', '6.2-617', '6.2-618', + '6.2-619', + '6.2-620'], + 'sIIc07': ['6.2-700', '6.2-701', '6.2-702', '6.2-703', '6.2-704', '6.2-705', '6.2-706', '6.2-707', + '6.2-708', '6.2-709', '6.2-710', '6.2-711', '6.2-712', '6.2-713', '6.2-714', '6.2-715'], + + 'sIIc08a01': ['6.2-800', '6.2-801', '6.2-802', '6.2-803', '6.2-804', '6.2-805', '6.2-806', + '6.2-807'], + 'sIIc08a02': ['6.2-808', '6.2-809', '6.2-810', '6.2-811', '6.2-812', '6.2-813', '6.2-814', + '6.2-815', + '6.2-816', '6.2-817', '6.2-818'], + 'sIIc08a03': ['6.2-819', '6.2-820', '6.2-821'], + 'sIIc08a04': ['6.2-822', '6.2-823', '6.2-824', '6.2-825', '6.2-826', '6.2-827', '6.2-828', + '6.2-829', + '6.2-830'], + 'sIIc08a05': ['6.2-831', '6.2-832', '6.2-833', '6.2-834', '6.2-835'], + 'sIIc08a06': ['6.2-836', '6.2-837', '6.2-838', '6.2-839', '6.2-840', '6.2-841', '6.2-842', + '6.2-843', + '6.2-844', '6.2-845', '6.2-846', '6.2-847', '6.2-848'], + 'sIIc08a07': ['6.2-849', '6.2-850', '6.2-851', '6.2-852', '6.2-853', '6.2-854', '6.2-855', + '6.2-856', + '6.2-857', '6.2-858', '6.2-859', ], + 'sIIc08a08': ['6.2-860', '6.2-861', '6.2-862', '6.2-863', '6.2-864', '6.2-865', '6.2-866', + '6.2-867', + '6.2-868', '6.2-869'], + 'sIIc08a09': ['6.2-870', '6.2-871', '6.2-872', '6.2-873', '6.2-874', '6.2-875', '6.2-876', + '6.2-877', + '6.2-878', '6.2-879', '6.2-880', '6.2-881', '6.2-882', '6.2-883', + '6.2-884', '6.2-885', '6.2-886', '6.2-887', '6.2-888'], + 'sIIc08a10': ['6.2-889', '6.2-890', '6.2-891', '6.2-892'], + 'sIIc08a11': ['6.2 - 893', '6.2 - 894', '6.2 - 895', '6.2 - 896', '6.2 - 897'], + 'sIIc08a12': ['6.2 - 898', '6.2 - 899', '6.2 - 900', '6.2 - 901', '6.2 - 902', '6.2 - 903', + '6.2 - 904', '6.2 - 905', '6.2 - 906', '6.2 - 907', '6.2 - 908', '6.2 - 909', + '6.2 - 910', '6.2 - 911'], + 'sIIc08a13': ['6.2 - 912', '6.2 - 913', '6.2 - 914', '6.2 - 915', '6.2 - 916', '6.2 - 917', + '6.2 - 918', '6.2 - 919', '6.2 - 920', '6.2 - 921', '6.2 - 922', '6.2 - 923', + '6.2 - 924'], + 'sIIc08a14': ['6.2 - 925', '6.2 - 926', '6.2 - 927', '6.2 - 928', '6.2 - 929', '6.2 - 930', + '6.2 - 931', '6.2 - 932', '6.2 - 933', '6.2 - 934', '6.2 - 935', '6.2 - 936', + '6.2 - 937'], + 'sIIc08a15': ['6.2 - 938', '6.2 - 939', '6.2 - 940', '6.2 - 941', '6.2 - 942', '6.2 - 943', + '6.2 - 944', '6.2 - 945', '6.2 - 946'], + 'sIIc08a16': ['6.2 - 947', '6.2 - 948', '6.2 - 949', '6.2 - 950'], + 'sIIc08a17': ['6.2 - 951', '6.2 - 952', '6.2 - 953'], + 'sIIc10a01': ['6.2-1000', '6.2-1001', '6.2-1002', '6.2-1003', '6.2-1004', '6.2-1005', '6.2-1006', + '6.2-1007', '6.2-1008', '6.2-1009', '6.2-1010', '6.2-1011', + '6.2-1012'], + 'sIIc10a02': ['6.2-1013', '6.2-1014', '6.2-1015', '6.2-1016', '6.2-1017', '6.2-1018', '6.2-1019', + '6.2-1020', '6.2-1021', '6.2-1022', '6.2-1023', '6.2-1024', + '6.2-1025', '6.2-1026', '6.2-1027', '6.2-1028', '6.2-1029', '6.2-1030', '6.2-1031', + '6.2-1032', '6.2-1033', '6.2-1034', '6.2-1035', '6.2-1036', '6.2-1037', + '6.2-1038', '6.2-1039', '6.2-1040', '6.2-1041', '6.2-1042', '6.2-1043', '6.2-1044', + '6.2-1045', '6.2-1046'], + 'sIIc10a03': ['6.2-1047', '6.2-1048', '6.2-1049', '6.2-1050', '6.2-1051', '6.2-1052', '6.2-1053', + '6.2-1054', '6.2-1055', + '6.2-1056', '6.2-1057', '6.2-1058', '6.2-1059', '6.2-1060', '6.2-1061', '6.2-1062', + '6.2-1063', '6.2-1064'], + 'sIIc10a04': ['6.2-1065', '6.2-1066', '6.2-1067', '6.2-1068', '6.2-1069', '6.2-1070', '6.2-1071', + '6.2-1072', + '6.2-1073'], + 'sIIc10a05': ['6.2-1074', '6.2-1075', '6.2-1076', '6.2-1077', '6.2-1078', '6.2-1079', '6.2-1080'], + 'sIIc10a06': ['6.2-1081', '6.2-1082', '6.2-1083', '6.2-1084', '6.2-1085', '6.2-1086', '6.2-1087', + '6.2-1088', + '6.2-1089', '6.2-1090', '6.2-1091', '6.2-1092', '6.2-1093', + '6.2-1094', '6.2-1095', '6.2-1096', '6.2-1097', '6.2-1098', '6.2-1099'], + + 'sIIc11a01': ['6.2-1100', '6.2-1101', '6.2-1102', '6.2-1103', '6.2-1104', '6.2-1105', '6.2-1106', + '6.2-1107', + '6.2-1108', '6.2-1109', '6.2-1110', '6.2-1111', '6.2-1112', '6.2-1113'], + 'sIIc11a02': ['6.2-1114', '6.2-1115', '6.2-1116', '6.2-1117', '6.2-1118', '6.2-1119', '6.2-1120', + '6.2-1121', + '6.2-1122', '6.2-1123', '6.2-1124', '6.2-1125', '6.2-1126', '6.2-1127', '6.2-1128', + '6.2-1129', + '6.2-1130', '6.2-1131', '6.2-1132', + ], + 'sIIc11a03': ['6.2-1133', '6.2-1134', '6.2-1135', '6.2-1136', '6.2-1137', '6.2-1138'], + 'sIIc11a04': ['6.2-1139', '6.2-1140', '6.2-1141', '6.2-1142', '6.2-1143', '6.2-1144', '6.2-1145', + '6.2-1146', + '6.2-1147'], + 'sIIc11a05': ['6.2-1148', '6.2-1149', '6.2-1150', '6.2-1151', '6.2-1152', '6.2-1153', '6.2-1154', + '6.2-1155', + '6.2-1156', '6.2-1157', '6.2-1158', '6.2-1159', '6.2-1160', '6.2-1161', '6.2-1162', + '6.2-1163', + '6.2-1164', '6.2-1165'], + 'sIIc11a06': ['6.2-1166', '6.2-1167', '6.2-1168', '6.2-1169', '6.2-1170', '6.2-1171', '6.2-1172', + '6.2-1173', + '6.2-1174', '6.2-1175', '6.2-1176', '6.2-1177', '6.2-1178'], + 'sIIc11a07': ['6.2-1179', '6.2-1180', '6.2-1181', '6.2-1182', '6.2-1183', '6.2-1184', '6.2-1185'], + 'sIIc11a08': ['6.2-1186', '6.2-1187', '6.2-1188', '6.2-1189', '6.2-1190'], + 'sIIc11a09': ['6.2-1191', '6.2-1192', '6.2-1193', '6.2-1194', '6.2-1195', '6.2-1196', '6.2-1197', + '6.2-1198', + '6.2-1199', '6.2-1200', '6.2-1201', '6.2-1202', '6.2-1203', '6.2-1204', '6.2-1205'], + 'sIIc13a01': ['6.2-1300', '6.2-1301', '6.2-1302', '6.2-1303', '6.2-1304', '6.2-1305', '6.2-1306', + '6.2-1307'], + 'sIIc13a02': ['6.2-1308', '6.2-1309', '6.2-1310', '6.2-1311', '6.2-1312', '6.2-1313', '6.2-1314', + '6.2-1315', + '6.2-1316', '6.2-1317', '6.2-1318', '6.2-1319'], + 'sIIc13a03': ['6.2-1320', '6.2-1321', '6.2-1322', '6.2-1323', '6.2-1324', '6.2-1325', '6.2-1326'], + 'sIIc13a04': ['6.2-1327', '6.2-1328', '6.2-1329', '6.2-1330'], + 'sIIc13a05': ['6.2-1331', '6.2-1332', '6.2-1333', '6.2-1334', '6.2-1335', '6.2-1336', '6.2-1337', + '6.2-1338', + '6.2-1339', '6.2-1340', '6.2-1341', '6.2-1342', '6.2-1343'], + 'sIIc13a06': ['6.2-1344', '6.2-1345', '6.2-1346', '6.2-1347', '6.2-1347.1'], + 'sIIc13a07': ['6.2-1348', '6.2-1349', '6.2-1350', '6.2-1351', '6.2-1352', '6.2-1353', '6.2-1354', + '6.2-1355', + '6.2-1356', '6.2-1357'], + 'sIIc13a08': ['6.2-1358', '6.2-1359', '6.2-1360', '6.2-1361', '6.2-1362', '6.2-1363', '6.2-1364', + '6.2-1365', + '6.2-1366', '6.2-1367', '6.2-1368', '6.2-1369'], + 'sIIc13a09': ['6.2-1370', '6.2-1371', '6.2-1372', '6.2-1373', '6.2-1374', '6.2-1375', '6.2-1376'], + 'sIIc13a010': ['6.2-1377', '6.2-1378'], + 'sIIc13a011': ['6.2-1379', '6.2-1380'], + 'sIIIc14': ['6.2-1400', '6.2-1401', '6.2-1402', '6.2-1403', '6.2-1404', '6.2-1405', '6.2-1406', + '6.2-1407', + '6.2-1408', '6.2-1409', '6.2-1410', '6.2-1411', '6.2-1412', + '6.2-1413', '6.2-1414', '6.2-1415', '6.2-1416', '6.2-1417', '6.2-1418', '6.2-1419', + '6.2-1420', + '6.2-1421'], + 'sIIIc15': ['6.2-1500', '6.2-1501', '6.2-1502', '6.2-1503', '6.2-1504', '6.2-1505', '6.2-1506', + '6.2-1507', + '6.2-1508', '6.2-1508.1', '6.2-1509', '6.2-1510', '6.2-1511', '6.2-1512', '6.2-1513', + '6.2-1514', + '6.2-1515', '6.2-1516', '6.2-1517', '6.2-1518', '6.2-1519', '6.2-1520', '6.2-1521', + '6.2-1522', + '6.2-1523', '6.2-1523.1', '6.2-1523.2', '6.2-1523.3', '6.2-1524', '6.2-1525', + '6.2-1526', + '6.2-1527', '6.2-1528', '6.2-1529', '6.2-1530', + '6.2-1531', '6.2-1532', '6.2-1533', '6.2-1534', '6.2-1535', '6.2-1536', '6.2-1537', + '6.2-1538', + '6.2-1539', '6.2-1540', '6.2-1541', '6.2-1542', '6.2-1543'], + 'sIIIc16': ['6.2-1600', '6.2-1601', '6.2-1602', '6.2-1603', '6.2-1604', '6.2-1605', '6.2-1606', + '6.2-1607', + '6.2-1608', '6.2-1609', '6.2-1610', '6.2-1611', '6.2-1612', '6.2-1613', '6.2-1614', + '6.2-1615', + '6.2-1616', '6.2-1617', '6.2-1618', '6.2-1619', '6.2-1620', '6.2-1621', '6.2-1622', + '6.2-1623', + '6.2-1624', '6.2-1625', '6.2-1626', '6.2-1627', '6.2-1628', '6.2-1629'], + 'sIIIc17': ['6.2-1700', '6.2-1701', '6.2-1702', '6.2-1703', '6.2-1704', '6.2-1705', '6.2-1706', + '6.2-1707', + '6.2-1708', '6.2-1709', '6.2-1710', '6.2-1711', '6.2-1712', '6.2-1713', '6.2-1714', + '6.2-1715', '6.2-1716', '6.2-1717', '6.2-1718', '6.2-1719', '6.2-1720', '6.2-1721', + '6.2-1701.1', + '6.2-1701.2', '6.2-1701.3', '6.2-1712.1'], + 'sIIIc18': ['6.2-1800', '6.2-1801', '6.2-1802', '6.2-1803', '6.2-1804', '6.2-1805', '6.2-1806', + '6.2-1807', + '6.2-1808', '6.2-1809', '6.2-1810', '6.2-1811', '6.2-1812', '6.2-1813', '6.2-1814', + '6.2-1815', + '6.2-1816', + '6.2-1817', '6.2-1818', '6.2-1819', '6.2-1820', '6.2-1821', '6.2-1822', '6.2-1823', + '6.2-1824', + '6.2-1825', '6.2-1826', '6.2-1827', '6.2-1828', '6.2-1829', '6.2-1816.1', '6.2-1817.1', + '6.2-1818.1', '6.2-1818.2', + '6.2-1818.3', '6.2-1818.4'], + 'sIIIc19': ['6.2-1900', '6.2-1901', '6.2-1902', '6.2-1903', '6.2-1904', '6.2-1905', '6.2-1906', + '6.2-1907', + '6.2-1908', '6.2-1909', '6.2-1910', '6.2-1911', '6.2-1912', + '6.2-1913', '6.2-1914', '6.2-1915', '6.2-1916', '6.2-1917', '6.2-1918', '6.2-1919', + '6.2-1920', + '6.2-1921', '6.2-1904.1', '6.2-1906.1'], + 'sIIIc20': ['6.2-2000', '6.2-2001', '6.2-2002', '6.2-2003', '6.2-2004', '6.2-2005', '6.2-2006', + '6.2-2007', + '6.2-2008', '6.2-2009', '6.2-2010', '6.2-2011', '6.2-2012', '6.2-2013', '6.2-2014', + '6.2-2015', + '6.2-2016', + '6.2-2017', '6.2-2018', '6.2-2019', '6.2-2020', '6.2-2021', '6.2-2022', '6.2-2023', + '6.2-2024', + '6.2-2025'], + 'sIIIc20.1': ['6.2-2026', '6.2-2027', '6.2-2028', '6.2-2029', '6.2-2030', '6.2-2031', '6.2-2032', + '6.2-2033', + '6.2-2034', '6.2-2035', '6.2-2036', '6.2-2037', '6.2-2038', '6.2-2039', '6.2-2040', + '6.2-2041', + '6.2-2042', + '6.2-2043', '6.2-2044', '6.2-2045', '6.2-2046', '6.2-2047', '6.2-2048', '6.2-2049', + '6.2-2050'], + + 'sIIIc21': ['6.2-2100', '6.2-2101', '6.2-2102', '6.2-2103', '6.2-2104', '6.2-2105', '6.2-2106', + '6.2-2107', + '6.2-2108', '6.2-2109', '6.2-2110', '6.2-2111', '6.2-207.1'], + 'sIIIc22': ['6.2-2200', '6.2-2201', '6.2-2202', '6.2-2203', '6.2-2204', '6.2-2205', '6.2-2206', + '6.2-2207', + '6.2-2208', '6.2-2209', '6.2-2210', '6.2-2211', '6.2-2212', '6.2-2213', '6.2-2214', + '6.2-2215', + '6.2-2216', '6.2-2217', '6.2-2218', '6.2-2219', '6.2-2220', '6.2-2221', '6.2-2222', + '6.2-2223', + '6.2-2224', '6.2-2225', '6.2-2226', '6.2-2227', '6.2-2215.1', '6.2-2216.1', + '6.2-2216.2', + '6.2-2216.3', '6.2-2216.4', '6.2-2218.1' + ], + 'sIIIc23': ['6.2-2300', '6.2-2301', '6.2-2302', '6.2-2303', '6.2-2304', '6.2-2305', '6.2-2306', + '6.2-2307', + '6.2-2308', '6.2-2309', '6.2-2310', '6.2-2311', '6.2-2312', '6.2-2313', '6.2-2314'], + 'sIIIc24': ['6.2-2400', '6.2-2401', '6.2-2402'], + 'sIIIc25': ['6.2-2500', '6.2-2501', '6.2-2502', '6.2-2503', '6.2-2504', '6.2-2505'], + 'sIIIc26': ['6.2-2600', '6.2-2601', '6.2-2602', '6.2-2603', '6.2-2604', '6.2-2605', '6.2-2606', + '6.2-2607', + '6.2-2608', '6.2-2609', '6.2-2610', '6.2-2611', '6.2-2612', '6.2-2613', '6.2-2614', + '6.2-2615', + '6.2-2616', '6.2-2617', '6.2-2618', '6.2-2619', '6.2-2620', '6.2-2621', '6.2-2622'], } title_8_02 = {'c01': ['8.2-101', '8.2-102', '8.2-103', '8.2-104', '8.2-105', '8.2-106', '8.2-107'], 'c02': ['8.2-201', '8.2-202', '8.2-203', '8.2-204', '8.2-205', '8.2-206', '8.2-207', '8.2-208', '8.2-209', '8.2-210'], @@ -1365,14 +1777,17 @@ def add_citation(self): 'c01': ['8.2A-101', '8.2A-102', '8.2A-103', '8.2A-104', '8.2A-105', '8.2A-106', '8.2A-107', '8.2A-108', '8.2A-109'], 'c02': ['8.2A-201', '8.2A-202', '8.2A-203', '8.2A-204', '8.2A-205', '8.2A-206', '8.2A-207', '8.2A-208', - '8.2A-209', '8.2A-210', '8.2A - 211','8.2A - 212','8.2A - 213','8.2A - 214','8.2A - 215','8.2A - 216','8.2A - 217','8.2A - 218','8.2A - 219','8.2A - 220','8.2A - 221'],'c03':['8.2A-301', '8.2A-302', - '8.2A-303', '8.2A-304','8.2A-305', '8.2A-306','8.2A-307', '8.2A-308', '8.2A-309', '8.2A-310', '8.2A-311'], - 'c04': ['8.2A-401','8.2A-402', '8.2A-403','8.2A-404','8.2A-405','8.2A-406','8.2A-407'], - 'c05a01': ['8.2A-501', '8.2A-502', '8.2A-503', '8.2A-504', '8.2A-505', '8.2A-506', '8.2A-507'], - 'c05a02': ['8.2A-508', '8.2A-509', '8.2A-510', '8.2A-511', '8.2A-512', '8.2A-513', '8.2A-514', '8.2A-515', - '8.2A-516', '8.2A-517', '8.2A-518', '8.2A-519', '8.2A-520', '8.2A-521', '8.2A-522'], - 'c05a03': ['8.2A-523', '8.2A-524', '8.2A-525', '8.2A-526', '8.2A-527', '8.2A-528', '8.2A-529', '8.2A-530', - '8.2A-531', '8.2A-532']} + '8.2A-209', '8.2A-210', '8.2A - 211', '8.2A - 212', '8.2A - 213', '8.2A - 214', '8.2A - 215', + '8.2A - 216', '8.2A - 217', '8.2A - 218', '8.2A - 219', '8.2A - 220', '8.2A - 221'], + 'c03': ['8.2A-301', '8.2A-302', + '8.2A-303', '8.2A-304', '8.2A-305', '8.2A-306', '8.2A-307', '8.2A-308', '8.2A-309', '8.2A-310', + '8.2A-311'], + 'c04': ['8.2A-401', '8.2A-402', '8.2A-403', '8.2A-404', '8.2A-405', '8.2A-406', '8.2A-407'], + 'c05a01': ['8.2A-501', '8.2A-502', '8.2A-503', '8.2A-504', '8.2A-505', '8.2A-506', '8.2A-507'], + 'c05a02': ['8.2A-508', '8.2A-509', '8.2A-510', '8.2A-511', '8.2A-512', '8.2A-513', '8.2A-514', '8.2A-515', + '8.2A-516', '8.2A-517', '8.2A-518', '8.2A-519', '8.2A-520', '8.2A-521', '8.2A-522'], + 'c05a03': ['8.2A-523', '8.2A-524', '8.2A-525', '8.2A-526', '8.2A-527', '8.2A-528', '8.2A-529', '8.2A-530', + '8.2A-531', '8.2A-532']} title_8_3A = { 'c01': ['8.3A-101', '8.3A-102', '8.3A-103', '8.3A-104', '8.3A-105', '8.3A-106', '8.3A-107', '8.3A-108', @@ -1387,14 +1802,22 @@ def add_citation(self): 'c05': ['8.3A-501', '8.3A-502', '8.3A-503', '8.3A-504', '8.3A-505'], 'c06': ['8.3A-601', '8.3A-602', '8.3A-603', '8.3A-604', '8.3A-605']} - title_8_4 = {'c01': ['8.4-101', '8.4-102', '8.4-103', '8.4-104', '8.4-105', '8.4-106', '8.4-107', '8.4-108', '8.4-109', - '8.4-110', '8.4-111', '8.4-105.1'],'c02': ['8.4-201', '8.4-202', '8.4-203', '8.4-204', '8.4-205', '8.4-206', '8.4-207', '8.4-208', '8.4-209', - '8.4-210', '8.4-211', '8.4-212', '8.4-213', '8.4-214', '8.4-205.1', '8.4-207.1','8.4-207.2', '8.4-207.3', '8.4-211.1'], - 'c03': ['8.4-301', '8.4-302', '8.4-303'], 'c04': ['8.4-401','8.4-402','8.4-403','8.4-404','8.4-405','8.4-406','8.4-407'],'c05': ['8.4-501', '8.4-502', '8.4-503', '8.4-504']} - - title_8_4 = { 'c01': ['8.4A-101', '8.4A-102', '8.4A-103', '8.4A-104', '8.4A-105', '8.4A-106', '8.4A-107', '8.4A-108'], - 'c02': ['8.4A-201', '8.4A-202', '8.4A-203', '8.4A-204', '8.4A-205', '8.4A-206', '8.4A-207', '8.4A-208','8.4A-209', '8.4A-210', '8.4A-211', '8.4A-212'], - 'c03': ['8.4A-301', '8.4A-302', '8.4A-303', '8.4A-304', '8.4A-305'],'c04': ['8.4A-401', '8.4A-402', '8.4A-403', '8.4A-404', '8.4A-405', '8.4A-406'], + title_8_4 = { + 'c01': ['8.4-101', '8.4-102', '8.4-103', '8.4-104', '8.4-105', '8.4-106', '8.4-107', '8.4-108', '8.4-109', + '8.4-110', '8.4-111', '8.4-105.1'], + 'c02': ['8.4-201', '8.4-202', '8.4-203', '8.4-204', '8.4-205', '8.4-206', '8.4-207', '8.4-208', '8.4-209', + '8.4-210', '8.4-211', '8.4-212', '8.4-213', '8.4-214', '8.4-205.1', '8.4-207.1', '8.4-207.2', + '8.4-207.3', '8.4-211.1'], + 'c03': ['8.4-301', '8.4-302', '8.4-303'], + 'c04': ['8.4-401', '8.4-402', '8.4-403', '8.4-404', '8.4-405', '8.4-406', '8.4-407'], + 'c05': ['8.4-501', '8.4-502', '8.4-503', '8.4-504']} + + title_8_4 = { + 'c01': ['8.4A-101', '8.4A-102', '8.4A-103', '8.4A-104', '8.4A-105', '8.4A-106', '8.4A-107', '8.4A-108'], + 'c02': ['8.4A-201', '8.4A-202', '8.4A-203', '8.4A-204', '8.4A-205', '8.4A-206', '8.4A-207', '8.4A-208', + '8.4A-209', '8.4A-210', '8.4A-211', '8.4A-212'], + 'c03': ['8.4A-301', '8.4A-302', '8.4A-303', '8.4A-304', '8.4A-305'], + 'c04': ['8.4A-401', '8.4A-402', '8.4A-403', '8.4A-404', '8.4A-405', '8.4A-406'], 'c05': ['8.4A-501', '8.4A-502', '8.4A-503', '8.4A-504', '8.4A-505', '8.4A-506', '8.4A-507']} title_3_2 = { @@ -1405,15 +1828,21 @@ def add_citation(self): 's0Ic03': ['3.2-300', '3.2-301', '3.2-302'], 's0Ic3.1': ['3.2-303', '3.2-304', '3.2-305', '3.2-306', '3.2-307', '3.2-308', '3.2-309', '3.2-310', '3.2-311'], - 's0Ic04': ['3.2-400', '3.2-401', '3.2-402', '3.2 - 403','3.2 - 404','3.2 - 405','3.2 - 406','3.2 - 407','3.2 - 408','3.2 - 409','3.2 - 410'], - 's0Ic05':['3.2-500', '3.2-501', '3.2-502', '3.2 - 503','3.2 - 504','3.2 - 505','3.2 - 506'], - 's0Ic06': ['3.2-600','3.2-601','3.2-602','3.2 - 603','3.2 - 604'],'s0Ic07a01':['3.2-700', '3.2-701', '3.2-702', '3.2 - 703','3.2 - 704','3.2 - 705','3.2 - 706','3.2 - 707','3.2 - 708', - '3.2 - 709','3.2 - 710','3.2 - 711','3.2 - 712','3.2 - 713'],'s0Ic07a02': ['3.2-714', '3.2-715', '3.2-716','3.2-717', '3.2-718', '3.2-719','3.2-720', '3.2-721', '3.2-722', - '3.2-723', '3.2-724', '3.2-725','3.2-726', '3.2-727', '3.2-728','3.2-729', '3.2-730', '3.2-731'], - 's0Ic08': ['3.2-800', '3.2-801', '3.2-802', '3.2 - 803','3.2 - 804','3.2 - 805','3.2 - 806','3.2 - 807', - '3.2 - 808','3.2 - 809'],'s0Ic09':['3.2-900', '3.2-901'],'s0Ic10': ['3.2-1000', '3.2-1001', '3.2-1002', '3.2-1003', '3.2-1004', - '3.2-1005', '3.2-1006', '3.2-1007', '3.2-1008', '3.2-1009','3.2-1010', '3.2-1011']} - + 's0Ic04': ['3.2-400', '3.2-401', '3.2-402', '3.2 - 403', '3.2 - 404', '3.2 - 405', '3.2 - 406', '3.2 - 407', + '3.2 - 408', '3.2 - 409', '3.2 - 410'], + 's0Ic05': ['3.2-500', '3.2-501', '3.2-502', '3.2 - 503', '3.2 - 504', '3.2 - 505', '3.2 - 506'], + 's0Ic06': ['3.2-600', '3.2-601', '3.2-602', '3.2 - 603', '3.2 - 604'], + 's0Ic07a01': ['3.2-700', '3.2-701', '3.2-702', '3.2 - 703', '3.2 - 704', '3.2 - 705', '3.2 - 706', + '3.2 - 707', '3.2 - 708', + '3.2 - 709', '3.2 - 710', '3.2 - 711', '3.2 - 712', '3.2 - 713'], + 's0Ic07a02': ['3.2-714', '3.2-715', '3.2-716', '3.2-717', '3.2-718', '3.2-719', '3.2-720', '3.2-721', + '3.2-722', + '3.2-723', '3.2-724', '3.2-725', '3.2-726', '3.2-727', '3.2-728', '3.2-729', '3.2-730', + '3.2-731'], + 's0Ic08': ['3.2-800', '3.2-801', '3.2-802', '3.2 - 803', '3.2 - 804', '3.2 - 805', '3.2 - 806', '3.2 - 807', + '3.2 - 808', '3.2 - 809'], 's0Ic09': ['3.2-900', '3.2-901'], + 's0Ic10': ['3.2-1000', '3.2-1001', '3.2-1002', '3.2-1003', '3.2-1004', + '3.2-1005', '3.2-1006', '3.2-1007', '3.2-1008', '3.2-1009', '3.2-1010', '3.2-1011']} title_2_2 = { 's0Ip0Ac01a01': ['2.2-100', '2.2-101', '2.2-102', '2.2-103', '2.2-104', '2.2-105', '2.2-106', '2.2-107', @@ -1456,344 +1885,412 @@ def add_citation(self): 's0Ip0Ac04a03': ['2.2-418', '2.2-419', '2.2-420', '2.2-421', '2.2-422', '2.2-423', '2.2-424', '2.2-425', '2.2-426', '2.2-427', '2.2-428', '2.2-429', '2.2-430', '2.2-431', '2.2-432', '2.2-433', '2.2-434', '2.2-435'], - 's0Ip0Ac4.1': ['2.2-435.1', '2.2-435.2', '2.2-435.3', '2.2-435.4', '2.2-435.5'],'s0Ip0Ac4.2':['2.2-435.6','2.2-435.7','2.2-435.8','2.2-435.9','2.2-435.10'], - 's0Ip0Ac4.2:1': ['2.2-435.11'],'s0Ip0Ac4.2:2': ['2.2-435.12'],'s0Ip0Ac4.3': ['2.2-436', '2.2-437'], - 's0Ip0Ac4.4': ['2.2-438', '2.2-439', '2.2-440', '2.2-441', '2.2-442', '2.2-443', '2.2-444', '2.2-445','2.2-446', '2.2-447', '2.2-448', '2.2-449'], - 's0Ip0Bc05a01': ['2.2-500', '2.2-501', '2.2-502', '2.2-503', '2.2-504', '2.2-505', '2.2-506', '2.2-507','2.2-507.1', '2.2-507.2', '2.2-507.3', '2.2-508', '2.2-509', '2.2-509.1', '2.2-510', - '2.2-510.1', '2.2-510.2', '2.2-511', '2.2-511.1', '2.2-512', '2.2-513', '2.2-514', '2.2-515', '2.2-515.1', - '2.2-515.2', '2.2-516'],'s0Ip0Bc05a02': ['2.2-517'],'s0Ip0Bc05a03': ['2.2-518', '2.2-519'], - 's0Ip0Bc05a04': ['2.2-520', '2.2-521', '2.2 - 522','2.2 - 523','2.2 - 524'],'s0Ip0Cc06a01':['2.2-600', '2.2-601','2.2-601.1', '2.2-602', - '2.2-603', '2.2-604','2.2-604.1','2.2-604.2','2.2-604.2', '2.2-605','2.2-606', '2.2-607','2.2-608', '2.2-608.1','2.2-609', - '2.2-610', '2.2-611','2.2-612', '2.2-613','2.2-614', '2.2-614.1','2.2-614.2','2.2-614.2:1','2.2-614.3','2.2-614.4','2.2-614.5'], - 's0Ip0Cc06a02': ['2.2-615', '2.2-616', '2.2-617', '2.2-618', '2.2-619', '2.2-620', '2.2-621'], - 's0Ip0Cc07': ['2.2-700', '2.2-701', '2.2-702', '2.2-703', '2.2-704', '2.2-705', '2.2-706', '2.2-707', '2.2-708','2.2-709', '2.2-710', '2.2-711', '2.2-712', '2.2-713', '2.2-714', '2.2-715', '2.2-716', '2.2-717', - '2.2-718', '2.2-719', '2.2-720' ], - 's0Ip0Cc08a01': ['2.2-800', '2.2-801', '2.2-802', '2.2-803', '2.2-803.1', '2.2-804', '2.2-805', '2.2-806','2.2-807', '2.2-808', '2.2-809', '2.2-810', '2.2-811', '2.2-812', '2.2-813', '2.2-813.1', - '2.2-813.2'],'s0Ip0Cc08a02': ['2.2-814', '2.2-815', '2.2-816'],'s0Ip0Cc09': ['2.2-900', '2.2-904.2'], - 's0Ip0Cc9.1': ['2.2-905', '2.2-906'],'s0Ip0Cc10': [' 2.2-1000', '2.2-1001'], - 's0Ip0Cc11a01': ['2.2-1100', '2.2-1101', '2.2-1102'], - 's0Ip0Cc11a02': ['2.2-1103', '2.2-1104', '2.2-1105', '2.2-1106', '2.2-1107', '2.2-1108'], - 's0Ip0Cc11a03': ['2.2-1109', '2.2-1110', '2.2-1111', '2.2-1112', '2.2-1113', '2.2-1114', '2.2-1115', '2.2-1116', - '2.2-1117', '2.2-1118', '2.2-1119', '2.2-1120', '2.2-1121', '2.2-1122', - '2.2-1123', '2.2-1124', '2.2-1125', '2.2-1126', '2.2-1127', '2.2-1128'], - 's0Ip0Cc11a04': ['2.2-1129', '2.2-1130', '2.2-1131', '2.2-1132', '2.2-1133', '2.2-1134', '2.2-1135', '2.2-1136', - '2.2-1137', '2.2-1138', '2.2-1139', '2.2-1140', '2.2-1141', '2.2-1142', - '2.2-1143', '2.2-1144', '2.2-1145', '2.2-1146', '2.2-1147', '2.2-1148', '2.2-1149', '2.2-1150', - '2.2-1151', '2.2-1152', '2.2-1153', '2.2-1154', '2.2-1155', '2.2-1156', '2.2-1157', '2.2-1158', - '2.2-1159', '2.2-1160', '2.2-1161'], - 's0Ip0Cc11a05': ['2.2-1162', '2.2-1163', '2.2-1164', '2.2-1165', '2.2-1166', '2.2-1167'], - 's0Ip0Cc11a06': ['2.2-1168', '2.2-1169', '2.2-1170', '2.2-1171', '2.2-1172'], - 's0Ip0Cc11a07': ['2.2-1173', '2.2-1174', '2.2-1175', '2.2-1176', '2.2-1177', '2.2-1178', '2.2-1179', '2.2-1180', - '2.2-1181'], - 's0Ip0Cc11a08': ['2.2-1182', '2.2-1183'], - 's0Ip0Cc12': [ - '2.2-1200', '2.2-1201', '2.2-1202', '2.2-1203', '2.2-1204', '2.2-1205', '2.2-1206', '2.2-1207', '2.2-1208', - '2.2-1209', '2.2-1210', '2.2-1211', '2.2-1212', '2.2-1213', '2.2-1201.1'], - 's0Ip0Cc13': ['2.2-1300', '2.2-1301', '2.2-1302', '2.2-1303', '2.2-1304'], - 's0Ip0Cc14': ['2.2-1400', '2.2-1401', '2.2-1402', '2.2-1403', '2.2-1404'], - 's0Ip0Cc15': ['2.2-1500', '2.2-1501', '2.2-1502', '2.2-1503', '2.2-1504', '2.2-1505', '2.2-1506', '2.2-1507', - '2.2-1508', '2.2-1509', '2.2-1510', '2.2-1511', '2.2-1512', '2.2-1513', '2.2-1514', '2.2-1501.1', - '2.2-1502.1', - '2.2-1503.1', '2.2-1503.2', '2.2-1503.3', '2.2-1509.1', '2.2-1509.2', '2.2-1509.3', '2.2-1509.4'], - 's0Ip0Cc15.1': ['2.2-1515', '2.2-1516', '2.2-1517', '2.2-1518', '2.2-1519', '2.2-1520'], - 's0Ip0Cc16': ['2.2-1600', '2.2-1601', '2.2-1602', '2.2-1603', '2.2-1604', '2.2-1605', '2.2-1606'], - 's0Ip0Cc16.1a01': ['2.2-1603', '2.2-1604', '2.2-1605', '2.2-1606', '2.2-1607', '2.2-1608', '2.2-1609', - '2.2-1610'], - 's0Ip0Cc16.1a02': ['2.2-1611', '2.2-1612', '2.2-1613', '2.2-1614', '2.2-1615', '2.2-1616'], - 's0Ip0Cc16.1a03': ['2.2-1617'], - 's0Ip0Cc17': ['2.2-1700', '2.2-1701', '2.2-1702', '2.2-1703', '2.2-1704', '2.2-1705', '2.2-1706', '2.2-1707', - '2.2-1708', '2.2-1709', '2.2-1710'], - 's0Ip0Cc18a01': ['2.2-1800', '2.2-1801', '2.2-1802', '2.2-1803', '2.2-1804', '2.2-1805', '2.2-1806', '2.2-1807', - '2.2-1808', '2.2-1809', '2.2-1810', '2.2-1811', '2.2-1812'], - 's0Ip0Cc18a02': ['2.2-1813', '2.2-1814', '2.2-1815', '2.2-1816', '2.2-1817', '2.2-1818'], - 's0Ip0Cc18a03': ['2.2-1819', '2.2-1820', '2.2-1821', '2.2-1822', '2.2-1823', '2.2-1824', '2.2-1825', '2.2-1826', - '2.2-1827'], - 's0Ip0Cc18a04': ['2.2-1828', '2.2-1829', '2.2-1830', '2.2-1831'], - 's0Ip0Cc18a4.1': ['2.2-1831.1', '2.2-1831.2', '2.2-1831.3', '2.2-1831.4', '2.2-1831.5'], - 's0Ip0Cc18a5': ['2.2-1832', '2.2-1833', '2.2-1834', '2.2-1835', '2.2-1836', '2.2-1837', '2.2-1838', '2.2-1839', - '2.2-1840', '2.2-1841', '2.2-1842', '2.2-1843'], - 's0Ip0Cc19': ['2.2-1900', '2.2-1901', '2.2-1902', '2.2-1903', '2.2-1904', '2.2-1905'], - 's0Ip0Cc20': ['2.2-2000', '2.2-2000.1', '2.2-2001', '2.2-2001.1', '2.2-2001.2', '2.2-2001.3', '2.2-2001.4', - '2.2-2001.5', '2.2-2001.6', '2.2-2002', '2.2-2002.1', '2.2-2002.2', '2.2-2003', '2.2-2004', - '2.2-2004.1'], - 's0Ip0Cc20.1a01': ['2.2-2005', '2.2-2006', '2.2-2007', '2.2-2008', '2.2-2009', '2.2-2010', '2.2-2011', - '2.2-2012', '2.2-2013', '2.2-2014', '2.2-2015'], - 's0Ip0Cc20.1a02': ['2.2-2016', '2.2-2017', '2.2-2018', '2.2-2019', '2.2-2020', '2.2-2021'], - 's0Ip0Cc20.1a03': ['2.2-2022', '2.2-2023', '2.2-2024'], - 's0Ip0Cc20.1a04': ['2.2-2025', '2.2-2026', '2.2-2027', '2.2-2028', '2.2-2029', '2.2-2030'], - 's0Ip0Cc20.1a05': ['2.2-2031'], - 's0Ip0Cc20.1a06': ['2.2-2032'], - 's0Ip0Cc20.1a07': ['2.2-2033', '2.2-2034'], - 's0Ip0Dc21': ['2.2-2100', '2.2-2101', '2.2-2102', '2.2-2103', '2.2-2104', '2.2-2105', '2.2-2106'], - 's0Ip0Dc22a01': ['2.2-2200'], - 's0Ip0Dc22a02': ['2.2-2201', '2.2-2202', '2.2-2203', '2.2-2204', '2.2-2205', '2.2-2206', '2.2-2207', '2.2-2208', - '2.2-2209', '2.2-2210', '2.2-2211', '2.2-2212', '2.2-2213', '2.2-2214', - '2.2-2215', '2.2-2216', '2.2-2217'], - 's0Ip0Dc22a03': ['2.2-2218', '2.2-2219', '2.2-2220', '2.2-2221', '2.2-2222', '2.2-2223', '2.2-2224', '2.2-2225', - '2.2-2226', '2.2-2227', '2.2-2228', '2.2-2229', '2.2-2230', '2.2-2231', '2.2-2232', - '2.2-2233'], - 's0Ip0Dc22a04': ['2.2-2234', '2.2-2235', '2.2-2236', '2.2-2237', '2.2-2238', '2.2-2239', '2.2-2240', '2.2-2241', - '2.2-2242', '2.2-2243', '2.2-2244', '2.2-2245', '2.2-2246'], - 's0Ip0Dc22a05': ['2.2-2247', '2.2-2248', '2.2-2249', '2.2-2250', '2.2-2251', '2.2-2252', '2.2-2253', '2.2-2254', - '2.2-2255', '2.2-2256', '2.2-2257', '2.2-2258', '2.2-2259'], - 's0Ip0Dc22a06':['2.2-2260', '2.2-2261', '2.2-2262', '2.2-2263', '2.2-2264', '2.2-2265', '2.2-2266', '2.2-2267', '2.2-2268', '2.2-2269', '2.2-2270', '2.2-2271', '2.2-2272', '2.2-2273', '2.2-2274', '2.2-2275', '2.2-2276', '2.2-2277','2.2-2278'], - 's0Ip0Dc22a07':['2.2-2279', '2.2-2280', '2.2-2281', '2.2-2282', '2.2-2283', '2.2-2284', '2.2-2285', '2.2-2286', '2.2-2287', '2.2-2288', '2.2-2289', '2.2-2290', '2.2-2291', '2.2-2292', '2.2-2293', '2.2-2294', '2.2-2295', '2.2-2296', '2.2-2297', '2.2-2298', '2.2-2299', '2.2-2300', '2.2-2301', '2.2-2302', '2.2-2303', '2.2-2304', '2.2-2305', '2.2-2306', '2.2-2307', - '2.2-2308', '2.2-2309', '2.2-2310', '2.2-2311', '2.2-2312', '2.2-2313', '2.2-2314'], - 's0Ip0Dc22a08': ['2.2-2315', '2.2-2316', '2.2-2317', '2.2-2318', '2.2-2319', '2.2-2320', '2.2-2321', '2.2-2322', - '2.2-2323', '2.2-2324', '2.2-2325', '2.2-2326', '2.2-2327'], - 's0Ip0Dc22a09': ['2.2-2328', '2.2-2329', '2.2-2330', '2.2-2331', '2.2-2332', '2.2-2333', '2.2-2334', - '2.2-2335'], - 's0Ip0Dc22a10': ['2.2-2336', '2.2-2337', '2.2-2338', '2.2-2339', '2.2-2340', '2.2-2341', '2.2-2342', '2.2-2343', - '2.2-2344', '2.2-2345', '2.2-2346', '2.2-2347', '2.2-2348', '2.2-2349', '2.2-2350'], - 's0Ip0Dc22a11': ['2.2-2351', '2.2-2352', '2.2-2353', '2.2-2354', '2.2-2355', '2.2-2356', '2.2-2357', '2.2-2358', - '2.2-2359', '2.2-2360', '2.2-2361', '2.2-2362', '2.2-2363', '2.2-2364'], - 's0Ip0Dc22a12': ['2.2-2365', '2.2-2366', '2.2-2367', '2.2-2368', '2.2-2369', '2.2-2370', '2.2-2371', '2.2-2372', - '2.2-2373', '2.2-2374', '2.2-2375', '2.2-2376', '2.2-2377', '2.2-2378', '2.2-2379', - '2.2-2380'], - 's0Ip0Dc24a01': ['2.2-2400', '2.2-2401', '2.2-2402'], - 's0Ip0Dc24a02': ['2.2-2403'], - 's0Ip0Dc24a03': ['2.2-2404', '2.2-2405', '2.2-2406'], - 's0Ip0Dc24a04': ['2.2-2407', '2.2-2408'], - 's0Ip0Dc24a05': ['2.2-2409', '2.2-2410'], - 's0Ip0Dc24a06': ['2.2-2411', '2.2-2412'], - 's0Ip0Dc24a07': ['2.2-2413', '2.2-2414'], - 's0Ip0Dc24a08': ['2.2-2415', '2.2-2416', '2.2-2417', '2.2-2418', '2.2-2419', '2.2-2420'], - 's0Ip0Dc24a09': ['2.2-2421', '2.2-2422'], - 's0Ip0Dc24a10': ['2.2-2423'], - 's0Ip0Dc24a11': ['2.2-2424', '2.2-2425'], - 's0Ip0Dc24a12': ['2.2-2426', '2.2-2427', '2.2-2428', '2.2-2429', '2.2-2430', '2.2-2431', '2.2-2432', - '2.2-2433'], - 's0Ip0Dc24a13': ['2.2-2434'], - 's0Ip0Dc24a14': ['2.2-2435', '2.2-2436', '2.2-2437'], - 's0Ip0Dc24a15': ['2.2-2438', '2.2-2439'], - 's0Ip0Dc24a16': ['2.2-2441', '2.2-2442', '2.2-2443', '2.2-2444', '2.2-2445', '2.2-2446', '2.2-2447'], - 's0Ip0Dc24a17': ['2.2-2448', '2.2-2449', '2.2-2450', '2.2-2451'], - 's0Ip0Dc24a18': ['2.2-2452', '2.2-2453', '2.2-2454'], - 's0Ip0Dc24a19': ['2.2-2455', '2.2-2456'], - 's0Ip0Dc24a20': ['2.2-2457', '2.2-2458'], - 's0Ip0Dc24a21': ['2.2-2459', '2.2-2460', '2.2-2461'], - 's0Ip0Dc24a22': ['2.2-2462', '2.2-2463', '2.2-2464'], - 's0Ip0Dc24a23': ['2.2-2465', '2.2-2466', '2.2-2467', '2.2-2468', '2.2-2469'], - 's0Ip0Dc24a24': ['2.2-2470', '2.2-2471', '2.2-2472', '2.2-2473', '2.2-2474', '2.2-2475', '2.2-2476', - '2.2-2477'], - 's0Ip0Dc24a25': ['2.2-2478', '2.2-2479', '2.2-2480', '2.2-2481', '2.2-2482', '2.2-2483'], - 's0Ip0Dc24a26': ['2.2-2484', '2.2-2485', '2.2-2486', '2.2-2487', '2.2-2488', '2.2-2489', '2.2-2480', '2.2-2481', - '2.2-2482', '2.2-2483', '2.2-2484', '2.2-2485', '2.2-2486', - '2.2-2487', '2.2-2488', '2.2-2489', '2.2-2490'], - 's0Ip0Dc24a27': ['2.2-2491', '2.2-2492', '2.2-2493', '2.2-2494', '2.2-2495'], - 's0Ip0Dc24a28': ['2.2-2496', '2.2-2497', '2.2-2498', '2.2-2499'], - 's0Ip0Dc24a29': ['2.2-2491.1', '2.2-2491.2', '2.2-2491.3', '2.2-2491.4'], - 's0Ip0Dc24a30': ['2.2-2491.5', '2.2-2491.6', '2.2-2491.7', '2.2-2491.8'], - 's0Ip0Dc25a01': ['2.2-500', '2.2-501', '2.2-502'], - 's0Ip0Dc25a02': ['2.2-503', '2.2-504', '2.2-505'], - 's0Ip0Dc25a03': ['2.2-506', '2.2-507'], - 's0Ip0Dc25a04': ['2.2-508', '2.2-509', '2.2-510'], - 's0Ip0Dc25a05': ['2.2-511', '2.2-512'], - 's0Ip0Dc25a06': ['2.2-513', '2.2-514', '2.2-515', '2.2-516', '2.2-517'], - 's0Ip0Dc25a07': ['2.2-518', '2.2-519', '2.2-520', '2.2-521', '2.2-522', '2.2-523'], - 's0Ip0Dc25a07.1': ['2.2-524', '2.2-525', '2.2-526', '2.2-527', '2.2-528', '2.2-529'], - 's0Ip0Dc25a08': ['2.2-530', '2.2-531'], - 's0Ip0Dc25a09': ['2.2-532', '2.2-533', '2.2-534', '2.2-535', '2.2-536'], - 's0Ip0Dc25a010': ['2.2-537', '2.2-538', '2.2-539', '2.2-540', '2.2-541', '2.2-542', '2.2-543'], - 's0Ip0Dc25a011': ['2.2-544', '2.2-545', '2.2-546', '2.2-547', '2.2-548', '2.2-549', '2.2-550'], - 's0Ip0Dc25a012': ['2.2-551', '2.2-552', '2.2-553', '2.2-554', '2.2-555', '2.2-556', '2.2-557'], - 's0Ip0Dc25a013': ['2.2-558', '2.2-559', '2.2-560', '2.2-561', '2.2-562', '2.2-563', '2.2-564'], - 's0Ip0Dc26a01': ['2.2-2600', '2.2-2601', '2.2-2602'], - 's0Ip0Dc26a02': ['2.2-2603', '2.2-2604'], - 's0Ip0Dc26a03': ['2.2-2605', '2.2-2606', '2.2-2607', '2.2-2608'], - 's0Ip0Dc26a04': ['2.2-2609', '2.2-2610'], - 's0Ip0Dc26a05': ['2.2-2611', '2.2-2612', '2.2-2613'], - 's0Ip0Dc26a06': ['2.2-2614', '2.2-2615', '2.2-2616'], - 's0Ip0Dc26a07': ['2.2-2617', '2.2-2618', '2.2-2619'], - 's0Ip0Dc26a08': ['2.2-2620', '2.2-2621', '2.2-2622', '2.2-2623', '2.2-2624', '2.2-2625'], - 's0Ip0Dc26a09': ['2.2-2626', '2.2-2627'], - 's0Ip0Dc26a10': ['2.2-2628', '2.2-2629'], - 's0Ip0Dc26a11': ['2.2-2630', '2.2-2631'], - 's0Ip0Dc26a12': ['2.2-2632', '2.2-2633', '2.2-2634', '2.2-2635', '2.2-2636', '2.2-2637', '2.2-2638', - '2.2-2639'], - 's0Ip0Dc26a13': ['2.2-2640', '2.2-2641'], - 's0Ip0Dc26a14': ['2.2-2642', '2.2-2643'], - 's0Ip0Dc26a15': ['2.2-2644', '2.2-2645', '2.2-2646', '2.2-2647'], - 's0Ip0Dc26a16': ['2.2-2648', '2.2-2649'], - 's0Ip0Dc26a17': ['2.2-2650'], - 's0Ip0Dc26a18': ['2.2-2651'], - 's0Ip0Dc26a19': ['2.2-2652', '2.2-2653', '2.2-2654'], - 's0Ip0Dc26a20': ['2.2-2655', '2.2-2656'], - 's0Ip0Dc26a21': ['2.2-2657', '2.2-2658', '2.2-2659', '2.2-2660', '2.2-2661', '2.2-2662', '2.2-2663'], - 's0Ip0Dc26a22': ['2.2-2664'], - 's0Ip0Dc26a23': ['2.2-2665', '2.2-2666'], - 's0Ip0Dc26a23.1': ['2.2-2666.1', '2.2-2666.2', '2.2-2666.3'], - 's0Ip0Dc26a24': ['2.2-2667', '2.2-2668'], - 's0Ip0Dc26a25': ['2.2-2669', '2.2-2670', '2.2-2671', '2.2-2672', '2.2-2673', '2.2-2674'], - 's0Ip0Dc26a26': ['2.2-2674', '2.2-2678'], - 's0Ip0Dc26a27': ['2.2-2679', '2.2-2680'], - 's0Ip0Dc26a28': ['2.2-2681', '2.2-2682'], - 's0Ip0Dc26a29': ['2.2-2683', '2.2-2684', '2.2-2685', '2.2-2686', '2.2-2687', '2.2-2688', '2.2-2689'], - 's0Ip0Dc26a30': ['2.2-2690', '2.2-2691', '2.2-2692', '2.2-2693', '2.2-2694', '2.2-2695'], - 's0Ip0Dc26a31': ['2.2-2696', '2.2-2697'], - 's0Ip0Dc26a32': ['2.2-2698', '2.2-2699'], - 's0Ip0Dc26a33': ['2.2-2699.1', '2.2-2699.2'], - 's0Ip0Dc26a34': ['2.2-2699.3', '2.2-2699.4'], - 's0Ip0Dc26a35': ['2.2-2699.5', '2.2-2699.6', '2.2-2699.7'], - 's0Ip0Dc26a36': ['2.2-2699.8', '2.2-2699.9', '2.2-2699.10', '2.2-2699.11', '2.2-2699.12'], - 's0Ip0Dc26a37': ['2.2-2699.13', '2.2-2699.14'], - 's0Ip0Dc27a01': ['2.2-2700', '2.2-2701', '2.2-2702', '2.2-2703', '2.2-2704'], - 's0Ip0Dc27a02': ['2.2-2705', '2.2-2706', '2.2-2707', '2.2-2708'], - 's0Ip0Dc27a03': ['2.2-2709', '2.2-2710'], - 's0Ip0Dc27a04': ['2.2-2711'], - 's0Ip0Dc27a05': ['2.2-2712', '2.2-2713', '2.2-2714'], - 's0Ip0Dc27a06': ['2.2-2715', '2.2-2716', '2.2-2717', '2.2-2718', '2.2-2719'], - 's0Ip0Dc27a07': ['2.2-2720', '2.2-2721', '2.2-2722', '2.2-2723', '2.2-2724'], - 's0Ip0Dc27a08': ['2.2-2725', '2.2-2726', '2.2-2727', '2.2-2728', '2.2-2729', '2.2-2730', '2.2-2731'], - 's0Ip0Dc27a09': ['2.2-2732', '2.2-2733'], - 's0Ip0Dc27a10': ['2.2-2734', '2.2-2735', '2.2-2736', '2.2-2737'], - 's0Ip0Dc27a11': ['2.2-2738', '2.2-2739', '2.2-2740', '2.2-2741', '2.2-2742', '2.2-2743'], - 's0Ip0Dc27.1': ['2.2-2744', '2.2-2745', '2.2-2746', '2.2-2747', '2.2-2748', '2.2-2749', '2.2-2750', '2.2-2751', - '2.2-2752', '2.2-2753', '2.2-2754', '2.2-2755', '2.2-2756', '2.2-2757'], - 's0Ip0Ec28': ['2.2-2800', '2.2-2801', '2.2-2802', '2.2-2803', '2.2-2804', '2.2-2805', '2.2-2806', '2.2-2807', - '2.2-2808', '2.2-2809', '2.2-2810', '2.2-2811', '2.2-2812', '2.2-2813', '2.2-2814', '2.2-2815', - '2.2-2816', '2.2-2817', '2.2-2818', '2.2-2819', '2.2-2820', '2.2-2821', '2.2-2822', '2.2-2823', - '2.2-2824', '2.2-2825', '2.2-2826', '2.2-2827', '2.2-2828', '2.2-2829', '2.2-2830', '2.2-2831', - '2.2-2832'], - 's0Ip0Ec29': ['2.2-2900', '2.2-2901', '2.2-2902', '2.2-2903', '2.2-2904', '2.2-2905'], - 's0Ip0Ec30': ['2.2-3000', '2.2-3001', '2.2-3002', '2.2-3003', '2.2-3004', '2.2-3005', '2.2-3006', '2.2-3007', - '2.2-3008'], - 's0Ip0Ec30.1': ['2.2-3009', '2.2-3010', '2.2-3011', '2.2-3012', '2.2-3013', '2.2-3014'], - 's0Ip0Ec31a01': ['2.2-3100', '2.2-3101'], - 's0Ip0Ec31a02': ['2.2-3102', '2.2-3103', '2.2-3104'], - 's0Ip0Ec31a03': ['2.2-3105', '2.2-3106', '2.2-3107', '2.2-3108', '2.2-3109', '2.2-3110'], - 's0Ip0Ec31a04': ['2.2-3111', '2.2-3112'], - 's0Ip0Ec31a05': ['2.2-3113', '2.2-3114', '2.2-3115', '2.2-3116', '2.2-3117', '2.2-3118'], - 's0Ip0Ec31a06': ['2.2-3119'], - 's0Ip0Ec31a07': ['2.2-3120', '2.2-3121', '2.2-3122', '2.2-3123', '2.2-3124', '2.2-3125', '2.2-3126', - '2.2-3127'], - 's0Ip0Ec31a08': ['2.2-3128', '2.2-3129', '2.2-3130', '2.2-3131'], - 's0Ip0Ec31a09': ['2.2-3132'], - 's0Ip0Ec32': ['2.2-3200', '2.2-3201', '2.2-3202', '2.2-3203', '2.2-3204', '2.2-3205', '2.2-3206'], - 'sIIp0Ac33': ['2.2-3300', '2.2-3301', '2.2-3302', '2.2-3303', '2.2-3304', '2.2-3305', '2.2-3306', '2.2-3307', - '2.2-3308', '2.2-3309', '2.2-3310', '2.2-3311', '2.2-3312', '2.2-3313', - '2.2-3314', '2.2-3315', '2.2-3316', '2.2-3317', '2.2-3318', '2.2-3319', '2.2-3320', '2.2-3321', - '2.2-3322'], - 'sIIp0Ac34': ['2.2-3400', '2.2-3401', '2.2-3402'], - 'sIIp0Ac35': ['2.2-3500', '2.2-3501', '2.2-3502', '2.2-3503', '2.2-3504'], - 'sIIp0Ac36': ['2.2-3600', '2.2-3601', '2.2-3602', '2.2-3603', '2.2-3604', '2.2-3605'], - 'sIIp0Bc37': ['2.2-3700', '2.2-3701', '2.2-3702', '2.2-3703', '2.2-3704', '2.2-3705', '2.2-3706', '2.2-3707', - '2.2-3708', '2.2-3709', '2.2-3710', '2.2-3711', '2.2-3712', '2.2-3713', '2.2-3714', '2.2-3715'], - 'sIIp0Bc38': ['2.2-3800', '2.2-3801', '2.2-3802', '2.2-3803', '2.2-3804', '2.2-3805', '2.2-3806', '2.2-3807', - '2.2-3808', '2.2-3809'], - 'sIIp0Bc38.1': ['2.2-3815','2.2-3816'],'sIIp0Bc38.2':['2.2-3817', '2.2-3818', '2.2-3819'], - 'sIIp0Bc39': ['2.2-3900', '2.2-3901', '2.2-3902', '2.2-3903', - '2.2-3904', '2.2-3905', '2.2-3906', '2.2-3907', - '2.2-3908', '2.2-3909'], - 'sIIp0Bc40a01': ['2.2-4000', '2.2-4001', '2.2-4002', '2.2-4003', '2.2-4004'], - 'sIIp0Bc40a02': ['2.2-4005', '2.2-4006', '2.2-4007', '2.2-4008', '2.2-4009', '2.2-4010', '2.2-4011', '2.2-4012', - '2.2-4013', '2.2-4014', '2.2-4015', '2.2-4016'], - 'sIIp0Bc40a03': ['2.2-4017', '2.2-4018', '2.2-4019', '2.2-4020', '2.2-4021', '2.2-4022', '2.2-4023'], - 'sIIp0Bc40a04': ['2.2-4024', '2.2-4024.1', '2.2-4024.2'], - 'sIIp0Bc40a05': ['2.2-4025', '2.2-4026', '2.2-4027', '2.2-4028', '2.2-4029', '2.2-4030'], - 'sIIp0Bc40a06': ['2.2-4031', '2.2-4032', '2.2-4033'], - 'sIIp0Bc41': ['2.2 - 4100','2.2 - 4101','2.2 - 4102','2.2 - 4103','2.2 - 4104'],'sIIp0Bc41.1':['2.2-4115','2.2-4116','2.2-4117','2.2-4118','2.2-4119'], - 'sIIp0Bc42': ['2.2-4200', '2.2-4201'], - 'sIIp0Bc43a01': ['2.2-4300', '2.2-4301', '2.2-4302', '2.2-4302.1', '2.2-4302.2'], - 'sIIp0Bc43a02': ['2.2-4303', '2.2-4304', '2.2-4305', '2.2-4306', '2.2-4307', '2.2-4308', '2.2-4309', '2.2-4310', - '2.2-4311', '2.2-4312', '2.2-4313', '2.2-4314', '2.2-4315', '2.2-4316', '2.2-4317', - '2.2-4318', '2.2-4319', '2.2-4320', '2.2-4321', '2.2-4322', '2.2-4323', '2.2-4324', '2.2-4325', - '2.2-4326', '2.2-4327', '2.2-4328', '2.2-4329', '2.2-4330', '2.2-4331', '2.2-4332', '2.2-4333', - '2.2-4334', - '2.2-4335', '2.2-4336', '2.2-4337', '2.2-4338', '2.2-4339', '2.2-4340', '2.2-4341', - '2.2-4342'], - 'sIIp0Bc43a03': ['2.2-4343', '2.2-4344', '2.2-4345', '2.2-4346'], - 'sIIp0Bc43a04': ['2.2-4347', '2.2-4348', '2.2-4349', '2.2-4350', '2.2-4351', '2.2-4352', '2.2-4353', '2.2-4354', - '2.2-4355', '2.2-4356'], - 'sIIp0Bc43a05': ['2.2-4357', '2.2-4358', '2.2-4359', '2.2-4360', '2.2-4361', '2.2-4362', '2.2-4363', '2.2-4364', - '2.2-4365', '2.2-4366'], - 'sIIp0Bc43a06': ['2.2-4367', '2.2-4368', '2.2-4369', '2.2-4370', '2.2-4371', '2.2-4372', '2.2-4373', '2.2-4374', - '2.2-4375', '2.2-4376', '2.2-4377'], - 'sIIp0Bc43.1a01': ['2.2-4378', '2.2-4379'], - 'sIIp0Bc43.1a02': ['2.2-4380'], - 'sIIp0Bc43.1a03': ['2.2-4381'], - 'sIIp0Bc43.1a04': ['2.2-4382'], - 'sIIp0Bc43.1a05': ['2.2-4383'], - 'sIIp0Bc44': ['2.2-4400', '2.2-4401', '2.2-4402', '2.2-4403', '2.2-4404', '2.2-4405', '2.2-4406', '2.2-4407', - '2.2-4408', '2.2-4409', '2.2-4410', '2.2-4411'], - 'sIIp0Bc45': ['2.2-4500', '2.2-4501', '2.2-4502', '2.2-4503', '2.2-4504', '2.2-4505', '2.2-4506', '2.2-4507', - '2.2-4508', '2.2-4509', '2.2-4510', '2.2-4511', '2.2-4512', - '2.2-4513', '2.2-4514', '2.2-4515', '2.2-4516', '2.2-4517', '2.2-4518', '2.2-4519'], - 'sIIp0Bc46': ['2.2-4600', '2.2-4601', '2.2-4602', '2.2-4603', '2.2-4604', '2.2-4605', '2.2-4606'], - 'sIIp0Bc47': ['2.2-4700', '2.2-4701', '2.2-4702', '2.2-4703', '2.2-4704', '2.2-4705'], - 'sIIp0Bc48': ['2.2-4800', '2.2-4801', '2.2-4802', '2.2-4803', '2.2-4804', '2.2-4805', '2.2-4806', '2.2-4807', - '2.2-4808', '2.2-4809'], - 'sIIp0Bc49': ['2.2-4900', '2.2-4901', '2.2-4902', '2.2-4903', '2.2-4904', '2.2-4905', '2.2-4906'], - 'sIIp0Bc50': ['2.2-5000', '2.2-5001', '2.2-5002', '2.2-5003'], - 'sIIp0Bc50.1': ['2.2-5004', '2.2-5005'], - 'sIIp0Bc51': ['2.2-5100', '2.2-5101', '2.2-5102', '2.2-5103', '2.2-5104', '2.2-5102.1'], - 'sIIp0Bc51.1': ['2.2-5105', '2.2-5106', '2.2-5107', '2.2-5108'], - 'sIIp0Bc52': ['2.2-5200', '2.2-5201', '2.2-5202', '2.2-5203', '2.2-5204', '2.2-5205', '2.2-5206', '2.2-5207', - '2.2-5208', '2.2-5209', '2.2-5210', '2.2-5211', '2.2-5212', '2.2-5213', '2.2-5214'], - 'sIIp0Bc53': ['2.2-5300', '2.2-5301', '2.2-5302', '2.2-5303', '2.2-5304', '2.2-5305', '2.2-5306', '2.2-5307', - '2.2-5308'], - 'sIIp0Bc54': ['2.2-5400', '2.2-5401', '2.2-5402', '2.2-5403', '2.2-5404', '2.2-5405', '2.2-5406', '2.2-5407', - '2.2-5408'], - 'sIIp0Bc55': ['2.2-5500', '2.2-5501', '2.2-5502', '2.2-5503', '2.2-5504', '2.2-5505', '2.2-5506', '2.2-5507', - '2.2-5508', '2.2-5509'], - 'sIIp0Bc55.1': ['2.2-5510', '2.2-5511'], - 'sIIp0Bc55.2': ['2.2-5512', '2.2-5513'], - 'sIIp0Bc55.3': ['2.2-5514'], - 'sIIp0Bc55.4': ['2.2-5515'], - 'sIIp0Cc56': ['2.2-5600', '2.2-5601', '2.2-5602', '2.2-5603'], - 'sIIp0Cc57': ['2.2-5700', '2.2-5701', '2.2-5702'], - 'sIIp0Cc58': ['2.2-5800', '2.2-5801', '2.2-5802', '2.2-5803'], - 'sIIp0Cc59': ['2.2-5900', '2.2-5901'], - 'sIIp0Cc60': ['2.2-6000'], + 's0Ip0Ac4.1': ['2.2-435.1', '2.2-435.2', '2.2-435.3', '2.2-435.4', '2.2-435.5'], + 's0Ip0Ac4.2': ['2.2-435.6', '2.2-435.7', '2.2-435.8', '2.2-435.9', '2.2-435.10'], + 's0Ip0Ac4.2:1': ['2.2-435.11'], 's0Ip0Ac4.2:2': ['2.2-435.12'], 's0Ip0Ac4.3': ['2.2-436', '2.2-437'], + 's0Ip0Ac4.4': ['2.2-438', '2.2-439', '2.2-440', '2.2-441', '2.2-442', '2.2-443', '2.2-444', '2.2-445', + '2.2-446', '2.2-447', '2.2-448', '2.2-449'], + 's0Ip0Bc05a01': ['2.2-500', '2.2-501', '2.2-502', '2.2-503', '2.2-504', '2.2-505', '2.2-506', '2.2-507', + '2.2-507.1', '2.2-507.2', '2.2-507.3', '2.2-508', '2.2-509', '2.2-509.1', '2.2-510', + '2.2-510.1', '2.2-510.2', '2.2-511', '2.2-511.1', '2.2-512', '2.2-513', '2.2-514', + '2.2-515', '2.2-515.1', + '2.2-515.2', '2.2-516'], 's0Ip0Bc05a02': ['2.2-517'], + 's0Ip0Bc05a03': ['2.2-518', '2.2-519'], + 's0Ip0Bc05a04': ['2.2-520', '2.2-521', '2.2 - 522', '2.2 - 523', '2.2 - 524'], + 's0Ip0Cc06a01': ['2.2-600', '2.2-601', '2.2-601.1', '2.2-602', + '2.2-603', '2.2-604', '2.2-604.1', '2.2-604.2', '2.2-604.2', '2.2-605', '2.2-606', + '2.2-607', '2.2-608', '2.2-608.1', '2.2-609', + '2.2-610', '2.2-611', '2.2-612', '2.2-613', '2.2-614', '2.2-614.1', '2.2-614.2', + '2.2-614.2:1', '2.2-614.3', '2.2-614.4', '2.2-614.5'], + 's0Ip0Cc06a02': ['2.2-615', '2.2-616', '2.2-617', '2.2-618', '2.2-619', '2.2-620', '2.2-621'], + 's0Ip0Cc07': ['2.2-700', '2.2-701', '2.2-702', '2.2-703', '2.2-704', '2.2-705', '2.2-706', '2.2-707', + '2.2-708', '2.2-709', '2.2-710', '2.2-711', '2.2-712', '2.2-713', '2.2-714', '2.2-715', + '2.2-716', '2.2-717', + '2.2-718', '2.2-719', '2.2-720'], + 's0Ip0Cc08a01': ['2.2-800', '2.2-801', '2.2-802', '2.2-803', '2.2-803.1', '2.2-804', '2.2-805', '2.2-806', + '2.2-807', '2.2-808', '2.2-809', '2.2-810', '2.2-811', '2.2-812', '2.2-813', '2.2-813.1', + '2.2-813.2'], 's0Ip0Cc08a02': ['2.2-814', '2.2-815', '2.2-816'], + 's0Ip0Cc09': ['2.2-900', '2.2-904.2'], + 's0Ip0Cc9.1': ['2.2-905', '2.2-906'], 's0Ip0Cc10': [' 2.2-1000', '2.2-1001'], + 's0Ip0Cc11a01': ['2.2-1100', '2.2-1101', '2.2-1102'], + 's0Ip0Cc11a02': ['2.2-1103', '2.2-1104', '2.2-1105', '2.2-1106', '2.2-1107', '2.2-1108'], + 's0Ip0Cc11a03': ['2.2-1109', '2.2-1110', '2.2-1111', '2.2-1112', '2.2-1113', '2.2-1114', '2.2-1115', + '2.2-1116', + '2.2-1117', '2.2-1118', '2.2-1119', '2.2-1120', '2.2-1121', '2.2-1122', + '2.2-1123', '2.2-1124', '2.2-1125', '2.2-1126', '2.2-1127', '2.2-1128'], + 's0Ip0Cc11a04': ['2.2-1129', '2.2-1130', '2.2-1131', '2.2-1132', '2.2-1133', '2.2-1134', '2.2-1135', + '2.2-1136', + '2.2-1137', '2.2-1138', '2.2-1139', '2.2-1140', '2.2-1141', '2.2-1142', + '2.2-1143', '2.2-1144', '2.2-1145', '2.2-1146', '2.2-1147', '2.2-1148', '2.2-1149', + '2.2-1150', + '2.2-1151', '2.2-1152', '2.2-1153', '2.2-1154', '2.2-1155', '2.2-1156', '2.2-1157', + '2.2-1158', + '2.2-1159', '2.2-1160', '2.2-1161'], + 's0Ip0Cc11a05': ['2.2-1162', '2.2-1163', '2.2-1164', '2.2-1165', '2.2-1166', '2.2-1167'], + 's0Ip0Cc11a06': ['2.2-1168', '2.2-1169', '2.2-1170', '2.2-1171', '2.2-1172'], + 's0Ip0Cc11a07': ['2.2-1173', '2.2-1174', '2.2-1175', '2.2-1176', '2.2-1177', '2.2-1178', '2.2-1179', + '2.2-1180', + '2.2-1181'], + 's0Ip0Cc11a08': ['2.2-1182', '2.2-1183'], + 's0Ip0Cc12': [ + '2.2-1200', '2.2-1201', '2.2-1202', '2.2-1203', '2.2-1204', '2.2-1205', '2.2-1206', '2.2-1207', + '2.2-1208', + '2.2-1209', '2.2-1210', '2.2-1211', '2.2-1212', '2.2-1213', '2.2-1201.1'], + 's0Ip0Cc13': ['2.2-1300', '2.2-1301', '2.2-1302', '2.2-1303', '2.2-1304'], + 's0Ip0Cc14': ['2.2-1400', '2.2-1401', '2.2-1402', '2.2-1403', '2.2-1404'], + 's0Ip0Cc15': ['2.2-1500', '2.2-1501', '2.2-1502', '2.2-1503', '2.2-1504', '2.2-1505', '2.2-1506', + '2.2-1507', + '2.2-1508', '2.2-1509', '2.2-1510', '2.2-1511', '2.2-1512', '2.2-1513', '2.2-1514', + '2.2-1501.1', + '2.2-1502.1', + '2.2-1503.1', '2.2-1503.2', '2.2-1503.3', '2.2-1509.1', '2.2-1509.2', '2.2-1509.3', + '2.2-1509.4'], + 's0Ip0Cc15.1': ['2.2-1515', '2.2-1516', '2.2-1517', '2.2-1518', '2.2-1519', '2.2-1520'], + 's0Ip0Cc16': ['2.2-1600', '2.2-1601', '2.2-1602', '2.2-1603', '2.2-1604', '2.2-1605', '2.2-1606'], + 's0Ip0Cc16.1a01': ['2.2-1603', '2.2-1604', '2.2-1605', '2.2-1606', '2.2-1607', '2.2-1608', '2.2-1609', + '2.2-1610'], + 's0Ip0Cc16.1a02': ['2.2-1611', '2.2-1612', '2.2-1613', '2.2-1614', '2.2-1615', '2.2-1616'], + 's0Ip0Cc16.1a03': ['2.2-1617'], + 's0Ip0Cc17': ['2.2-1700', '2.2-1701', '2.2-1702', '2.2-1703', '2.2-1704', '2.2-1705', '2.2-1706', + '2.2-1707', + '2.2-1708', '2.2-1709', '2.2-1710'], + 's0Ip0Cc18a01': ['2.2-1800', '2.2-1801', '2.2-1802', '2.2-1803', '2.2-1804', '2.2-1805', '2.2-1806', + '2.2-1807', + '2.2-1808', '2.2-1809', '2.2-1810', '2.2-1811', '2.2-1812'], + 's0Ip0Cc18a02': ['2.2-1813', '2.2-1814', '2.2-1815', '2.2-1816', '2.2-1817', '2.2-1818'], + 's0Ip0Cc18a03': ['2.2-1819', '2.2-1820', '2.2-1821', '2.2-1822', '2.2-1823', '2.2-1824', '2.2-1825', + '2.2-1826', + '2.2-1827'], + 's0Ip0Cc18a04': ['2.2-1828', '2.2-1829', '2.2-1830', '2.2-1831'], + 's0Ip0Cc18a4.1': ['2.2-1831.1', '2.2-1831.2', '2.2-1831.3', '2.2-1831.4', '2.2-1831.5'], + 's0Ip0Cc18a5': ['2.2-1832', '2.2-1833', '2.2-1834', '2.2-1835', '2.2-1836', '2.2-1837', '2.2-1838', + '2.2-1839', + '2.2-1840', '2.2-1841', '2.2-1842', '2.2-1843'], + 's0Ip0Cc19': ['2.2-1900', '2.2-1901', '2.2-1902', '2.2-1903', '2.2-1904', '2.2-1905'], + 's0Ip0Cc20': ['2.2-2000', '2.2-2000.1', '2.2-2001', '2.2-2001.1', '2.2-2001.2', '2.2-2001.3', '2.2-2001.4', + '2.2-2001.5', '2.2-2001.6', '2.2-2002', '2.2-2002.1', '2.2-2002.2', '2.2-2003', '2.2-2004', + '2.2-2004.1'], + 's0Ip0Cc20.1a01': ['2.2-2005', '2.2-2006', '2.2-2007', '2.2-2008', '2.2-2009', '2.2-2010', '2.2-2011', + '2.2-2012', '2.2-2013', '2.2-2014', '2.2-2015'], + 's0Ip0Cc20.1a02': ['2.2-2016', '2.2-2017', '2.2-2018', '2.2-2019', '2.2-2020', '2.2-2021'], + 's0Ip0Cc20.1a03': ['2.2-2022', '2.2-2023', '2.2-2024'], + 's0Ip0Cc20.1a04': ['2.2-2025', '2.2-2026', '2.2-2027', '2.2-2028', '2.2-2029', '2.2-2030'], + 's0Ip0Cc20.1a05': ['2.2-2031'], + 's0Ip0Cc20.1a06': ['2.2-2032'], + 's0Ip0Cc20.1a07': ['2.2-2033', '2.2-2034'], + 's0Ip0Dc21': ['2.2-2100', '2.2-2101', '2.2-2102', '2.2-2103', '2.2-2104', '2.2-2105', '2.2-2106'], + 's0Ip0Dc22a01': ['2.2-2200'], + 's0Ip0Dc22a02': ['2.2-2201', '2.2-2202', '2.2-2203', '2.2-2204', '2.2-2205', '2.2-2206', '2.2-2207', + '2.2-2208', + '2.2-2209', '2.2-2210', '2.2-2211', '2.2-2212', '2.2-2213', '2.2-2214', + '2.2-2215', '2.2-2216', '2.2-2217'], + 's0Ip0Dc22a03': ['2.2-2218', '2.2-2219', '2.2-2220', '2.2-2221', '2.2-2222', '2.2-2223', '2.2-2224', + '2.2-2225', + '2.2-2226', '2.2-2227', '2.2-2228', '2.2-2229', '2.2-2230', '2.2-2231', '2.2-2232', + '2.2-2233'], + 's0Ip0Dc22a04': ['2.2-2234', '2.2-2235', '2.2-2236', '2.2-2237', '2.2-2238', '2.2-2239', '2.2-2240', + '2.2-2241', + '2.2-2242', '2.2-2243', '2.2-2244', '2.2-2245', '2.2-2246'], + 's0Ip0Dc22a05': ['2.2-2247', '2.2-2248', '2.2-2249', '2.2-2250', '2.2-2251', '2.2-2252', '2.2-2253', + '2.2-2254', + '2.2-2255', '2.2-2256', '2.2-2257', '2.2-2258', '2.2-2259'], + 's0Ip0Dc22a06': ['2.2-2260', '2.2-2261', '2.2-2262', '2.2-2263', '2.2-2264', '2.2-2265', '2.2-2266', + '2.2-2267', '2.2-2268', '2.2-2269', '2.2-2270', '2.2-2271', '2.2-2272', '2.2-2273', + '2.2-2274', '2.2-2275', '2.2-2276', '2.2-2277', '2.2-2278'], + 's0Ip0Dc22a07': ['2.2-2279', '2.2-2280', '2.2-2281', '2.2-2282', '2.2-2283', '2.2-2284', '2.2-2285', + '2.2-2286', '2.2-2287', '2.2-2288', '2.2-2289', '2.2-2290', '2.2-2291', '2.2-2292', + '2.2-2293', '2.2-2294', '2.2-2295', '2.2-2296', '2.2-2297', '2.2-2298', '2.2-2299', + '2.2-2300', '2.2-2301', '2.2-2302', '2.2-2303', '2.2-2304', '2.2-2305', '2.2-2306', + '2.2-2307', + '2.2-2308', '2.2-2309', '2.2-2310', '2.2-2311', '2.2-2312', '2.2-2313', '2.2-2314'], + 's0Ip0Dc22a08': ['2.2-2315', '2.2-2316', '2.2-2317', '2.2-2318', '2.2-2319', '2.2-2320', '2.2-2321', + '2.2-2322', + '2.2-2323', '2.2-2324', '2.2-2325', '2.2-2326', '2.2-2327'], + 's0Ip0Dc22a09': ['2.2-2328', '2.2-2329', '2.2-2330', '2.2-2331', '2.2-2332', '2.2-2333', '2.2-2334', + '2.2-2335'], + 's0Ip0Dc22a10': ['2.2-2336', '2.2-2337', '2.2-2338', '2.2-2339', '2.2-2340', '2.2-2341', '2.2-2342', + '2.2-2343', + '2.2-2344', '2.2-2345', '2.2-2346', '2.2-2347', '2.2-2348', '2.2-2349', '2.2-2350'], + 's0Ip0Dc22a11': ['2.2-2351', '2.2-2352', '2.2-2353', '2.2-2354', '2.2-2355', '2.2-2356', '2.2-2357', + '2.2-2358', + '2.2-2359', '2.2-2360', '2.2-2361', '2.2-2362', '2.2-2363', '2.2-2364'], + 's0Ip0Dc22a12': ['2.2-2365', '2.2-2366', '2.2-2367', '2.2-2368', '2.2-2369', '2.2-2370', '2.2-2371', + '2.2-2372', + '2.2-2373', '2.2-2374', '2.2-2375', '2.2-2376', '2.2-2377', '2.2-2378', '2.2-2379', + '2.2-2380'], + 's0Ip0Dc24a01': ['2.2-2400', '2.2-2401', '2.2-2402'], + 's0Ip0Dc24a02': ['2.2-2403'], + 's0Ip0Dc24a03': ['2.2-2404', '2.2-2405', '2.2-2406'], + 's0Ip0Dc24a04': ['2.2-2407', '2.2-2408'], + 's0Ip0Dc24a05': ['2.2-2409', '2.2-2410'], + 's0Ip0Dc24a06': ['2.2-2411', '2.2-2412'], + 's0Ip0Dc24a07': ['2.2-2413', '2.2-2414'], + 's0Ip0Dc24a08': ['2.2-2415', '2.2-2416', '2.2-2417', '2.2-2418', '2.2-2419', '2.2-2420'], + 's0Ip0Dc24a09': ['2.2-2421', '2.2-2422'], + 's0Ip0Dc24a10': ['2.2-2423'], + 's0Ip0Dc24a11': ['2.2-2424', '2.2-2425'], + 's0Ip0Dc24a12': ['2.2-2426', '2.2-2427', '2.2-2428', '2.2-2429', '2.2-2430', '2.2-2431', '2.2-2432', + '2.2-2433'], + 's0Ip0Dc24a13': ['2.2-2434'], + 's0Ip0Dc24a14': ['2.2-2435', '2.2-2436', '2.2-2437'], + 's0Ip0Dc24a15': ['2.2-2438', '2.2-2439'], + 's0Ip0Dc24a16': ['2.2-2441', '2.2-2442', '2.2-2443', '2.2-2444', '2.2-2445', '2.2-2446', '2.2-2447'], + 's0Ip0Dc24a17': ['2.2-2448', '2.2-2449', '2.2-2450', '2.2-2451'], + 's0Ip0Dc24a18': ['2.2-2452', '2.2-2453', '2.2-2454'], + 's0Ip0Dc24a19': ['2.2-2455', '2.2-2456'], + 's0Ip0Dc24a20': ['2.2-2457', '2.2-2458'], + 's0Ip0Dc24a21': ['2.2-2459', '2.2-2460', '2.2-2461'], + 's0Ip0Dc24a22': ['2.2-2462', '2.2-2463', '2.2-2464'], + 's0Ip0Dc24a23': ['2.2-2465', '2.2-2466', '2.2-2467', '2.2-2468', '2.2-2469'], + 's0Ip0Dc24a24': ['2.2-2470', '2.2-2471', '2.2-2472', '2.2-2473', '2.2-2474', '2.2-2475', '2.2-2476', + '2.2-2477'], + 's0Ip0Dc24a25': ['2.2-2478', '2.2-2479', '2.2-2480', '2.2-2481', '2.2-2482', '2.2-2483'], + 's0Ip0Dc24a26': ['2.2-2484', '2.2-2485', '2.2-2486', '2.2-2487', '2.2-2488', '2.2-2489', '2.2-2480', + '2.2-2481', + '2.2-2482', '2.2-2483', '2.2-2484', '2.2-2485', '2.2-2486', + '2.2-2487', '2.2-2488', '2.2-2489', '2.2-2490'], + 's0Ip0Dc24a27': ['2.2-2491', '2.2-2492', '2.2-2493', '2.2-2494', '2.2-2495'], + 's0Ip0Dc24a28': ['2.2-2496', '2.2-2497', '2.2-2498', '2.2-2499'], + 's0Ip0Dc24a29': ['2.2-2491.1', '2.2-2491.2', '2.2-2491.3', '2.2-2491.4'], + 's0Ip0Dc24a30': ['2.2-2491.5', '2.2-2491.6', '2.2-2491.7', '2.2-2491.8'], + 's0Ip0Dc25a01': ['2.2-500', '2.2-501', '2.2-502'], + 's0Ip0Dc25a02': ['2.2-503', '2.2-504', '2.2-505'], + 's0Ip0Dc25a03': ['2.2-506', '2.2-507'], + 's0Ip0Dc25a04': ['2.2-508', '2.2-509', '2.2-510'], + 's0Ip0Dc25a05': ['2.2-511', '2.2-512'], + 's0Ip0Dc25a06': ['2.2-513', '2.2-514', '2.2-515', '2.2-516', '2.2-517'], + 's0Ip0Dc25a07': ['2.2-518', '2.2-519', '2.2-520', '2.2-521', '2.2-522', '2.2-523'], + 's0Ip0Dc25a07.1': ['2.2-524', '2.2-525', '2.2-526', '2.2-527', '2.2-528', '2.2-529'], + 's0Ip0Dc25a08': ['2.2-530', '2.2-531'], + 's0Ip0Dc25a09': ['2.2-532', '2.2-533', '2.2-534', '2.2-535', '2.2-536'], + 's0Ip0Dc25a010': ['2.2-537', '2.2-538', '2.2-539', '2.2-540', '2.2-541', '2.2-542', '2.2-543'], + 's0Ip0Dc25a011': ['2.2-544', '2.2-545', '2.2-546', '2.2-547', '2.2-548', '2.2-549', '2.2-550'], + 's0Ip0Dc25a012': ['2.2-551', '2.2-552', '2.2-553', '2.2-554', '2.2-555', '2.2-556', '2.2-557'], + 's0Ip0Dc25a013': ['2.2-558', '2.2-559', '2.2-560', '2.2-561', '2.2-562', '2.2-563', '2.2-564'], + 's0Ip0Dc26a01': ['2.2-2600', '2.2-2601', '2.2-2602'], + 's0Ip0Dc26a02': ['2.2-2603', '2.2-2604'], + 's0Ip0Dc26a03': ['2.2-2605', '2.2-2606', '2.2-2607', '2.2-2608'], + 's0Ip0Dc26a04': ['2.2-2609', '2.2-2610'], + 's0Ip0Dc26a05': ['2.2-2611', '2.2-2612', '2.2-2613'], + 's0Ip0Dc26a06': ['2.2-2614', '2.2-2615', '2.2-2616'], + 's0Ip0Dc26a07': ['2.2-2617', '2.2-2618', '2.2-2619'], + 's0Ip0Dc26a08': ['2.2-2620', '2.2-2621', '2.2-2622', '2.2-2623', '2.2-2624', '2.2-2625'], + 's0Ip0Dc26a09': ['2.2-2626', '2.2-2627'], + 's0Ip0Dc26a10': ['2.2-2628', '2.2-2629'], + 's0Ip0Dc26a11': ['2.2-2630', '2.2-2631'], + 's0Ip0Dc26a12': ['2.2-2632', '2.2-2633', '2.2-2634', '2.2-2635', '2.2-2636', '2.2-2637', '2.2-2638', + '2.2-2639'], + 's0Ip0Dc26a13': ['2.2-2640', '2.2-2641'], + 's0Ip0Dc26a14': ['2.2-2642', '2.2-2643'], + 's0Ip0Dc26a15': ['2.2-2644', '2.2-2645', '2.2-2646', '2.2-2647'], + 's0Ip0Dc26a16': ['2.2-2648', '2.2-2649'], + 's0Ip0Dc26a17': ['2.2-2650'], + 's0Ip0Dc26a18': ['2.2-2651'], + 's0Ip0Dc26a19': ['2.2-2652', '2.2-2653', '2.2-2654'], + 's0Ip0Dc26a20': ['2.2-2655', '2.2-2656'], + 's0Ip0Dc26a21': ['2.2-2657', '2.2-2658', '2.2-2659', '2.2-2660', '2.2-2661', '2.2-2662', '2.2-2663'], + 's0Ip0Dc26a22': ['2.2-2664'], + 's0Ip0Dc26a23': ['2.2-2665', '2.2-2666'], + 's0Ip0Dc26a23.1': ['2.2-2666.1', '2.2-2666.2', '2.2-2666.3'], + 's0Ip0Dc26a24': ['2.2-2667', '2.2-2668'], + 's0Ip0Dc26a25': ['2.2-2669', '2.2-2670', '2.2-2671', '2.2-2672', '2.2-2673', '2.2-2674'], + 's0Ip0Dc26a26': ['2.2-2674', '2.2-2678'], + 's0Ip0Dc26a27': ['2.2-2679', '2.2-2680'], + 's0Ip0Dc26a28': ['2.2-2681', '2.2-2682'], + 's0Ip0Dc26a29': ['2.2-2683', '2.2-2684', '2.2-2685', '2.2-2686', '2.2-2687', '2.2-2688', '2.2-2689'], + 's0Ip0Dc26a30': ['2.2-2690', '2.2-2691', '2.2-2692', '2.2-2693', '2.2-2694', '2.2-2695'], + 's0Ip0Dc26a31': ['2.2-2696', '2.2-2697'], + 's0Ip0Dc26a32': ['2.2-2698', '2.2-2699'], + 's0Ip0Dc26a33': ['2.2-2699.1', '2.2-2699.2'], + 's0Ip0Dc26a34': ['2.2-2699.3', '2.2-2699.4'], + 's0Ip0Dc26a35': ['2.2-2699.5', '2.2-2699.6', '2.2-2699.7'], + 's0Ip0Dc26a36': ['2.2-2699.8', '2.2-2699.9', '2.2-2699.10', '2.2-2699.11', '2.2-2699.12'], + 's0Ip0Dc26a37': ['2.2-2699.13', '2.2-2699.14'], + 's0Ip0Dc27a01': ['2.2-2700', '2.2-2701', '2.2-2702', '2.2-2703', '2.2-2704'], + 's0Ip0Dc27a02': ['2.2-2705', '2.2-2706', '2.2-2707', '2.2-2708'], + 's0Ip0Dc27a03': ['2.2-2709', '2.2-2710'], + 's0Ip0Dc27a04': ['2.2-2711'], + 's0Ip0Dc27a05': ['2.2-2712', '2.2-2713', '2.2-2714'], + 's0Ip0Dc27a06': ['2.2-2715', '2.2-2716', '2.2-2717', '2.2-2718', '2.2-2719'], + 's0Ip0Dc27a07': ['2.2-2720', '2.2-2721', '2.2-2722', '2.2-2723', '2.2-2724'], + 's0Ip0Dc27a08': ['2.2-2725', '2.2-2726', '2.2-2727', '2.2-2728', '2.2-2729', '2.2-2730', '2.2-2731'], + 's0Ip0Dc27a09': ['2.2-2732', '2.2-2733'], + 's0Ip0Dc27a10': ['2.2-2734', '2.2-2735', '2.2-2736', '2.2-2737'], + 's0Ip0Dc27a11': ['2.2-2738', '2.2-2739', '2.2-2740', '2.2-2741', '2.2-2742', '2.2-2743'], + 's0Ip0Dc27.1': ['2.2-2744', '2.2-2745', '2.2-2746', '2.2-2747', '2.2-2748', '2.2-2749', '2.2-2750', + '2.2-2751', + '2.2-2752', '2.2-2753', '2.2-2754', '2.2-2755', '2.2-2756', '2.2-2757'], + 's0Ip0Ec28': ['2.2-2800', '2.2-2801', '2.2-2802', '2.2-2803', '2.2-2804', '2.2-2805', '2.2-2806', + '2.2-2807', + '2.2-2808', '2.2-2809', '2.2-2810', '2.2-2811', '2.2-2812', '2.2-2813', '2.2-2814', + '2.2-2815', + '2.2-2816', '2.2-2817', '2.2-2818', '2.2-2819', '2.2-2820', '2.2-2821', '2.2-2822', + '2.2-2823', + '2.2-2824', '2.2-2825', '2.2-2826', '2.2-2827', '2.2-2828', '2.2-2829', '2.2-2830', + '2.2-2831', + '2.2-2832'], + 's0Ip0Ec29': ['2.2-2900', '2.2-2901', '2.2-2902', '2.2-2903', '2.2-2904', '2.2-2905'], + 's0Ip0Ec30': ['2.2-3000', '2.2-3001', '2.2-3002', '2.2-3003', '2.2-3004', '2.2-3005', '2.2-3006', + '2.2-3007', + '2.2-3008'], + 's0Ip0Ec30.1': ['2.2-3009', '2.2-3010', '2.2-3011', '2.2-3012', '2.2-3013', '2.2-3014'], + 's0Ip0Ec31a01': ['2.2-3100', '2.2-3101'], + 's0Ip0Ec31a02': ['2.2-3102', '2.2-3103', '2.2-3104'], + 's0Ip0Ec31a03': ['2.2-3105', '2.2-3106', '2.2-3107', '2.2-3108', '2.2-3109', '2.2-3110'], + 's0Ip0Ec31a04': ['2.2-3111', '2.2-3112'], + 's0Ip0Ec31a05': ['2.2-3113', '2.2-3114', '2.2-3115', '2.2-3116', '2.2-3117', '2.2-3118'], + 's0Ip0Ec31a06': ['2.2-3119'], + 's0Ip0Ec31a07': ['2.2-3120', '2.2-3121', '2.2-3122', '2.2-3123', '2.2-3124', '2.2-3125', '2.2-3126', + '2.2-3127'], + 's0Ip0Ec31a08': ['2.2-3128', '2.2-3129', '2.2-3130', '2.2-3131'], + 's0Ip0Ec31a09': ['2.2-3132'], + 's0Ip0Ec32': ['2.2-3200', '2.2-3201', '2.2-3202', '2.2-3203', '2.2-3204', '2.2-3205', '2.2-3206'], + 'sIIp0Ac33': ['2.2-3300', '2.2-3301', '2.2-3302', '2.2-3303', '2.2-3304', '2.2-3305', '2.2-3306', + '2.2-3307', + '2.2-3308', '2.2-3309', '2.2-3310', '2.2-3311', '2.2-3312', '2.2-3313', + '2.2-3314', '2.2-3315', '2.2-3316', '2.2-3317', '2.2-3318', '2.2-3319', '2.2-3320', + '2.2-3321', + '2.2-3322'], + 'sIIp0Ac34': ['2.2-3400', '2.2-3401', '2.2-3402'], + 'sIIp0Ac35': ['2.2-3500', '2.2-3501', '2.2-3502', '2.2-3503', '2.2-3504'], + 'sIIp0Ac36': ['2.2-3600', '2.2-3601', '2.2-3602', '2.2-3603', '2.2-3604', '2.2-3605'], + 'sIIp0Bc37': ['2.2-3700', '2.2-3701', '2.2-3702', '2.2-3703', '2.2-3704', '2.2-3705', '2.2-3706', + '2.2-3707', + '2.2-3708', '2.2-3709', '2.2-3710', '2.2-3711', '2.2-3712', '2.2-3713', '2.2-3714', + '2.2-3715'], + 'sIIp0Bc38': ['2.2-3800', '2.2-3801', '2.2-3802', '2.2-3803', '2.2-3804', '2.2-3805', '2.2-3806', + '2.2-3807', + '2.2-3808', '2.2-3809'], + 'sIIp0Bc38.1': ['2.2-3815', '2.2-3816'], 'sIIp0Bc38.2': ['2.2-3817', '2.2-3818', '2.2-3819'], + 'sIIp0Bc39': ['2.2-3900', '2.2-3901', '2.2-3902', '2.2-3903', + '2.2-3904', '2.2-3905', '2.2-3906', '2.2-3907', + '2.2-3908', '2.2-3909'], + 'sIIp0Bc40a01': ['2.2-4000', '2.2-4001', '2.2-4002', '2.2-4003', '2.2-4004'], + 'sIIp0Bc40a02': ['2.2-4005', '2.2-4006', '2.2-4007', '2.2-4008', '2.2-4009', '2.2-4010', '2.2-4011', + '2.2-4012', + '2.2-4013', '2.2-4014', '2.2-4015', '2.2-4016'], + 'sIIp0Bc40a03': ['2.2-4017', '2.2-4018', '2.2-4019', '2.2-4020', '2.2-4021', '2.2-4022', '2.2-4023'], + 'sIIp0Bc40a04': ['2.2-4024', '2.2-4024.1', '2.2-4024.2'], + 'sIIp0Bc40a05': ['2.2-4025', '2.2-4026', '2.2-4027', '2.2-4028', '2.2-4029', '2.2-4030'], + 'sIIp0Bc40a06': ['2.2-4031', '2.2-4032', '2.2-4033'], + 'sIIp0Bc41': ['2.2 - 4100', '2.2 - 4101', '2.2 - 4102', '2.2 - 4103', '2.2 - 4104'], + 'sIIp0Bc41.1': ['2.2-4115', '2.2-4116', '2.2-4117', '2.2-4118', '2.2-4119'], + 'sIIp0Bc42': ['2.2-4200', '2.2-4201'], + 'sIIp0Bc43a01': ['2.2-4300', '2.2-4301', '2.2-4302', '2.2-4302.1', '2.2-4302.2'], + 'sIIp0Bc43a02': ['2.2-4303', '2.2-4304', '2.2-4305', '2.2-4306', '2.2-4307', '2.2-4308', '2.2-4309', + '2.2-4310', + '2.2-4311', '2.2-4312', '2.2-4313', '2.2-4314', '2.2-4315', '2.2-4316', '2.2-4317', + '2.2-4318', '2.2-4319', '2.2-4320', '2.2-4321', '2.2-4322', '2.2-4323', '2.2-4324', + '2.2-4325', + '2.2-4326', '2.2-4327', '2.2-4328', '2.2-4329', '2.2-4330', '2.2-4331', '2.2-4332', + '2.2-4333', + '2.2-4334', + '2.2-4335', '2.2-4336', '2.2-4337', '2.2-4338', '2.2-4339', '2.2-4340', '2.2-4341', + '2.2-4342'], + 'sIIp0Bc43a03': ['2.2-4343', '2.2-4344', '2.2-4345', '2.2-4346'], + 'sIIp0Bc43a04': ['2.2-4347', '2.2-4348', '2.2-4349', '2.2-4350', '2.2-4351', '2.2-4352', '2.2-4353', + '2.2-4354', + '2.2-4355', '2.2-4356'], + 'sIIp0Bc43a05': ['2.2-4357', '2.2-4358', '2.2-4359', '2.2-4360', '2.2-4361', '2.2-4362', '2.2-4363', + '2.2-4364', + '2.2-4365', '2.2-4366'], + 'sIIp0Bc43a06': ['2.2-4367', '2.2-4368', '2.2-4369', '2.2-4370', '2.2-4371', '2.2-4372', '2.2-4373', + '2.2-4374', + '2.2-4375', '2.2-4376', '2.2-4377'], + 'sIIp0Bc43.1a01': ['2.2-4378', '2.2-4379'], + 'sIIp0Bc43.1a02': ['2.2-4380'], + 'sIIp0Bc43.1a03': ['2.2-4381'], + 'sIIp0Bc43.1a04': ['2.2-4382'], + 'sIIp0Bc43.1a05': ['2.2-4383'], + 'sIIp0Bc44': ['2.2-4400', '2.2-4401', '2.2-4402', '2.2-4403', '2.2-4404', '2.2-4405', '2.2-4406', + '2.2-4407', + '2.2-4408', '2.2-4409', '2.2-4410', '2.2-4411'], + 'sIIp0Bc45': ['2.2-4500', '2.2-4501', '2.2-4502', '2.2-4503', '2.2-4504', '2.2-4505', '2.2-4506', + '2.2-4507', + '2.2-4508', '2.2-4509', '2.2-4510', '2.2-4511', '2.2-4512', + '2.2-4513', '2.2-4514', '2.2-4515', '2.2-4516', '2.2-4517', '2.2-4518', '2.2-4519'], + 'sIIp0Bc46': ['2.2-4600', '2.2-4601', '2.2-4602', '2.2-4603', '2.2-4604', '2.2-4605', '2.2-4606'], + 'sIIp0Bc47': ['2.2-4700', '2.2-4701', '2.2-4702', '2.2-4703', '2.2-4704', '2.2-4705'], + 'sIIp0Bc48': ['2.2-4800', '2.2-4801', '2.2-4802', '2.2-4803', '2.2-4804', '2.2-4805', '2.2-4806', + '2.2-4807', + '2.2-4808', '2.2-4809'], + 'sIIp0Bc49': ['2.2-4900', '2.2-4901', '2.2-4902', '2.2-4903', '2.2-4904', '2.2-4905', '2.2-4906'], + 'sIIp0Bc50': ['2.2-5000', '2.2-5001', '2.2-5002', '2.2-5003'], + 'sIIp0Bc50.1': ['2.2-5004', '2.2-5005'], + 'sIIp0Bc51': ['2.2-5100', '2.2-5101', '2.2-5102', '2.2-5103', '2.2-5104', '2.2-5102.1'], + 'sIIp0Bc51.1': ['2.2-5105', '2.2-5106', '2.2-5107', '2.2-5108'], + 'sIIp0Bc52': ['2.2-5200', '2.2-5201', '2.2-5202', '2.2-5203', '2.2-5204', '2.2-5205', '2.2-5206', + '2.2-5207', + '2.2-5208', '2.2-5209', '2.2-5210', '2.2-5211', '2.2-5212', '2.2-5213', '2.2-5214'], + 'sIIp0Bc53': ['2.2-5300', '2.2-5301', '2.2-5302', '2.2-5303', '2.2-5304', '2.2-5305', '2.2-5306', + '2.2-5307', + '2.2-5308'], + 'sIIp0Bc54': ['2.2-5400', '2.2-5401', '2.2-5402', '2.2-5403', '2.2-5404', '2.2-5405', '2.2-5406', + '2.2-5407', + '2.2-5408'], + 'sIIp0Bc55': ['2.2-5500', '2.2-5501', '2.2-5502', '2.2-5503', '2.2-5504', '2.2-5505', '2.2-5506', + '2.2-5507', + '2.2-5508', '2.2-5509'], + 'sIIp0Bc55.1': ['2.2-5510', '2.2-5511'], + 'sIIp0Bc55.2': ['2.2-5512', '2.2-5513'], + 'sIIp0Bc55.3': ['2.2-5514'], + 'sIIp0Bc55.4': ['2.2-5515'], + 'sIIp0Cc56': ['2.2-5600', '2.2-5601', '2.2-5602', '2.2-5603'], + 'sIIp0Cc57': ['2.2-5700', '2.2-5701', '2.2-5702'], + 'sIIp0Cc58': ['2.2-5800', '2.2-5801', '2.2-5802', '2.2-5803'], + 'sIIp0Cc59': ['2.2-5900', '2.2-5901'], + 'sIIp0Cc60': ['2.2-6000'], } title_5_1 = { - 'c01a01': ['5.1-1', '5.1-1.1', '5.1-1.2', '5.1-1.3', '5.1-1.4', '5.1-1.5', '5.1-1.6', '5.1-1.7', - '5.1-2', '5.1-2.1', '5.1-2.2', '5.1-2.3', '5.1-2.4', '5.1-2.5', '5.1-2.6', '5.1-2.7', - '5.1-2.8', '5.1-2.9', '5.1-2.10', '5.1-2.11', '5.1-2.12', '5.1-2.13', '5.1-2.14', - '5.1-2.15', '5.1-2.16', '5.1-2.17', '5.1-2.18', '5.1-2.19', '5.1-2.20', '5.1-2.21', - '5.1-2.22', '5.1-2.23', - '5.1-2.24', '5.1-3', '5.1-4', '5.1-5', '5.1-6', '5.1-7', '5.1-7.1', '5.1-7.2', - '5.1-7.3', '5.1-8', '5.1-9', '5.1-9.1', '5.1-9.2', '5.1-9.3', '5.1-9.4', '5.1-9.5', - '5.1-9.6', '5.1-9.7', '5.1-9.8', '5.1-9.9', - '5.1-10', '5.1-11', '5.1-12'], - 'c01a02': ['5.1-13', '5.1-14', '5.1-15', '5.1-16', '5.1-17', '5.1-18', '5.1-19', '5.1-20', - '5.1-21', '5.1-22', '5.1-23', '5.1-24', '5.1-25'], - 'c01a03': ['5.1-25.1', '5.1-25.2', '5.1-25.3', '5.1-25.4'], - 'c02': ['5.1-26', '5.1-27', '5.1-28', '5.1-29', '5.1-30'], - 'c2.1': ['5.1-30.1', '5.1-30.2', '5.1-30.3', '5.1-30.4', '5.1-30.5', '5.1-30.6', '5.1-30.7', - '5.1-30.8', '5.1-30.9', '5.1-30.10'], - 'c03a01': ['5.1-31', '5.1-32', '5.1-33', '5.1-34', '5.1-35', '5.1-36', '5.1-37', '5.1-38', - '5.1-39', '5.1-40', '5.1-41'], - 'c03a02': ['5.1-42', '5.1-43', '5.1-44', '5.1-45', '5.1-46'], - 'c03a03': ['5.1-47', '5.1-48'], - 'c04': ['5.1-49', '5.1-50'], - 'c05': ['5.1-51', '5.1-52', '5.1-53', '5.1-54', '5.1-55'], - 'c06': ['5.1-56', '5.1-57', '5.1-58', '5.1-59', '5.1-60', '5.1-61', '5.1-62', '5.1-63', - '5.1-64', '5.1-65', '5.1-66', '5.1-67', '5.1-68', '5.1-69', '5.1-70', '5.1-71', - '5.1-72', - '5.1-73', '5.1-74', '5.1-75', '5.1-76'], - 'c07': ['5.1-77', '5.1-78', '5.1-79', '5.1-80', '5.1-81', '5.1-82'], - 'c08': ['5.1-83', '5.1-84', '5.1-85', '5.1-86', '5.1-87', '5.1-88'], - 'c8.1': ['5.1-88.1', '5.1-88.2', '5.1-88.3', '5.1-88.4', '5.1-88.5', '5.1-88.6'], - 'c8.2': ['5.1-88.7', '5.1-88.8', '5.1-88.9', '5.1-88.10'], - 'c09a01': ['5.1-89', '5.1-90', '5.1-91', '5.1-92', '5.1-93'], - 'c09a02': ['5.1-94', '5.1-95', '5.1-96', '5.1-97', '5.1-98', '5.1-99', '5.1-100', '5.1-101', - '5.1-102'], - 'c09a03': ['5.1-103', '5.1-104', '5.1-105', '5.1-106'], - 'c09a04': ['5.1-107'], - 'c09a05': ['5.1-108', '5.1-109', '5.1-110', '5.1-111', '5.1-112'], - 'c09a06': ['5.1-113', '5.1-114', '5.1-115'], - 'c09a07': ['5.1-116', '5.1-117', '5.1-118', '5.1-119', '5.1-120'], - 'c09a08':['5.1-121', '5.1-122', '5.1-123', '5.1-124', '5.1-125', '5.1-126', '5.1-127', '5.1-128', '5.1-129', '5.1-130', '5.1-131', '5.1-132', '5.1-133', '5.1-134', '5.1-135', '5.1-136', '5.1-137', '5.1-138', '5.1-139'], - 'c09a09':['5.1-140', '5.1-141', '5.1-142', '5.1-143', '5.1-144', '5.1-145', '5.1-146', '5.1-147', - '5.1-148', '5.1-149', '5.1-150', '5.1-151'], - 'c10': ['5.1-152', '5.1-153', '5.1-154', '5.1-155', '5.1-156', '5.1-157', '5.1-158', - '5.1-159', '5.1-160', '5.1-161', '5.1-162', '5.1-163', '5.1-164', '5.1-165', - '5.1-166', '5.1-167', '5.1-168', '5.1-169', '5.1-170', - '5.1-171', '5.1-172', '5.1-173', '5.1-174', '5.1-175', '5.1-176', '5.1-177', - '5.1-178']} + 'c01a01': ['5.1-1', '5.1-1.1', '5.1-1.2', '5.1-1.3', '5.1-1.4', '5.1-1.5', '5.1-1.6', '5.1-1.7', + '5.1-2', '5.1-2.1', '5.1-2.2', '5.1-2.3', '5.1-2.4', '5.1-2.5', '5.1-2.6', '5.1-2.7', + '5.1-2.8', '5.1-2.9', '5.1-2.10', '5.1-2.11', '5.1-2.12', '5.1-2.13', '5.1-2.14', + '5.1-2.15', '5.1-2.16', '5.1-2.17', '5.1-2.18', '5.1-2.19', '5.1-2.20', '5.1-2.21', + '5.1-2.22', '5.1-2.23', + '5.1-2.24', '5.1-3', '5.1-4', '5.1-5', '5.1-6', '5.1-7', '5.1-7.1', '5.1-7.2', + '5.1-7.3', '5.1-8', '5.1-9', '5.1-9.1', '5.1-9.2', '5.1-9.3', '5.1-9.4', '5.1-9.5', + '5.1-9.6', '5.1-9.7', '5.1-9.8', '5.1-9.9', + '5.1-10', '5.1-11', '5.1-12'], + 'c01a02': ['5.1-13', '5.1-14', '5.1-15', '5.1-16', '5.1-17', '5.1-18', '5.1-19', '5.1-20', + '5.1-21', '5.1-22', '5.1-23', '5.1-24', '5.1-25'], + 'c01a03': ['5.1-25.1', '5.1-25.2', '5.1-25.3', '5.1-25.4'], + 'c02': ['5.1-26', '5.1-27', '5.1-28', '5.1-29', '5.1-30'], + 'c2.1': ['5.1-30.1', '5.1-30.2', '5.1-30.3', '5.1-30.4', '5.1-30.5', '5.1-30.6', '5.1-30.7', + '5.1-30.8', '5.1-30.9', '5.1-30.10'], + 'c03a01': ['5.1-31', '5.1-32', '5.1-33', '5.1-34', '5.1-35', '5.1-36', '5.1-37', '5.1-38', + '5.1-39', '5.1-40', '5.1-41'], + 'c03a02': ['5.1-42', '5.1-43', '5.1-44', '5.1-45', '5.1-46'], + 'c03a03': ['5.1-47', '5.1-48'], + 'c04': ['5.1-49', '5.1-50'], + 'c05': ['5.1-51', '5.1-52', '5.1-53', '5.1-54', '5.1-55'], + 'c06': ['5.1-56', '5.1-57', '5.1-58', '5.1-59', '5.1-60', '5.1-61', '5.1-62', '5.1-63', + '5.1-64', '5.1-65', '5.1-66', '5.1-67', '5.1-68', '5.1-69', '5.1-70', '5.1-71', + '5.1-72', + '5.1-73', '5.1-74', '5.1-75', '5.1-76'], + 'c07': ['5.1-77', '5.1-78', '5.1-79', '5.1-80', '5.1-81', '5.1-82'], + 'c08': ['5.1-83', '5.1-84', '5.1-85', '5.1-86', '5.1-87', '5.1-88'], + 'c8.1': ['5.1-88.1', '5.1-88.2', '5.1-88.3', '5.1-88.4', '5.1-88.5', '5.1-88.6'], + 'c8.2': ['5.1-88.7', '5.1-88.8', '5.1-88.9', '5.1-88.10'], + 'c09a01': ['5.1-89', '5.1-90', '5.1-91', '5.1-92', '5.1-93'], + 'c09a02': ['5.1-94', '5.1-95', '5.1-96', '5.1-97', '5.1-98', '5.1-99', '5.1-100', '5.1-101', + '5.1-102'], + 'c09a03': ['5.1-103', '5.1-104', '5.1-105', '5.1-106'], + 'c09a04': ['5.1-107'], + 'c09a05': ['5.1-108', '5.1-109', '5.1-110', '5.1-111', '5.1-112'], + 'c09a06': ['5.1-113', '5.1-114', '5.1-115'], + 'c09a07': ['5.1-116', '5.1-117', '5.1-118', '5.1-119', '5.1-120'], + 'c09a08': ['5.1-121', '5.1-122', '5.1-123', '5.1-124', '5.1-125', '5.1-126', '5.1-127', '5.1-128', + '5.1-129', '5.1-130', '5.1-131', '5.1-132', '5.1-133', '5.1-134', '5.1-135', '5.1-136', + '5.1-137', '5.1-138', '5.1-139'], + 'c09a09': ['5.1-140', '5.1-141', '5.1-142', '5.1-143', '5.1-144', '5.1-145', '5.1-146', '5.1-147', + '5.1-148', '5.1-149', '5.1-150', '5.1-151'], + 'c10': ['5.1-152', '5.1-153', '5.1-154', '5.1-155', '5.1-156', '5.1-157', '5.1-158', + '5.1-159', '5.1-160', '5.1-161', '5.1-162', '5.1-163', '5.1-164', '5.1-165', + '5.1-166', '5.1-167', '5.1-168', '5.1-169', '5.1-170', + '5.1-171', '5.1-172', '5.1-173', '5.1-174', '5.1-175', '5.1-176', '5.1-177', + '5.1-178']} title_10_1 = {'sIIc11a06': ['10.1-1149', '10.1-1150'], 'sIIc15': ['10.1-1500', '10.1-1501', '10.1-1502', '10.1-1503', '10.1-1504'], @@ -1806,8 +2303,8 @@ def add_citation(self): 'c24': ['22.1-360', '22.1-361']} title_28_2 = {'sIIc10a01': ['28.2-1000', '28.2-1000.1', '28.2-1000.2'], - 'sIIc10a02': ['28.2-1001', '28.2-1002', '28.2-1003', '28.2-1004', '28.2-1005', - '28.2-1006', '28.2-1007']} + 'sIIc10a02': ['28.2-1001', '28.2-1002', '28.2-1003', '28.2-1004', '28.2-1005', + '28.2-1006', '28.2-1007']} title_29_1 = {'c05a2.1': ['29.1-530.5']} @@ -1840,14 +2337,14 @@ def add_citation(self): title_59_1 = {'c29a05': ['59.1-394.1', '59.1-394.2', '59.1-394.3', '59.1-394.4']} title_62_1 = {'c05': ['62.1-64', '62.1-65', '62.1-66', '62.1-67', '62.1-68', '62.1-69'], - 'c5.2': ['62.1-69.5'], 'c5.4': ['62.1-69.34', '62.1-69.35'], - 'c5.5': ['62.1-69.36', '62.1-69.37', '62.1-69.38', '62.1-69.39', '62.1-69.40', - '62.1-69.41', '62.1-69.42', '62.1-69.43', '62.1-69.44'], - 'c5.6': ['62.1-69.45', '62.1-69.46', '62.1-69.47', '62.1-69.48', '62.1-69.49', - '62.1-69.50', '62.1-69.51', '62.1-69.52'], - 'c06': ['62.1-70', '62.1-71', '62.1-72', '62.1-73', '62.1-74', '62.1-75', '62.1-76', - '62.1-77', '62.1-77.1', '62.1-78', '62.1-79'], - 'c6.1': ['62.1-79.1', '62.1-79.2']} + 'c5.2': ['62.1-69.5'], 'c5.4': ['62.1-69.34', '62.1-69.35'], + 'c5.5': ['62.1-69.36', '62.1-69.37', '62.1-69.38', '62.1-69.39', '62.1-69.40', + '62.1-69.41', '62.1-69.42', '62.1-69.43', '62.1-69.44'], + 'c5.6': ['62.1-69.45', '62.1-69.46', '62.1-69.47', '62.1-69.48', '62.1-69.49', + '62.1-69.50', '62.1-69.51', '62.1-69.52'], + 'c06': ['62.1-70', '62.1-71', '62.1-72', '62.1-73', '62.1-74', '62.1-75', '62.1-76', + '62.1-77', '62.1-77.1', '62.1-78', '62.1-79'], + 'c6.1': ['62.1-79.1', '62.1-79.2']} title_63_1 = {'': ['63.1-1']} @@ -1868,57 +2365,47 @@ def add_citation(self): title_56 = {'': ['56-529', '56-530']} - - - target = "_blank" for tag in self.soup.find_all("p"): if tag.span: tag.span.unwrap() - if re.search(r"§{0,2}\s\d+(\.\d+)*-\d+(\.\d+)*\.*\s*(:\d+)*|\d+\sVa.\s\d+|S\.E\. \d+|Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*)", tag.text.strip()): + if re.search( + r"(\.\d+)*-\d+(\.\d+)*\.*\s*(:\d+)*|\d+\sVa.\s\d+|S\.E\. \d+|Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*)", + tag.text.strip()): text = str(tag) - # for match in set(x[0] for x in re.findall(r'(§\s*\d+(\.\d+)*-\d+(\.\d+)*(:\d+)*|' - # r'§§\s*\d+(\.\d+)*-\d+(\.\d+)*(:\d+)*|' - # r'\s*\d+(\.\d+)*-\d+(\.\d+)*(:\d+)*|\d+\sVa.\s\d+|S\.E\. \d+|' - # r'Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*))', - # tag.get_text())): - - - - - for match in set(x[0] for x in re.findall(r'(§{0,2}\s\d+(\.\d+)*-\d+(\.\d+)*\.*\s*(:\d+)*|\d+\sVa\.\s\d+|S\.E\. \d+|Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*))', - tag.get_text())): - + for match in set(x[0] for x in re.findall( + r'((\.\d+)*-\d+(\.\d+)*\.*\s*(:\d+)*|\d+\sVa\.\s\d+|S\.E\. \d+|Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*))', + tag.get_text())): inside_text = re.sub(r'<p\sclass="\w\d+">|</p>|^<p\sclass="\w\d+"\sid=".+">|</p>$', '', text, re.DOTALL) - if re.search(r"§*\s*(?P<sec_id>\d+(\.\d+)*-\d+(\.\d+)*(:\d+)*)", match.strip()): - cite_id = re.search(r"§*\s*(?P<sec_id>(?P<title_id>\d+(\.\d+)*)-(?P<chap_id>\d+)(\.\d+)*)\.*\s*", match.strip()) + if re.search(r"(?P<sec_id>\d+(\.\d+)*-\d+(\.\d+)*(:\d+)*)", match.strip()): + cite_id = re.search( + r"(?P<sec_id>(?P<title_id>\d+(\.\d+)*)-(?P<chap_id>\d+)(\.\d+)*)\.*\s*", match.strip()) title_id = f'title_{cite_id.group("title_id").zfill(2)}' if cite_id.group("title_id").zfill(2) == self.title_id: target = "_self" else: target = "_blank" + if not re.search(r"^§*\s*\d+\.\d+", cite_id.group("title_id").zfill(2)): + if cite_id.group("title_id").zfill(2) in ['01', '11']: + for key, value in eval(title_id).items(): + if cite_id.group("sec_id") in value: + tag.clear() + chap_id = key - if not re.search(r"^§*\s*\d+\.\d+",cite_id.group("title_id").zfill(2)): - if cite_id.group("title_id").zfill(2) in ['01','11'] : - for key,value in eval(title_id).items(): - if cite_id.group("sec_id") in value: - tag.clear() - chap_id = key - - tag_id = f'gov.va.code.title.{cite_id.group("title_id").zfill(2)}.html#t{cite_id.group("title_id").zfill(2)}{chap_id}s{cite_id.group("sec_id")}' - class_name = "ocva" - format_text = f'<cite class="{class_name}"><a href="{tag_id}" target="{target}">{match}</a></cite>' - text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) - tag.append(text) + tag_id = f'gov.va.code.title.{cite_id.group("title_id").zfill(2)}.html#t{cite_id.group("title_id").zfill(2)}{chap_id}s{cite_id.group("sec_id")}' + class_name = "ocva" + format_text = f'<cite class="{class_name}"><a href="{tag_id}" target="{target}">{match}</a></cite>' + text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) + tag.append(text) - elif cite_id.group("title_id").zfill(2) in ['30','56','44']: + elif cite_id.group("title_id").zfill(2) in ['30', '56', '44']: for key, value in eval(title_id).items(): if cite_id.group("sec_id") in value: tag.clear() @@ -1929,7 +2416,8 @@ def add_citation(self): text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) tag.append(text) - elif cite_id.group("title_id").zfill(2) in ['02','03','04','05','06','07','08','09','10','12','13','14']: + elif cite_id.group("title_id").zfill(2) in ['02', '03', '04', '05', '06', '07', '08', '09', + '10', '12', '13', '14']: tag.clear() tag_id = f'gov.va.code.title.{cite_id.group("title_id").zfill(2)}.html#t{cite_id.group("title_id").zfill(2)}c{cite_id.group("sec_id")}' @@ -1939,7 +2427,8 @@ def add_citation(self): tag.append(text) else: - if cite_id.group("title_id").zfill(2) in ['2.1','3.1','7.1','8.01','8.03','8.05','8.05A','8.06A']: + if cite_id.group("title_id").zfill(2) in ['2.1', '3.1', '7.1', '8.01', '8.03', '8.05', + '8.05A', '8.06A']: tag.clear() # tag_id = f'gov.va.code.title.{cite_id.group("title_id").zfill(2)}.html#t{cite_id.group("title_id").zfill(2)}s{cite_id.group("sec_id")}' @@ -1951,7 +2440,7 @@ def add_citation(self): text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) tag.append(text) - elif cite_id.group("title_id").zfill(2) in ['15.1','14.1','13.1','12.1']: + elif cite_id.group("title_id").zfill(2) in ['15.1', '14.1', '13.1', '12.1']: tag.clear() tag_id = f'gov.va.code.title.{cite_id.group("title_id").zfill(2)}.html#t{cite_id.group("title_id").zfill(2)}s{cite_id.group("sec_id")}' class_name = "ocva" @@ -1959,7 +2448,8 @@ def add_citation(self): text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) tag.append(text) - elif cite_id.group("title_id").zfill(2) in ['2.2','3.2','4.1','5.1','6.2','8.001','8.02','8.02A','8.03A','8.04','8.04A']: + elif cite_id.group("title_id").zfill(2) in ['2.2', '3.2', '4.1', '5.1', '6.2', '8.001', + '8.02', '8.02A', '8.03A', '8.04', '8.04A']: title = re.sub('\.', r'_', cite_id.group("title_id")) title_dict_id = f'title_{title}' @@ -1975,7 +2465,9 @@ def add_citation(self): text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) tag.append(text) - elif cite_id.group("title_id").zfill(2) in ['10.1','16.1','22.1','28.2','29.1','33.1','38.2','42.1','45.1','46.2','53.1','54.1','59.1','62.1','63.1','63.2']: + elif cite_id.group("title_id").zfill(2) in ['10.1', '16.1', '22.1', '28.2', '29.1', '33.1', + '38.2', '42.1', '45.1', '46.2', '53.1', '54.1', + '59.1', '62.1', '63.1', '63.2']: title = re.sub('\.', r'_', cite_id.group("title_id")) title_dict_id = f'title_{title}' @@ -1989,9 +2481,9 @@ def add_citation(self): text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) tag.append(text) - elif re.search(r'Titles (\d+(\.\d+)*)',match.strip()): + elif re.search(r'Titles (\d+(\.\d+)*)', match.strip()): tag.clear() - t_id = re.search(r'Titles (?P<t_id>\d+(\.\d+)*)',match.strip()).group('t_id').zfill(2) + t_id = re.search(r'Titles (?P<t_id>\d+(\.\d+)*)', match.strip()).group('t_id').zfill(2) tag_id = f'gov.va.code.title.0{t_id}.html' class_name = "ocva" @@ -2008,12 +2500,87 @@ def add_citation(self): text = re.sub(fr'{re.escape(match)}', format_text, inside_text, re.I) tag.append(text) + main_tag = self.soup.new_tag('main') + chap_nav = self.soup.find('nav') + tag_to_wrap = chap_nav.find_next_sibling() + while True: + next_tag = tag_to_wrap.find_next_sibling() + main_tag.append(tag_to_wrap) + if not next_tag: + chap_nav.insert_after(main_tag) + break + tag_to_wrap = next_tag print("citation added") + def clean_html_and_add_cite(self): + cite_p_tags = [] + for tag in self.soup.findAll( + lambda tag: re.search( + r"\d+(\.\d+)*-\d+(\.\d+)*\.*\s*(:\d+)*|\d+\sVa.\s\d+|S\.E\. \d+|Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*)", + tag.get_text()) and tag.name == 'p' + and tag not in cite_p_tags): + cite_p_tags.append(tag) + text = str(tag) + + for match in set( + x[0] for x in re.findall(r'(\d+(\.\d+)*-\d+(\.\d+)*\.*\s*(:\d+)*)', + tag.get_text())): + inside_text = re.sub(r'<p\sclass="\w\d+">|</p>|<b>|</b>|<p>|<p.+>', '', text, re.DOTALL) + id_reg = re.search(r'(?P<cite>(?P<title>\d+(\.\d+)*)-\d+(\.\d+)*(\.\s:\d+)*)', + match.strip()) + + title_id = id_reg.group("title").strip().zfill(2) + + if os.path.isfile( + f"../../cic-code-va/transforms/va/ocva/r{self.release_number}/gov.va.code.title.{title_id}.html"): + with open( + f"../../cic-code-va/transforms/va/ocva/r{self.release_number}/gov.va.code.title.{title_id}.html", + 'r') as firstfile: + + for line in firstfile: + if re.search(rf'id=".+(s|c){id_reg.group("cite")}">$', line.strip()): + tag.clear() + head_id = re.search(rf'id="(?P<h_id>.+(s|c){id_reg.group("cite")})">$', + line.strip()) + + if title_id == self.title_id: + target = "_self" + a_id = f'#{head_id.group("h_id")}' + else: + target = "_blank" + a_id = f'gov.va.code.title.{title_id}.html#{head_id.group("h_id")}' + + tag.clear() + text = re.sub(fr'\s{re.escape(match)}', + f' <cite class="ocva"><a href="{a_id}" target="{target}">{match}</a></cite>', + inside_text, + re.I) + tag.append(text) + + for match in set( + x[0] for x in re.findall(r'(\d+\sVa.\s\d+|S\.E\. \d+|Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*))', + tag.get_text())): + inside_text = re.sub(r'<p\sclass="\w\d+">|</p>|<b>|</b>|<p>', '', text, re.DOTALL) + tag.clear() + class_name = "va_code" + text = re.sub(re.escape(match), f'<cite class="{class_name}">{match}</cite>', inside_text, re.I) + tag.append(text) + + main_tag = self.soup.new_tag('main') + chap_nav = self.soup.find('nav') + tag_to_wrap = chap_nav.find_next_sibling() + while True: + next_tag = tag_to_wrap.find_next_sibling() + main_tag.append(tag_to_wrap) + if not next_tag: + chap_nav.insert_after(main_tag) + break + tag_to_wrap = next_tag + print("citation added") def add_watermark_and_remove_class_name(self): - watermark_tag = self.soup.new_tag('p', Class='transformation') + watermark_tag = self.soup.new_tag('p', **{"class": "transformation"}) watermark_tag.string = self.watermark_text.format(self.release_number, self.release_date, datetime.now().date()) title_tag = self.soup.find("nav") @@ -2024,15 +2591,14 @@ def add_watermark_and_remove_class_name(self): if meta.get('http-equiv') == "Content-Style-Type": meta.decompose() - for all_tag in self.soup.findAll("h2",class_="navhead"): + for all_tag in self.soup.findAll("h2", class_="navhead"): all_tag.name = "p" del all_tag["class"] del all_tag["id"] - for tag in self.soup.find_all(): - if tag.name in ['li', 'h4', 'h3', 'p','h2']: - del tag["class"] - + clss = re.compile(r'p\d+') + for all_tag in self.soup.findAll(class_=clss): + del all_tag["class"] for tag in self.soup.findAll(): if len(tag.contents) == 0: @@ -2050,10 +2616,8 @@ def add_watermark_and_remove_class_name(self): if len(tag.get_text(strip=True)) == 0: tag.extract() - print("watermark added") - def css_file(self): head = self.soup.find("head") style = self.soup.head.find("style") @@ -2080,8 +2644,9 @@ def write_soup_to_file(self): soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) with open(f"../../cic-code-va/transforms/va/ocva/r{self.release_number}/{self.html_file_name}", "w") as file: - file.write(soup_str.replace('<br/>','<br />')) - + soup_str = re.sub(r'&(?!amp;)', '&', soup_str) + soup_str = re.sub(r'<br/>', '<br />', soup_str) + file.write(soup_str) def start_parse(self): @@ -2096,17 +2661,18 @@ def start_parse(self): start_time = datetime.now() print(start_time) self.create_page_soup() - self.css_file() + # self.css_file() if re.search('constitution', self.html_file_name): - self.class_regex = {'ul':'^I\.','head2': '^Chapter \d+\.', 'head1': '^The Constitution of the United States|Constitution of Virginia', - 'head3': r'^§ 1\.|^Section \d+\.','junk': '^Statute text','article':'——————————', 'ol': r'^A\.\s', 'head4': '^CASE NOTES', \ - 'amdhead':'^AMENDMENTS TO THE CONSTITUTION','casenav':'^I\.'} + self.class_regex = {'ul': '^Article I\.', 'head2': '^Article I\.', + 'head1': '^THE CONSTITUTION OF THE UNITED STATES OF AMERICA|Constitution of Virginia', + 'head3': r'^§ 1\.|^Section \d+\.', 'junk': '^Text', 'article': '——————————', + 'ol': r'^A\.\s', 'head4': '^CASE NOTES', \ + 'amdhead': '^AMENDMENTS TO THE CONSTITUTION', 'casenav': '^I\.'} self.generate_class_name() self.remove_junk() self.recreate_tag() self.replace_tags() - self.create_main_tag() self.create_ul_tag() self.create_chapter_section_nav() self.create_case_note_nav() @@ -2121,16 +2687,18 @@ def start_parse(self): self.remove_junk() self.recreate_tag() self.replace_tags() - self.create_main_tag() + # self.create_main_tag() self.create_ul_tag() self.create_chapter_section_nav() self.create_case_note_nav() self.create_case_note_ul() - self.create_and_wrap_with_div_tag() + self.wrap_div_tags() + # # self.create_and_wrap_with_div_tag() self.convert_paragraph_to_alphabetical_ol_tags1() - self.add_citation() + # self.add_citation() + # self.clean_html_and_add_cite() self.add_watermark_and_remove_class_name() - + self.clean_html_and_add_cite() self.write_soup_to_file() print(datetime.now() - start_time) diff --git a/html_parser/vt_html_parser.py b/html_parser/vt_html_parser.py index d08fa2c..50c94a6 100644 --- a/html_parser/vt_html_parser.py +++ b/html_parser/vt_html_parser.py @@ -218,7 +218,7 @@ def replace_tags(self): elif header_tag.get("class") == [self.class_regex["head3"]]: - if re.search(r'^§\s\d+\.\[.+\]', header_tag.text.strip()): + if re.search(r'^§\s\d+\.\[.+]', header_tag.text.strip()): header_tag.name = "h3" sec_id = re.sub(r'[\W\s\d]','',header_tag.text.strip()).lower() header_tag[ @@ -742,8 +742,7 @@ def convert_paragraph_to_alphabetical_ol_tags1(self): if p_tag.i: p_tag.i.unwrap() - if re.search(r'^4\.1 Term of permit\.', current_tag_text): - print() + if re.search(r'^\([ivx]+\)', current_tag_text) and main_sec_alpha not in ['i','v','x'] : @@ -1064,7 +1063,6 @@ def add_citation(self): r"|\d+,\sNo\.\s\d+",tag.text.strip()): text = str(tag) - for match in set(x[0] for x in re.findall(r'(\d+\sV\.S\.A\.\s§+\s\d+(-\d+)*([a-z]+)*(\([a-z]\))*(\(\d+\))*(\([A-Z]\))*' r'|\d+\sU\.S\.C\.\s§\s\d+\(*[a-z]\)*' r'|\d+,\sNo\.\s\d+)' diff --git a/html_parser/wy_html_parser.py b/html_parser/wy_html_parser.py index d1440ef..6f60260 100644 --- a/html_parser/wy_html_parser.py +++ b/html_parser/wy_html_parser.py @@ -1085,9 +1085,13 @@ def write_soup_to_file(self): print("validating") - with open(f"../../cic-code-wy/transforms/wy/ocwy/r{self.release_number}/{self.html_file_name}", "w") as file: - file.write(soup_str.replace('& ','& ')) + with open(f"../../cic-code-wy-1/transforms/wy/ocwy/r{self.release_number}/{self.html_file_name}", "w") as file: + # file.write(soup_str.replace('& ','& ')) + soup_str = re.sub(r'&(?!amp;)', '&', soup_str) + soup_str = re.sub('<br/>', '<br />', soup_str) + soup_str = re.sub(r'<span class.*?>\s*</span>', '', soup_str) + file.write(soup_str) def create_case_note_nav(self): @@ -1177,7 +1181,7 @@ def start_parse(self): self.create_page_soup() if re.search('constitution', self.html_file_name): self.class_regex = {'head1': r'^Constitution of the State of Wyoming|THE CONSTITUTION OF THE UNITED STATES OF AMERICA', 'ul': r'^(PREAMBLE|Preamble)','head2':'Article \d\.', - 'head4': '^History\.', 'ol_p': r'^\(\d\)', 'junk1': '^Annotations$','head':'^Section added\.', + 'head4': r'^History\.', 'ol_p': r'^\(\d\)', 'junk1': '^Annotations$','head':'^Section added\.', 'head3': r'^§ \d|^sec\.|^Section \d',} self.generate_class_name() diff --git a/html_parser_framework/ak_html_parser.py b/html_parser_framework/ak_html_parser.py new file mode 100644 index 0000000..b168795 --- /dev/null +++ b/html_parser_framework/ak_html_parser.py @@ -0,0 +1,507 @@ +""" + - this file accepts the text util generated html and parse it + - here the html is converted in such a way that it matches the html5 standards + - the run method is calls the run_title or run_constitution method of ParseHtml class + - this method based on the file type(constitution files or title files) decides which methods to run +""" + +import re +from base_html_parser import ParseHtml +from regex_pattern import RegexPatterns, CustomisedRegexAK +import roman +from loguru import logger + + +class AKParseHtml(ParseHtml, RegexPatterns): + + def __init__(self, state_key, path, release_number, input_file_name): + super().__init__(state_key, path, release_number, input_file_name) + + def pre_process(self): + + """directory to store regex patterns """ + if re.search('constitution', self.input_file_name): + self.tag_type_dict = { + 'head1': r'^The Constitution of the State|^CONSTITUTION OF THE UNITED STATES OF AMERICA', + 'ul': r'^Preamble', 'head2': '^Article I', + 'head4': '^Notes to Decisions', 'junk1': '^Text$', + 'head3': r'^Section \d\.|^§ \d\.', 'note_tag': '^Analysis'} + self.h2_order: list = ['article'] + else: + self.tag_type_dict: dict = {'head1': r'Title \d+\.', 'ul': r'^Chapter \d+\.', + 'head2': r'^Chapter \d+\.', + 'head4': r'^History\.', + 'head3': r'^Sec\. \d+\.\d+\.\d+\.', + 'junk1': '^History$', 'NTD': '^Notes to Decisions'} + self.h2_order: list = ['chapter', 'article'] + self.h4_head: list = ['History.', 'Compiler’s Notes.', 'NOTES TO DECISIONS', 'Notes to Decisions'] + self.junk_tag_class = ['Apple-converted-space', 'Apple-tab-span'] + + self.watermark_text = """Release {0} of the Official Code of Alaska Annotated released {1}. + Transformed and posted by Public.Resource.Org using cic-beautify-state-codes version v1.3 on {2}. + This document is not subject to copyright and is in the public domain. + """ + self.regex_pattern_obj = CustomisedRegexAK() + + def replace_tags_titles(self): + """ + - regex_pattern_obj for customised regex class is created + - h2_order list which has order of h2 tags created + - calling method of base class + - replacing all other tags which are not handled in the base class + + """ + h5_s_alpha_id = None + h5_num_id = None + cap_roman = None + cap_alpha = None + h5_rom_id = None + cap_num = None + h5_alpha_id = None + dup_h5_id_list = [] + + super(AKParseHtml, self).replace_tags_titles() + + for p_tag in self.soup.find_all(): + if p_tag.name == "p": + if p_tag.get("class") == [self.tag_type_dict["head2"]]: + p_tag.name = 'h2' + cur_tag_text = re.sub(r'\W+', '', p_tag.get_text().strip()).lower() + p_tag['id'] = f'{p_tag.find_previous("h2", class_="oneh2").get("id")}-{cur_tag_text}' + + elif p_tag.get("class") == [self.tag_type_dict["ul"]]: + p_tag.name = "li" + p_tag.wrap(self.ul_tag) + + elif p_tag.get("class") == [self.tag_type_dict["head4"]]: + if ar_tag := re.search(r'^ARTICLE (?P<id>[IVX]+)', p_tag.text.strip(),re.I): + p_tag.name = "h4" + p_tag["id"] = f'{p_tag.find_previous("h3").get("id")}-{ar_tag.group("id")}' + if re.search(r'^Analysis', p_tag.text.strip()): + for tag in p_tag.find_next_siblings(): + if tag.get('class') == [self.tag_type_dict["NTD"]]: + break + else: + tag["class"] = "casenote" + tag.name = "li" + + elif p_tag.get("class") == [self.tag_type_dict["NTD"]]: + if re.search(r'^Notes to Decisions$|^Analysis$', p_tag.text.strip(),re.I): + p_tag.name = "h4" + p_tag["id"] = f'{p_tag.find_previous({"h3","h2","h1"}).get("id")}notestodecisions' + cap_roman = "I" + + elif re.search(rf'^{cap_roman}\.', p_tag.text.strip()): + p_tag.name = "h5" + h5_rom_text = re.search(r'^(?P<h5_id>[IVX]+)\.', p_tag.text.strip()).group("h5_id") + h5_rom_id = f"{p_tag.find_previous('h3').get('id')}-notetodecisison-{h5_rom_text}" + p_tag['id'] = h5_rom_id + cap_alpha = 'A' + cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) + + elif cap_alpha and re.search(fr'^{cap_alpha}\.',p_tag.text.strip()): + p_tag.name = "h5" + h5_alpha_text = re.search(r'^(?P<h5_id>[A-Z]+)\.', p_tag.text.strip()).group("h5_id") + h5_alpha_id = f"{h5_rom_id}-{h5_alpha_text}" + p_tag['id'] = h5_alpha_id + cap_alpha = chr(ord(cap_alpha) + 1) + cap_num = 1 + + elif cap_num and re.search(fr'^{cap_num}\.', p_tag.text.strip()): + p_tag.name = "h5" + h5_num_text = re.search(r'^(?P<h5_id>\d+)\.', p_tag.text.strip()).group("h5_id") + h5_num_id = f"{h5_alpha_id}-{h5_num_text}" + p_tag['id'] = h5_num_id + cap_num += 1 + + elif re.search(r'^[ivx]+\.', p_tag.text.strip()): + p_tag.name = "h5" + h5_s_rom_text = re.search(r'^(?P<h5_id>[ivx]+)\.', p_tag.text.strip()).group("h5_id") + h5_s_rom_id = f"{h5_s_alpha_id}-{h5_s_rom_text}" + p_tag['id'] = h5_s_rom_id + + elif re.search(r'^[a-z]+\.', p_tag.text.strip()): + p_tag.name = "h5" + h5_s_alpha_text = re.search(r'^(?P<h5_id>\w+)\.', p_tag.text.strip()).group("h5_id") + h5_s_alpha_id = f"{h5_num_id}-{h5_s_alpha_text}" + if h5_s_alpha_id in dup_h5_id_list: + h5_s_alpha_id = f"{h5_num_id}-{h5_s_alpha_text}.{count}" + count += 1 + else: + count = 1 + + p_tag['id'] = h5_s_alpha_id + dup_h5_id_list.append(h5_s_alpha_id) + + elif p_tag.name == "h2" and p_tag.get("class") == "oneh2": + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + def convert_paragraph_to_alphabetical_ol_tags(self): + """ + For each tag which has to be converted to ordered list(<ol>) + - create new <ol> tags with appropriate type (1, A, i, a ..) + - get previous headers' id to set unique id for each list item (<li>) + - append each li to respective ol accordingly + """ + + inner_sec_alpha = 'A' + inner_num_count = 1 + num_count = 1 + ol_count = 1 + main_sec_alpha = 'a' + small_roman = "i" + + sec_alpha_cur_tag = None + inr_sec_alpha_cur_tag = None + inr_num_cur_tag = None + num_cur_tag = None + sec_alpha_id = None + inr_num_id = None + inr_sec_alpha_id = None + small_roman_id = None + num_id = None + + sec_alpha_ol = self.soup.new_tag("ol", type="a") + inr_num_ol = self.soup.new_tag("ol") + inr_sec_alpha_ol = self.soup.new_tag("ol", type="A") + roman_ol = self.soup.new_tag("ol", type="i") + num_ol = self.soup.new_tag("ol") + + for p_tag in self.soup.body.find_all(['h3', 'h4', 'h5', 'p']): + current_tag_text = p_tag.text.strip() + + if re.search(rf'^\({main_sec_alpha}\)', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + sec_alpha_cur_tag = p_tag + inner_num_count = 1 + inr_sec_alpha_cur_tag = None + + if re.search(r'^\(a\)', current_tag_text): + sec_alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(sec_alpha_ol) + if num_cur_tag: + num_cur_tag.append(sec_alpha_ol) + sec_alpha_id = num_cur_tag.get("id") + else: + sec_alpha_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + else: + sec_alpha_ol.append(p_tag) + + p_tag["id"] = f'{sec_alpha_id}{main_sec_alpha}' + p_tag.string = re.sub(rf'^^\({main_sec_alpha}\)', '', current_tag_text) + main_sec_alpha = chr(ord(main_sec_alpha) + 1) + + if re.search(rf'^\([a-z]\)\s\(1\)', current_tag_text): + p_tag.name = "li" + inr_num_ol = self.soup.new_tag("ol") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([a-z]\)\s\(1\)', '', current_tag_text) + inr_num_cur_tag = li_tag + inr_num_id = f'{sec_alpha_cur_tag.get("id")}' + li_tag["id"] = f'{sec_alpha_cur_tag.get("id")}1' + inr_num_ol.append(li_tag) + p_tag.string = "" + p_tag.append(inr_num_ol) + inner_num_count = 2 + + elif re.search(rf'^\({inner_num_count}\)', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + inr_num_cur_tag = p_tag + inner_sec_alpha = 'A' + + if re.search(r'^\(1\)', current_tag_text): + inr_num_ol = self.soup.new_tag("ol") + p_tag.wrap(inr_num_ol) + + if sec_alpha_cur_tag: + sec_alpha_cur_tag.append(inr_num_ol) + inr_num_id = sec_alpha_cur_tag.get('id') + else: + inr_num_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + + else: + inr_num_ol.append(p_tag) + + p_tag["id"] = f'{inr_num_id}{inner_num_count}' + p_tag.string = re.sub(rf'^\({inner_num_count}\)', '', current_tag_text) + inner_num_count = inner_num_count + 1 + + if re.search(rf'^\(\d+\)\s*\(A\)', current_tag_text): + p_tag.name = "li" + inr_sec_alpha_ol = self.soup.new_tag("ol", type="a") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\d+\)\s*\(A\)', '', current_tag_text) + inr_sec_alpha_cur_tag = li_tag + inr_sec_alpha_id = f'{inr_num_cur_tag.get("id")}' + li_tag["id"] = f'{inr_num_cur_tag.get("id")}A' + inr_sec_alpha_ol.append(li_tag) + p_tag.string = "" + p_tag.append(inr_sec_alpha_ol) + inner_sec_alpha = 'b' + + elif re.search(rf'^{num_count}\.', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + num_cur_tag = p_tag + main_sec_alpha = 'a' + + if re.search(r'^1\.', current_tag_text): + num_ol = self.soup.new_tag("ol") + p_tag.wrap(num_ol) + num_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + else: + num_ol.append(p_tag) + + p_tag["id"] = f'{num_id}{num_count}' + p_tag.string = re.sub(rf'^{num_count}\.', '', current_tag_text) + num_count = num_count + 1 + + if re.search(rf'^\d+\.\s*\(a\)', current_tag_text): + p_tag.name = "li" + sec_alpha_ol = self.soup.new_tag("ol", type="a") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\d+\.\s*\(a\)', '', current_tag_text) + sec_alpha_cur_tag = li_tag + sec_alpha_id = f'{num_cur_tag.get("id")}' + li_tag["id"] = f'{num_cur_tag.get("id")}a' + sec_alpha_ol.append(li_tag) + p_tag.string = "" + p_tag.append(sec_alpha_ol) + main_sec_alpha = 'b' + + elif re.search(rf'^\({inner_sec_alpha}\)', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + inr_sec_alpha_cur_tag = p_tag + small_roman = "i" + + if re.search(r'^\(A\)', current_tag_text): + inr_sec_alpha_ol = self.soup.new_tag("ol", type="A") + p_tag.wrap(inr_sec_alpha_ol) + if inr_num_cur_tag: + inr_num_cur_tag.append(inr_sec_alpha_ol) + inr_sec_alpha_id = inr_num_cur_tag.get('id') + else: + inr_sec_alpha_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + else: + inr_sec_alpha_ol.append(p_tag) + + p_tag["id"] = f'{inr_sec_alpha_id}{inner_sec_alpha}' + p_tag.string = re.sub(rf'^\({inner_sec_alpha}\)', '', current_tag_text) + + if inner_sec_alpha == 'Z': + inner_sec_alpha = 'A' + else: + inner_sec_alpha = chr(ord(inner_sec_alpha) + 1) + + if re.search(r'^\([A-Z]\)\s*\(i\)', current_tag_text): + p_tag.name = "li" + roman_ol = self.soup.new_tag("ol", type="i") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([A-Z]\)\s*\(i\)', '', current_tag_text) + ol_head_cur_tag = li_tag + ol_head_id = f'{sec_alpha_cur_tag.get("id")}' + li_tag["id"] = f'{sec_alpha_cur_tag.get("id")}i' + roman_ol.append(li_tag) + p_tag.string = "" + p_tag.append(roman_ol) + small_roman = "ii" + + elif re.search(rf'^\({inner_sec_alpha}{inner_sec_alpha}\)', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + inr_sec_alpha_ol.append(p_tag) + + p_tag["id"] = f'{inr_sec_alpha_id}{inner_sec_alpha}{inner_sec_alpha}' + p_tag.string = re.sub(rf'^\({inner_sec_alpha}{inner_sec_alpha}\)', '', current_tag_text) + + if inner_sec_alpha == 'Z': + inner_sec_alpha = 'A' + else: + inner_sec_alpha = chr(ord(inner_sec_alpha) + 1) + + elif re.search(rf'^\({small_roman}\)', current_tag_text): + p_tag.name = "li" + rom_cur_tag = p_tag + + if re.search(r'^\(i\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="i") + p_tag.wrap(roman_ol) + if inr_sec_alpha_cur_tag: + inr_sec_alpha_cur_tag.append(roman_ol) + small_roman_id = inr_sec_alpha_cur_tag.get('id') + elif sec_alpha_cur_tag: + sec_alpha_cur_tag.append(roman_ol) + small_roman_id = sec_alpha_cur_tag.get('id') + else: + roman_ol.append(p_tag) + + p_tag["id"] = f'{small_roman_id}{small_roman}' + p_tag.string = re.sub(rf'^\({small_roman}\)', '', current_tag_text) + small_roman = roman.toRoman(roman.fromRoman(small_roman.upper()) + 1).lower() + + if p_tag.name in ['h3', 'h4', 'h5']: + inner_sec_alpha = 'A' + inner_num_count = 1 + num_count = 1 + ol_count = 1 + main_sec_alpha = 'a' + sec_alpha_cur_tag = None + inr_sec_alpha_cur_tag = None + inr_num_cur_tag = None + num_cur_tag = None + + logger.info("ol tags added") + + def create_analysis_nav_tag(self): + """ + - calling appropriate analysis nav method of base + according to the header of analysis nav tag + """ + + rom_tag = None + rom_tag_id = None + alpha_tag = None + alpha_tag_id = None + a_tag_id = None + rom = "I" + alpha = None + + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + + if re.search('constitution', self.input_file_name): + + for tag in self.soup.find_all("p", class_=[self.tag_type_dict['note_tag']]): + if re.search(r'^([IVX]+\.|[A-Z]\.|\d+\.)', tag.text.strip()) \ + and re.search(r'^Notes to Decisions$', tag.find_previous('h4').text.strip()): + tag.name = "li" + tag['class'] = "note" + + for case_tag in self.soup.find_all(): + if case_tag.name == "li" and case_tag.get("class") == "note": + if re.search(fr'^{rom}\.', case_tag.text.strip()): + rom_tag = case_tag + if re.search(r'^I\.', case_tag.text.strip()): + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(rom_ul) + else: + rom_ul.append(case_tag) + + rom_num = re.sub(r'[\W\s]+', '', case_tag.text.strip()).lower() + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-notestodecisions-{rom_num}' + rom_tag_id = f'#{case_tag.find_previous("h3").get("id")}-notestodecisions-{rom_num}' + rom = roman.toRoman(roman.fromRoman(rom.upper()) + 1) + alpha = "A" + + elif re.search(fr'^{alpha}\.', case_tag.text.strip()): + alpha_tag = case_tag + if re.search(r'^A\.', case_tag.text.strip()): + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(alpha_ul) + rom_tag.append(alpha_ul) + else: + alpha_ul.append(case_tag) + + alpha_id = re.sub(r'[\W\s]+', '', case_tag.text.strip().strip()).lower() + alpha_tag_id = f'{rom_tag_id}-{alpha_id}' + a_tag_id = f'{rom_tag_id}-{alpha_id}' + alpha = chr(ord(alpha) + 1) + + elif re.search(r'^\d+\.', case_tag.text.strip().strip()): + digit_tag = case_tag + if re.search(r'^1\.', case_tag.text.strip().strip()): + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(digit_ul) + alpha_tag.append(digit_ul) + else: + digit_ul.append(case_tag) + + digit = re.search(r'^(?P<nid>[0-9]+)\.', case_tag.text.strip().strip()).group("nid") + a_tag_id = f'{alpha_tag_id}-{digit}' + + anchor = self.soup.new_tag('a', href=a_tag_id) + anchor.string = case_tag.text + case_tag.string = '' + case_tag.append(anchor) + + elif case_tag.name == "h5": + rom = "I" + logger.info("note to decision nav created") + + else: + super(AKParseHtml, self).create_case_note_analysis_nav_tag() + logger.info("case note nav created") + + def replace_tags_constitution(self): + self.regex_pattern_obj = CustomisedRegexAK() + super(AKParseHtml, self).replace_tags_constitution() + + note_to_decision_id_list: list = [] + rom = "I" + alpha = None + num = None + NTD_rom_head_id = None + NTD_alpha_head_id = None + + for p_tag in self.soup.find_all("p", class_=self.tag_type_dict["head2"]): + if re.search(r'^Ordinance No\. \d',p_tag.text.strip()): + p_tag.name = "h2" + p_tag_text = re.sub(r'\W+', '', p_tag.text.strip()).lower() + p_tag["id"] = f"{p_tag.find_previous('h1').get('id')}-{p_tag_text}" + p_tag["class"] = "oneh2" + + self.replace_h3_tags_con() + + for header_tag in self.soup.find_all(): + if header_tag.get("class") == [self.tag_type_dict["head4"]]: + if re.search(fr'^{rom}\.', header_tag.text.strip()): + header_tag.name = "h5" + NTD_text = re.sub(r'\W+', '', header_tag.text.strip()).lower() + NTD_rom_head_id = f"{header_tag.find_previous('h4').get('id')}-{NTD_text}" + + if NTD_rom_head_id in note_to_decision_id_list: + header_tag['id'] = f"{NTD_rom_head_id}.1" + else: + header_tag['id'] = f"{NTD_rom_head_id}" + note_to_decision_id_list.append(NTD_rom_head_id) + rom = roman.toRoman(roman.fromRoman(rom.upper()) + 1) + alpha = "A" + + elif alpha and re.search(fr'^{alpha}\.', header_tag.text.strip()): + header_tag.name = "h5" + NTD_alpha_text = re.sub(r'\W+', '', header_tag.text.strip()).lower() + NTD_alpha_head_id = f"{NTD_rom_head_id}-{NTD_alpha_text}" + + if NTD_alpha_head_id in note_to_decision_id_list: + header_tag['id'] = f"{NTD_alpha_head_id}.1" + else: + header_tag['id'] = f"{NTD_alpha_head_id}" + note_to_decision_id_list.append(NTD_alpha_head_id) + alpha = chr(ord(alpha) + 1) + num = 1 + elif num and re.search(fr'^\d+\.', header_tag.text.strip()): + header_tag.name = "h5" + NTD_num = re.search(fr'^(?P<nid>\d+)\.', header_tag.text.strip()).group("nid") + NTD_num_head_id = f"{NTD_alpha_head_id}-{NTD_num}" + + if NTD_num_head_id in note_to_decision_id_list: + header_tag['id'] = f"{NTD_num_head_id}.1" + else: + header_tag['id'] = f"{NTD_num_head_id}" + note_to_decision_id_list.append(NTD_num_head_id) + + elif header_tag.name == "h3": + rom = "I" + num = None + + def add_anchor_tags_con(self): + super(AKParseHtml, self).add_anchor_tags_con() + self.c_nav_count = 0 + for li in self.soup.find_all("li"): + if not li.get("id"): + if re.search(r'^Ordinance No\. \d', li.text.strip()): + li_tag_text = re.sub(r'\W+', '', li.text.strip()).lower() + self.c_nav_count = int( + re.search(r'cnav(?P<ncount>\d+)', li.find_previous("li").get("id").strip()).group("ncount")) + 1 + self.set_chapter_section_id(li, li_tag_text, + sub_tag="-", + prev_id=li.find_previous("h1").get("id"), + cnav=f'cnav{self.c_nav_count:02}') diff --git a/html_parser_framework/base_html_parser.py b/html_parser_framework/base_html_parser.py new file mode 100644 index 0000000..75b9080 --- /dev/null +++ b/html_parser_framework/base_html_parser.py @@ -0,0 +1,1936 @@ +import copy +import importlib +import os +import re +from datetime import datetime + +import roman +from bs4 import BeautifulSoup, Doctype +from regex_pattern import RegexPatterns +from loguru import logger + + +class ParseHtml: + + def __init__(self, state_key, path, release_number, input_file_name): + + """Meta Data""" + self.h3_pattern_text_con = None + self.h2_rename_pattern = None + self.chp_nav_count = 0 + self.h2_pattern_text_con = None + self.h2_text_con = None + self.h2_text = None + self.h3_pattern_text = None + self.h2_pattern_text = None + self.id_list = [] + self.file_name = None + self.tag = None + self.state_key = state_key + self.path = path + self.release_number = release_number + self.input_file_name = input_file_name + + self.parser_obj = None + self.junk_tag_class = None + self.h2_order = None + self.title = None + self.ul_tag = None + + self.cite_pattern = None + self.release_date = None + self.watermark_text = None + self.path_in = None + self.path_out = None + self.h4_head = None + self.tag_type_dict = None + self.soup = None + + self.s_nav_count = 0 + self.p_nav_count = 0 + self.a_nav_count = 0 + self.c_nav_count = 0 + + self.h4_cur_id_list: list = [] + self.meta_tags: list = [] + self.list_ids: list = [] + self.dup_id_list: list = [] + self.h2_rep_id: list = [] + self.h2_id_count = 1 + self.id_count = 1 + self.list_id_count = 1 + self.h3_count = 1 + + self.meta_data = {"file_name": self.input_file_name, "state_key": self.state_key, + "release_number": self.release_number} + + self.junk_tag_class = ['Apple-converted-space', 'Apple-tab-span'] + self.regex_pattern_obj = RegexPatterns() + + def pre_process(self): + pass + + def set_release_date(self): + date_dictionary = {} + with open('release_dates.txt') as file: + for line in file: + (key, value) = line.split() + date_dictionary[key] = value + + release_date_pattern = fr'{self.state_key}_r{self.release_number}' + + if release_date_pattern in date_dictionary: + self.release_date = date_dictionary[release_date_pattern] + else: + logger.error("release date is missing in release_date file") + + self.parser_obj = getattr(importlib.import_module('regex_pattern'), f'CustomisedRegex{self.state_key}')() + + def set_page_soup(self): + """ + - Read the input html to parse and convert it to Beautifulsoup object + - Input Html will be html 4 so replace html tag which is self.soup.contents[0] with <html> + which is syntax of html tag in html 5 + - add attribute 'lang' to html tag with value 'en' + """ + + with open(self.path) as open_file: + html_data = open_file.read() + self.soup = BeautifulSoup(html_data, features="lxml") + self.soup.contents[0].replace_with(Doctype("html")) + self.soup.html.attrs['lang'] = 'en' + logger.info(f"soup is created for {self.meta_data}") + + def generate_class_name_dict(self): + """ + - Find the textutil generated class names for each type of tag (h1, h2, ....) + using re pattern specified in self.tag_type_dict + """ + + for key, value in self.tag_type_dict.items(): + tag_class = self.soup.find( + lambda tag: tag.name == 'p' and re.search(self.tag_type_dict.get(key), tag.get_text().strip(), re.I) + and tag.attrs["class"][0] not in self.tag_type_dict.values()) + if tag_class: + self.tag_type_dict[key] = tag_class.get('class')[0] + + logger.info(f"updated class dict is {self.tag_type_dict}") + + def replace_h1_tags_titles(self, header_tag): + """ + - This method is called by replace_tags_titles with tag as args + - The class,name and id is set for the tag + + """ + + if self.regex_pattern_obj.h1_pattern.search(header_tag.text.strip()): + header_tag.name = "h1" + title_no = self.regex_pattern_obj.h1_pattern.search(header_tag.text.strip()).group('id') + self.title = title_no + header_tag["class"] = "title" + header_tag["id"] = f't{title_no.zfill(2)}' + header_tag.wrap(self.soup.new_tag("nav")) + + def set_id_for_h2_tags(self, header_tag, text, prev, cur): + + """ + - This method is called by replace_h2_tags_titles method with + tag,text, prev (previous tag class),cur(current tag class) args + - With the args passed the name,id and class is updated + + """ + + pattern = f'h2_{text}_pattern' + instance = getattr(self.parser_obj, pattern) + + if instance.search(header_tag.text.strip()) and instance.search(header_tag.text.strip()).group('id'): + header_tag.name = "h2" + chap_no = instance.search(header_tag.text.strip()).group('id') + if header_tag.findPrevious("h2", class_=prev): + header_tag_id = f'{header_tag.findPrevious("h2", class_=prev).get("id")}{text[0]}{chap_no.zfill(2)}' + if header_tag_id in self.dup_id_list: + header_tag["id"] = f'{header_tag_id}.{self.id_count:02}' + self.id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.id_count = 1 + header_tag["class"] = cur + else: + header_tag_id = f'{header_tag.findPrevious("h1").get("id")}{text[0]}{chap_no.zfill(2)}' + + if header_tag_id in self.dup_id_list: + header_tag["id"] = f'{header_tag_id}.{self.id_count:02}' + self.id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.id_count = 1 + + header_tag["class"] = "oneh2" + self.dup_id_list.append(header_tag_id) + + def set_id_for_h2_tags_con(self, header_tag, text, prev, cur): + + """ + - This method is called by replace_h2_tags_titles method with + tag,text, prev (previous tag class),cur(current tag class) args + - With the args passed the name,id and class is updated + + """ + + pattern = f'h2_{text}_pattern_con' + instance = getattr(self.parser_obj, pattern) + + if instance.search(header_tag.text.strip()) and instance.search(header_tag.text.strip()).group('id'): + header_tag.name = "h2" + chap_no = instance.search(header_tag.text.strip()).group('id') + if header_tag.findPrevious("h2", class_=prev): + header_tag_id = f'{header_tag.findPrevious("h2", class_=prev).get("id")}{text[:2]}{chap_no.zfill(2)}' + if header_tag_id in self.dup_id_list: + header_tag["id"] = f'{header_tag_id}.{self.id_count:02}' + self.id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.id_count = 1 + header_tag["class"] = cur + else: + header_tag_id = f'{header_tag.findPrevious("h1").get("id")}{text[:2]}{chap_no.zfill(2)}' + + if header_tag_id in self.dup_id_list: + header_tag["id"] = f'{header_tag_id}.{self.id_count:02}' + self.id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.id_count = 1 + + header_tag["class"] = "oneh2" + self.dup_id_list.append(header_tag_id) + + def replace_h2_tags_titles(self, header_tag): + + """ + - This method is called by replace_tags_titles method with + tag as args + - With the args passed the name,id and class is updated using sub method + set_id_for_h2_tags. + + """ + + text = re.search(r'^\S+', header_tag.text.strip()).group().lower() + + if text == self.h2_order[0]: + pattern = f'h2_{text}_pattern' + instance = getattr(self.parser_obj, pattern) + + if instance.search(header_tag.text.strip()): + header_tag.name = "h2" + chap_no = instance.search(header_tag.text.strip()).group('id') + header_tag_id = f'{self.soup.find("h1").get("id")}{text[0]}{chap_no.zfill(2)}' + if header_tag_id in self.h2_rep_id: + header_tag["id"] = f'{header_tag_id}.{self.h2_id_count:02}' + self.h2_id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h2_id_count = 1 + + header_tag["class"] = "oneh2" + self.h2_rep_id.append(header_tag_id) + + elif text == self.h2_order[1]: + self.set_id_for_h2_tags(header_tag, text, prev={"gen", "oneh2"}, cur="twoh2") + + elif text == self.h2_order[2]: + self.set_id_for_h2_tags(header_tag, text, prev={"twoh2", "oneh2", "gen"}, cur="threeh2") + + elif text == self.h2_order[3]: + self.set_id_for_h2_tags(header_tag, text, prev={"oneh2", "twoh2", "threeh2", "gen"}, cur="fourh2") + + elif self.h2_pattern_text: + for list_pattern in self.h2_pattern_text: + h2_pattern = re.compile(list_pattern) + if h2_tag := h2_pattern.search(header_tag.text.strip()): + header_tag.name = "h2" + if header_tag.find_previous("h2"): + prev_header_tag = header_tag.find_previous( + lambda tag: tag.name == 'h2' and not h2_pattern.search(tag.text.strip()) and + not re.search(rf'{h2_tag.group("tag").lower()}', tag.get("id").strip())) + + if re.search(r'(1|A|I)\.', header_tag.text.strip()): + prev_header_tag = header_tag.find_previous( + lambda tag: tag.name == 'h2' and not h2_pattern.search(tag.text.strip()) and + not re.search(rf'{h2_tag.group("tag")}', tag.get("id").strip())) + self.prev_class_name = prev_header_tag.get("class") + else: + prev_header_tag = header_tag.find_previous("h2", class_=self.prev_class_name) + + else: + prev_header_tag = self.soup.find("h1") + self.prev_class_name = prev_header_tag.get("class") + + header_tag[ + "id"] = f'{prev_header_tag.get("id")}{h2_tag.group("tag").lower()}{h2_tag.group("id").zfill(2)}' + header_tag["class"] = "gen" + + if self.h2_rename_pattern: + for list_pattern in self.h2_rename_pattern: + h2_pattern = re.compile(list_pattern) + if h2_tag := h2_pattern.search(header_tag.text.strip()): + prev_header_tag = header_tag.find_previous( + lambda tag: tag.name == 'h2' and not h2_pattern.search(tag.text.strip()) and + not re.search(rf'{h2_tag.group("tag").lower()}', tag.get("id").strip())) + header_tag[ + "id"] = f'{prev_header_tag.get("id")}{h2_tag.group("tag").lower()}{h2_tag.group("id").zfill(2)}' + header_tag.name = "h2" + + def replace_h3_titles(self, header_tag, h3_id_list): + if sec_id := getattr(self.parser_obj, "section_pattern").search(header_tag.text.strip()): + sec_id = re.sub(r'\s+|\.$', '', sec_id.group("id")) + if self.format_id(sec_id, header_tag): + sec_id = self.format_id(sec_id, header_tag) + header_tag.name = "h3" + if header_tag.find_previous({"h2", "h3"}, class_={"oneh2", "twoh2", "threeh2", "fourh2", "gen"}): + header_tag_id = f'{header_tag.find_previous({"h2", "h3"}, class_={"oneh2", "twoh2", "threeh2", "fourh2", "gen"}).get("id")}s{sec_id.zfill(2)}' + if header_tag_id in h3_id_list: + header_tag_id = f'{header_tag_id}.{self.h3_count:02}' + header_tag["id"] = f'{header_tag_id}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + else: + header_tag_id = f'{header_tag.find_previous("h1").get("id")}c{sec_id}' + if header_tag_id in h3_id_list: + header_tag_id = f'{header_tag_id}.{self.h3_count:02}' + header_tag["id"] = f'{header_tag_id}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + + h3_id_list.append(header_tag_id) + + elif getattr(self.parser_obj, "section_pattern_1") and getattr(self.parser_obj, "section_pattern_1").search( + header_tag.text.strip()): + if sec_id := getattr(self.parser_obj, "section_pattern_1").search(header_tag.text.strip()): + sec_id = re.sub(r'\s+|\.$', '', sec_id.group("id")) + header_tag.name = "h3" + if header_tag.find_previous({"h2", "h3"}, class_={"oneh2", "twoh2", "threeh2", "fourh2", "gen"}): + header_tag_id = f'{header_tag.find_previous({"h2", "h3"}, class_={"oneh2", "twoh2", "threeh2", "fourh2", "gen"}).get("id")}s{sec_id.zfill(2)}' + if header_tag_id in h3_id_list: + header_tag["id"] = f'{header_tag_id}.{self.h3_count:02}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + else: + header_tag_id = f'{header_tag.find_previous("h1").get("id")}s{sec_id}' + if header_tag_id in h3_id_list: + header_tag["id"] = f'{header_tag_id}.{self.h3_count:02}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + + h3_id_list.append(header_tag_id) + + else: + if self.h3_pattern_text: + for list_pattern in self.h3_pattern_text: + h3_pattern = re.compile(list_pattern) + if h3_tag := h3_pattern.search(header_tag.text.strip()): + header_tag.name = "h3" + sec_id = h3_tag.group("id") + if header_tag.find_previous("h2", class_={"oneh2", "twoh2", "threeh2", "fourh2"}): + header_tag_id = f'{header_tag.find_previous("h2", class_={"oneh2", "twoh2", "threeh2", "fourh2"}).get("id")}s{sec_id.zfill(2)}' + if header_tag_id in h3_id_list: + header_tag["id"] = f'{header_tag_id}.{self.h3_count:02}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + else: + header_tag_id = f'{header_tag.find_previous("h1").get("id")}s{sec_id.zfill(2)}' + if header_tag_id in h3_id_list: + header_tag["id"] = f'{header_tag_id}.{self.h3_count:02}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + + h3_id_list.append(header_tag_id) + + def replace_h4_tag_titles(self, header_tag, h4_count, id): + + """ + - if the text of the tag matches to the text in the list h4.head, + then the tag name, id is updated + """ + + header_tag.name = "h4" + if id: + header4_tag_text = id + else: + header4_tag_text = re.sub(r'[\W.]+', '', header_tag.text.strip()).lower() + h4_tag_id = f'{header_tag.find_previous({"h3", "h2", "h1"}).get("id")}-{header4_tag_text}' + + if h4_tag_id in self.h4_cur_id_list: + header_tag['id'] = f'{h4_tag_id}.{h4_count}' + h4_count += 1 + else: + header_tag['id'] = f'{h4_tag_id}' + + self.h4_cur_id_list.append(h4_tag_id) + + def replace_tags_titles(self): + + """ + based on the class of the tag ,the tag name is updated + """ + self.h4_count = 1 + h3_id_list: list = [] + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + self.get_h2_order() + for header_tag in self.soup.find_all("p"): + if header_tag.get("class") == [self.tag_type_dict["head1"]]: + self.replace_h1_tags_titles(header_tag) + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + self.replace_h2_tags_titles(header_tag) + + elif header_tag.get("class") == [self.tag_type_dict["head2"]]: + self.replace_h2_tags_titles(header_tag) + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + header_tag_text = re.sub(r'\W+', '', header_tag.text.strip()) + if self.h2_text and header_tag_text in self.h2_text: + self.h2_set_id(header_tag) + else: + self.replace_h3_titles(header_tag, h3_id_list) + + elif header_tag.get("class") == [self.tag_type_dict["head3"]]: + self.replace_h3_titles(header_tag, h3_id_list) + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif header_tag.get("class") == [self.tag_type_dict["head4"]]: + if header_tag.text.strip() in self.h4_head: + self.replace_h4_tag_titles(header_tag, self.h4_count, None) + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif header_tag.get("class") == [self.tag_type_dict["ul"]]: + if not re.search(r'^(Section\.?|Chapter|Sec\.|Chap.)$', header_tag.text.strip()): + header_tag.name = "li" + header_tag.wrap(self.ul_tag) + + elif header_tag.name == "p": + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + logger.info("Tags are replaced in the base class") + + def set_chapter_section_id(self, list_item, chap_num, sub_tag, prev_id, cnav): + """ + - This method is called by add_anchor_tags , + this will set tag id and reference link to the list_item + """ + li_list = [] + li_link = self.soup.new_tag('a') + li_link.append(list_item.text) + li_link_id = f"{prev_id}{sub_tag}{chap_num.zfill(2)}" + li_list.append(li_link) + list_item.contents = li_list + if li_link_id in self.list_ids: + li_link_id = f"{li_link_id}.{self.list_id_count:02}" + list_item['id'] = f'{li_link_id}-{cnav}' + list_item.a['href'] = f'#{li_link_id}' + self.list_id_count += 1 + else: + li_link_id = f"{prev_id}{sub_tag}{chap_num.zfill(2)}" + list_item['id'] = f'{li_link_id}-{cnav}' + list_item.a['href'] = f'#{li_link_id}' + self.list_id_count = 1 + + self.list_ids.append(li_link_id) + + def add_anchor_tags(self): + + """ + - adding id and reference link to the li tag by calling + set_chapter_section_id method + + """ + pnav_count = 1 + + for li_tag in self.soup.findAll(): + if li_tag.name == "li" and len(li_tag.text.strip()) > 0 and \ + li_tag.get('class') and li_tag.get('class')[0] == self.tag_type_dict['ul']: + text = re.search(r'^\S+', li_tag.text.strip()).group().lower() + pattern = f'h2_{text}_pattern' + li_tag_text = re.sub(r'\W+', '', li_tag.text.strip()) + if text == self.h2_order[0]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[0]}", + prev_id=li_tag.find_previous("h1").get("id"), + cnav=f'{text[0]}nav{self.c_nav_count:02}') + elif text == self.h2_order[1]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()) and instance.search(li_tag.text.strip()).group('id'): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.a_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[0]}", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'{text[0]}nav{self.a_nav_count:02}') + elif text == self.h2_order[2]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.p_nav_count += 1 + if li_tag.find_previous("h2", class_={"twoh2", "gen"}) or li_tag.find_previous("h3", + class_="twoh2"): + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[0]}", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'{text[0]}nav{self.p_nav_count:02}') + elif text == self.h2_order[3]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.s_nav_count += 1 + if li_tag.find_previous("h2", class_={"threeh2", "gen"}): + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[0]}", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'{text[0]}nav{self.s_nav_count:02}') + + elif getattr(self.parser_obj, "section_pattern").search(li_tag.text.strip()): + sec_num = getattr(self.parser_obj, "section_pattern").search(li_tag.text.strip()).group("id") + sec_num = re.sub(r'\s+|\.$', '', sec_num) + if self.format_id(sec_num, li_tag): + sec_num = self.format_id(sec_num, li_tag) + if li_tag.find_previous(class_={"gen", "oneh2", "twoh2", "threeh2", "fourh2"}): + prev_id = li_tag.find_previous( + class_={"navhead1", "navhead", "gen", "oneh2", "twoh2", "threeh2", "fourh2"}).get( + "id") + prev_p_tag = li_tag.find_previous("p") + if prev_p_tag and re.search(r'^(Section\.?|Chapter|Sec\.)$', prev_p_tag.text.strip()): + sub_tag = prev_p_tag.text.strip()[0].lower() + else: + sub_tag = 's' + else: + prev_id = li_tag.find_previous(class_={"navhead1", "navhead", "title"}).get("id") + sub_tag = 'c' + + self.s_nav_count += 1 + cnav = f'snav{self.s_nav_count:02}' + self.set_chapter_section_id(li_tag, sec_num, sub_tag, prev_id, cnav) + + elif self.h2_text and li_tag_text in self.h2_text: + chap_num = re.sub(r'\W+', '', li_tag.text.strip()).lower() + if li_tag.find_previous("li") and li_tag.find_previous("li").get("id"): + self.chp_nav_count = int( + re.search(r'-(?P<ntag>\w+)nav(?P<ncount>\d+)', + li_tag.find_previous("li").get("id").strip()).group( + "ncount")) + 1 + ntag = f"{re.search(r'-(?P<ntag>[a-z]+)nav(?P<ncount>[0-9]+)', li_tag.find_previous('li').get('id').strip()).group('ntag')}" + else: + self.chp_nav_count += 1 + ntag = "c" + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-", + prev_id=li_tag.find_previous( + class_={"title", "oneh2", "twoh2", "threeh2", "fourh2"}).get("id"), + cnav=f'{ntag}nav{self.chp_nav_count:02}') + + elif self.h2_pattern_text: + for list_pattern in self.h2_pattern_text: + h2_pattern = re.compile(list_pattern) + if h2_tag := h2_pattern.search(li_tag.text.strip()): + self.p_nav_count += 1 + self.set_chapter_section_id(li_tag, h2_tag.group("id"), + sub_tag=h2_tag.group("tag").lower(), + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'{h2_tag.group("tag")}nav{self.p_nav_count:02}') + elif self.h3_pattern_text: + for list_pattern in self.h3_pattern_text: + h3_pattern = re.compile(list_pattern) + if h3_tag := h3_pattern.search(li_tag.text.strip()): + self.a_nav_count += 1 + self.set_chapter_section_id(li_tag, h3_tag.group("id"), + sub_tag='s', + prev_id=li_tag.find_previous( + class_={"navhead1", "navhead", "oneh2", "twoh2", + "threeh2"}).get( + "id"), + cnav=f'cnav{self.a_nav_count:02}') + + if self.h2_rename_pattern and li_tag.name == "li" and li_tag.a: + for list_tag in self.h2_rename_pattern: + tag_pattern = re.compile(list_tag) + if tag := tag_pattern.search(li_tag.a.text.strip()): + li_tag[ + "id"] = f'{li_tag.find_previous("h2").get("id")}{tag.group("tag").lower()}{tag.group("id").zfill(2)}-pnav{pnav_count}' + li_tag.a[ + "href"] = f'#{li_tag.find_previous("h2").get("id")}{tag.group("tag").lower()}{tag.group("id").zfill(2)}' + pnav_count += 1 + + elif li_tag.name in ['h2', 'h3', 'h4']: + self.a_nav_count = 0 + self.c_nav_count = 0 + self.p_nav_count = 0 + self.s_nav_count = 0 + + logger.info("anchor tags are added in base class") + + def convert_paragraph_to_alphabetical_ol_tags(self): + """ this method is defined in the child class""" + pass + + def create_analysis_nav_tag(self): + """this method is defined in the child class""" + pass + + def create_judicial_decision_analysis_nav_tag(self): + + """ + - Analysis classes are defined based on the header of the analysis tag. + - This method creates JUDICIAL DECISIONS analysis nav tag + + """ + + a_tag_list = [] + analysis_tag = None + analysis_tag_id = None + analysis_num_tag_id = None + analysis_num_tag = None + a_tag_id = None + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + inner_alpha_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + text_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + for analysis_p_tag in self.soup.findAll('p', {'class': self.tag_type_dict['ol_p']}): + if re.search(r'^Analysis', analysis_p_tag.text.strip()): + for a_tag in analysis_p_tag.find_next_siblings(): + if a_tag.get("class") == [self.tag_type_dict['ol_p']]: + a_tag.name = "li" + a_tag_text = re.sub(r'[\W_]+', '', a_tag.text.strip()).strip().lower() + a_tag_list.append(a_tag_text) + if re.search(r'^\d+\.', a_tag.text.strip()): + if re.search(r'^1\.', a_tag.text.strip()): + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + a_tag.wrap(inner_ul_tag) + if analysis_tag: + analysis_tag.append(inner_ul_tag) + else: + inner_ul_tag.append(a_tag) + analysis_num_tag_id = f"{analysis_tag_id}-{a_tag_text}" + a_tag_id = f"{analysis_tag_id}-{a_tag_text}" + + elif re.search(r'^[a-z]\.', a_tag.text.strip()): + if re.search(r'^a\.', a_tag.text.strip()): + inner_alpha_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + a_tag.wrap(inner_alpha_ul_tag) + a_tag.find_previous("li").append(inner_alpha_ul_tag) + else: + inner_alpha_ul_tag.append(a_tag) + a_tag_id = f"{analysis_num_tag_id}-{a_tag_text}" + + else: + if a_tag.find_previous().name == "a": + ul_tag.append(a_tag) + else: + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + a_tag.wrap(ul_tag) + analysis_tag = a_tag + analysis_tag_id = f"#{a_tag.find_previous('h4').get('id')}-{a_tag_text}" + a_tag_id = f"#{a_tag.find_previous('h3').get('id')}-judicialdecisions-{a_tag_text}" + + anchor = self.soup.new_tag('a', href=a_tag_id) + anchor.string = a_tag.text + a_tag.string = '' + a_tag.append(anchor) + + elif a_tag.get("class") == [self.tag_type_dict['head4']]: + break + else: + if analysis_p_tag.find_previous("h4"): + if re.search(r'^JUDICIAL DECISIONS', analysis_p_tag.find_previous("h4").text.strip()): + if analysis_num_tag_id and re.search(r'^\d+\.\s—\w+', analysis_p_tag.text.strip()): + analysis_p_tag.name = "li" + a_tag_text = re.sub(r'[\W\s]+', '', analysis_p_tag.text.strip()) + + if analysis_p_tag.find_previous("li") and \ + re.search(r'^\d+\.', analysis_p_tag.find_previous("li").text.strip()): + text_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + analysis_p_tag.wrap(text_ul_tag) + analysis_num_tag.append(text_ul_tag) + + else: + text_ul_tag.append(analysis_p_tag) + + a_tag_id = f'{analysis_num_tag_id}{a_tag_text}' + + elif re.search(r'^\d+\.', analysis_p_tag.text.strip()): + analysis_p_tag.name = "li" + analysis_num_tag = analysis_p_tag + if re.search(r'^1\.', analysis_p_tag.text.strip()): + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + analysis_p_tag.wrap(inner_ul_tag) + else: + inner_ul_tag.append(analysis_p_tag) + a_tag_text = re.search(r'^(?P<id>\d+)\.', analysis_p_tag.text.strip()).group("id") + analysis_num_tag_id = f"#{analysis_p_tag.find_previous('h3').get('id')}-judicialdecision-{a_tag_text}" + a_tag_id = analysis_num_tag_id + + anchor = self.soup.new_tag('a', href=a_tag_id) + anchor.string = analysis_p_tag.text + analysis_p_tag.string = '' + analysis_p_tag.append(anchor) + + def create_case_note_analysis_nav_tag(self): + """ + - Analysis classes are defined based on the header of the analysis tag. + - This method creates Case Notes analysis nav tag + + """ + + digit_tag, s_alpha_ul, digit_id, s_alpha_tag, s_rom_ul, s_alpha_id = None, None, None, None, None, None + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + + cap_alpha = 'A' + s_roman = "i" + s_alpha = 'a' + case_count = 1 + case_tag_id = None + a_tag_id = None + alpha_id = None + rom_tag = None + rom_id = None + alpha_tag = None + note_head_id = None + case_tag_id_list = [] + + case_head_id = None + case_head_tag = None + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + for case_tag in self.soup.find_all(class_='casenote'): + if re.search(r'^ANNOTATIONS$', case_tag.find_previous().text.strip()): + rom_tag = None + + case_tag.name = "li" + if re.search(r'^[IVX]+\.', case_tag.text.strip()): + rom_tag = case_tag + cap_alpha = 'A' + + if re.search(r'^I\.', case_tag.text.strip()): + if case_tag.find_next(class_='casenote') and re.search(r'^J\.', case_tag.find_next( + class_='casenote').text.strip()): + alpha_ul.append(case_tag) + cap_alpha = 'J' + a_tag_id = f'{rom_id}-I' + + else: + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(rom_ul) + rom_id = f'#{case_tag.find_previous("h3").get("id")}-notetodecisison-I' + a_tag_id = rom_id + else: + rom_ul.append(case_tag) + + rom_num = re.search(r'^(?P<rid>[IVX]+)\.', case_tag.text.strip()).group("rid") + rom_id = f'#{case_tag.find_previous("h3").get("id")}-notetodecisison-{rom_num}' + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-notetodecisison-{rom_num}' + + elif re.search(fr'^{cap_alpha}\.', case_tag.text.strip()): + alpha_tag = case_tag + if re.search(r'^A\.', case_tag.text.strip()): + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(alpha_ul) + rom_tag.append(alpha_ul) + else: + alpha_ul.append(case_tag) + + alpha_id = f"{rom_id}-{cap_alpha}" + cap_alpha = chr(ord(cap_alpha) + 1) + a_tag_id = alpha_id + + elif re.search(r'^\d+\.', case_tag.text.strip()): + digit_tag = case_tag + s_alpha = 'a' + if re.search(r'^1\.', case_tag.text.strip()): + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(digit_ul) + alpha_tag.append(digit_ul) + if alpha_tag: + alpha_tag.append(digit_ul) + else: + digit_ul.append(case_tag) + + digit_num = re.search(r'^(?P<nid>\d+)\.', case_tag.text.strip()).group("nid") + digit_id = f"{alpha_id}-{digit_num}" + a_tag_id = f"{alpha_id}-{digit_num}" + + elif re.search(r'^—\w+', case_tag.text.strip()): + inner_tag = case_tag + inner_tag_text = re.sub(r'[\W\s]+', '', case_tag.text.strip()).lower() + inner_tag_id = f'{case_head_id}-{inner_tag_text}' + + if not re.search(r'^—\w+', case_tag.find_previous("li").text.strip()): + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(inner_ul_tag) + if case_head_tag: + case_head_tag.append(inner_ul_tag) + else: + inner_tag_id = f'#{case_tag.find_previous("h4").get("id")}-{inner_tag_text}' + else: + inner_ul_tag.append(case_tag) + + if inner_tag_id in case_tag_id_list: + case_tag_id = f'{inner_tag_id}.{case_count}' + case_count += 1 + else: + case_tag_id = f'{inner_tag_id}' + case_count = 1 + + case_tag_id_list.append(case_tag_id) + a_tag_id = case_tag_id + + elif re.search(rf'^{s_alpha}\.', case_tag.text.strip()): + s_alpha_tag = case_tag + s_roman = "i" + if re.search(r'^a\.', case_tag.text.strip()): + s_alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(s_alpha_ul) + digit_tag.append(s_alpha_ul) + else: + s_alpha_ul.append(case_tag) + + s_alpha_id = f"{digit_id}-{s_alpha}" + a_tag_id = f"{digit_id}-{s_alpha}" + s_alpha = chr(ord(s_alpha) + 1) + + elif re.search(rf'^{s_roman}\.', case_tag.text.strip()): + s_rom_tag = case_tag + if re.search(r'^i\.', case_tag.text.strip()): + s_rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(s_rom_ul) + s_alpha_tag.append(s_rom_ul) + else: + s_rom_ul.append(case_tag) + + a_tag_id = f"{s_alpha_id}-{s_roman}" + s_roman = roman.toRoman(roman.fromRoman(s_roman.upper()) + 1).lower() + else: + if case_tag.find_previous("h4") and \ + re.search(r'^ANNOTATIONS$', case_tag.find_previous("h4").text.strip(), re.I): + case_head_tag = case_tag + case_tag_text = re.sub(r'[\W\s]+', '', case_tag.text.strip()).lower() + + if re.search(r'^ANNOTATIONS|^[IVX]+\.', case_tag.find_previous().text.strip()): + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(ul_tag) + if rom_tag: + rom_tag.append(ul_tag) + note_head_id = f'{rom_id}' + else: + note_head_id = f'#{case_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecision' + else: + ul_tag.append(case_tag) + + case_head_id = f'{note_head_id}-{case_tag_text}' + if case_head_id in case_tag_id_list: + case_tag_id = f'{case_head_id}.{case_count}' + case_count += 1 + else: + case_tag_id = f'{case_head_id}' + case_count = 1 + + a_tag_id = f'{note_head_id}-{case_tag_text}' + case_tag_id_list.append(case_head_id) + + anchor = self.soup.new_tag('a', href=a_tag_id) + anchor.string = case_tag.text + case_tag.string = '' + case_tag.append(anchor) + + for p_tag in self.soup.findAll('h4', string=re.compile(r'^Case Notes$')): + case_note_tag = p_tag.find_next_sibling() + if not case_note_tag.get("class") == [self.tag_type_dict['ol_p']]: + case_tag_list = case_note_tag.text.splitlines() + case_note_tag.clear() + for tag in case_tag_list: + if len(tag) > 0: + new_ul_tag = self.soup.new_tag("li") + new_ul_tag.string = tag + new_ul_tag["class"] = "casenotes" + case_note_tag.append(new_ul_tag) + case_note_tag.unwrap() + + case_count = 1 + case_tag_id_list = [] + + for case_tag in self.soup.find_all("li", class_='casenotes'): + if re.search(r'^—\w+', case_tag.text.strip()): + inner_tag = case_tag + inner_tag_text = re.sub(r'[\W\s]+', '', case_tag.text.strip()).lower() + case_tag_id = f'{case_head_id}-{inner_tag_text}' + + if not re.search(r'^—\w+', case_tag.find_previous("li").text.strip()): + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(inner_ul_tag) + case_head_tag.append(inner_ul_tag) + else: + inner_ul_tag.append(case_tag) + + elif re.search(r'^— —\w+', case_tag.text.strip()): + pass + elif re.search(r'^— — —\w+', case_tag.text.strip()): + pass + elif re.search(r'^— — — —\w+', case_tag.text.strip()): + pass + else: + case_head_tag = case_tag + case_tag_text = re.sub(r'[\W\s]+', '', case_tag.text.strip()).lower() + case_head_id = f'#{case_tag.find_previous({"h3", "h2", "h1"}).get("id")}-casenote-{case_tag_text}' + + if case_head_id in case_tag_id_list: + case_tag_id = f'{case_head_id}.{case_count}' + case_count += 1 + else: + case_tag_id = f'{case_head_id}' + case_count = 1 + + if case_tag.find_previous().name != "a": + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(ul_tag) + else: + ul_tag.append(case_tag) + + case_tag_id_list.append(case_head_id) + + anchor = self.soup.new_tag('a', href=case_tag_id) + anchor.string = case_tag.text + case_tag.string = '' + case_tag.append(anchor) + + def create_annotation_analysis_nav_tag(self): + """ + - Analysis classes are defined based on the header of the analysis tag. + - This method creates ANNOTATION analysis nav tag + + """ + + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + + rom_tag = None + alpha_tag = None + a_tag_id = None + rom_tag_id = None + alpha_tag_id = None + + for case_tag in self.soup.find_all("p", class_=self.tag_type_dict['Analysis']): + if re.search(r'^I\.', case_tag.text.strip()): + case_tag_list = case_tag.text.splitlines() + case_tag.clear() + for tag in case_tag_list: + new_ul_tag = self.soup.new_tag("li") + new_ul_tag.string = tag + new_ul_tag["class"] = "annotation" + case_tag.append(new_ul_tag) + case_tag.unwrap() + + for case_tag in self.soup.find_all("li", class_='annotation'): + if re.search(rf'^[IVX]+\.', case_tag.text.strip()): + rom_tag = case_tag + if re.search(r'^I\.', case_tag.text.strip()): + if not re.search(r'^H\.', case_tag.find_previous("li").text.strip()): + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(rom_ul) + rom_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-I' + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-I' + else: + alpha_tag = case_tag + alpha_ul.append(case_tag) + alpha_tag_id = f'{rom_tag_id}-I' + a_tag_id = f'{rom_tag_id}-I' + else: + rom_ul.append(case_tag) + rom_num = re.search(r'^(?P<rid>[IVX]+)\.', case_tag.text.strip()).group("rid") + rom_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-{rom_num}' + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-{rom_num}' + + elif re.search(r'^[A-Z]\.', case_tag.text.strip()): + alpha_tag = case_tag + if re.search(r'^A\.', case_tag.text.strip()): + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(alpha_ul) + rom_tag.append(alpha_ul) + else: + alpha_ul.append(case_tag) + + alpha = re.search(r'^(?P<aid>[A-Z])\.', case_tag.text.strip().strip()) + alpha_tag_id = f'{rom_tag_id}-{alpha.group("aid")}' + a_tag_id = f'{rom_tag_id}-{alpha.group("aid")}' + + elif re.search(r'^\d+\.', case_tag.text.strip().strip()): + if re.search(r'^1\.', case_tag.text.strip().strip()): + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(digit_ul) + alpha_tag.append(digit_ul) + else: + digit_ul.append(case_tag) + digit = re.search(r'^(?P<nid>\d+)\.', case_tag.text.strip().strip()).group("nid") + a_tag_id = f'{alpha_tag_id}-{digit}' + + anchor = self.soup.new_tag('a', href=a_tag_id) + anchor.string = case_tag.text + case_tag.string = '' + case_tag.append(anchor) + + def create_Notes_to_decision_analysis_nav_tag_con(self): + case_tag_id = None + case_head_id = None + case_head_tag = None + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + rom_tag = None + + for note_tag in self.soup.find_all(): + if note_tag.name == "h4": + case_tag_id = None + case_head_id = None + case_head_tag = None + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + rom_tag = None + + elif note_tag.name == "li" and note_tag.get("class") == "note": + if re.search(r'^[IVX]+\.', note_tag.text.strip()): + rom_tag = note_tag + rom_num = re.search(r'^(?P<id>[IVX]+)\.', note_tag.text.strip()).group("id") + if re.search(r'^I\.', note_tag.text.strip()): + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(rom_ul) + else: + rom_ul.append(note_tag) + rom_tag_id = f'#{note_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecision-{rom_num}' + case_tag_id = f'#{note_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecision-{rom_num}' + + elif re.search(r'^—\w+', note_tag.text.strip()): + inner_tag = note_tag + inner_tag_text = re.sub(r'[\W\s]+', '', note_tag.text.strip()).lower() + inner_tag_id = f'{case_head_id}-{inner_tag_text}' + case_tag_id = f'{case_head_id}-{inner_tag_text}' + if not re.search(r'^—\w+', note_tag.find_previous("li").text.strip()): + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(inner_ul_tag) + if case_head_tag: + case_head_tag.append(inner_ul_tag) + else: + inner_ul_tag.append(note_tag) + + elif re.search(r'^— —\w+', note_tag.text.strip()): + inner_li_tag = note_tag + inner_li_tag_text = re.sub(r'[\W\s]+', '', note_tag.text.strip()).lower() + case_tag_id = f'{inner_tag_id}-{inner_li_tag_text}' + if not re.search(r'^— —\w+', note_tag.find_previous("li").text.strip()): + inner_ul_tag1 = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(inner_ul_tag1) + inner_tag.append(inner_ul_tag1) + else: + inner_ul_tag1.append(note_tag) + else: + case_head_tag = note_tag + case_tag_text = re.sub(r'[\W\s]+', '', note_tag.text.strip()).lower() + + if re.search(r'^Notes to Decisions|^[IVX]+\.', note_tag.find_previous().text.strip()): + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(ul_tag) + if rom_tag: + rom_tag.append(ul_tag) + note_head_id = f'{rom_tag_id}' + else: + note_head_id = f'#{note_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecision' + else: + ul_tag.append(note_tag) + + case_head_id = f'{note_head_id}-{case_tag_text}' + case_tag_id = f'{note_head_id}-{case_tag_text}' + + anchor = self.soup.new_tag('a', href=case_tag_id) + anchor.string = note_tag.text + note_tag.string = '' + note_tag.append(anchor) + + def create_Notes_to_decision_analysis_nav_tag(self): + """ + - Analysis classes are defined based on the header of the analysis tag. + - This method creates NOTES TO DECISION analysis nav tag + + """ + case_tag_id = None + case_head_id = None + case_head_tag = None + note_head_tag = None + note_id = None + note_inner_tag = None + note_inr_id = None + inner_tag_id = None + doubledash_inner_tag_id = None + inner_tag = None + doubledash_inner_tag = None + subsection_tag = None + doubledash_inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_inner_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + note_inner1_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + tripledash_inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + new_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag_id: list = [] + count = 1 + inr_count = 1 + + for note_tag in self.soup.find_all("li", class_="note"): + if re.search(r'^—?\d+\.?\s*—\s*(\w+|“)', note_tag.text.strip()): + inner_tag = note_tag + inner_tag_text = re.sub(r'[\W\s]+', '', note_tag.text.strip()).lower() + inner_tag_id = f'{case_head_id}-{inner_tag_text}' + + if not re.search(r'^—?\d+\.?\s*—\s*(\w+|“)', note_tag.find_previous("li").text.strip()): + inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(inner_ul_tag) + case_head_tag.append(inner_ul_tag) + else: + inner_ul_tag.append(note_tag) + + if inner_tag_id in note_tag_id: + inner_tag_id = f'{inner_tag_id}.{inr_count:02}' + inr_count += 1 + else: + inner_tag_id = f'{inner_tag_id}' + inr_count = 1 + + case_tag_id = f'{inner_tag_id}' + note_tag_id.append(case_tag_id) + + elif re.search(r'^—?\d+\.?\s*—\s*—\s*(\w+|“)', note_tag.text.strip()): + doubledash_inner_tag = note_tag + doubledash_inner_tag_text = re.sub(r'[\W\s]+', '', note_tag.text.strip()).lower() + doubledash_inner_tag_id = f'{inner_tag_id}-{doubledash_inner_tag_text}' + + if doubledash_inner_tag_id in note_tag_id: + doubledash_inner_tag_id = f'{doubledash_inner_tag_id}.{inr_count:02}' + inr_count += 1 + else: + doubledash_inner_tag_id = f'{doubledash_inner_tag_id}' + inr_count = 1 + + case_tag_id = f'{doubledash_inner_tag_id}' + note_tag_id.append(case_tag_id) + + if not re.search(r'^—?\d+\.?\s*—\s*—\s*(\w+|“)', note_tag.find_previous("li").text.strip()): + doubledash_inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(doubledash_inner_ul_tag) + inner_tag.append(doubledash_inner_ul_tag) + else: + doubledash_inner_ul_tag.append(note_tag) + + elif re.search(r'^—?\d+\.?\s*—\s*—\s*—\s*(\w+|“)', note_tag.text.strip()): + tripledash_inner_tag_text = re.sub(r'[\W\s]+', '', note_tag.text.strip()).lower() + tripledash_inner_tag_id = f'{doubledash_inner_tag_id}-{tripledash_inner_tag_text}' + + if tripledash_inner_tag_id in note_tag_id: + tripledash_inner_tag_id = f'{tripledash_inner_tag_id}.{inr_count:02}' + inr_count += 1 + else: + tripledash_inner_tag_id = f'{tripledash_inner_tag_id}' + inr_count = 1 + case_tag_id = f'{tripledash_inner_tag_id}' + note_tag_id.append(case_tag_id) + + if not re.search(r'^—?\d+\.?\s*—\s*—\s*—\s*(\w+|“)', note_tag.find_previous("li").text.strip()): + tripledash_inner_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(tripledash_inner_ul_tag) + doubledash_inner_tag.append(tripledash_inner_ul_tag) + else: + tripledash_inner_ul_tag.append(note_tag) + + elif re.search(r'^\d+\.?\s*—\s*—\s*—\s*—\s*(\w+|“)', note_tag.text.strip()): + pass + + elif re.search(r'^[IVX]+\.', note_tag.text.strip()): + rom_tag = note_tag + rom_num = re.search(r'^(?P<id>[IVX]+)\.', note_tag.text.strip()).group("id") + if re.search(r'^I\.', note_tag.text.strip()): + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(rom_ul) + else: + rom_ul.append(note_tag) + case_tag_id = f'#{note_tag.find_previous({"h4", "h3", "h2", "h1"}).get("id")}-notetodecision-{rom_num}' + else: + case_head_tag = note_tag + case_tag_text = re.sub(r'[\W\s]+', '', note_tag.text.strip()).lower() + if re.search(r'^\d+\.?', note_tag.text.strip()): + if re.search(r'^0\.5\.?', note_tag.text.strip()): + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(ul_tag) + + elif re.search(r'^1\.', note_tag.text.strip()): + if note_tag.find_previous("li") and \ + re.search(r'^0\.5\.?', note_tag.find_previous("li").text.strip()): + ul_tag.append(note_tag) + + else: + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(ul_tag) + if subsection_tag: + subsection_tag.append(ul_tag) + else: + if note_tag.find_previous().name != 'a': + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(ul_tag) + else: + ul_tag.append(note_tag) + + case_head_id = f'#{note_tag.find_previous({"h4", "h3", "h2", "h1"}).get("id")}-{case_tag_text}' + case_tag_id = f'#{note_tag.find_previous({"h4", "h3", "h2", "h1"}).get("id")}-{case_tag_text}' + + elif re.search(r'^(FIRST|SECOND|THIRD) SUBSECTION', note_tag.text.strip()): + subsection_tag = note_tag + if re.search(r'^FIRST SUBSECTION', note_tag.text.strip()): + new_ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(new_ul_tag) + else: + new_ul_tag.append(note_tag) + + else: + if re.search(r'^—\w+', note_tag.text.strip()): + if not re.search(r'^—\w+', note_tag.find_previous("li").text.strip()): + note_inner_tag = note_tag + note_inner_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(note_inner_ul) + if note_head_tag: + note_head_tag.append(note_inner_ul) + else: + note_id = f'{note_tag.find_previous("h3").get("id")}-casenote' + else: + note_inner_ul.append(note_tag) + + note_inr_id = f'{note_id}-{case_tag_text}' + + if note_inr_id in note_tag_id: + case_tag_id = f'{note_inr_id}.{inr_count:02}' + inr_count += 1 + else: + case_tag_id = f'{note_inr_id}' + inr_count = 1 + + note_tag_id.append(note_inr_id) + + elif re.search(r'^— —\w+', note_tag.text.strip()): + if not re.search(r'^— —\w+', note_tag.find_previous("li").text.strip()): + note_inner1_tag = note_tag + note_inner1_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(note_inner1_ul) + note_inner_tag.append(note_inner1_ul) + else: + + note_inner1_ul.append(note_tag) + + note_inr1_id = f'{note_inr_id}-{case_tag_text}' + + if note_inr1_id in note_tag_id: + case_tag_id = f'{note_inr1_id}.{inr_count:02}' + inr_count += 1 + else: + case_tag_id = f'{note_inr1_id}' + inr_count = 1 + + note_tag_id.append(note_inr1_id) + + else: + note_head_tag = note_tag + note_head_id = f'#{note_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecision-{case_tag_text}' + + if note_head_id in note_tag_id: + case_tag_id = f'{note_head_id}.{count:02}' + count += 1 + else: + case_tag_id = f'{note_head_id}' + count = 1 + + if note_tag.find_previous().name != "a": + ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + note_tag.wrap(ul_tag) + else: + ul_tag.append(note_tag) + + note_id = case_tag_id + + note_tag_id.append(note_head_id) + + anchor = self.soup.new_tag('a', href=case_tag_id) + anchor.string = note_tag.text + note_tag.string = '' + note_tag.append(anchor) + + def wrap_div_tags(self): + """ + - for each h2 in html + - create new div and append h2 to that div + - find next tag, if next tag is h3 + - create new div and append h3 to it + - append that new div to h2 div + - find next tag of h3, if next tag is h4 + - create new div and append h4 to that div + - append that new div to h3 div + - find next tag, if next tag is h5 + - create new div and append h5 to that div + - append that new div to h4 div + - if not h5 append that tag to h2 div and so on + - if not h4 append that tag to h2 div and so on + - if not h3 append that tag to h2 div and so on + """ + self.soup = BeautifulSoup(self.soup.prettify(formatter=None), features='lxml') + for header in self.soup.findAll('h2'): + new_chap_div = self.soup.new_tag('div') + sec_header = header.find_next_sibling() + if not sec_header: + print() + header.wrap(new_chap_div) + if sec_header: + while True: + next_sec_tag = sec_header.find_next_sibling() + if sec_header.name == 'h3': + new_sec_div = self.soup.new_tag('div') + tag_to_wrap = sec_header.find_next_sibling() + sec_header.wrap(new_sec_div) + while True: + if tag_to_wrap: + next_tag = tag_to_wrap.find_next_sibling() + else: + break + if tag_to_wrap.name == 'h4': + new_sub_sec_div = self.soup.new_tag('div') + inner_tag = tag_to_wrap.find_next_sibling() + tag_to_wrap.wrap(new_sub_sec_div) + + while True: + inner_next_tag = inner_tag.find_next_sibling() + if inner_tag.name == 'h5': + new_h5_div = self.soup.new_tag('div') + inner_h5_tag = inner_tag.find_next_sibling() + inner_tag.wrap(new_h5_div) + while True: + next_h5_child_tag = inner_h5_tag.find_next_sibling() + new_h5_div.append(inner_h5_tag) + inner_next_tag = next_h5_child_tag + if not next_h5_child_tag or next_h5_child_tag.name in ['h3', 'h2', 'h4', + 'h5']: + break + inner_h5_tag = next_h5_child_tag + inner_tag = new_h5_div + new_sub_sec_div.append(inner_tag) + next_tag = inner_next_tag + if not inner_next_tag or inner_next_tag.name in ['h3', + 'h2'] or inner_next_tag.name == 'h4' \ + and inner_next_tag.get('class'): + break + inner_tag = inner_next_tag + tag_to_wrap = new_sub_sec_div + elif tag_to_wrap.name == 'h5': + new_sub_sec_div = self.soup.new_tag('div') + inner_tag = tag_to_wrap.find_next_sibling() + tag_to_wrap.wrap(new_sub_sec_div) + while True: + inner_next_tag = inner_tag.find_next_sibling() + new_sub_sec_div.append(inner_tag) + next_tag = inner_next_tag + if not inner_next_tag or inner_next_tag.name in ['h3', 'h2', 'h4', 'h5']: + break + inner_tag = inner_next_tag + tag_to_wrap = new_sub_sec_div + if not re.search(r'h\d', str(tag_to_wrap.name)): + new_sec_div.append(tag_to_wrap) + next_sec_tag = next_tag + if not next_tag or next_tag.name in ['h3', 'h2']: + break + tag_to_wrap = next_tag + sec_header = new_sec_div + new_chap_div.append(sec_header) + if not next_sec_tag or next_sec_tag.name == 'h2': + break + sec_header = next_sec_tag + if not sec_header: + print() + + logger.info("wrapped inside div tags") + + def wrap_inside_main_tag(self): + + """wrap inside main tag""" + + main_tag = self.soup.new_tag('main') + chap_nav = self.soup.find('nav') + ul = self.soup.find("ul") + if ul: + if ul.find_previous("p", string=re.compile(r'^[A-Za-z]')): + ul.find_previous("p", string=re.compile(r'^[A-Za-z]')).wrap(chap_nav) + self.soup.find("ul").wrap(chap_nav) + tag_to_wrap = chap_nav.find_next_sibling() + while True: + next_tag = tag_to_wrap.find_next_sibling() + main_tag.append(tag_to_wrap) + if not next_tag: + chap_nav.insert_after(main_tag) + break + tag_to_wrap = next_tag + + logger.info("wrapped inside main tag") + + def post_process(self): + """ + adding css file + wrapping watermark tag and head tag inside nav tag + clean HTML + """ + + "adding css file" + stylesheet_link_tag = self.soup.new_tag('link') + stylesheet_link_tag.attrs = {'rel': 'stylesheet', 'type': 'text/css', + 'href': 'https://unicourt.github.io/cic-code-ga/transforms/ga/stylesheet/ga_code_stylesheet.css'} + self.soup.style.replace_with(stylesheet_link_tag) + self.meta_tags.append(copy.copy(stylesheet_link_tag)) + + "adding watermark" + watermark_p = self.soup.new_tag('p', **{"class": "transformation"}) + watermark_p.string = self.watermark_text.format(self.release_number, self.release_date, + datetime.now().date()) + self.soup.find("nav").insert(0, watermark_p) + + for meta in self.soup.findAll('meta'): + if meta.get('name') and meta.get('name') in ['Author', 'Description']: + meta.decompose() + + "adding watermark tag inside meta data" + for key, value in {'viewport': "width=device-width, initial-scale=1", + 'description': self.watermark_text.format(self.release_number, self.release_date, + datetime.now().date())}.items(): + new_meta = self.soup.new_tag('meta') + new_meta.attrs['name'] = key + new_meta.attrs['content'] = value + self.soup.head.append(new_meta) + + "clean HTML" + [text_junk.decompose() for text_junk in + self.soup.find_all("p", class_=self.tag_type_dict['junk1'])] + + [tag.decompose() for tag in self.soup.find_all("p", string=re.compile(r'——————————'))] + + for junk_tag in self.soup.find_all(class_=self.junk_tag_class): + junk_tag.unwrap() + + for tag in self.soup.findAll(): + if len(tag.contents) == 0: + if tag.name == 'meta': + if tag.attrs.get('http-equiv') == 'Content-Style-Type': + tag.decompose() + continue + self.meta_tags.append(copy.copy(tag)) + elif tag.name == 'br': + if not tag.parent or tag in tag.parent.contents: + tag.decompose() + continue + + if tag.name == "ul" and tag.li and re.search(r'p\d+', + str(tag.li.get("class"))) and tag.parent.name != "nav": + tag.wrap(self.soup.new_tag("nav")) + + clss = re.compile(r'p\d+') + for all_tag in self.soup.findAll(class_=clss): + del all_tag["class"] + + for tag in self.soup.find_all(class_="navhead"): + del tag["id"] + + for tag in self.soup.find_all(class_="navhead1"): + del tag["id"] + + logger.info("clean HTML is processed") + return self.soup + + def write_soup_to_file(self): + """ + - add the space before self-closing meta tags + - replace <br/> to <br /> and & to & + - convert html to str + - write html str to an output file + + """ + soup_str = str(self.soup.prettify(formatter=None)) + + for tag in self.meta_tags: + cleansed_tag = re.sub(r'/>', ' />', str(tag)) + soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) + + with open( + f"/home/mis/PycharmProjects/cic_code_framework/transforms_output/{self.state_key.lower()}/oc{self.state_key.lower()}/r{self.release_number}/{self.input_file_name}", + "w") as file: + soup_str = getattr(self.parser_obj, "amp_pattern").sub('&', soup_str) + soup_str = getattr(self.parser_obj, "br_pattern").sub('<br />', soup_str) + soup_str = re.sub(r'<span class.*?>\s*</span>|<p>\s*</p>', '', soup_str) + soup_str = soup_str.replace('=“”>', '=“”>') + + file.write(soup_str) + + logger.info(f"parsing {self.input_file_name} is completed") + + def replace_h3_tags_con(self): + count = 1 + for header_tag in self.soup.find_all(class_=self.tag_type_dict["head3"]): + if self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()): + header_tag.name = "h3" + chap_no = self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()).group('id') + + if header_tag.find_previous("h4") and \ + self.regex_pattern_obj.h2_article_pattern_con.search( + header_tag.find_previous("h4").text.strip()): + header_tag.name = "h5" + header_tag_id = f'{header_tag.find_previous("h4").get("id")}-s{chap_no.zfill(2)}' + elif header_tag.find_previous("h3") and \ + self.regex_pattern_obj.amend_pattern_con.search( + header_tag.find_previous("h3").text.strip()): + header_tag.name = "h4" + header_tag_id = f'{header_tag.find_previous("h3").get("id")}-s{chap_no.zfill(2)}' + else: + header_tag_id = f'{header_tag.find_previous("h2", class_={"oneh2", "gen"}).get("id")}-s{chap_no.zfill(2)}' + + if header_tag.find_previous({"h3", "h4"}, id=header_tag_id): + header_tag["id"] = f'{header_tag_id}.{count:02}' + count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + count = 1 + + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + def replace_tags_constitution(self): + h4_count = 1 + h3_id_list = [] + h2_id_list = [] + count = 1 + self.get_h2_order() + for header_tag in self.soup.find_all("p"): + if header_tag.get("class") == [self.tag_type_dict["head1"]]: + if self.regex_pattern_obj.h1_pattern_con.search(header_tag.text.strip()): + header_tag.name = "h1" + title_no = re.sub(r'[\W\s]+', '', header_tag.text.strip()).lower() + self.title = title_no + header_tag["class"] = "title" + header_tag["id"] = f't{title_no}' + header_tag.wrap(self.soup.new_tag("nav")) + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif header_tag.get("class") == [self.tag_type_dict["head2"]]: + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + header_tag_text = re.sub(r'\W+', '', header_tag.text.strip()) + text = re.search(r'^(\S+)', header_tag.text.strip()).group().lower() + if text == self.h2_order[0]: + pattern = f'h2_{text}_pattern_con' + instance = getattr(self.parser_obj, pattern) + if instance.search(header_tag.text.strip()): + header_tag.name = "h2" + chap_no = instance.search(header_tag.text.strip()).group('id') + header_tag_id = f'{header_tag.find_previous("h1").get("id")}{text[:2]}{chap_no.zfill(2)}' + if header_tag_id in self.h2_rep_id: + header_tag["id"] = f'{header_tag_id}.{self.h2_id_count:02}' + self.h2_id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h2_id_count = 1 + + header_tag["class"] = "oneh2" + self.h2_rep_id.append(header_tag_id) + + elif text == self.h2_order[1]: + self.set_id_for_h2_tags_con(header_tag, text, prev="oneh2", cur="twoh2") + + elif text == self.h2_order[2]: + self.set_id_for_h2_tags_con(header_tag, text, prev={"twoh2", "oneh2"}, cur="threeh2") + + elif text == self.h2_order[3]: + self.set_id_for_h2_tags_con(header_tag, text, prev={"oneh2", "twoh2", "threeh2"}, cur="fourh2") + + if re.search(r'^Amendment (\d+|[IVX]+)', header_tag.text.strip(), re.I): + header_tag.name = "h3" + tag_num = re.search(r'^(?P<amd_txt>Amendment (?P<id>\d+|[IVX]+))', header_tag.text.strip(), re.I) + + if re.search(f'{tag_num.group("amd_txt")}', self.soup.find("ul").text.strip()): + header_tag.name = "h2" + header_tag["id"] = f"{header_tag.find_previous('h1').get('id')}am{tag_num.group('id').zfill(2)}" + else: + header_tag["id"] = f"{header_tag.find_previous('h2').get('id')}-{tag_num.group('id').zfill(2)}" + + header_tag["class"] = "gen" + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif re.search(r'^PREAMBLE|^AMENDMENTS|^Schedule', header_tag.text.strip(), re.I) or \ + self.h2_pattern_text and header_tag.text.strip() in self.h2_pattern_text: + header_tag.name = "h2" + tag_text = re.sub(r'[\W\s]+', '', header_tag.text.strip()).lower() + header_tag["id"] = f"{header_tag.find_previous('h1').get('id')}-{tag_text}" + header_tag["class"] = "gen" + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()): + if re.search(r'§ 4\. Power inherent in the', header_tag.text.strip()): + print() + + header_tag.name = "h3" + chap_no = self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()).group('id') + header_tag_id = f'{header_tag.find_previous("h2", class_={"oneh2", "gen"}).get("id")}-s{chap_no.zfill(2)}' + + if header_tag.find_previous("h3", id=header_tag_id): + header_tag_id = f'{header_tag_id}.{count:02}' + header_tag["id"] = f'{header_tag_id}.{count:02}' + count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + count = 1 + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif self.h2_text_con and header_tag_text in self.h2_text_con: + header_tag.name = "h2" + header_tag["id"] = f'{self.soup.find("h1").get("id")}-{header_tag_text.lower()}' + header_tag["class"] = "oneh2" + + elif self.h2_pattern_text_con: + for list_pattern in self.h2_pattern_text_con: + h2_pattern = re.compile(list_pattern) + if h2_tag := h2_pattern.search(header_tag.text.strip()): + header_tag.name = "h2" + header_tag["class"] = "amd" + header_tag[ + "id"] = f'{header_tag.find_previous("h2", class_="gen").get("id")}-amd{h2_tag.group("id").zfill(2)}' + + elif self.h2_text_con and header_tag_text in self.h2_text_con: + header_tag.name = "h2" + p_tag_text = re.sub(r'\W+', '', header_tag.text.strip()).lower() + header_tag_id = f'{self.soup.find("h1").get("id")}-{p_tag_text}' + + if header_tag_id in self.h2_rep_id: + header_tag["id"] = f'{header_tag_id}.{self.h2_id_count:02}' + self.h2_id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h2_id_count = 1 + header_tag["class"] = "oneh2" + self.h2_rep_id.append(header_tag['id']) + + elif header_tag.get("class") == [self.tag_type_dict["head3"]]: + if re.search(r'^PREAMBLE|^AMENDMENTS|^Schedule', header_tag.text.strip(), re.I): + header_tag.name = "h2" + tag_text = re.sub(r'[\W\s]+', '', header_tag.text.strip()).lower() + header_tag["id"] = f"{header_tag.find_previous('h1').get('id')}-{tag_text}" + header_tag["class"] = "gen" + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()): + header_tag.name = "h3" + chap_no = self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()).group('id') + header_tag[ + "id"] = f'{header_tag.find_previous({"h2", "h3"}, class_={"oneh2", "gen", "amd"}).get("id")}-s{chap_no.zfill(2)}' + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + if self.h3_pattern_text_con: + for list_pattern in self.h3_pattern_text_con: + h3_pattern = re.compile(list_pattern) + if h3_tag := h3_pattern.search(header_tag.text.strip()): + header_tag.name = "h3" + sec_id = h3_tag.group("id") + if header_tag.find_previous("h2", class_={"oneh2", "twoh2", "threeh2"}): + header_tag_id = f'{header_tag.find_previous("h2", class_={"oneh2", "twoh2", "threeh2"}).get("id")}s{sec_id.zfill(2)}' + if header_tag_id in h3_id_list: + header_tag["id"] = f'{header_tag_id}.{self.h3_count:02}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + else: + header_tag_id = f'{header_tag.find_previous("h1").get("id")}s{sec_id.zfill(2)}' + if header_tag_id in h3_id_list: + header_tag["id"] = f'{header_tag_id}.{self.h3_count:02}' + self.h3_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + self.h3_count = 1 + + h3_id_list.append(header_tag_id) + + elif header_tag.get("class") == [self.tag_type_dict["head4"]]: + if header_tag.text.strip() in self.h4_head: + self.replace_h4_tag_titles(header_tag, h4_count, None) + h4_count += 1 + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + elif header_tag.get("class") == [self.tag_type_dict["ul"]]: + if not re.search(r'^(Section\.?|Chapter|Sec\.|Article|Amendment)$', header_tag.text.strip()): + header_tag.name = "li" + header_tag.wrap(self.ul_tag) + + def add_anchor_tags_con(self): + for li_tag in self.soup.findAll(): + if li_tag.name == "li": + li_tag_text = re.sub(r'\W+', '', li_tag.text.strip()) + text = re.search(r'^\S+', li_tag.text.strip()).group().lower() + pattern = f'h2_{text}_pattern_con' + if text == self.h2_order[0]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[:2]}", + prev_id=li_tag.find_previous("h1").get("id"), + cnav=f'cnav{self.c_nav_count:02}') + elif text == self.h2_order[1]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()) and instance.search(li_tag.text.strip()).group('id'): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[:2]}", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'cnav{self.c_nav_count:02}') + elif text == self.h2_order[2]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.c_nav_count += 1 + if li_tag.find_previous("h2", class_="twoh2"): + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[0]}", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'cnav{self.c_nav_count:02}') + elif text == self.h2_order[3]: + instance = getattr(self.parser_obj, pattern) + if instance.search(li_tag.text.strip()): + chap_num = instance.search(li_tag.text.strip()).group('id') + self.c_nav_count += 1 + + if li_tag.find_previous("h2", class_="threeh2"): + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f"{text[0]}", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'cnav{self.c_nav_count:02}') + elif self.regex_pattern_obj.h2_article_pattern_con.search(li_tag.text.strip()): + chap_num = self.regex_pattern_obj.h2_article_pattern_con.search(li_tag.text.strip()).group("id") + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-ar", + prev_id=li_tag.find_previous("h1").get("id"), + cnav=f'cnav{self.c_nav_count:02}') + + elif self.regex_pattern_obj.section_pattern_con.search(li_tag.text.strip()): + chap_num = self.regex_pattern_obj.section_pattern_con.search(li_tag.text.strip()).group("id") + self.s_nav_count += 1 + if self.regex_pattern_obj.section_pattern_con1.search(li_tag.text.strip()): + if li_tag.find_previous("h3", class_={"oneh2", "gen", "amend"}): + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-s", + prev_id=li_tag.find_previous("h3", class_={"oneh2", "gen", + "amend"}).get("id"), + cnav=f'snav{self.s_nav_count:02}') + else: + + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-s", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'snav{self.s_nav_count:02}') + + elif li_tag.find_previous("h4") and \ + self.regex_pattern_obj.h2_article_pattern_con.search( + li_tag.find_previous("h4").text.strip()): + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-s", + prev_id=li_tag.find_previous("h4").get("id"), + cnav=f'snav{self.s_nav_count:02}') + + elif li_tag.find_previous("h3") and \ + self.regex_pattern_obj.amend_pattern_con.search( + li_tag.find_previous("h3").text.strip()): + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-s", + prev_id=li_tag.find_previous("h3").get("id"), + cnav=f'snav{self.s_nav_count:02}') + + else: + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-s", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'snav{self.s_nav_count:02}') + + elif re.search(r'^PREAMBLE|^AMENDMENTS|^Schedule', li_tag.text.strip(), re.I) or \ + self.h2_pattern_text and li_tag.text.strip() in self.h2_pattern_text: + chap_num = re.sub(r'[\W\s]+', '', li_tag.text.strip()).lower() + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-", + prev_id=li_tag.find_previous('h1').get("id"), + cnav=f'cnav{self.c_nav_count:02}') + elif self.regex_pattern_obj.amend_pattern_con.search(li_tag.text.strip()): + chap_num = self.regex_pattern_obj.amend_pattern_con.search(li_tag.text.strip()).group("id") + self.a_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-", + prev_id=li_tag.find_previous({"h2", "h1"}).get("id"), + cnav=f'amnav{self.a_nav_count:02}') + + elif self.h2_text_con and li_tag_text in self.h2_text_con: + if li_tag.find_previous("li") and li_tag.find_previous("li").get("id"): + self.chp_nav_count = int( + re.search(r'(c|s|am|a)nav(?P<ncount>\d+)', + li_tag.find_previous("li").get("id").strip()).group( + "ncount")) + 1 + else: + self.chp_nav_count += 1 + self.set_chapter_section_id(li_tag, li_tag_text.lower(), + sub_tag="-", + prev_id=li_tag.find_previous("h1").get("id"), + cnav=f'cnav{self.chp_nav_count:02}') + elif self.h3_pattern_text_con: + for list_pattern in self.h3_pattern_text_con: + h3_pattern = re.compile(list_pattern) + if h3_tag := h3_pattern.search(li_tag.text.strip()): + self.s_nav_count += 1 + self.set_chapter_section_id(li_tag, h3_tag.group("id"), + sub_tag='s', + prev_id=li_tag.find_previous( + class_={"navhead1", "navhead", "oneh2", "twoh2", + "threeh2"}).get( + "id"), + cnav=f'snav{self.s_nav_count:02}') + elif li_tag.name in ['h2', 'h3', 'h4']: + self.a_nav_count = 0 + self.c_nav_count = 0 + self.p_nav_count = 0 + self.s_nav_count = 0 + + def create_analysis_nav_tag_con(self): + pass + + def run_constitution(self): + """calling methods to parse the passed constitution htmls""" + + self.set_page_soup() + self.set_release_date() + self.pre_process() + self.generate_class_name_dict() + self.replace_tags_constitution() + self.wrap_inside_main_tag() + self.add_anchor_tags_con() + self.convert_paragraph_to_alphabetical_ol_tags() + self.create_analysis_nav_tag() + self.wrap_div_tags() + self.post_process() + self.validating_broken_links() + self.write_soup_to_file() + + def run_titles(self): + + """calling methods to parse the passed title htmls""" + + self.set_page_soup() + self.set_release_date() + self.pre_process() + self.generate_class_name_dict() + self.replace_tags_titles() + self.wrap_inside_main_tag() + self.add_anchor_tags() + self.convert_paragraph_to_alphabetical_ol_tags() + self.create_analysis_nav_tag() + self.wrap_div_tags() + self.creating_formatted_table() + self.post_process() + self.storing_header_ids() + self.validating_broken_links() + self.write_soup_to_file() + + def run(self): + logger.info(self.meta_data) + start_time = datetime.now() + logger.info(start_time) + + if re.search('constitution', self.input_file_name): + self.run_constitution() + else: + self.run_titles() + + logger.info(datetime.now() - start_time) + return str(self.soup.prettify(formatter=None)), self.meta_tags + + def storing_header_ids(self): + title_id = re.search(r'(?P<tid>(\d+[A-Z]?\.\d+[A-Z]?)|\d+(\w)*|\d+(\.\w+)*)', self.input_file_name).group("tid") + if not os.path.exists(f'{self.state_key}_cite_id'): + os.mkdir(f'{self.state_key}_cite_id') + if not os.path.exists(f'{self.state_key}_cite_id/{self.state_key}{self.release_number}'): + os.mkdir(f'{self.state_key}_cite_id/{self.state_key}{self.release_number}') + + with open( + f'{self.state_key}_cite_id/{self.state_key}{self.release_number}/{self.state_key}{self.release_number}_{title_id}_ids.txt', + "w") as file: + list_of_ids = [] + for tag in self.soup.find_all({'h3', "li"}): + if tag.name == "h3" and tag.get("id"): + key = re.search(r'.+(s|c)(?P<sec_id>.+)$', tag.get("id").strip()).group("sec_id") + value = tag.get("id") + list_of_ids.append(f'{key} {value}\n') + elif tag.name == "li" and tag.get("id") and tag.parent.name == "ol": + if re.search(r'.+s(?P<sec_id>.+)$', tag.get("id").strip()): + key = re.search(r'.+s(?P<sec_id>.+)$', tag.get("id").strip()).group("sec_id") + value = tag.get("id") + list_of_ids.append(f'{key} {value}\n') + file.writelines(list_of_ids) + + def validating_broken_links(self): + header_id_list: list = [] + for head_tag in self.soup.find_all({'h2', 'h3', 'h5', 'table'}): + if head_tag.get("id"): + header_id_list.append(head_tag.get("id")) + + for li_tag in self.soup.find_all("li"): + if li_tag.a and li_tag.a.get("href"): + href_to_id = re.sub(r'#', '', li_tag.a.get("href").strip()) + if href_to_id not in header_id_list: + print(li_tag) + li_tag.a.unwrap() + logger.warning( + f"*{li_tag.text.strip()}* is invalid link in r{self.release_number}f{self.input_file_name}") + + logger.info("validated for broken links") + + def creating_formatted_table(self): + pass + + def format_id(self, sec_id, tag): + pass + + def h2_set_id(self, header_tag): + pass + + def get_h2_order(self): + if len(self.h2_order) < 4: + self.h2_order += [""] * (abs(4 - len(self.h2_order))) + logger.warning("appending empty element to h2_order list") + elif len(self.h2_order) > 4: + logger.warning("ignoring elements of he_order list after 4 elements") diff --git a/html_parser_framework/co_html_parser.py b/html_parser_framework/co_html_parser.py new file mode 100644 index 0000000..1f4a08f --- /dev/null +++ b/html_parser_framework/co_html_parser.py @@ -0,0 +1,1400 @@ +import re +from base_html_parser import ParseHtml +from regex_pattern import CustomisedRegexCO +import roman +from loguru import logger + + +class COParseHtml(ParseHtml): + + def __init__(self, state_key, path, release_number, input_file_name): + super().__init__(state_key, path, release_number, input_file_name) + self.file_no = None + + def pre_process(self): + if re.search('constitution', self.input_file_name): + self.tag_type_dict: dict = {'ul': '^Article I.|^Preamble', 'head2': '^Article I.', + 'head1': '^Declaration of Independence|Constitution of the State of Colorado', + 'head3': r'^§ 1.|^Section 1\.', 'junk1': '^Statute text|^Text', 'ol_p': r'^§', + 'head4': '^ANNOTATIONS|^ANNOTATION', 'art_head': '^ARTICLE', + 'amd': '^AMENDMENTS', 'Analysis': r'^I\.', 'section': '^Section 1.', + } + else: + self.file_no = re.search(r'gov\.co\.crs\.title\.(?P<fno>\w+(\.\d)*)\.html', self.input_file_name).group( + "fno") + + if self.file_no in ['07', '38', '04']: + self.tag_type_dict: dict = {'ul': '^Art.', 'head2': r'^ARTICLE \d', + 'head1': '^(TITLE|Title)|^(CONSTITUTION OF KENTUCKY)', + 'head3': r'^\d+(\.\d+)*-\d+-\d+\.', + 'part_head': r'^PART 1', + 'junk1': '^Annotations', 'ol_p': r'^(\(1\))', + 'head4': '^ANNOTATION', 'nd_nav': r'^1\.', + 'Analysis': r'^Analysis', 'editor': '^Editor\'s note', + 'h4_article': r'^Article I', + 'table': r'^Eligible Employers Solvency|20 or younger|^Table \d'} + + else: + self.tag_type_dict: dict = {'ul': '^Art.', 'head2': '^ARTICLE|^Article|^Part', + 'head1': '^(TITLE|Title)|^(CONSTITUTION OF KENTUCKY)', + 'head3': r'^\d+(\.\d+)*-\d+-\d+\.', + 'junk1': '^Annotations', 'ol_p': r'^(\(1\))', + 'head4': '^ANNOTATION', 'nd_nav': r'^1\.', 'part_head': r'^PART\s\d+', + 'Analysis': r'^Analysis', 'editor': '^Editor\'s note', + 'h4_article': 'Article I', 'h4_article1': 'Article I', + 'table': r'^Eligible Employers Solvency|20 or younger|^Table \d'} + + self.h2_text = ['General, Primary, Recall, and Congressional Vacancy Elections', 'Other Election Offenses', + 'Initiative and Referendum', 'Odd-Year Elections', 'Election Campaign Regulations', + 'Congressional Districts', 'General Assembly', 'Legislative Services', 'Jurisdiction', + 'Statutes - Construction and Revision', 'Miscellaneous', 'Consumer Credit Code', + 'Refund Anticipation Loans', 'Rental Purchase', 'Interest Rates', 'Debt Management', + 'Fair Trade and Restraint of Trade', 'Energy and Water Conservation', 'Art Transactions', + 'Agricultural Assistance', 'Assignments in General', 'Patents - Prohibited Communication', + 'Enforcement of Nondramatic Music Copyrights', 'Charitable Solicitations', + 'Records Retention', 'Health Care Coverage Cooperatives', + 'Transactions Involving Licensed Hospitals', 'Hospital Disclosures to Consumers', + 'Protection Against Exploitation of At-Risk Adults', 'Residential Roofing Services', + 'Direct Primary Health Care', 'Cemeteries', 'Public Establishments', + 'Internet Service Providers', 'Corporations', 'Associations', 'Partnerships', + 'Trademarks and Business Names', 'Trade Secrets', 'Limited Liability Companies', + 'Corporations and Associations', 'Corporations - Continued', 'Colorado Corporation Code', + 'Nonprofit Corporations', 'Special Purpose Corporations', + 'Religious and Benevolent Organizations', 'LABOR I - Department of Labor and Employment', + 'LABOR II - Workers’ Compensation and Related Provisions', + 'LABOR III - Employment Security', + 'Employment and Training', 'Independent Living Services', 'Labor Conditions', 'Wages', + 'Division of Labor - Industrial Claim Appeals Office', 'Labor Relations', + 'Workers’ Compensation Cost Containment', 'Apprenticeship and Training', + 'Public Works', 'Fuel Products', 'Buildings and Equipment', 'Explosives', + 'Special Safety Provisions', 'General Provisions', 'Licenses', + 'Regulation of Insurance Companies', 'Property and Casualty Insurance', + 'Nonadmitted Insurance', 'Captive Insurance Companies', + 'Life Insurance', 'Covercolorado', + 'Franchise Insurance', 'Credit Insurance', 'Title Insurance', 'Mutual Insurance', + 'Interinsurance', 'Fraternal Benefit Societies', 'Preneed Funeral Contracts', + 'Health Care Coverage', 'Health Maintenance Organizations', 'Medicare Supplement Insurance', + 'Long - Term Care', 'Life and Health Insurance Protection', 'Health Care', + 'Cash - Bonding Agents', 'Banks and Industrial Banks', 'Branch Institutions', + 'Credit Unions', 'Marijuana Financial Services Cooperatives', 'Miscellaneous', + 'Savings and Loan Associations', 'Securities', 'Public Securities', + 'Recovery and Reinvestment Finance Act', 'U.S. Agency Obligations', + 'Hospital and Health Care Trusts', 'Compliance Review Documents', 'Banks', 'Banking Code', + 'General Financial Provisions', 'Industrial Banks', 'Trust Companies and Trust Funds', + 'General', 'Division of Real Estate', 'Division of Conservation', + 'Division of Professions and Occupations', 'Business Professions and Occupations', + 'Health Care Professions and Occupations', 'Courts of Record', + 'Municipal Courts', 'Civil Protection Orders', 'Change of Name', 'Costs', + 'Regulation of Actions and Proceedings', 'Damages and Limitations on Actions', + 'Contracts and Agreements', 'Evidence', 'Fees and Salaries', 'Forcible Entry and Detainer', + 'Habeas Corpus', 'Joint Rights and Obligations', 'Judgments and Executions', + 'Juries and Jurors', 'Limitation of Actions', 'Priority of Actions', 'Witnesses', + 'Advocates', 'Adoption - Adults', 'Marriage and Rights of Married Persons', + 'Domestic Abuse', 'Desertion and Nonsupport', + 'Dissolution of Marriage - Parental Responsibilities', + 'Child Support', 'Civil Union', 'Fiduciary', 'Powers of Appointment', + 'Colorado Uniform Trust Code', 'Colorado Probate Code', + 'Declarations - Future Health Care Treatment', 'Human Bodies After Death', + 'Community Property Rights', 'Designated Beneficiary Agreements', + 'Abandoned Estate Planning Documents', 'Code of Criminal Procedure', + 'Uniform Mandatory Disposition of Detainers Act', 'Wiretapping and Eavesdropping', + 'Criminal Activity Information', 'Sentencing and Imprisonment', 'Costs - Criminal Actions', + 'Fugitives and Extradition', 'Offenders - Registration', 'Department of Corrections', + 'Correctional Facilities and Programs', 'Diagnostic Programs', 'Miscellaneous Provisions', + 'General and Administrative', 'Compensatory Education', 'School Districts', + 'Financial Policies and Procedures', 'Financing of Schools', 'Second Chance Program', + 'Financing of Schools - Continued', 'Teachers', 'Junior Colleges', 'Miscellaneous', + 'State Universities and Colleges', 'Community Colleges and Occupational Education', + 'Educational Centers and Local District Colleges', 'Educational Programs', + 'Administration', 'Vital Statistics', 'Hospitals', 'Disease Control', + 'Products Control and Safety', 'Family Planning', 'Environmental Control', + 'Environment - Small Communities', 'Safety - Disabled Persons', + 'Prevention, Intervention, and Treatment Services', 'Health Care', 'Administration', + 'Prescription Drugs', 'Indigent Care', 'Colorado Medical Assistance Act', + 'Children’s Basic Health Plan', 'Administration', 'State Officers', 'Principal Departments', + 'Governor’s Office', 'Other Agencies', 'State Personnel System and State Employees', + 'Public Employees’ Retirement Systems', 'Federal Programs - Housing - Relocation', + 'Interstate Compacts and Agreements', 'Planning - State', + 'Publication of Legal Notices and Public Printing', 'Electronic Transactions', + 'Public (Open) Records', 'Governmental Access to News Information', + 'State Funds', 'Federal Funds', 'Restrictions on Public Benefits', + 'State Fiscal Policies Relating To Section 20 Of Article X of the State Constitution', + 'Federal Mandates', 'Internet Regulation', 'State Delinquency Charges', + 'State History, Archives, and Emblems', 'Allocation for Art', 'State Property', + 'State Assistance - Denver Convention Center', 'Information Technology Access for Blind', + 'Libraries', 'Construction', 'Procurement Code', + 'Government Competition with Private Enterprise', + 'Financing of Critical State Needs', 'Administration''Vital Statistics', 'Hospitals', + 'Disease Control', 'Products Control and Safety', 'Family Planning', + 'Environmental Control', 'Environment - Small Communities', 'Safety - Disabled Persons', + 'Prevention, Intervention, and Treatment Services', 'Health Care', + 'Department of Human Services', 'Mental Health', 'Corrections', 'Other Institutions', + 'Colorado Diagnostic Program', 'Behavioral Health', + 'Mental Health and Mental Health Disorders', + 'Alcohol and Substance Use - Alcohol and Substance Use Disorders', 'Institutions', + 'Emergency Preparedness', 'Military', 'Veterans', 'Division of Aviation', + 'General Provisions', 'Housing', 'Miscellaneous', 'Energy Conservation', + 'Property Insurance', + 'Bond Anticipation Notes', 'Tax Anticipation Notes', 'Land Use Control and Conservation', + 'Hazardous Substance Incidents', 'Wildland Fire Planning', 'Special Statutory Authorities', + 'Marketing Districts', 'Affordable Housing Dwelling Unit Advisory Boards', + 'Competition in Utility and Entertainment Services', 'Medical Provider Fees', + 'Immigration Status - Cooperation with Federal Officials', 'Compensation - Fees', + 'County Elected Officials’ Salary Commission', 'Location and Boundaries', 'County Officers', + 'County Powers and Functions', 'County Planning and Building Codes', + 'Apportionment of Federal Moneys', 'Flood Control', 'Home Rule', + 'Corporate Class - Organization and Territory', 'Municipal Elections', + 'Annexation - Consolidation - Disconnection', 'Powers and Functions of Cities and Towns', + 'Special District Act', 'Multipurpose Districts', 'Water and Sanitation Districts', + 'Single Purpose Service Districts', 'Regional Service Authorities', + 'Special Statutory Districts', 'Wildlife', 'Administration', 'Parks', + 'Wildlife - Continued', + 'Outdoor Recreation', 'Colorado Natural Areas', 'Recreational Areas and Ski Safety', + 'Great Outdoors Program', 'Geological Survey', 'Joint Review Process', 'Mines and Minerals', + 'Oil and Natural Gas', 'Administration', 'Pest and Weed Control', + 'Organically Grown Products', + 'Fertilizers', 'Weights and Measures', 'Central Filing System', 'Poultry and Rabbits', + 'Agricultural Products - Standards and Regulations', 'Marketing and Sales', + 'Protection of Livestock', 'Livestock', 'Meat Processing', + 'Agricultural Products - Standards and Regulations - Continued', 'Fairs', + 'Soil Conservation', 'Development Authority', 'Produce Safety', 'Pet Animal Care', + 'Public Lands and Rivers', 'Weather Modification', 'State Lands', 'Forestry', + 'Natural Areas', + 'Conservancy Law of Colorado - Flood Control', 'Drainage and Drainage Districts', + 'Water Conservation and Irrigation Districts', 'Water Conservation Board and Compacts', + 'Water Rights and Irrigation', 'Water Resources and Power Development', + 'Water Conservation', 'Eminent Domain', 'Frauds - Statute of Frauds', + 'Joint Rights and Obligations', 'Tenants and Landlords', 'Unclaimed Property', + 'Loaned Property', 'Liens', 'Partition', 'Manufactured Homes', 'Real Property', + 'Survey Plats and Monument Records', 'Property Tax', 'Specific Taxes', + 'General and Administrative', 'Exemptions', 'Deferrals', 'Valuation and Taxation', + 'Equalization', 'Collection and Redemption', 'Conveyancing and Evidence of Title', + 'Public Utilities', 'Railroads', 'Geothermal Heat', 'Energy Impacts', 'Aircraft', + 'Airports', 'Aerospace', 'General and Administrative', 'Drivers’ Licenses', 'Taxation', + 'Regulation of Vehicles and Traffic', 'Automobile Theft Law', 'Certificates of Title', + 'Motor Vehicle Financial Responsibility Law', 'Port of Entry Weigh Stations', + 'Motor Vehicle Repairs', 'Collector’s Items', 'Disposition of Personal Property', + 'Idling Standard', 'Highway Safety', 'General and Administrative', + 'Highways and Highway Systems', 'Special Highway Construction', 'Financing', + 'Highway Safety', + 'Aviation Safety and Accessibility', 'General Provisions', 'Alcohol and Tobacco Regulation', + 'Marijuana Regulation', 'Automobiles', 'Gaming and Racing', 'Lottery', + 'Generally', 'Airport Revenue Bonds' + ] + + self.h4_head: list = ['Editor’s Notes', 'Cross references:', 'NOTES TO DECISIONS', 'JUDICIAL DECISIONS', + 'RESEARCH REFERENCES', 'ANNOTATION', 'OFFICIAL COMMENT', 'History.'] + + self.watermark_text = """Release {0} of the Official Code of Colorado Annotated released {1}. + Transformed and posted by Public.Resource.Org using cic-beautify-state-codes.py version 1.4 on {2}. + This document is not subject to copyright and is in the public domain. + """ + self.h2_order: list = ['article', 'part', 'subpart', ''] + + self.regex_pattern_obj = CustomisedRegexCO() + + def replace_tags_titles(self): + for p_tag in self.soup.find_all("p", class_=self.tag_type_dict["head1"]): + if self.regex_pattern_obj.h2_subpart_pattern.search(p_tag.text.strip()): + pos = p_tag.attrs['class'].index(self.tag_type_dict["head1"]) + p_tag.attrs['class'][pos] = self.tag_type_dict["head2"] + + super(COParseHtml, self).replace_tags_titles() + num_p_tag = None + h4_count = 1 + cap_roman = "I" + cap_rom_id = None + alpha_id = None + + for p_tag in self.soup.find_all("p"): + if p_tag.get("class") == [self.tag_type_dict["head2"]]: + p_tag.name = "h2" + if p_tag.get("class") == [self.tag_type_dict["head4"]]: + if num_p_tag and re.search(r'^\d+\.\s—\w+', p_tag.text.strip()): + p_tag.name = "h4" + p_tag_text = re.sub(r'[\W\s]+', '', p_tag.text.strip()) + p_tag["id"] = f'{num_p_tag}{p_tag_text}' + else: + if re.search(rf'^{cap_roman}\.', p_tag.text.strip()): + p_tag.name = "h5" + chap_num = re.search(r'^(?P<id>[IVX]+)\.', p_tag.text.strip()).group("id") + + cap_rom_id = f'{p_tag.find_previous("h3").get("id")}-annotation-{chap_num}' + p_tag["id"] = f'{p_tag.find_previous("h3").get("id")}-annotation-{chap_num}' + cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) + + elif re.search(r'^[A-Z]\.\s"?[A-Z][a-z]+', p_tag.text.strip()) and \ + p_tag.find_previous(re.compile('^h[1-4]$')).name == "h4": + p_tag.name = "h5" + prev_id = cap_rom_id + chap_num = re.search(r'^(?P<id>[A-Z])\.', p_tag.text.strip()).group("id") + alpha_id = f'{prev_id}-{chap_num}' + p_tag["id"] = f'{prev_id}-{chap_num}' + + elif alpha_id and re.search(r'^[1-9]\.', p_tag.text.strip()): + p_tag.name = "h5" + num_id = alpha_id + chap_num = re.search(r'^(?P<id>[0-9])\.', p_tag.text.strip()).group("id") + p_tag["id"] = f'{num_id}-{chap_num}' + elif re.search(r'^(?P<id>[IVX]+)\.', p_tag.text.strip()) and self.release_number == '76': + p_tag.name = "h5" + chap_num = re.search(r'^(?P<id>[IVX]+)\.', p_tag.text.strip()).group("id") + cap_rom_id = f'{p_tag.find_previous("h3").get("id")}-annotation-{chap_num}' + p_tag["id"] = f'{p_tag.find_previous("h3").get("id")}-annotation-{chap_num}' + + if self.regex_pattern_obj.h2_part_pattern.search(p_tag.text.strip()): + p_tag["class"] = "navhead1" + p_tag[ + "id"] = f'{p_tag.find_previous("h2").get("id")}p{self.regex_pattern_obj.h2_part_pattern.search(p_tag.text.strip()).group("id").zfill(2)}' + + elif self.regex_pattern_obj.h2_subpart_pattern.search(p_tag.text.strip()): + + p_tag["class"] = "navhead" + p_tag[ + "id"] = f'{p_tag.find_previous("p", class_="navhead1").get("id")}s{self.regex_pattern_obj.h2_subpart_pattern.search(p_tag.text.strip()).group("id").zfill(2)}' + + if re.search(r'^——————————$', p_tag.text.strip()): + p_tag.decompose() + + elif p_tag.get("class") == [self.tag_type_dict["part_head"]]: + if self.regex_pattern_obj.h2_part_pattern.search(p_tag.text.strip()): + p_tag["class"] = "navhead1" + p_tag[ + "id"] = f'{p_tag.find_previous("h2").get("id")}p{self.regex_pattern_obj.h2_part_pattern.search(p_tag.text.strip()).group("id").zfill(2)}' + + elif self.regex_pattern_obj.h2_subpart_pattern.search(p_tag.text.strip()): + + p_tag["class"] = "navhead" + p_tag[ + "id"] = f'{p_tag.find_previous("p", class_="navhead1").get("id")}s{self.regex_pattern_obj.h2_subpart_pattern.search(p_tag.text.strip()).group("id").zfill(2)}' + + if re.search(r'^——————————$', p_tag.text.strip()): + p_tag.decompose() + + elif p_tag.get("class") == [self.tag_type_dict["h4_article"]] or \ + "h4_article1" in self.tag_type_dict and p_tag.get("class") == [self.tag_type_dict["h4_article1"]]: + if re.search(r'^(ARTICLE|Article) ([IVX]+|\d+)', p_tag.text.strip()): + p_tag.name = "h4" + ar_id = re.search(r"^(ARTICLE|Article) (?P<aid>[IVX]+|\d+)", p_tag.text.strip()).group("aid") + p_tag["id"] = f'{p_tag.find_previous("h3").get("id")}-' \ + f'a{ar_id}' + + elif p_tag.get("class") == [self.tag_type_dict["ol_p"]]: + if p_tag.text.strip() in self.h4_head: + p_tag.name = "h4" + header4_tag_text = re.sub(r'[\W.]+', '', p_tag.text.strip()).lower() + h4_tag_id = f'{p_tag.find_previous({"h3", "h2", "h1"}).get("id")}-{header4_tag_text}' + + if h4_tag_id in self.h4_cur_id_list: + p_tag['id'] = f'{h4_tag_id}.{h4_count}' + h4_count += 1 + else: + p_tag['id'] = f'{h4_tag_id}' + + self.h4_cur_id_list.append(h4_tag_id) + + if self.release_number in ['75', '76'] and self.file_no in ['37', '24']: + if re.search(r'^(ARTICLE|Article) ([IVX]+|\d+)', p_tag.text.strip()): + p_tag.name = "h4" + ar_id = re.search(r"^(ARTICLE|Article) (?P<aid>[IVX]+|\d+)", p_tag.text.strip()).group("aid") + p_tag["id"] = f'{p_tag.find_previous("h3").get("id")}-' \ + f'a{ar_id}' + + if re.search(r'^Analysis$|^ARTICLE \d\.', p_tag.text.strip()): + cap_roman = "I" + for tag in p_tag.find_next_siblings(): + if tag.get('class') == [self.tag_type_dict["head4"]] or \ + tag.get('class') == [self.tag_type_dict["part_head"]]: + break + else: + tag["class"] = "annotation" + tag.name = "li" + + if re.search(r'^History\.', p_tag.text.strip()): + alpha_id = None + + def add_anchor_tags(self): + super(COParseHtml, self).add_anchor_tags() + + h2_article_pattern = re.compile(r'^(article|Art\.)\s(?P<id>\d+(\.\d+)*)', re.I) + + for li_tag in self.soup.findAll(): + if li_tag.name == "li" and not li_tag.get("id"): + if h2_article_pattern.search(li_tag.text.strip()): + chap_num = h2_article_pattern.search(li_tag.text.strip()).group("id") + sub_tag = "a" + prev_id = li_tag.find_previous("h1").get("id") + self.c_nav_count += 1 + cnav = f'cnav{self.c_nav_count:02}' + self.set_chapter_section_id(li_tag, chap_num, sub_tag, prev_id, cnav) + + elif li_tag.name in ['h2', 'h3', 'h4']: + self.a_nav_count = 0 + self.c_nav_count = 0 + self.p_nav_count = 0 + self.s_nav_count = 0 + + def convert_paragraph_to_alphabetical_ol_tags(self): + """ + For each tag which has to be converted to orderd list(<ol>) + - create new <ol> tags with appropriate type (1, A, i, a ..) + - get previous headers id to set unique id for each list item (<li>) + - append each li to respective ol accordingly + """ + if not re.search('constitution', self.input_file_name): + for tag in self.soup.find_all("p", class_=[self.tag_type_dict['editor']]): + if re.search(r'^Editor\'s note: \(\d+\)', tag.text.strip()): + new_h4_tag = self.soup.new_tag("h4") + new_h4_tag.string = tag.find_next("b").text + h4_text = re.sub(r'[\W\s]+', '', tag.find_next("b").text.strip()).lower() + new_h4_tag['id'] = f'{tag.find_previous({"h3", "h2", "h1"}).get("id")}-{h4_text}' + tag.insert_before(new_h4_tag) + tag.find_next("b").decompose() + + for p_tag in self.soup.find_all("p", class_=[self.tag_type_dict['ol_p']]): + current_p_tag = p_tag.text.strip() + if re.search(r'^\[.+\]\s*\(\d+(\.\d+)*\)', current_p_tag): + alpha_text = re.sub(r'^\[.+\]\s*', '', current_p_tag) + num_text = re.sub(r'\(1\).+', '', current_p_tag) + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + + if re.search(r'^\(\d+(\.\d+)*\)', current_p_tag): + if p_tag.find_next().name == "b": + if re.search(r'^\[ Editor\'s note:', p_tag.find_next().text.strip()): + continue + else: + alpha_text = re.sub(r'^[^.]+\.', '', current_p_tag) + num_text = re.sub(r'\(a\).+', '', current_p_tag) + if re.search(r'^\s*\([a-z]\)', alpha_text): + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + elif re.search(r'^.+\s(?P<alpha>\(a\)+)', current_p_tag): + alpha_text = re.search(r'^.+\s(?P<alpha>\(a\).+)', current_p_tag).group("alpha") + num_text = re.sub(r'\(a\).+', '', current_p_tag) + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + + if re.search(r'^\(\d+\)\s*\([a-z]+\)\s*.+\s*\([a-z]\)', current_p_tag): + alpha = re.search( + r'^(?P<num_text>\(\d+\)\s*\((?P<alpha1>[a-z]+)\)\s*.+\s*)(?P<alpha_text>\((?P<alpha2>[a-z])\).+)', + current_p_tag) + if re.match(r'^\([a-z]\)', p_tag.find_next_sibling().text.strip()): + nxt_alpha = re.search(r'^\((?P<alpha3>[a-z])\)', + p_tag.find_next_sibling().text.strip()).group("alpha3") + if ord(alpha.group("alpha2")) == (ord(alpha.group("alpha1"))) + 1: + if ord(nxt_alpha) == (ord(alpha.group("alpha2"))) + 1: + alpha_text = alpha.group("alpha_text") + num_text = alpha.group("num_text") + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + + if re.search(r'^\(\d+\)\s*(to|and)\s*\(\d+\)\s*', current_p_tag): + nxt_tag = p_tag.find_next_sibling( + lambda tag: tag.name in ['p'] and re.search(r'^[^\s]', tag.text.strip())) + alpha = re.search( + r'^(?P<text1>\((?P<num1>\d+)\))\s*(to|and)\s*(?P<text2>\((?P<num2>\d+)\)\s*(?P<rpt_text>.+))', + current_p_tag) + if re.search(r'^\(\d+\)', nxt_tag.text.strip()): + nxt_alpha = re.search(r'^\((?P<num3>\d+)\)', nxt_tag.text.strip()).group( + "num3") + if int(nxt_alpha) != int(alpha.group("num1")) + 1: + if int(alpha.group("num2")) == int(alpha.group("num1")) + 1: + if int(nxt_alpha) == int(alpha.group("num2")) + 1: + alpha_text = alpha.group("text2") + num_text = alpha.group("text1") + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + else: + if int(nxt_alpha) == int(alpha.group("num2")) + 1: + alpha_text = alpha.group("text2") + num_text = alpha.group("text1") + alpha.group("rpt_text") + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + range_from = int(alpha.group("num1")) + range_to = int(alpha.group("num2")) + count = range_from + 1 + for new_p_tag in range(range_from + 1, range_to): + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = f'({count}){alpha.group("rpt_text")}' + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag = new_p_tag + count += 1 + + if re.search(r'^\([a-zA-Z]\)\s*(to|and)\s*\([a-zA-Z]\)\s*(Repealed.|\()', current_p_tag): + alpha = re.search( + r'^(?P<text1>\((?P<num1>[a-zA-Z])\))\s*(to|and)\s*(?P<text2>\((?P<num2>[a-zA-Z])\)\s*(' + r'?P<rpt_text>Repealed.|\(.+))', + current_p_tag) + if re.match(r'^\([a-zA-Z]\)', p_tag.find_next_sibling().text.strip()): + nxt_alpha = re.search(r'^\((?P<num3>[a-zA-Z])\)', + p_tag.find_next_sibling().text.strip()).group( + "num3") + if ord(alpha.group("num2")) == ord(alpha.group("num1")) + 1: + if ord(nxt_alpha) == ord(alpha.group("num2")) + 1: + alpha_text = alpha.group("text2") + num_text = alpha.group("text1") + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + + else: + if ord(nxt_alpha) == ord(alpha.group("num2")) + 1: + alpha_text = alpha.group("text2") + num_text = alpha.group("text1") + alpha.group("rpt_text") + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + range_from = ord(alpha.group("num1")) + range_to = ord(alpha.group("num2")) + count = range_from + 1 + for new_p_tag in range(range_from + 1, range_to): + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = f'({chr(count)}){alpha.group("rpt_text")}' + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag = new_p_tag + count += 1 + + else: + alpha_text = alpha.group("text2") + num_text = alpha.group("text1") + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + range_from = ord(alpha.group("num1")) + range_to = ord(alpha.group("num2")) + count = range_from + 1 + for new_p_tag in range(range_from + 1, range_to): + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = f'({chr(count)})' + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag = new_p_tag + count += 1 + + if re.search(r'^\([a-z]\).+\([a-z]\)\s*', current_p_tag): + alpha = re.search(r'^(?P<text1>\((?P<alpha1>[a-z])\).+)(?P<text2>\((?P<alpha2>[a-z])\)\s*.+)', + current_p_tag) + if re.match(r'^\([a-z]\)', p_tag.find_next_sibling().text.strip()): + nxt_alpha = re.search(r'^\((?P<alpha3>[a-z])\)', + p_tag.find_next_sibling().text.strip()).group( + "alpha3") + if ord(alpha.group("alpha2")) == ord(alpha.group("alpha1")) + 1: + if ord(nxt_alpha) == ord(alpha.group("alpha2")) + 1: + alpha_text = alpha.group("text2") + num_text = alpha.group("text1") + new_p_tag = self.soup.new_tag("p") + new_p_tag.string = alpha_text + new_p_tag["class"] = [self.tag_type_dict['ol_p']] + p_tag.insert_after(new_p_tag) + p_tag.string = num_text + + main_sec_alpha = 'a' + sec_alpha = 'a' + cap_alpha = 'A' + inr_cap_alpha = 'A' + cap_roman = 'I' + ol_head = 1 + roman_count = 1 + ol_count = 1 + inner_num_head = 1 + + alpha_ol = self.soup.new_tag("ol", type="a") + cap_alpha_ol = self.soup.new_tag("ol", type="A") + roman_ol = self.soup.new_tag("ol", type="I") + num_ol = self.soup.new_tag("ol") + + dup_id_list = [] + inner_roman_ol = None + num_tag = None + inr_cap_alpha_cur_tag = None + alpha_cur_tag = None + prev_alpha_id = None + prev_head_id = None + article_alpha_tag = None + inner_alpha_tag = None + num_cur_tag = None + cap_alpha_cur_tag = None + sec_alpha_cur_tag = None + previous_li_tag = None + prev_num_id = None + rom_id = None + prev_id = None + prev_alpha = None + sec_alpha_id = None + sec_alpha_ol = self.soup.new_tag("ol", type="a") + inr_cap_alpha_ol = self.soup.new_tag("ol", type="A") + num_ol1 = self.soup.new_tag("ol") + + for p_tag in self.soup.body.find_all(['h3', 'h4', 'h5', 'p']): + current_tag_text = p_tag.text.strip() + + if re.search(rf'^\({ol_head}\)|^\[.+]\s*\({ol_head}\)|^\(\d+\.\d+\)', current_tag_text): + previous_li_tag = p_tag + if re.search(rf'^\({ol_head}\)|^\[.+]\s*\({ol_head}\) ', current_tag_text): + p_tag.name = "li" + num_cur_tag = p_tag + + if re.search(r'^\(1\)|^\[.+]\s*\(1\)', current_tag_text): + num_ol = self.soup.new_tag("ol") + p_tag.wrap(num_ol) + + if article_alpha_tag: + alpha_cur_tag.append(num_ol) + prev_head_id = alpha_cur_tag.get("id") + elif cap_alpha_cur_tag: + cap_alpha_cur_tag.append(num_ol) + prev_head_id = cap_alpha_cur_tag.get("id") + elif inner_alpha_tag: + inner_alpha_tag.append(num_ol) + prev_head_id = inner_alpha_tag.get("id") + else: + prev_head_id = p_tag.find_previous(["h4", "h3", "h2", "h1"]).get("id") + main_sec_alpha = 'a' + else: + num_ol.append(p_tag) + + if inr_cap_alpha_cur_tag: + p_tag["id"] = f'{inr_cap_alpha_cur_tag.get("id")}{ol_head}' + elif article_alpha_tag: + p_tag["id"] = f'{alpha_cur_tag.get("id")}{ol_head}' + else: + prev_num_id = f'{prev_head_id}ol{ol_count}{ol_head}' + p_tag["id"] = f'{prev_head_id}ol{ol_count}{ol_head}' + main_sec_alpha = 'a' + + p_tag.string = re.sub(rf'^\({ol_head}\)', '', current_tag_text) + ol_head += 1 + + if re.search(r'^\(\d+\)\s\(a\)', current_tag_text): + alpha_ol = self.soup.new_tag("ol", type="a") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\d+\)\s\(a\)', '', current_tag_text) + previous_li_tag = li_tag + alpha_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<cid>\d+)\)\s\((?P<pid>a)\)', current_tag_text) + prev_alpha_id = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}' + li_tag["id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}' + alpha_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(alpha_ol) + main_sec_alpha = "b" + + if re.search(r'^\(\d+\)\s*\(\w\)\s*\([I,V,X]+\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="I") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\(\d+\)\s*\(\w\)\s*\([I,V,X]+\)', '', current_tag_text) + inner_li_tag.append(current_tag_text) + li_tag["class"] = self.tag_type_dict['ol_p'] + rom_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<id1>\d+)\)\s*\((?P<cid>\w)\)\s*\((?P<id2>[IVX]+)\)', + current_tag_text) + rom_id = f'{prev_head_id}ol{ol_count}{cur_tag.group("id1")}{cur_tag.group("cid")}' + inner_li_tag[ + "id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("id1")}{cur_tag.group("cid")}{cur_tag.group("id2")}' + roman_ol.append(inner_li_tag) + alpha_cur_tag.string = "" + alpha_cur_tag.insert(0, roman_ol) + cap_roman = "II" + + if re.search(r'^\(\d+\)\s*\(\w\)\s*\([I,V,X]+\)\s*\(\w\)', current_tag_text): + cap_alpha_ol = self.soup.new_tag("ol", type="A") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\(\d+\)\s*\(\w\)\s*\([I,V,X]+\)\s*\(\w\)', '', + current_tag_text) + li_tag["class"] = self.tag_type_dict['ol_p'] + cur_tag = re.search( + r'^\((?P<id1>\d+)\)\s*\((?P<cid>\w)\)\s*\((?P<id2>[I,V,X]+)\)\s*\((?P<id3>\w)\)', + current_tag_text) + prev_id = f'{prev_head_id}ol{ol_count}{cur_tag.group("id1")}{cur_tag.group("cid")}{cur_tag.group("id2")}' + + inner_li_tag[ + "id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("id1")}{cur_tag.group("cid")}{cur_tag.group("id2")}{cur_tag.group("id3")}' + cap_alpha_ol.append(inner_li_tag) + rom_cur_tag.string = "" + rom_cur_tag.append(cap_alpha_ol) + cap_alpha = "B" + + if re.search(r'^\(\d+\)\s\(i\)', current_tag_text): + inner_roman_ol = self.soup.new_tag("ol", type="i") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\d+\)\s\(i\)', '', current_tag_text) + prev_alpha = p_tag + cur_tag = re.search(r'^\((?P<cid>\d+)\)\s\((?P<pid>i)\)', current_tag_text) + prev_num_id = f'{prev_head_id}{cur_tag.group("cid")}' + li_tag["id"] = f'{prev_head_id}{cur_tag.group("cid")}{cur_tag.group("pid")}' + inner_roman_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(inner_roman_ol) + + elif re.search(r'^\(\d+\.\d+\)', current_tag_text): + cur_tag = re.search(r'^\((?P<cid>\d+\.\d+)\)', current_tag_text).group("cid") + tag_id = f'{prev_num_id}-{cur_tag}' + prev_num_id = f'{prev_num_id}-{cur_tag}' + if tag_id in dup_id_list: + p_tag["id"] = f'{tag_id}-{cur_tag}.1' + else: + p_tag["id"] = f'{tag_id}-{cur_tag}' + + dup_id_list.append(tag_id) + if num_cur_tag: + num_cur_tag.append(p_tag) + else: + p_tag.find_previous("li").append(p_tag) + main_sec_alpha = "a" + num_cur_tag = p_tag + + if re.search(r'^\(\d+\.\d+\)\s\(\w\)|^\(\d+\.\d+\)\s*\[.+\]\s*\(\w\)', current_tag_text): + alpha_ol = self.soup.new_tag("ol", type="a") + li_tag = self.soup.new_tag("li") + li_tag.append(current_tag_text) + alpha_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<cid>\d+\.\d+)\).+\((?P<pid>\w)\)', current_tag_text) + prev_alpha_id = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}' + if prev_alpha_id in dup_id_list: + li_tag["id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}.1' + else: + li_tag["id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}' + alpha_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(alpha_ol) + main_sec_alpha = "b" + cap_roman = "I" + dup_id_list.append(prev_alpha_id) + if re.search(r'^\(\d+\.\d+\)\s*\(\w\)\s*\([I,V,X]+\)\s*', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="I") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.append(current_tag_text) + li_tag["class"] = self.tag_type_dict['ol_p'] + rom_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<id1>\d+\.\d+)\)\s*\((?P<cid>\w)\)\s*\((?P<id2>[I,V,X]+)\)', + current_tag_text) + rom_id = f'{prev_head_id}ol{ol_count}{cur_tag.group("id1")}{cur_tag.group("cid")}{cur_tag.group("id2")}' + inner_li_tag[ + "id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("id1")}{cur_tag.group("cid")}{cur_tag.group("id2")}' + roman_ol.append(inner_li_tag) + p_tag.insert(1, roman_ol) + roman_ol.find_previous().string.replace_with(roman_ol) + cap_roman = "II" + + elif re.search(rf'^\({main_sec_alpha}\)|^\([a-z]\.\d+\)', current_tag_text): + previous_li_tag = p_tag + if re.search(rf'^\({main_sec_alpha}\)', current_tag_text): + p_tag.name = "li" + alpha_cur_tag = p_tag + cap_roman = "I" + if re.search(r'^\(a\)', current_tag_text): + alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(alpha_ol) + + if p_tag.find_previous("h4") and re.search(r'^(ARTICLE|Article) [IVX]+', + p_tag.find_previous("h4").text.strip()): + if num_tag: + num_tag.append(alpha_ol) + prev_alpha_id = f'{num_tag.get("id")}' + elif num_cur_tag: + num_cur_tag.append(alpha_ol) + prev_alpha_id = f'{prev_num_id}' + else: + prev_alpha_id = f'{p_tag.find_previous("h4").get("id")}ol{ol_count}' + article_alpha_tag = p_tag + elif num_cur_tag: + article_alpha_tag = None + num_cur_tag.append(alpha_ol) + prev_alpha_id = f'{prev_num_id}' + else: + article_alpha_tag = p_tag + prev_alpha_id = f'{p_tag.find_previous(["h4", "h3", "h2", "h1"]).get("id")}ol{ol_count}' + + else: + alpha_ol.append(p_tag) + + if p_tag.find_previous("h4") and re.search(r'^(ARTICLE|Article) [IVX]+', + p_tag.find_previous("h4").text.strip()): + ol_head = 1 + + p_tag["id"] = f'{prev_alpha_id}{main_sec_alpha}' + p_tag.string = re.sub(rf'^\({main_sec_alpha}\)', '', current_tag_text) + main_sec_alpha = chr(ord(main_sec_alpha) + 1) + + if re.search(r'^\(\w\)(\s*\[.+])*\s*\([I,V,X]+\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="I") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\w\)\s*\([I,V,X]+\)', '', current_tag_text) + + li_tag["class"] = self.tag_type_dict['ol_p'] + rom_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<cid>\w+)\)(\s*\[.+\])*\s*\((?P<pid>[I,V,X]+)\)', current_tag_text) + rom_id = f'{prev_num_id}{cur_tag.group("cid")}' + li_tag["id"] = f'{prev_num_id}{cur_tag.group("cid")}{cur_tag.group("pid")}' + roman_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(roman_ol) + cap_roman = "II" + + if re.search(r'^\(\w\)\s*\([IVX]+\)\s*\(\w\)', current_tag_text): + cap_alpha_ol = self.soup.new_tag("ol", type="A") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\(\w\)\s*\([IVX]+\)\s*\(\w\)', '', current_tag_text) + inner_li_tag.append(current_tag_text) + li_tag["class"] = self.tag_type_dict['ol_p'] + cur_tag = re.search( + r'^\((?P<cid>\w)\)\s*\((?P<id2>[IVX]+)\)\s*\((?P<id3>\w)\)', + current_tag_text) + prev_id = rom_cur_tag.get("id") + inner_li_tag[ + "id"] = f'{rom_cur_tag.get("id")}{cur_tag.group("id3")}' + cap_alpha_ol.append(inner_li_tag) + p_tag.insert(1, cap_alpha_ol) + rom_cur_tag.string = "" + rom_cur_tag.string.replace_with(cap_alpha_ol) + cap_alpha = "B" + + if re.search(r'^\(\w\)\s*\(\d+\)', current_tag_text): + num_ol = self.soup.new_tag("ol") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\w\)\s*\(\d+\)', '', current_tag_text) + li_tag["class"] = self.tag_type_dict['ol_p'] + cur_tag = re.search(r'^\((?P<cid>\w+)\)\s*\((?P<pid>\d+)\)', current_tag_text) + li_tag["id"] = f'{alpha_cur_tag.get("id")}{cur_tag.group("pid")}' + num_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(num_ol) + ol_head = 2 + cap_alpha = "A" + + if re.search(r'^\(\w\)\s*\([ivx]+\)', current_tag_text): + inner_roman_ol = self.soup.new_tag("ol", type="i") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\(\w\)\s*\([ivx]+\)', '', current_tag_text) + inner_li_tag["class"] = self.tag_type_dict['ol_p'] + prev_alpha = inner_li_tag + cur_tag = re.search(r'^\((?P<cid>\w)\)\s*\((?P<pid>[ivx]+)\)', current_tag_text) + inner_li_tag["id"] = f'{alpha_cur_tag.get("id")}{cur_tag.group("pid")}' + inner_roman_ol.append(inner_li_tag) + p_tag.contents = [] + p_tag.append(inner_roman_ol) + elif re.search(r'^\(\w+\.\d+\)', current_tag_text): + p_tag.name = "li" + roman_count = 1 + cur_tag = re.search(r'^\((?P<cid>\w+\.\d+)\)', current_tag_text).group("cid") + p_tag.string = re.sub(r'^\(\w+\.\d+\)', '', current_tag_text) + p_tag_id = f'{prev_alpha_id}-{cur_tag}' + if p_tag_id in dup_id_list: + p_tag["id"] = f'{prev_alpha_id}-{cur_tag}.1' + else: + p_tag["id"] = f'{prev_alpha_id}-{cur_tag}' + + dup_id_list.append(p_tag_id) + prev_alpha_id = f'{prev_alpha_id}' + + if not re.search(r'^\(\w+\.\d+\)', p_tag.find_next().text.strip()) and re.search(r'^\([A-Z]\)', + p_tag.find_next().text.strip()): + prev_alpha_id = f'{prev_alpha_id}-{cur_tag}' + + alpha_ol.append(p_tag) + alpha_cur_tag = p_tag + + if re.search(r'^\(\w\.\d+\)\s*\([IVX]+\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="I") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\w\.\d+\)\s*\([IVX]+\)', '', current_tag_text) + + li_tag["class"] = self.tag_type_dict['ol_p'] + cur_tag = re.search(r'^\((?P<cid>\w+\.\d+)\)\s*\((?P<pid>[IVX]+)\)', current_tag_text) + rom_id = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}' + li_tag["id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}' + roman_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(roman_ol) + cap_roman = "II" + + if re.search(r'^\(\w\.\d+\)\s*\([IVX]+\)\s*\(\w\)', current_tag_text): + cap_alpha_ol = self.soup.new_tag("ol", type="A") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.append(current_tag_text) + inner_li_tag["class"] = self.tag_type_dict['ol_p'] + cur_tag = re.search( + r'^\((?P<cid>\w\.\d+)\)\s*\((?P<id2>[IVX]+)\)\s*\((?P<id3>\w)\)', + current_tag_text) + prev_id = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("id2")}' + + inner_li_tag[ + "id"] = f'{prev_head_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("id2")}{cur_tag.group("id3")}' + + cap_alpha_ol.append(inner_li_tag) + p_tag.insert(1, cap_alpha_ol) + cap_alpha_ol.find_previous().string.replace_with(cap_alpha_ol) + cap_alpha = "B" + + elif re.search(rf'^\({cap_roman}\)', current_tag_text): + previous_li_tag = p_tag + p_tag.name = "li" + rom_cur_tag = p_tag + cap_alpha = "A" + if re.search(r'^\(I\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="I") + p_tag.wrap(roman_ol) + if alpha_cur_tag: + alpha_cur_tag.append(roman_ol) + rom_id = f'{alpha_cur_tag.get("id")}' + p_tag["id"] = f'{alpha_cur_tag.get("id")}I' + else: + rom_id = f'{p_tag.find_previous("li").get("id")}' + p_tag["id"] = f'{p_tag.find_previous("li").get("id")}I' + p_tag.find_previous("li").append(roman_ol) + else: + roman_ol.append(p_tag) + p_tag["id"] = f'{rom_id}{cap_roman}' + + p_tag.string = re.sub(rf'^\({cap_roman}\)', '', current_tag_text) + cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) + + if re.search(r'^\([IVX]+\)\s*\([A-Z]\)', current_tag_text): + cap_alpha_ol = self.soup.new_tag("ol", type="A") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([IVX]+\)\s*\(A\)', '', current_tag_text) + cap_alpha_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<cid>[IVX]+)\)\s*\((?P<pid>[A-Z])\)', current_tag_text) + prev_id = f'{rom_cur_tag.get("id")}' + li_tag["id"] = f'{rom_cur_tag.get("id")}{cur_tag.group("pid")}' + + if not re.search(r'^\(I\)', current_tag_text): + prev_tag_id = p_tag.find_previous("li").get("id") + cur_tag_id = re.search(r'^[^IVX]+', prev_tag_id).group() + li_tag["id"] = f'{cur_tag_id}{cur_tag.group("cid")}{cur_tag.group("pid")}' + cap_alpha_ol.append(li_tag) + p_tag.string = "" + p_tag.append(cap_alpha_ol) + roman_count += 1 + cap_alpha = "B" + + elif re.search(rf'^\({cap_alpha}\)', current_tag_text): + previous_li_tag = p_tag + p_tag.name = "li" + cap_alpha_cur_tag = p_tag + + if re.search(r'^\(A\)', current_tag_text): + cap_alpha_ol = self.soup.new_tag("ol", type="A") + p_tag.wrap(cap_alpha_ol) + prev_id = p_tag.find_previous("li").get("id") + p_tag.find_previous("li").append(cap_alpha_ol) + + else: + cap_alpha_ol.append(p_tag) + + if cap_alpha in ['I', 'V', 'X', 'L']: + p_tag["id"] = f'{prev_id}{ord(cap_alpha)}' + else: + p_tag["id"] = f'{prev_id}{cap_alpha}' + + p_tag.string = re.sub(rf'^\({cap_alpha}\)', '', current_tag_text) + if cap_alpha == 'Z': + cap_alpha = 'A' + else: + cap_alpha = chr(ord(cap_alpha) + 1) + + elif re.search(r'^\([ivx]+\)', current_tag_text): + previous_li_tag = p_tag + p_tag.name = "li" + cap_alpha = "A" + if re.search(r'^\(i\)', current_tag_text): + inner_roman_ol = self.soup.new_tag("ol", type="i") + p_tag.wrap(inner_roman_ol) + p_tag.find_previous("li").append(inner_roman_ol) + prev_alpha = p_tag.find_previous("li") + p_tag["id"] = f'{prev_alpha.get("id")}i' + else: + cur_tag = re.search(r'^\((?P<cid>[ivx]+)\)', current_tag_text).group("cid") + if inner_roman_ol: + inner_roman_ol.append(p_tag) + p_tag["id"] = f'{prev_alpha.get("id")}{cur_tag}' + + else: + alpha_ol.append(p_tag) + alpha_cur_tag = p_tag + p_tag["id"] = f'{prev_num_id}{cur_tag}' + p_tag.string = re.sub(r'^\((?P<cid>[ivx]+)\)', '', current_tag_text) + + elif re.search(rf'^{sec_alpha}\.', current_tag_text): + previous_li_tag = p_tag + p_tag.name = "li" + sec_alpha_cur_tag = p_tag + + if re.search(r'^a\.', current_tag_text): + sec_alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(sec_alpha_ol) + if num_tag: + num_tag.append(sec_alpha_ol) + sec_alpha_id = num_tag.get("id") + else: + sec_alpha_id = f'{p_tag.find_previous({"h4", "h3", "h2"}).get("id")}ol{ol_count}{sec_alpha}' + inner_num_head = 1 + else: + sec_alpha_ol.append(p_tag) + if not num_tag: + inner_num_head = 1 + + p_tag["id"] = f'{sec_alpha_id}{sec_alpha}' + p_tag.string = re.sub(rf'^{sec_alpha}\.', '', current_tag_text) + sec_alpha = chr(ord(sec_alpha) + 1) + + elif re.search(rf'^{inr_cap_alpha}\.', current_tag_text) and p_tag.name == "p": + inner_alpha_tag = p_tag + p_tag.name = "li" + inr_cap_alpha_cur_tag = p_tag + inner_num_head = 1 + ol_head = 1 + + if re.search(r'^A\.', current_tag_text): + inr_cap_alpha_ol = self.soup.new_tag("ol", type="A") + p_tag.wrap(inr_cap_alpha_ol) + prev_id = f'{p_tag.find_previous({"h4", "h3", "h2"}).get("id")}ol{ol_count}' + + else: + inr_cap_alpha_ol.append(p_tag) + + p_tag["id"] = f'{prev_id}{inr_cap_alpha}' + p_tag.string = re.sub(rf'^{inr_cap_alpha}\.', '', current_tag_text) + if inr_cap_alpha == 'Z': + inr_cap_alpha = 'A' + else: + inr_cap_alpha = chr(ord(inr_cap_alpha) + 1) + + if re.search(r'^[A-Z]\.\s\(1\)', current_tag_text): + num_ol = self.soup.new_tag("ol") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^[A-Z]\.\s\(1\)', '', current_tag_text) + li_tag["class"] = self.tag_type_dict['ol_p'] + inner_alpha_id = f'{inr_cap_alpha_cur_tag.get("id")}' + li_tag["id"] = f'{inr_cap_alpha_cur_tag.get("id")}1' + num_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(num_ol) + ol_head = 2 + + elif re.search(rf'^{inner_num_head}\.', current_tag_text) and p_tag.name == "p": + inner_num_tag = p_tag + p_tag.name = "li" + num_tag = p_tag + if re.search(r'^1\.', current_tag_text): + num_ol1 = self.soup.new_tag("ol") + p_tag.wrap(num_ol1) + + if sec_alpha_cur_tag: + sec_alpha_cur_tag.append(num_ol1) + prev_head_id = sec_alpha_cur_tag.get('id') + sec_alpha = 'a' + elif inr_cap_alpha_cur_tag: + inr_cap_alpha_cur_tag.append(num_ol1) + prev_head_id = inr_cap_alpha_cur_tag.get('id') + elif alpha_cur_tag: + alpha_cur_tag.append(num_ol1) + prev_head_id = alpha_cur_tag.get('id') + else: + prev_head_id = p_tag.find_previous({"h5", "h4", "h3", "h2"}).get("id") + else: + num_ol1.append(p_tag) + + if sec_alpha_cur_tag: + sec_alpha = 'a' + if p_tag.find_previous("h4") and re.search(r'^(ARTICLE|Article) [IVX]+', + p_tag.find_previous("h4").text.strip()): + main_sec_alpha = 'a' + + p_tag["id"] = f'{prev_head_id}ol{ol_count}{inner_num_head}' + p_tag.string = re.sub(rf'^{inner_num_head}\.', '', current_tag_text) + inner_num_head += 1 + + if re.search(r'^\d+\.\s*?a\.', current_tag_text): + sec_alpha_ol = self.soup.new_tag("ol", type="a") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\d+\.\s*?a\.', '', current_tag_text) + sec_alpha_cur_tag = li_tag + cur_tag = re.search(r'^(?P<cid>\d+)\.\s*?a\.', current_tag_text) + prev_id = f'{num_tag.get("id")}{cur_tag.group("cid")}' + li_tag["id"] = f'{num_tag.get("id")}{cur_tag.group("cid")}a' + sec_alpha_ol.append(li_tag) + p_tag.string = "" + p_tag.append(sec_alpha_ol) + sec_alpha = "b" + + elif re.search(r'^\([a-z]{2,3}\)', current_tag_text) and p_tag.name != "li": + previous_li_tag = p_tag + curr_id = re.search(r'^\((?P<cur_id>[a-z]+)\)', current_tag_text).group("cur_id") + p_tag.name = "li" + alpha_cur_tag = p_tag + alpha_ol.append(p_tag) + prev_alpha_id = f'{prev_num_id}{curr_id}' + p_tag["id"] = f'{prev_num_id}{curr_id}' + roman_count = 1 + p_tag.string = re.sub(r'^\([a-z]{2,3}\)', '', current_tag_text) + + elif p_tag.get("class") == [self.tag_type_dict['ol_p']] and not re.search(r'^History|^Source', + current_tag_text): + if previous_li_tag: + previous_li_tag.append(p_tag) + + elif "table" in self.tag_type_dict and p_tag.get("class") == [self.tag_type_dict["table"]] and p_tag.span: + if previous_li_tag: + previous_li_tag.append(p_tag) + p_tag["class"] = 'table' + + if re.search(r'^Source|^Cross references:|^OFFICIAL COMMENT|^(ARTICLE|Article) ([IVX]+|\d+)', + current_tag_text, re.I) or p_tag.name in ['h3', 'h4', 'h2']: + main_sec_alpha = 'a' + sec_alpha = 'a' + cap_alpha = 'A' + inr_cap_alpha = 'A' + cap_roman = 'I' + ol_head = 1 + roman_count = 1 + ol_count = 1 + inner_roman_ol = None + num_tag = None + inr_cap_alpha_cur_tag = None + alpha_cur_tag = None + prev_alpha_id = None + article_alpha_tag = None + inner_alpha_tag = None + num_cur_tag = None + cap_alpha_cur_tag = None + sec_alpha_cur_tag = None + previous_li_tag = None + prev_id = None + inner_num_head = 1 + + logger.info("ol tags added") + + def create_analysis_nav_tag(self): + if self.release_number == '76': + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + rom_tag_id = None + rom_tag = None + alpha_tag = None + alpha_tag_id, a_tag_id = None, None + for case_tag in self.soup.find_all("li", class_='annotation'): + if re.search(rf'^[IVX]+\.', case_tag.text.strip()): + rom_tag = case_tag + if re.search(r'^I\.', case_tag.text.strip()): + if not re.search(r'^H\.', case_tag.find_previous("li").text.strip()): + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(rom_ul) + rom_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-I' + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-I' + else: + alpha_tag = case_tag + alpha_ul.append(case_tag) + alpha_tag_id = f'{rom_tag_id}-I' + a_tag_id = f'{rom_tag_id}-I' + elif re.search(r'^II\.', case_tag.text.strip()): + if case_tag.find_previous().name == 'a': + rom_ul.append(case_tag) + rom_num = re.search(r'^(?P<rid>[IVX]+)\.', case_tag.text.strip()).group("rid") + rom_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-{rom_num}' + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-{rom_num}' + else: + rom_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(rom_ul) + rom_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-II' + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-II' + + else: + rom_ul.append(case_tag) + rom_num = re.search(r'^(?P<rid>[IVX]+)\.', case_tag.text.strip()).group("rid") + rom_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-{rom_num}' + a_tag_id = f'#{case_tag.find_previous("h3").get("id")}-annotation-{rom_num}' + + elif re.search(r'^[A-Z]\.', case_tag.text.strip()): + alpha_tag = case_tag + if re.search(r'^A\.', case_tag.text.strip()): + alpha_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(alpha_ul) + rom_tag.append(alpha_ul) + else: + alpha_ul.append(case_tag) + + alpha = re.search(r'^(?P<aid>[A-Z])\.', case_tag.text.strip().strip()) + alpha_tag_id = f'{rom_tag_id}-{alpha.group("aid")}' + a_tag_id = f'{rom_tag_id}-{alpha.group("aid")}' + + elif re.search(r'^\d+\.', case_tag.text.strip().strip()): + if re.search(r'^1\.', case_tag.text.strip().strip()): + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + case_tag.wrap(digit_ul) + alpha_tag.append(digit_ul) + else: + digit_ul.append(case_tag) + digit = re.search(r'^(?P<nid>\d+)\.', case_tag.text.strip().strip()).group("nid") + a_tag_id = f'{alpha_tag_id}-{digit}' + + anchor = self.soup.new_tag('a', href=a_tag_id) + anchor.string = case_tag.text + case_tag.string = '' + case_tag.append(anchor) + + else: + super(COParseHtml, self).create_annotation_analysis_nav_tag() + logger.info("Annotation analysis nav created") + + def wrap_inside_main_tag(self): + + """wrap inside main tag""" + + main_tag = self.soup.new_tag('main') + chap_nav = self.soup.find('nav') + + h2_tag = self.soup.find("h2") + tag_to_wrap = h2_tag.find_previous_sibling() + + for tag in tag_to_wrap.find_next_siblings(): + tag.wrap(main_tag) + + for nav_tag in chap_nav.find_next_siblings(): + if nav_tag.name != "main": + nav_tag.wrap(chap_nav) + + def replace_tags_constitution(self): + for p_tag in self.soup.find_all(class_=self.tag_type_dict['head3']): + current_p_tag = p_tag.text.strip() + next_sibling = p_tag.find_next_sibling() + if re.search('^§', current_p_tag): + if p_tag.b and re.search('^§', p_tag.b.text.strip()): + new_h3_tag = self.soup.new_tag("p") + new_h3_tag.attrs["class"] = self.tag_type_dict['head3'] + h3_text = p_tag.b.text + new_h3_tag.string = h3_text + p_tag.insert_before(new_h3_tag) + p_tag["class"] = self.tag_type_dict['head3'] + if not re.search(r'^Constitution of the State of Colorado', p_tag.find_next("b").text.strip()): + p_tag.find_next("b").decompose() + + else: + new_h3_tag = self.soup.new_tag("p") + new_h3_tag["class"] = self.tag_type_dict['head3'] + h3_text = "§ " + p_tag.find_next("b").text + new_h3_tag.string = h3_text + p_tag.insert_before(new_h3_tag) + if not re.search(r'^Constitution of the State of Colorado', p_tag.find_next("b").text.strip()): + p_tag.find_next("b").decompose() + if re.search(r'^§', p_tag.text.strip()): + p_tag.string = re.sub(r'^§', '', p_tag.text.strip()) + + super(COParseHtml, self).replace_tags_constitution() + cap_roman = "I" + rom_id = None + for header_tag in self.soup.find_all("p"): + if header_tag.get("class") == [self.tag_type_dict["head2"]] or \ + header_tag.get("class") == [self.tag_type_dict["amd"]]: + if re.search(r'^PREAMBLE|^AMENDMENTS|^Schedule', header_tag.text.strip(), re.I): + header_tag.name = "h2" + tag_text = re.sub(r'[\W\s]+', '', header_tag.text.strip()).lower() + header_tag["id"] = f"{header_tag.find_previous('h1').get('id')}-{tag_text}" + header_tag["class"] = "gen" + elif header_tag.get("class") == [self.tag_type_dict["art_head"]]: + if self.regex_pattern_obj.h2_article_pattern_con.search(header_tag.text.strip()): + header_tag.name = "h3" + chap_no = self.regex_pattern_obj.h2_article_pattern_con.search(header_tag.text.strip()).group('id') + header_tag["id"] = f'{header_tag.find_previous("h2").get("id")}-am{chap_no.zfill(2)}' + header_tag["class"] = "amend" + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + elif header_tag.get("class") == [self.tag_type_dict["section"]]: + if re.search(r'^Section \d+(\.?\w)*\.', header_tag.text.strip()): + chap_num = re.search(r'^Section (?P<id>\d+(\.?\w)*)\.', header_tag.text.strip()).group("id") + header_tag.name = "h3" + header_tag[ + "id"] = f"{header_tag.find_previous('h2', class_={'oneh2', 'twoh2', 'threeh2', 'gen'}).get('id')}-sec{chap_num.zfill(2)}" + header_tag["class"] = "sec" + + elif header_tag.get("class") == [self.tag_type_dict["head4"]]: + if re.search(rf'^{cap_roman}\.', header_tag.text.strip()): + header_tag.name = "h5" + chap_num = re.search(r'^(?P<id>[IVX]+)\.', header_tag.text.strip()).group("id") + rom_id = f'{header_tag.find_previous("h3").get("id")}-annotation-{chap_num}' + header_tag["id"] = f'{header_tag.find_previous("h3").get("id")}-annotation-{chap_num}' + cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) + + elif re.search(r'^[A-Z]\.\s"?[A-Z][a-z]+', header_tag.text.strip()): + header_tag.name = "h5" + prev_id = rom_id + chap_num = re.search(r'^(?P<id>[A-Z])\.', header_tag.text.strip()).group("id") + header_tag["id"] = f'{prev_id}-{chap_num}' + + elif re.search(r'^[1-9]\.', header_tag.text.strip()): + header_tag.name = "h5" + if header_tag.find_previous( + lambda tag: tag.name in ['h5'] and re.search(r'^[A-Z]\.', + tag.text.strip())): + + prev_id = header_tag.find_previous( + lambda tag: tag.name in ['h5'] and re.search(r'^[A-Z]\.', + tag.text.strip())).get("id") + chap_num = re.search(r'^(?P<id>[0-9])\.', header_tag.text.strip()).group("id") + header_tag["id"] = f'{prev_id}-{chap_num}' + else: + header_tag["class"] = [self.tag_type_dict['ol_p']] + + if re.search(r'^Analysis$', header_tag.text.strip()): + cap_roman = "I" + + if self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()): + header_tag.name = "h3" + chap_no = self.regex_pattern_obj.section_pattern_con.search(header_tag.text.strip()).group('id') + if header_tag.find_previous("h3", class_={"oneh2", "gen", "amend"}): + header_tag[ + "id"] = f'{header_tag.find_previous("h3", class_={"oneh2", "gen", "amend"}).get("id")}-s{chap_no.zfill(2)}' + else: + header_tag[ + "id"] = f'{header_tag.find_previous("h2", class_={"oneh2", "gen", "amd"}).get("id")}-s{chap_no.zfill(2)}' + + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + def add_anchor_tags_con(self): + super(COParseHtml, self).add_anchor_tags_con() + for li in self.soup.find_all("li"): + if not li.get("id"): + if re.search(r'^[IVX]+', li.text.strip()): + chap_num = re.search(r'^(?P<id>[IVX]+)', li.text.strip()).group("id") + self.c_nav_count += 1 + + if self.release_number in ['71', '74']: + if li.find_previous({"h1", "h2"}) and \ + re.search(r'^AMENDMENTS$', li.find_previous({"h1", "h2"}).text.strip()): + tag = "-am" + else: + tag = 'ar' + self.set_chapter_section_id(li, chap_num, + sub_tag=tag, + prev_id=li.find_previous({"h1", "h2"}).get("id"), + cnav=f'cnav{self.c_nav_count:02}') + + else: + self.set_chapter_section_id(li, chap_num, + sub_tag="ar", + prev_id=li.find_previous({"h1", "h2"}).get("id"), + cnav=f'cnav{self.c_nav_count:02}') + + elif re.search(r'^Section \d+(\.?\w)*\.', li.text.strip()): + chap_num = re.search(r'^Section (?P<id>\d+(\.?\w)*)\.', li.text.strip()).group("id") + self.c_nav_count += 1 + self.set_chapter_section_id(li, chap_num.zfill(2), + sub_tag="-sec", + prev_id=li.find_previous("h2").get("id"), + cnav=f'cnav{self.c_nav_count:02}') + elif re.search(r'^\d+(\.?\w)*\.', li.text.strip()): + chap_num = re.search(r'^(?P<id>\d+(\.?\w)*)\.', li.text.strip()).group("id") + self.c_nav_count += 1 + if self.release_number in ['74']: + tag = "-sec" + else: + tag = "-s" + self.set_chapter_section_id(li, chap_num.zfill(2), + sub_tag=tag, + prev_id=li.find_previous("h2").get("id"), + cnav=f'cnav{self.c_nav_count:02}') + + def creating_formatted_table(self): + tbl_head = [] + count = 1 + tbl_text = None + new_row_tag = self.soup.new_tag('div', style="flex-basis: 10%;") + + if self.file_no == '06': + tbl_text = ['Table 1', 'Table 2'] + + for tag in self.soup.find_all(class_="table"): + if self.file_no == '06' and re.search(r'^Table 2', tag.text.strip()): + colum_count = len(tag.find_all("b")) + newRow = self.soup.new_tag('li', style="border-radius: 3px;padding: 20px 25px;" + "display: flex;justify-content:space-evenly;margin-bottom: 05px;") + newTable = self.soup.new_tag('ul') + new_p_tag = self.soup.new_tag('p') + new_p_tag.string = re.search(r'^Table 2', tag.text.strip()).group() + tag.insert_before(new_p_tag) + tag.string = re.sub(r'^Table 2', '', tag.text.strip()) + + tbl_data = tag.text.split('\n') + + count = 1 + for data in tbl_data: + if len(data.strip()) > 0 and data.strip() not in tbl_text: + if data.strip() not in ['w', 'r', 'f']: + if 0 < count <= colum_count: + count += 1 + else: + newRow = self.soup.new_tag('li', style="border-radius: 3px;padding: 20px 25px;" + "display: flex;justify-content:space-evenly" + ";margin-bottom: 05px;") + count = 2 + + new_row_tag = self.soup.new_tag('div', style="flex-basis: 10%;") + new_row_tag.append(data) + newRow.append(new_row_tag) + newTable.append(newRow) + else: + new_sub_tag = self.soup.new_tag('sub') + new_sub_tag.string = data + new_row_tag.append(new_sub_tag) + + tag.replace_with(newTable) + + elif self.file_no not in ['06'] and tag.span and re.search(r's\d', str(tag.span.get("class"))): + colum_count = len(tag.find_all("b")) + newRow = self.soup.new_tag('li', style="border-radius: 3px;padding: 20px 25px;" + "display: flex;justify-content:space-evenly;margin-bottom: 05px;") + newTable = self.soup.new_tag('ul') + tbl_head_row = self.soup.new_tag('li', style="border-radius: 1px;padding: 10px 15px;" + "display: flex;justify-content:space-evenly;") + + for head_tag in tag.find_all("b"): + new_head_tag = self.soup.new_tag('div', style="flex-basis: 10%;color: #000000;") + new_head_tag.append(head_tag.text.strip()) + tbl_head_row.append(new_head_tag) + newTable.append(tbl_head_row) + tbl_head.append(head_tag.text.strip()) + + tbl_data = tag.text.split('\n') + tbl_data = [i for i in tbl_data if i] + for data in tbl_data: + if len(data.strip()) > 0: + if 0 < count <= colum_count: + count += 1 + else: + newRow = self.soup.new_tag('li', style="border-radius: 3px;padding: 20px 25px;display:flex;" + "justify-content:space-evenly;margin-bottom: 05px;") + count = 2 + + new_row_tag = self.soup.new_tag('div', style="flex-basis: 10%;") + new_row_tag.append(data) + newRow.append(new_row_tag) + newTable.append(newRow) + + tag.replace_with(newTable) diff --git a/html_parser_framework/html_parse_runner.py b/html_parser_framework/html_parse_runner.py new file mode 100644 index 0000000..bca6e17 --- /dev/null +++ b/html_parser_framework/html_parse_runner.py @@ -0,0 +1,214 @@ +""" + - run this file with args state_key and path + - all the commandline args are mandatory + - path can be given for single file, all files of single release and all files of state accordingly. +""" + +import argparse +import concurrent +import glob +import importlib +import multiprocessing +import re +import traceback +from datetime import datetime + +import roman +from loguru import logger +from concurrent.futures import ProcessPoolExecutor +import os +import logging +from bs4 import BeautifulSoup + + +def start_parsing(arguments): + """ + - checking all arguments + - checking the path and based on given path the files are parsed + - files are parsed : single file , all files of particular release and + all files of particular state. + """ + + release_number = None + cpu_count = multiprocessing.cpu_count() + file_list = [] + state_key = arguments.state_key + path = arguments.path + run_after_release = arguments.run_after_release + + script = f'{state_key.lower()}_html_parser' + class_name = f'{state_key}ParseHtml' + parser_obj = getattr(importlib.import_module(script), class_name) + + if os.path.exists(path): # validation for path + if os.path.isfile(path): # checking given path is file or not. + input_file_name = os.path.basename(path) + release_number = re.search(r'/r(?P<rid>\d+)', os.path.dirname(path)).group("rid") + soup_obj = parser_obj(state_key, path, release_number, input_file_name).run() + id_dictionary = getting_header_id_dict(state_key, release_number) + add_cite_to_file(soup_obj[0], soup_obj[1], state_key, release_number, input_file_name, id_dictionary) + + else: + subdirectories_files = [x for x in glob.glob(f'{path}/**', recursive=True) if os.path.isfile(x)] + if run_after_release: + run_after_release = int(run_after_release) + for file in subdirectories_files: + release_number = int(re.search(r'(?P<rnum>\d+)$', os.path.dirname(file)).group("rnum")) + if release_number >= run_after_release: + file_list.append(file) + else: + file_list += subdirectories_files + else: + logging.exception("Invalid path", f'{path}') + + with ProcessPoolExecutor(cpu_count) as executor: + future_list = [] + for file in file_list: + release_number = re.search(r'(?P<r_num>\d+)$', os.path.dirname(file)).group("r_num") + future_obj = [executor.submit(parser_obj(state_key, file, release_number, os.path.basename(file)).run)] + future_list.append({future_obj[0]: [os.path.basename(file), release_number]}) + + executor.shutdown(wait=True) + id_dictionary = getting_header_id_dict(state_key, release_number) + + for item in future_list: + for future_obj in concurrent.futures.as_completed(item.keys()): + try: + s, m = future_obj.result() + add_cite_to_file(s, m, state_key, item[future_obj][1], + os.path.basename(item[future_obj][0]), id_dictionary) + except Exception as exc: + exception_on = f'{exc}\n------------------------\n' \ + f'{item[future_obj][0]}' + logging.exception(exception_on, traceback.format_exc()) + + +def getting_header_id_dict(state_key, release_number): + id_dictionary = {} + id_files = os.listdir(f'{state_key}_cite_id/{state_key}{release_number}') + for file in id_files: + with open(f'{state_key}_cite_id/{state_key}{release_number}/{file}') as f: + for line in f: + (key, value) = line.split() + id_dictionary[key] = value + return id_dictionary + + +def add_cite_to_file(soup_obj, meta_tag, state_key, release_number, input_file_name, id_dictionary): + cite_parser_obj = getattr(importlib.import_module('regex_pattern'), f'CustomisedRegex{state_key}')() + soup = BeautifulSoup(soup_obj, "html.parser") + + cite_p_tags = [] + for tag in soup.findAll( + lambda tag: getattr(cite_parser_obj, "cite_tag_pattern").search(tag.get_text()) and tag.name in ['p', 'li'] + and tag not in cite_p_tags and not tag.a and tag.parent.name != 'ul'): + cite_p_tags.append(tag) + text = str(tag) + + for match in set(x[0] for x in getattr(cite_parser_obj, "cite_pattern").findall(tag.text.strip())): + + inside_text = re.sub(r'<p\sclass="\w\d+">|</p>|<b>|</b>|<p>|<p.+>|^<li id="[a-z.A-Z\d-]+">|</li>$', '', + text, re.DOTALL) + id_reg = getattr(cite_parser_obj, "cite_pattern").search(match.strip()) + + if re.search(r'^(?P<name>[a-zA-Z.]+\.)(?P<tid>(\d+(\.\w)*)|(\d+\w*)|(\d+[A-Z]?\.\d+[A-Z]?))\.html$', + input_file_name.strip()): + file_name_pattern = re.search( + r'^(?P<name>[a-zA-Z.]+\.)(?P<tid>(\d+(\.\w)*)|(\d+\w*)|(\d+[A-Z]?\.\d+[A-Z]?))\.html$', + input_file_name.strip()) + title_id = file_name_pattern.group("tid").zfill(2) + file_name = file_name_pattern.group("name") + else: + file_name = f"{re.search(r'^(?P<name>[a-zA-Z.]+)constitution', input_file_name.strip()).group('name')}title." + title_id = None + + if state_key == "NC": + if re.search(r'^\d{3}[A-Z]*', id_reg.group("title").strip()): + cite_title_id = id_reg.group("title").strip() + elif re.search(r'^\d{2}[A-Z]*', id_reg.group("title").strip()): + cite_title_id = f'0{id_reg.group("title").strip()}' + else: + cite_title_id = f'00{id_reg.group("title").strip()}' + elif state_key == 'VA': + if re.search(r'^\d\.\d[A-Z]?', id_reg.group("title").strip()): + title_reg = re.search(r'^(?P<id1>\d)\.(?P<id2>\d[A-Z]?)', id_reg.group("title").strip()) + if id_reg.group("title").strip() in ['2.1', '2.2', '3.1', '3.2', '4.1', '5.1', '6.1', '7.1', '6.2']: + cite_title_id = f'0{title_reg.group("id1")}.{title_reg.group("id2")}' + else: + cite_title_id = f'0{title_reg.group("id1")}.0{title_reg.group("id2")}' + else: + cite_title_id = id_reg.group("title").strip().zfill(2) + else: + cite_title_id = id_reg.group("title").strip().zfill(2) + + if id_reg.group("ol"): + ol_id = re.sub(r'[() ]+', '', id_reg.group("ol")) + cite_pattern = f'{id_reg.group("cite")}ol1{ol_id}' + else: + cite_pattern = id_reg.group("cite") + + if cite_pattern in id_dictionary: + cite_id = id_dictionary[cite_pattern] + if state_key == "KY": + t_id = re.search(r'^t0?(?P<tid>[IVXL]+)', cite_id).group("tid") + cite_title_id = roman.fromRoman(t_id.upper()) + cite_title_id = f'{cite_title_id:02}' + if cite_title_id == title_id: + target = "_self" + a_id = f'#{cite_id}' + else: + target = "_blank" + a_id = f'{file_name}{cite_title_id}.html#{cite_id}' + tag.clear() + text = re.sub(fr'\s{re.escape(match)}', + f' <cite class="oc{state_key.lower()}"><a href="{a_id}" target="{target}">{match}</a></cite>', + inside_text, re.I) + + tag.append(BeautifulSoup(text, features="html.parser")) + + elif not os.path.exists( + f'{state_key}_cite_id/{state_key}{release_number}/{state_key}{release_number}_{cite_title_id}_ids.txt'): + logger.error(f"parsing {file_name}{cite_title_id}.html is incomplete....unable to add citation") + + for match in set( + x[0] for x in getattr(cite_parser_obj, "code_pattern").findall(tag.text.strip())): + inside_text = re.sub(r'^<p.*>|</p>$|^<li id="[a-z.A-Z\d-]+">|</li>$', '', text, re.DOTALL) + tag.clear() + text = re.sub(re.escape(match), f'<cite class="{state_key.lower()}_code">{match}</cite>', inside_text, re.I) + tag.append(BeautifulSoup(text, features="html.parser")) + + for li_tag in soup.findAll("li"): + if re.search(r'^<li.+><li.+>', str(li_tag).strip()): + li_tag_text = re.sub(r'^\[<li.+>|</li>]$', '', str(li_tag.contents)) + li_tag.clear() + li_tag.append(BeautifulSoup(li_tag_text, features="html.parser")) + + soup_str = str(soup.prettify()) + for tag in meta_tag: + cleansed_tag = re.sub(r'/>', ' />', str(tag)) + soup_str = re.sub(rf'{tag}', rf'{cleansed_tag}', soup_str, re.I) + + with open( + f"/home/mis/PycharmProjects/cic_code_framework/transforms_output/{state_key.lower()}/oc{state_key.lower()}" + f"/r{release_number}/{input_file_name}", "w") as file: + soup_str = re.sub(r'<span class.*?>\s*</span>|<p>\s*</p>|<b>\s*</b>', '', soup_str) + file.write(soup_str) + + print("cite added", input_file_name) + + +if __name__ == '__main__': + """ + - Parse the command line args + - set environment variables using parsed command line args + - Call start parsing method with args as arguments + """ + start_time = datetime.now() + logger.info(start_time) + parser = argparse.ArgumentParser() + parser.add_argument("--state_key", help="State of which parser should be run", required=True, type=str) + parser.add_argument("--path", help="file path which needs to be parsed", required=True, type=str) + parser.add_argument("--run_after_release", help="particular files which needs to be parsed", type=str) + args = parser.parse_args() + start_parsing(args) + logger.info(datetime.now() - start_time) diff --git a/html_parser_framework/ky_html_parser.py b/html_parser_framework/ky_html_parser.py new file mode 100644 index 0000000..6dabec1 --- /dev/null +++ b/html_parser_framework/ky_html_parser.py @@ -0,0 +1,711 @@ +import re +import roman +from base_html_parser import ParseHtml +from regex_pattern import CustomisedRegexKY +from loguru import logger + + +class KYParseHtml(ParseHtml): + + def __init__(self, state_key, path, release_number, input_file_name): + super().__init__(state_key, path, release_number, input_file_name) + self.am_nav_count = None + self.nd_list = [] + + def pre_process(self): + if re.search('constitution', self.input_file_name): + self.tag_type_dict: dict = {'ul': '^(§ )|^(ARTICLE)', 'head2': '^(§|ARTICLE|PREAMBLE)', + 'head1': '^(CONSTITUTION OF KENTUCKY)|^(THE CONSTITUTION OF THE UNITED STATES OF AMERICA)', + 'head3': r'^(Section|§)', + 'junk1': '^(Text)', 'ol_p': r'^(\(1\))', + 'head4': '^(NOTES TO DECISIONS)|^(Compiler’s Notes.)'} + self.h2_order: list = ['article'] + self.h2_text_con: list = [] + self.file_no = None + else: + self.tag_type_dict: dict = {'ul': '^CHAPTER', 'head2': '^CHAPTER', + 'head1': '^(TITLE)|^(CONSTITUTION OF KENTUCKY)', + 'head3': r'^([^\s]+[^\D]+)', + 'junk1': '^(Text)', 'ol_p': r'^(\(1\))', 'head4': '^(NOTES TO DECISIONS)', + 'nd_nav': r'^1\.'} + self.h2_text: list = [] + self.file_no = re.search(r'gov\.ky\.krs\.title\.(?P<fno>\w+(\.\d)*)\.html', self.input_file_name).group( + "fno") + + if self.file_no in ['33']: + self.h2_order: list = ['chapter', 'subchapter', 'article', 'part'] + elif self.file_no in ['12']: + self.h2_order: list = ['chapter'] + + else: + self.h2_order: list = ['chapter', 'article', 'part', 'subpart'] + + if self.file_no in ['29']: + self.h2_rename_pattern = [r'^(?P<tag>A)rticle (?P<id>3)\. Negotiable Instruments', + '^(?P<tag>A)rticle (?P<id>4)\. Bank Deposits and Collections', + '^(?P<tag>A)rticle (?P<id>4A)\. Funds Transfers', + '^(?P<tag>A)rticle (?P<id>5)\. Letters of Credit', + '^(?P<tag>A)rticle (?P<id>6)\. Bulk Transfers', + '^(?P<tag>A)rticle (?P<id>7)\. Warehouse Receipts, Bills of Lading, and Other Documents ' + 'of Title', + '^(?P<tag>A)rticle (?P<id>8)\. Investment Securities', + '^(?P<tag>A)rticle (?P<id>9)\. Secured Transactions — Sales of Accounts, Contract Rights ' + 'and Chattel Paper\.', + '^(?P<tag>A)rticle (?P<id>10)\. Other Provisions', + '^(?P<tag>A)rticle (?P<id>11)\. Transition'] + + self.h4_head: list = ['NOTES TO UNPUBLISHED DECISIONS', 'Official Comment', 'History.', + 'Compiler’s Notes.', 'NOTES TO DECISIONS', 'Notes to Unpublished Decisions'] + + self.watermark_text = """Release {0} of the Official Code of Kentucky Annotated released {1} + Transformed and posted by Public.Resource.Org using cic-beautify-state-codes.py version 1.4 on {2}. + This document is not subject to copyright and is in the public domain. + """ + + self.regex_pattern_obj = CustomisedRegexKY() + + def replace_tags_titles(self): + repeated_header_list = [] + nd_tag_text = [] + + for li_tag in self.soup.findAll(class_=self.tag_type_dict["ul"]): + if self.file_no in ['18', '12']: + if not re.search(r'^(chapter|article|part|subpart)', li_tag.text.strip(), re.I): + li_tag_text = re.sub(r'\W+', '', li_tag.text.strip()) + self.h2_text.append(li_tag_text) + else: + if not re.search(r'^(chapter|subchapter|article|part|subpart)', li_tag.text.strip(), re.I): + li_tag_text = re.sub(r'\W+', '', li_tag.text.strip()) + self.h2_text.append(li_tag_text) + + super(KYParseHtml, self).replace_tags_titles() + + for p_tag in self.soup.find_all(): + if p_tag.name == "p": + if p_tag.get("class") == [self.tag_type_dict["ul"]]: + p_tag.name = "li" + p_tag.wrap(self.ul_tag) + + elif p_tag.get("class") == [self.tag_type_dict["head4"]]: + if re.match(r'^—?\d{1,3}\D\.?(\d\.)*', p_tag.text.strip()) \ + and not re.match(r'^(\d+\D\.\d\d+)|^\d+-', p_tag.text.strip()) and \ + p_tag.find_previous("h4") and \ + re.search(r'^NOTES TO DECISIONS$|^Notes to Unpublished Decisions$', + p_tag.find_previous("h4").text.strip(), re.I): + p_tag.name = "h5" + sub_sec_text = re.sub(r'\W+', '', p_tag.get_text()).lower() + nd_tag_text.append(sub_sec_text) + + if not re.match(r'^(\d+\.?\s*[—-])|^(—\d+\.?\s*[—-])', p_tag.text.strip()): + prev_head_tag = p_tag.find_previous("h4").get("id") + sub_sec_id = f"{prev_head_tag}-{sub_sec_text}" + if sub_sec_id in repeated_header_list: + sub_sec_id = f"{prev_head_tag}-{sub_sec_text}.01" + else: + sub_sec_id = f"{prev_head_tag}-{sub_sec_text}" + p_tag["id"] = sub_sec_id + repeated_header_list.append(sub_sec_id) + + elif re.match(r'^(—?\d+\.?\s*—\s*[“a-zA-Z\d.]+)', p_tag.text.strip()): + prev_sub_tag = sub_sec_id + if self.release_number == '83' and self.file_no == '08' and \ + re.search(r'^—2\.— Burden of Proof\.', p_tag.text.strip()): + inner_sec_id1 = f"{p_tag.find_previous('h4').get('id')}-{sub_sec_text}" + else: + inner_sec_id1 = f"{prev_sub_tag}-{sub_sec_text}" + if inner_sec_id1 in repeated_header_list: + inner_sec_id1 = f"{inner_sec_id1}.01" + else: + inner_sec_id1 = f"{inner_sec_id1}" + p_tag["id"] = inner_sec_id1 + repeated_header_list.append(inner_sec_id1) + + elif re.match(r'^(—?\d+\.?\s*—\s*—\s*[“a-zA-Z\d]+)', p_tag.text.strip()): + prev_child_tag = inner_sec_id1 + innr_sec_id2 = f"{prev_child_tag}-{sub_sec_text}" + + if innr_sec_id2 in repeated_header_list: + innr_sec_id2 = f"{innr_sec_id2}.01" + else: + innr_sec_id2 = f"{innr_sec_id2}" + + p_tag["id"] = innr_sec_id2 + repeated_header_list.append(innr_sec_id2) + + elif re.match(r'^(—?\d+\.?\s*—\s*—\s*—\s*[“a-zA-Z\d]+)', p_tag.text.strip()): + prev_child_id1 = innr_sec_id2 + innr_subsec_header_tag_id = f"{prev_child_id1}-{sub_sec_text}" + + if innr_subsec_header_tag_id in repeated_header_list: + innr_subsec_header_tag_id = f"{innr_subsec_header_tag_id}.01" + else: + innr_subsec_header_tag_id = f"{innr_subsec_header_tag_id}" + + p_tag["id"] = innr_subsec_header_tag_id + repeated_header_list.append(innr_subsec_header_tag_id) + + elif p_tag.get("class") == [self.tag_type_dict["ol_p"]] or \ + p_tag.get("class") == [self.tag_type_dict["nd_nav"]]: + if self.regex_pattern_obj.h2_article_pattern.search(p_tag.text.strip()): + self.replace_h4_tag_titles(p_tag, None, + self.regex_pattern_obj.h2_article_pattern.search( + p_tag.text.strip()).group( + "id")) + elif self.regex_pattern_obj.h5_section_pattern.search(p_tag.text.strip()): + if self.file_no not in ['08', '11']: + p_tag.name = "h5" + p_tag[ + "id"] = f'{p_tag.find_previous({"h4", "h3"}).get("id")}sec{self.regex_pattern_obj.h5_section_pattern.search(p_tag.text.strip()).group("id")}' + + elif p_tag.name == "h4" and re.search(r'^NOTES TO DECISIONS$|^Notes to Unpublished Decisions$', + p_tag.text.strip(), re.I): + for tag in p_tag.find_next_siblings(): + if tag.get("class") == [self.tag_type_dict["ol_p"]]: + if not re.search(r'^(Analysis|Cited|Compiler’s Notes\.)', tag.text.strip()): + tag.name = "li" + tag["class"] = "note" + else: + break + + def h2_set_id(self, header_tag): + h2_id_count = 1 + header_tag.name = "h2" + p_tag_text = re.sub(r'\W+', '', header_tag.text.strip()).lower() + if self.file_no == '29': + header_tag_id = f'{header_tag.find_previous(class_={"oneh2", "title", "threeh2"}).get("id")}-{p_tag_text}' + else: + header_tag_id = f'{header_tag.find_previous(class_={"oneh2", "title"}).get("id")}-{p_tag_text}' + + if header_tag_id in self.h2_rep_id: + header_tag["id"] = f'{header_tag_id}.{h2_id_count:02}' + h2_id_count += 1 + else: + header_tag["id"] = f'{header_tag_id}' + h2_id_count = 1 + header_tag["class"] = "gen" + self.h2_rep_id.append(header_tag['id']) + + def add_anchor_tags(self): + super(KYParseHtml, self).add_anchor_tags() + for li_tag in self.soup.findAll(): + if li_tag.name == "li" and not li_tag.get("id"): + if re.search(r'^APPENDIXRULES', li_tag.text.strip()): + chap_num = re.sub(r'\W+', '', li_tag.text.strip()).lower() + sub_tag = 'apr' + prev_id = li_tag.find_previous("h1").get("id") + self.c_nav_count += 1 + cnav = f'cnav{self.c_nav_count:02}' + self.set_chapter_section_id(li_tag, chap_num, sub_tag, prev_id, cnav) + + elif li_tag.name in ['h2', 'h3', 'h4']: + self.a_nav_count = 0 + self.c_nav_count = 0 + self.p_nav_count = 0 + self.s_nav_count = 0 + + def ol_count_increment(self, current_id, ol_count): + if current_id in self.ol_list: + ol_count += 1 + else: + ol_count = 1 + return ol_count + + def convert_paragraph_to_alphabetical_ol_tags(self): + """ + For each tag which has to be converted to orderd list(<ol>) + - create new <ol> tags with appropriate type (1, A, i, a ..) + - get previous headers id to set unique id for each list item (<li>) + - append each li to respective ol accordingly + """ + main_sec_alpha = 'a' + cap_alpha = 'A' + ol_head = 1 + num_count = 1 + alpha_ol = self.soup.new_tag("ol", type="a") + cap_alpha_ol = self.soup.new_tag("ol", type="A") + cap_roman_ol = self.soup.new_tag("ol", type="I") + num_ol = self.soup.new_tag("ol") + num_ol1 = self.soup.new_tag("ol") + innr_alpha_ol = self.soup.new_tag("ol", type="a") + roman_ol = self.soup.new_tag("ol", type="i") + ol_count = 1 + self.ol_list = [] + ol_head1 = 1 + inner_num_count = 1 + sec_alpha = 'a' + small_roman = "i" + cap_rom = "I" + inner_sec_alpha = "a" + cap_roman_cur_tag = None + prev_head_id = None + prev_num_id = None + num_cur_tag = None + alpha_cur_tag = None + cap_alpha_cur_tag = None + prevnum_id = None + prev_id = None + prevnum_id1 = None + prev_id1 = None + alpha_cur_tag1 = None + previous_li_tag = None + num_tag = None + + for p_tag in self.soup.body.find_all(['h2', 'h3', 'h4', 'h5', 'p']): + if p_tag.i: + p_tag.i.unwrap() + if p_tag.span: + p_tag.span.unwrap() + + current_tag_text = p_tag.text.strip() + + if p_tag.name == "h3": + num_cur_tag = None + + if re.search(rf'^\({ol_head}\)', current_tag_text): + p_tag.name = "li" + num_cur_tag = p_tag + alpha_cur_tag = None + + main_sec_alpha = "a" + if re.search(r'^\(1\)', current_tag_text): + num_ol = self.soup.new_tag("ol") + p_tag.wrap(num_ol) + + if cap_roman_cur_tag: + cap_roman_cur_tag.append(num_ol) + prev_num_id = f'{cap_roman_cur_tag.get("id")}' + cap_alpha = 'A' + + elif cap_alpha_cur_tag: + cap_alpha_cur_tag.append(num_ol) + prev_num_id = f'{cap_alpha_cur_tag.get("id")}' + + elif alpha_cur_tag1: + alpha_cur_tag1.append(num_ol) + prev_num_id = f'{alpha_cur_tag1.get("id")}' + + else: + cap_alpha = 'A' + ol_count = self.ol_count_increment( + f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}', + ol_count) + prev_num_id = f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}' + self.ol_list.append(prev_num_id) + + else: + num_ol.append(p_tag) + if not cap_alpha_cur_tag: + cap_alpha = 'A' + + p_tag["id"] = f'{prev_num_id}{ol_head}' + p_tag.string = re.sub(rf'^\({ol_head}\)', '', current_tag_text) + ol_head += 1 + ol_head1 += 1 + + if re.search(r'^\(\d+\)(\s)*\([a-z]\)', current_tag_text): + alpha_ol = self.soup.new_tag("ol", type="a") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\d+\)(\s)*\(\w\)', '', current_tag_text) + li_tag.append(current_tag_text) + alpha_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<cid>\d+)\)(\s)*\((?P<pid>\w)\)', current_tag_text) + prevnum_id = f'{prev_num_id}ol{ol_count}{cur_tag.group("cid")}' + li_tag["id"] = f'{prev_num_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}' + alpha_ol.append(li_tag) + p_tag.contents = [] + p_tag.append(alpha_ol) + main_sec_alpha = "b" + num_count = 1 + + if re.search(r'^\(\d+\)(\s)?\([a-z]\)\s\d+\.', current_tag_text): + num_ol1 = self.soup.new_tag("ol") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\(\d+\)(\s)?\([a-z]\)\s\d+\.', '', current_tag_text) + inner_li_tag.append(current_tag_text) + cur_tag = re.search(r'^\((?P<cid>\d+)\)(\s)?\((?P<pid>\w)\)\s(?P<nid>\d+)\.', current_tag_text) + prev_id = f'{num_cur_tag.get("id")}{cur_tag.group("pid")}' + inner_li_tag["id"] = f'{num_cur_tag.get("id")}{cur_tag.group("pid")}{cur_tag.group("nid")}' + num_ol1.append(inner_li_tag) + alpha_cur_tag.string = "" + alpha_cur_tag.append(num_ol1) + num_count = 2 + previous_li_tag = p_tag + + elif re.search(rf'^\(\s*{main_sec_alpha}\s*\)', current_tag_text): + p_tag.name = "li" + alpha_cur_tag = p_tag + num_count = 1 + ol_head1 = 1 + + if re.search(r'^\(a\)', current_tag_text): + alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(alpha_ol) + if num_cur_tag: + prevnum_id = num_cur_tag.get("id") + num_cur_tag.append(alpha_ol) + elif num_tag: + prevnum_id = num_tag.get("id") + num_tag.append(alpha_ol) + else: + prevnum_id = f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}' + else: + alpha_ol.append(p_tag) + + p_tag["id"] = f'{prevnum_id}{main_sec_alpha}' + p_tag.string = re.sub(rf'^\(\s*{main_sec_alpha}\s*\)', '', current_tag_text) + main_sec_alpha = chr(ord(main_sec_alpha) + 1) + + if re.search(r'^\(\w\)\s?1\.', current_tag_text): + num_ol1 = self.soup.new_tag("ol") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\(\w\)\s?1\.', '', current_tag_text) + inner_li_tag.append(current_tag_text) + alpha_cur_tag = inner_li_tag + cur_tag = re.search(r'^\((?P<cid>\w)\)\s*(?P<pid>1)\.', current_tag_text) + prev_id = f'{prevnum_id}ol{ol_count}{cur_tag.group("cid")}' + inner_li_tag[ + "id"] = f'{prevnum_id}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}' + num_ol1.append(inner_li_tag) + p_tag.string = "" + p_tag.insert(0, num_ol1) + num_count = 2 + sec_alpha = 'a' + previous_li_tag = p_tag + + elif re.search(r'^\(\s*\d\d\s*\)', current_tag_text): + p_tag.name = "li" + p_tag_text = re.search(r'^\(\s*(?P<id>\d\d)\s*\)', current_tag_text).group("id") + alpha_ol.append(p_tag) + p_tag["id"] = f'{prevnum_id}{p_tag_text}' + p_tag.string = re.sub(r'^\(\s*\d\d\s*\)', '', current_tag_text) + previous_li_tag = p_tag + + elif re.search(rf'^{num_count}\.', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + sec_alpha = 'a' + num_tag = p_tag + inner_sec_alpha = "a" + + if re.search(r'^1\.', current_tag_text): + num_ol1 = self.soup.new_tag("ol") + p_tag.wrap(num_ol1) + if alpha_cur_tag: + prev_id = alpha_cur_tag.get("id") + alpha_cur_tag.append(num_ol1) + elif cap_alpha_cur_tag: + prev_id = cap_alpha_cur_tag.get("id") + cap_alpha_cur_tag.append(num_ol1) + elif num_cur_tag: + prev_id = num_cur_tag.get("id") + num_cur_tag.append(num_ol1) + else: + ol_count = self.ol_count_increment( + f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}', + ol_count) + prev_id = f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}' + self.ol_list.append(prev_id) + + else: + num_ol1.append(p_tag) + + p_tag["id"] = f'{prev_id}{num_count}' + p_tag.string = re.sub(rf'^{num_count}\.', '', current_tag_text) + num_count += 1 + + if re.search(r'^\d+\.\s?a\.', current_tag_text): + innr_alpha_ol = self.soup.new_tag("ol", type="a") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\d+\.\s?a\.', '', current_tag_text) + inner_li_tag.append(current_tag_text) + alpha_cur_tag1 = inner_li_tag + cur_tag = re.search(r'^(?P<cid>\d+)\.\s?(?P<pid>a)\.', current_tag_text) + prevnum_id1 = f'{alpha_cur_tag.get("id")}{cur_tag.group("cid")}' + inner_li_tag[ + "id"] = f'{alpha_cur_tag.get("id")}{cur_tag.group("cid")}{cur_tag.group("pid")}' + innr_alpha_ol.append(inner_li_tag) + p_tag.string = "" + p_tag.insert(0, innr_alpha_ol) + sec_alpha = 'b' + previous_li_tag = p_tag + + elif re.search(rf'^{inner_num_count}\.', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + inner_num_tag = p_tag + + if re.search(r'^1\.', current_tag_text): + inner_num_ol = self.soup.new_tag("ol") + p_tag.wrap(inner_num_ol) + if alpha_cur_tag: + inner_prev_id = alpha_cur_tag.get("id") + alpha_cur_tag.append(inner_num_ol) + elif num_cur_tag: + inner_prev_id = num_cur_tag.get("id") + num_cur_tag.append(inner_num_ol) + else: + ol_count = self.ol_count_increment( + f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}', + ol_count) + inner_prev_id = f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}' + self.ol_list.append(inner_prev_id) + else: + inner_num_ol.append(p_tag) + + p_tag["id"] = f'{inner_prev_id}{inner_num_count}' + p_tag.string = re.sub(rf'^{inner_num_count}\.', '', current_tag_text) + inner_num_count += 1 + + elif re.search(rf'^{sec_alpha}\.', current_tag_text): + p_tag.name = "li" + alpha_cur_tag1 = p_tag + ol_head1 = 1 + small_roman = "i" + + if re.search(r'^a\.', current_tag_text): + innr_alpha_ol = self.soup.new_tag("ol", type="a") + previd = p_tag.find_previous("li") + p_tag.wrap(innr_alpha_ol) + prevnum_id1 = previd.get("id") + previd.append(innr_alpha_ol) + p_tag["id"] = f'{prevnum_id1}{sec_alpha}' + else: + innr_alpha_ol.append(p_tag) + p_tag["id"] = f'{prevnum_id1}{sec_alpha}' + + p_tag.string = re.sub(rf'^{sec_alpha}\.', '', current_tag_text) + sec_alpha = chr(ord(sec_alpha) + 1) + + if re.search(r'^\w+\.\s?i\.', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="i") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\w+\.\s?i\.', '', current_tag_text) + inner_li_tag.append(current_tag_text) + cur_tag = re.search(r'^(?P<cid>\w+)\.\s?(?P<pid>i)\.', current_tag_text) + prev_id1 = f'{alpha_cur_tag1.get("id")}' + inner_li_tag[ + "id"] = f'{alpha_cur_tag1.get("id")}{cur_tag.group("pid")}' + roman_ol.append(inner_li_tag) + p_tag.string = "" + p_tag.insert(0, roman_ol) + small_roman = "ii" + previous_li_tag = p_tag + + elif re.search(rf'^{cap_alpha}\.|^\({cap_alpha}\)', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + cap_alpha_cur_tag = p_tag + num_count = 1 + if re.search(r'^A\.|^\(A\)', current_tag_text): + cap_alpha_ol = self.soup.new_tag("ol", type="A") + p_tag.wrap(cap_alpha_ol) + ol_count = self.ol_count_increment( + f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}', + ol_count) + + prev_id1 = f'{p_tag.find_previous({"h5", "h4", "h3"}).get("id")}ol{ol_count}' + self.ol_list.append(prev_id1) + else: + cap_alpha_ol.append(p_tag) + + p_tag["id"] = f'{prev_id1}{cap_alpha}' + p_tag.string = re.sub(rf'^{cap_alpha}\.|^\({cap_alpha}\)', '', current_tag_text) + + if cap_alpha == 'Z': + cap_alpha = 'A' + else: + cap_alpha = chr(ord(cap_alpha) + 1) + previous_li_tag = p_tag + + elif re.search(rf'^{inner_sec_alpha}\.', current_tag_text): + p_tag.name = "li" + alpha_cur_tag = p_tag + ol_head1 = 1 + if re.search(r'^a\.', current_tag_text): + alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(alpha_ol) + if num_tag: + prevnum_id = num_tag.get("id") + num_tag.append(alpha_ol) + else: + num_count = 1 + prevnum_id = f'{p_tag.find_previous({"h4", "h3"}).get("id")}ol{ol_count}' + else: + alpha_ol.append(p_tag) + if not num_tag: + num_count = 1 + + p_tag["id"] = f'{prevnum_id}{inner_sec_alpha}' + p_tag.string = re.sub(rf'^\(\s*{inner_sec_alpha}\s*\)', '', current_tag_text) + inner_sec_alpha = chr(ord(inner_sec_alpha) + 1) + + elif re.search(rf'^{cap_rom}\.', current_tag_text): + p_tag.name = "li" + cap_roman_cur_tag = p_tag + ol_head = 1 + + if re.search(r'^I\.', current_tag_text): + cap_roman_ol = self.soup.new_tag("ol", type="I") + p_tag.wrap(cap_roman_ol) + prev_id1 = p_tag.find_previous({"h5", "h4", "h3"}).get("id") + else: + cap_roman_ol.append(p_tag) + + p_tag["id"] = f'{prev_id1}ol{ol_count}{cap_rom}' + p_tag.string = re.sub(rf'^{cap_rom}\.', '', current_tag_text) + cap_rom = roman.toRoman(roman.fromRoman(cap_rom.upper()) + 1) + previous_li_tag = p_tag + + elif re.search(rf'^{small_roman}\.', current_tag_text) and alpha_cur_tag1: + p_tag.name = "li" + if re.search(r'^i\.', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="i") + p_tag.wrap(roman_ol) + + alpha_cur_tag1.append(roman_ol) + prev_id1 = alpha_cur_tag1.get("id") + else: + roman_ol.append(p_tag) + + p_tag["id"] = f'{prev_id1}{small_roman}' + p_tag.string = re.sub(rf'^{small_roman}\.', '', current_tag_text) + small_roman = roman.toRoman(roman.fromRoman(small_roman.upper()) + 1).lower() + previous_li_tag = p_tag + + elif p_tag.get("class") == [self.tag_type_dict['head4']] and p_tag.name == "p": + if p_tag.b: + previous_li_tag = None + if previous_li_tag and re.search(r'^Official Comment$', p_tag.find_previous("h4").text.strip(), re.I): + previous_li_tag.append(p_tag) + + elif p_tag.get("class") == [self.tag_type_dict['ol_p']] and p_tag.name == "p" and previous_li_tag: + if self.file_no == '42' and re.search(r'^Click to view', p_tag.text.strip(), re.I): + ol_head = 1 + + if re.search(r'^\([a-z][a-z]\)', p_tag.text.strip()): + p_tag.name = "li" + alpha_cur_tag = p_tag + num_count = 1 + ol_head1 = 1 + alpha_ol.append(p_tag) + pid = re.search(r'\((?P<id>[a-z][a-z])\)', p_tag.text.strip()).group("id") + p_tag["id"] = f'{prevnum_id}{pid}' + p_tag.string = re.sub(rf'^\([a-z][a-z]\)', '', current_tag_text) + elif not re.search(r'^(History|SECTION (\d+|[A-Z])|Click to view)', p_tag.find_next("p").text.strip(), + re.I) and \ + not re.search(r'^(History|SECTION (\d+|[A-Z])|Click to view)', p_tag.text.strip(), re.I): + previous_li_tag.append(p_tag) + + if re.search(r'^History|^Cross references:|^OFFICIAL COMMENT|^SECTION (\d+|[A-Z])[^-]', + current_tag_text, re.I) or p_tag.name in ['h3', 'h4', 'h5']: + ol_head = 1 + ol_head1 = 1 + num_count = 1 + num_cur_tag = None + main_sec_alpha = 'a' + sec_alpha = 'a' + alpha_cur_tag = None + cap_alpha = "A" + small_roman = "i" + cap_rom = "I" + inner_sec_alpha = "a" + cap_alpha_cur_tag = None + cap_roman_cur_tag = None + alpha_cur_tag1 = None + previous_li_tag = None + num_tag = None + + logger.info("ol tag created") + + def create_analysis_nav_tag(self): + super(KYParseHtml, self).create_Notes_to_decision_analysis_nav_tag() + logger.info("note to decision nav created") + + def add_anchor_tags_con(self): + super(KYParseHtml, self).add_anchor_tags_con() + self.am_nav_count = 0 + for li_tag in self.soup.findAll("li"): + if not li_tag.get("id"): + if amd_id := re.search(r'^AMENDMENT (?P<id>[IVX]+)', li_tag.text.strip()): + self.am_nav_count += 1 + self.set_chapter_section_id(li_tag, amd_id.group("id"), + sub_tag="-", + prev_id=li_tag.find_previous("h2").get("id"), + cnav=f'amnav{self.am_nav_count:02}') + + def replace_tags_constitution(self): + sub_sec_id = None + inner_sec_id1 = None + innr_sec_id2 = None + + for li_tag in self.soup.findAll(class_=self.tag_type_dict["ul"]): + li_tag_text = re.sub(r'\W+', '', li_tag.text.strip()) + self.h2_text_con.append(li_tag_text) + + super(KYParseHtml, self).replace_tags_constitution() + + repeated_header_list = [] + nd_tag_text = [] + + for p_tag in self.soup.findAll(): + if p_tag.get("class") == [self.tag_type_dict["head4"]]: + if re.match(r'^—?\d{1,3}\D\.?(\d\.)*', p_tag.text.strip()) \ + and not re.match(r'^(\d+\D\.\d\d+)|^\d+-', p_tag.text.strip()) and \ + p_tag.find_previous("h4") and \ + re.search(r'^NOTES TO DECISIONS$|^Notes to Unpublished Decisions$', + p_tag.find_previous("h4").text.strip(), re.I): + p_tag.name = "h5" + sub_sec_text = re.sub(r'\W+', '', p_tag.get_text()).lower() + nd_tag_text.append(sub_sec_text) + + if not re.match(r'^(\d+\.?\s*[—-])|^(—\d+\.?\s*[—-])', p_tag.text.strip()): + prev_head_tag = p_tag.find_previous("h4").get("id") + sub_sec_id = f"{prev_head_tag}-{sub_sec_text}" + if sub_sec_id in repeated_header_list: + sub_sec_id = f"{prev_head_tag}-{sub_sec_text}.01" + else: + sub_sec_id = f"{prev_head_tag}-{sub_sec_text}" + p_tag["id"] = sub_sec_id + repeated_header_list.append(sub_sec_id) + + elif re.match(r'^(—?\d+\.?\s*—\s*[“a-zA-Z\d.]+)', p_tag.text.strip()): + prev_sub_tag = sub_sec_id + inner_sec_id1 = f"{prev_sub_tag}-{sub_sec_text}" + if inner_sec_id1 in repeated_header_list: + inner_sec_id1 = f"{inner_sec_id1}.01" + else: + inner_sec_id1 = f"{inner_sec_id1}" + p_tag["id"] = inner_sec_id1 + repeated_header_list.append(inner_sec_id1) + + elif re.match(r'^(—?\d+\.?\s*—\s*—\s*[“a-zA-Z\d]+)', p_tag.text.strip()): + prev_child_tag = inner_sec_id1 + innr_sec_id2 = f"{prev_child_tag}-{sub_sec_text}" + + if innr_sec_id2 in repeated_header_list: + innr_sec_id2 = f"{innr_sec_id2}.01" + else: + innr_sec_id2 = f"{innr_sec_id2}" + + p_tag["id"] = innr_sec_id2 + repeated_header_list.append(innr_sec_id2) + + elif re.match(r'^(—?\d+\.?\s*—\s*—\s*—\s*[“a-zA-Z\d]+)', p_tag.text.strip()): + prev_child_id1 = innr_sec_id2 + innr_subsec_header_tag_id = f"{prev_child_id1}-{sub_sec_text}" + + if innr_subsec_header_tag_id in repeated_header_list: + innr_subsec_header_tag_id = f"{innr_subsec_header_tag_id}.01" + else: + innr_subsec_header_tag_id = f"{innr_subsec_header_tag_id}" + + p_tag["id"] = innr_subsec_header_tag_id + repeated_header_list.append(innr_subsec_header_tag_id) + + elif p_tag.name == "h4" and re.search(r'^NOTES TO DECISIONS$|^Notes to Unpublished Decisions$', + p_tag.text.strip(), re.I): + for tag in p_tag.find_next_siblings(): + if tag.get("class") == [self.tag_type_dict["ol_p"]]: + if not re.search(r'^(Analysis|Cited|Compiler’s Notes\.|Cross-References)', + tag.text.strip()): + tag.name = "li" + tag["class"] = "note" + else: + break diff --git a/html_parser_framework/regex_pattern.py b/html_parser_framework/regex_pattern.py new file mode 100644 index 0000000..a97e6c7 --- /dev/null +++ b/html_parser_framework/regex_pattern.py @@ -0,0 +1,298 @@ +import re + + +class RegexPatterns: + """ BASE PATTERNS""" + h1_pattern = re.compile(r'title (?P<id>\d+(\.\d+)*)', re.I) + h2_chapter_pattern = re.compile(r'^chapter\s(?P<id>\d+([a-zA-Z])*)', re.I) + h2_article_pattern = re.compile(r'^article\s(?P<id>\d+([a-zA-Z])*)', re.I) + h2_part_pattern = re.compile(r'^part\s(?P<id>(\d+([a-zA-Z])*)|([IVX]+)*)', re.I) + h2_subtitle_pattern = re.compile(r'^Subtitle\s*(?P<id>\d+)', re.I) + section_pattern_con1 = re.compile(r'^Section (?P<id>\d+)') + amend_pattern_con = re.compile(r'^AMENDMENT (?P<id>\d+)', re.I) + amp_pattern = re.compile(r'&(?!amp;)') + br_pattern = re.compile(r'<br/>') + cite_pattern = None + code_pattern = None + h1_pattern_con = None + h2_article_pattern_con = None + section_pattern_con = None + article_pattern_con = re.compile(r'^ARTICLE (?P<id>\d+(\w)?)') + section_pattern_1 = None + + +class CustomisedRegexGA(RegexPatterns): + """ Customised regex patterns for GA code""" + + section_pattern = re.compile(r'^(?P<id>\d+-\d+([a-z])?-\d+(\.\d+)?)', re.I) + ul_pattern = re.compile(r'^(?P<id>\d+([A-Z])?)', re.I) + rule_pattern = re.compile(r'^Rule (?P<id>\d+(-\d+-\.\d+)*(\s\(\d+\))*)\.', re.I) + + h1_pattern_con = re.compile(r'^Constitution of the United States|' + r'^CONSTITUTION OF THE STATE OF VERMONT', re.I) + h2_chapter_pattern_con = re.compile(r'^chapter\s*(?P<id>[IVX]+)', re.I) + h2_article_pattern_con = re.compile(r'^ARTICLE (?P<id>[IVX]+)\.*', re.I) + section_pattern_con = re.compile(r'^(Article|§)\s*(?P<id>\d+(-[A-Z])*)\.') + h2_amendment_pattern_con = re.compile(r'^AMENDMENT (?P<id>[IVX]+)\.*', re.I) + + cite_pattern = re.compile(r'\b((?P<cite>(?P<title>\d{1,2})-(?P<chap>\d(\w+)?)-(?P<sec>\d+(\.\d+)?))(\s?(\((' + r'?P<ol>\w+)\))+)?)') + + code_pattern = re.compile(r"|\d+ Ga.( App.)? \d+" + r"|\d+ S.E.(2d)? \d+" + r"|\d+ U.S.C. § \d+(\(\w\))?" + r"|\d+ S\. (Ct\.) \d+" + r"|\d+ L\. (Ed\.) \d+" + r"|\d+ L\.R\.(A\.)? \d+" + r"|\d+ Am\. St\.( R\.)? \d+" + r"|\d+ A\.L\.(R\.)? \d+") + + cite_tag_pattern = re.compile(r"§+\s(\W+)?\d+-\w+-\d+(\.\d+)?" + r"|\d+ Ga.( App.)? \d+" + r"|\d+ S.E.(2d)? \d+" + r"|\d+ U.S.C. § \d+(\(\w\))?" + r"|\d+ S\. (Ct\.) \d+" + r"|\d+ L\. (Ed\.) \d+" + r"|\d+ L\.R\.(A\.)? \d+" + r"|\d+ Am\. St\.( R\.)? \d+" + r"|\d+ A\.L\.(R\.)? \d+") + + +class CustomisedRegexVA(RegexPatterns): + """ Customised regex patterns for VA code""" + + h2_subtitle_pattern = re.compile(r'^subtitle\s(?P<id>[IVX]+([a-zA-Z])*)', re.I) + h2_part_pattern = re.compile(r'^part\s(?P<id>([A-Z]))', re.I) + h2_chapter_pattern = re.compile(r'^chapter\s(?P<id>\d+(\.\d+(:1\.)*?)*)', re.I) + h2_article_pattern = re.compile(r'^article\s(?P<id>\d+((\.\d+)*?[a-zA-Z])*)', re.I) + + section_pattern = re.compile( + r'^§+\s(?P<id>\d+(\.\d+)*[A-Z]*-\d+(\.\d+)*(:\d+)*(\.\d+)*(\.\d+)*)\.*\s*', re.I) + + cite_pattern = re.compile( + r'(?P<cite>(?P<title>\d+(\.\d+)*)-\d+(\.\d+)*(\.\s:\d+)*(?P<ol>(\([a-z]\))(\(\d+\))*)*)') + code_pattern = re.compile(r'(\d+\sVa.\s\d+|S\.E\. \d+|Va\. App\. LEXIS \d+|Titles (\d+(\.\d+)*))') + + +class CustomisedRegexAK(RegexPatterns): + """ Customised regex patterns for AK code""" + + section_pattern = re.compile(r'^Sec\.\s*?(?P<id>\d+\.\d+\.\d+)\.') + cite_pattern = re.compile(r'((?P<cite>(?P<title>\d+)\.\d+\.\d+)(?P<ol>(\([a-z]\))(\(\d+\))*(\(\w+\))*)*)') + code_pattern = re.compile(r'\d+ AAC \d+, art\. \d+\.|State v\. Yi, \d+ P\.\d+d \d+') + + h1_pattern_con = re.compile(r'The Constitution of the State') + h2_article_pattern_con = re.compile(r'^Article (?P<id>[IVX]+)', re.I) + section_pattern_con = re.compile(r'^Section (?P<id>\d+)\.') + + cite_tag_pattern = re.compile(r'AS\s\d+\.\d+\.\d+((\([a-z]\))(\(\d+\))*(\(\w+\))*)*|' + r'\d+ AAC \d+, art\. \d+\.|State v\. Yi, \d+ P\.\d+d \d+') + + +class CustomisedRegexCO(RegexPatterns): + """ Customised regex patterns for CO code""" + + h2_article_pattern = re.compile(r'^(article|Art\.)\s(?P<id>\d+(\.\d+)*)', re.I) + section_pattern = re.compile(r'^(?P<id>\d+(\.\d+)*-\d+(\.\d+)*-\d+(\.\d+)*)', re.I) + h2_subpart_pattern = re.compile(r'^(subpart|SUBPART|Subpart)\s(?P<id>(\d+([a-zA-Z])*)|([A-Z]))') + + cite_pattern = re.compile( + r'((?P<cite>(?P<title>\d+)(\.\d+)*-\d+(\.\d+)*-\d+(\.\d+)*)\s?(?P<ol>(\(\w\))(\(\w\))?(\(\w\))?)*)') + code_pattern = re.compile(r"Colo\.\s*\d+|Colo\.\s*Law\.\s*\d+|" + r"\d+\s*Denv\.\s*L\.\s*Rev\.\s*\d+|" + r"\d{4}\s*COA\s*\d+|" + r"L\.\s*\d+,\s*p\.\s*\d+|" + r"Colo\.+P\.\d\w\s\d+") + + h1_pattern_con = re.compile(r'^Declaration of Independence|' + r'^Constitution of the United States of America of 1787|' + r'^Constitution of the State of Colorado') + h2_article_pattern_con = re.compile(r'^ARTICLE (?P<id>[IVX]+)', re.I) + section_pattern_con = re.compile(r'^(§)\s*(?P<id>\d+(\d+)*(\.?\w)*)\.') + + cite_tag_pattern = re.compile(r"\d+(\.\d+)*-\d+(\.\d+)*-\d+(\.\d+)*" + r"Colo\.\s*\d+|Colo\.\s*Law\.\s*\d+|" + r"\d+\s*Denv\.\s*L\.\s*Rev\.\s*\d+|" + r"\d{4}\s*COA\s*\d+|" + r"L\.\s*\d+,\s*p\.\s*\d+|" + r"Colo\.+P\.\d\w\s\d+") + + +class CustomisedRegexVT(RegexPatterns): + """ Customised regex patterns for VT code""" + h1_pattern = re.compile(r'title (?P<id>\d+(\w+)*)', re.I) + h2_chapter_pattern = re.compile(r'^chapter\s*(?P<id>([IVX]+|\d+([A-Z])*))', re.I) + h2_article_pattern = re.compile(r'^article\s*(?P<id>([IVX]+|\d+([a-zA-Z])*))', re.I) + section_pattern = re.compile( + r'^§*\s*(?P<id>\d+([a-z]{0,2})*([A-Z])*(\.\d+)*(\.*?\s*?(-|—)\d+([a-z])*)*(\.\d+)*)\.*\s*') + rename_class_section_pattern = re.compile( + r'^§+\s*(?P<id>\d+([a-z]{0,2})*([A-Z])*)(\.\d+)*(\.*?\s*?-\d+([a-z])*)*(\.\d+)*\.*\s*') + section_pattern_1 = re.compile(r'^Executive Order No\. (?P<id>\d+-\d+)') + h2_subchapter_pattern = re.compile(r'^Subchapter (?P<id>\d+([A-Z]+)?)', re.I) + + h1_pattern_con = re.compile(r'^Constitution of the United States|' + r'^CONSTITUTION OF THE STATE OF VERMONT', re.I) + h2_chapter_pattern_con = re.compile(r'^chapter\s*(?P<id>[IVX]+)', re.I) + h2_article_pattern_con = re.compile(r'^ARTICLE (?P<id>[IVX]+)\.*', re.I) + section_pattern_con = re.compile(r'^(Article)\s*(?P<id>\d+(-[A-Z])*)\.') + h2_amendment_pattern_con = re.compile(r'^AMENDMENT (?P<id>[IVX]+)\.*', re.I) + + cite_pattern = re.compile(r'\b((?P<cite>(?P<title>\d{1,2})-(?P<chap>\d(\w+)?)-(?P<sec>\d+(\.\d+)?))(\s?(\((' + r'?P<ol>\w+)\))+)?)') + + code_pattern = re.compile(r"(\d+\sV\.S\.A\.\s§+\s\d+(-\d+)*([a-z]+)*(\([a-z]\))*(\(\d+\))*(\([A-Z]\))*" + r"|\d+\sU\.S\.C\.\s§\s\d+\(*[a-z]\)*" + r"|\d+,\sNo\.\s\d+)") + + cite_tag_pattern = re.compile(r"\d+\sV\.S\.A\.\s§+\s\d+(-\d+)*([a-z]+)*(\([a-z]\))*(\(\d+\))*(\([A-Z]\))*" + r"|\d+\sU\.S\.C\.\s§\s\d+\(*[a-z]\)*" + r"|\d+,\sNo\.\s\d+") + + +class CustomisedRegexAR(RegexPatterns): + """ Customised regex patterns for AR code""" + + section_pattern = re.compile(r'^(?P<id>(\d+-\d+([a-zA-Z])?-\d+(\.\d+)?)|\d\. Acts)') + h2_subtitle_pattern = re.compile(r'^Subtitle (?P<id>\d+)\.') + h2_chapters_pattern = re.compile(r'^Chapters (?P<id>\d+-\d+)') + h2_subchapter_pattern = re.compile(r'^Subchapter (?P<id>\d+)') + + h1_pattern_con = re.compile(r'^Constitution\s+Of\s+The', re.I) + h2_article_pattern_con = re.compile(r'^ARTICLE (?P<id>\d+)', re.I) + section_pattern_con = re.compile(r'^\[*§+\s*(?P<id>\d+)') + amend_pattern_con = re.compile(r'^AMENDMENT (?P<id>\d+)', re.I) + + cite_pattern = re.compile(r'\b((?P<cite>(?P<title>\d{1,2})-(?P<chap>\d(\w+)?)-(?P<sec>\d+(\.\d+)?))(\s?(\((' + r'?P<ol>\w+)\))+)?)') + code_pattern = re.compile(r"(\d+ Ga\.( App\.)? \d+" + r"|\d+ S\.E\.(2d)? \d+" + r"|\d+ U\.S\.C\. § \d+(\(\w\))?" + r"|\d+ S\. (Ct\.) \d+" + r"|\d+ L\. (Ed\.) \d+" + r"|\d+ L\.R\.(A\.)? \d+" + r"|\d+ Am\. St\.( R\.)? \d+" + r"|\d+ A\.L\.(R\.)? \d+)") + + cite_tag_pattern = re.compile(r"§+\s(\W+)?\d+-\w+-\d+(\.\d+)?" + r"|\d+ Ga\.( App\.)? \d+" + r"|\d+ S\.E\.(2d)? \d+" + r"|\d+ U\.S\.C\. § \d+(\(\w\))?" + r"|\d+ S\. (Ct\.) \d+" + r"|\d+ L\. (Ed\.) \d+" + r"|\d+ L\.R\.(A\.)? \d+" + r"|\d+ Am\. St\.( R\.)? \d+" + r"|\d+ A\.L\.(R\.)? \d+") + + +class CustomisedRegexND(RegexPatterns): + """ Customised regex patterns for ND code""" + + h2_part_pattern = re.compile(r'^Part\s(?P<id>([IVX]+)*(\d+([a-zA-Z])*)*)') + h2_chapter_pattern = re.compile(r'^CHAPTER\s(?P<id>\d+(\.\d+)*-\d+(\.\d+)*([A-Z])*)', re.I) + section_pattern = re.compile(r'^(?P<id>\d+(\.\d+)*-\d+(\.\d+)*-\d+(\.\d+)*)') + h2_article_pattern = re.compile(r'^article\s(?P<id>(\d+([a-zA-Z])*)|[IVX]+)', re.I) + + cite_pattern = re.compile( + r'((?P<cite>(?P<title>\d+(\.\d+)*)-\d+(\.\d+)*-\d+(\.\d+)*)(?P<ol>(\(\w\))(\(\w\))?(\(\w\))?)*)') + code_pattern = re.compile(r'N\.D\. LEXIS \d+') + + cite_tag_pattern = re.compile(r"\d+(\.\d+)*-\d+(\.\d+)*-\d+(\.\d+)*" + r"|Chapter\s(?P<chapid>\d+(\.\d+)*-\d+(\.\d+)*([A-Z])*)" + r"|N\.D\. LEXIS \d+") + + h1_pattern_con = re.compile(r'^CONSTITUTION OF NORTH DAKOTA|CONSTITUTION OF THE UNITED STATES OF AMERICA') + section_pattern_con = re.compile(r'^(Section(s)?|§) (?P<id>\d+(\.\d+)*)(\.| and| to)') + h2_article_pattern_con = re.compile(r'^ARTICLE (?P<id>[IVX]+|\d+)') + article_pattern_con = re.compile(r'^ARTICLE (?P<id>\d+(\w)?)') + + +class CustomisedRegexID(RegexPatterns): + """ Customised regex patterns for ID code""" + + h2_article_pattern = re.compile(r'^(article)\s(?P<id>\d+([a-zA-Z])*)', re.I) + section_pattern = re.compile(r'^§?(\s?)(?P<id>\d+-\d+[a-zA-Z]?(-\d+)?)\.?', re.I) + cite_pattern = re.compile(r'((?P<cite>(?P<title>\d+)\.\d+\.\d+)(?P<ol>(\([a-z]\))(\(\d+\))*)*)') + code_pattern = re.compile(r'N\.D\. LEXIS \d+') + + +class CustomisedRegexWY(RegexPatterns): + """ Customised regex patterns for WY code""" + + section_pattern = re.compile(r'^§*\s*(?P<id>\d+(\.\d+)*-\d+(\.[A-Z]+)*-\d+(\.\d+)*)', re.I) + h2_division_pattern = re.compile(r'^Division (?P<id>\d+)\.') + h2_article_pattern = re.compile(r'^article\s(?P<id>\d+(\.*[a-zA-Z])*)', re.I) + h2_subpart_pattern = re.compile(r'^subpart\s(?P<id>\d+(\.*[a-zA-Z])*)', re.I) + + cite_pattern = re.compile(r'((?P<cite>(?P<title>\d+)-\d+-\d+)(?P<ol>(\([a-z]\))(\([ivxl]+\))*(\(\w+\))*)*)') + code_pattern = re.compile(r'\d+ Wyo\. LEXIS \d+') + + h1_pattern_con = re.compile(r'^THE CONSTITUTION OF THE UNITED STATES OF AMERICA|' + r'^Constitution of the State of Wyoming') + h2_article_pattern_con = re.compile(r'^ARTICLE (?P<id>\d+)', re.I) + section_pattern_con = re.compile(r'^(Section|§) (?P<id>\d+)') + section_pattern_con1 = re.compile(r'^Section (?P<id>\d+)') + + cite_tag_pattern = re.compile(r"\d+-\d+-\d+((\([a-z]\))(\([ivxl]+\))*(\(\w+\))*)*|\d+ Wyo\. LEXIS \d+") + + +class CustomisedRegexTN(RegexPatterns): + """ Customised regex patterns for TN code""" + + section_pattern = re.compile(r'^(?P<id>\d+-\d+([a-z])?-\d+(\.\d+)?)', re.I) + cite_pattern = re.compile(r'\b(?P<cite>(?P<title>\d{1,2})-\d(\w+)?-\d+(\.\d+)?)(\s*(?P<ol>(\(\w+\))+))?') + code_pattern = re.compile(r'(\d+ (Ga\.) \d+)|(\d+ Ga\.( App\.) \d+)' + r'(\d+ S\.E\.(2d)? \d+)|(\d+ U\.S\.(C\. §)? \d+(\(\w\))?)' + r'(\d+ S\. (Ct\.) \d+)|(\d+ L\. (Ed\.) \d+)|' + r'(\d+ L\.R\.(A\.)? \d+)|(\d+ Am\. St\.( R\.)? \d+)' + r'(\d+ A\.L\.R\.(2d)? \d+)') + + +class CustomisedRegexKY(RegexPatterns): + h1_pattern = re.compile(r'title (?P<id>[IVXL]+)', re.I) + section_pattern = re.compile(r'^(?P<id>\d+([A-Z]*?)\.\d+(-\d+)*?)\.', re.I) + + cite_pattern = re.compile(r'(?P<cite>(?P<title>\d+[a-zA-Z]*)\.\d+(\(\d+\))*(-\d+)*)(\s*(?P<ol>(\(\w+\))+))?') + code_pattern = re.compile(r'((Ky\.\s*(L\. Rptr\.\s*)*\d+)|' + r'(Ky\.\s?(App\.)?\s?LEXIS\s?\d+)|' + r'(U\.S\.C\.\s*secs*\.\s*\d+(\([a-zA-Z]\))*(\(\d+\))*)|' + r'(OAG \d+-\d+))') + + cite_tag_pattern = re.compile(r"(KRS)*\s?\d+[a-zA-Z]*\.\d+(\(\d+\))*(-\d+)*|" + r"(KRS Chapter \d+[a-zA-Z]*)|" + r"(KRS Title \D+, Chapter \D+?,)|" + r"KRS\s*\d+[a-zA-Z]*\.\d+\(\d+\)|" + r"(KRS\s*\d+[a-zA-Z]*\.\d+\(\d+\)|" + r"(U.S.C.\s*secs*\.\s*\d+)|" + r"(Ky.\s?(App\.)?\s?LEXIS\s?\d+)|" + r"(Ky.\s*(L. Rptr.\s*)*\d+)|" + r"(OAG \d+-\d+))") + + +class CustomisedRegexNC(RegexPatterns): + """ Customised regex patterns for NC code""" + + h1_pattern = re.compile(r'^Chapter\s(?P<id>\d+([A-Z])*)') + + section_pattern = re.compile( + r'^§§*?\s*(?P<id>(\d+([A-Z])*-\d+([A-Z])*(\.\d+[A-Z]*)*(-\d+)*)|\d+([A-Z])*)[.:, ]') + + section_pattern_1 = re.compile( + r'^Rule(s)*\s*(?P<id>(\d+(\.\d+)*))[:., throug]') + + h2_subchapter_pattern = re.compile(r'^(Subchapter|SUBCHAPTER) (?P<id>[IVX]+-*?([A-Z])*)\.') + + h1_pattern_con = re.compile(r'^Constitution of the United States|' + r'^Constitution of North Carolina', re.I) + h2_chapter_pattern_con = re.compile(r'^chapter\s*(?P<id>[IVX]+)', re.I) + h2_article_pattern_con = re.compile(r'^Article (?P<id>[IVX]+)', re.I) + section_pattern_con = re.compile(r'^(Article|§)\s*(?P<id>\d+(-[A-Z])*)\.') + h2_amendment_pattern_con = re.compile(r'^AMENDMENT (?P<id>[IVX]+)\.*', re.I) + + cite_pattern = re.compile(r'(G\.S\.\s(?P<cite>(?P<title>\d+[A-Z]*)-\d+(\.\d+)*(-\d+)*)(\s?(\((?P<ol>\w+)\))+)?)') + + code_pattern = re.compile(r"(\d+ N\.C\. \d+|" + r"\d+ N\.C\. App\. LEXIS \d+|N\.C\. LEXIS \d+)") + + cite_tag_pattern = re.compile(r"G\.S\.\s\d+[A-Z]*-\d+(\.\d+)*(\([a-z0-9]+\))*|Chapter \d+[A-Z]*|" + r"\d+ N\.C\. \d+|" + r"\d+ N\.C\. App\.") diff --git a/html_parser_framework/release_dates.txt b/html_parser_framework/release_dates.txt new file mode 100644 index 0000000..9c7de15 --- /dev/null +++ b/html_parser_framework/release_dates.txt @@ -0,0 +1,30 @@ +AK_r80 2022.05 +AK_r81 2022.06 +AR_r76 2020.04.20 +AR_r77 2020.08.18 +AR_r78 2020.11.06 +CO_r71 2020.08.01 +CO_r72 2020.09.30 +CO_r73 2020.12.22 +CO_r74 2021.03.16 +CO_r75 2021.06.23 +CO_r76 2021.10 +CO_r77 2022.02 +CO_r78 2022.03 +CO_r79 2022.06 +ND_r78 2021.12 +ND_r79 2022.01 +ND_r80 2022.03 +ND_r81 2022.06 +WY_r78 2021.05 +WY_r79 2021.09 +WY_r80 2021.10 +WY_r81 2022.01 +WY_r82 2022.05 +KY_r82 2022.01 +VT_r81 2021.07 +VT_r82 2021.11 +VT_r83 2022.01 +VT_r84 2022.06 +VT_r85 2022.07 +NC_r81 2022.06 \ No newline at end of file diff --git a/html_parser_framework/vt_html_parser.py b/html_parser_framework/vt_html_parser.py new file mode 100644 index 0000000..5c8d933 --- /dev/null +++ b/html_parser_framework/vt_html_parser.py @@ -0,0 +1,1008 @@ +import re +from base_html_parser import ParseHtml +import roman +from regex_pattern import CustomisedRegexVT + + +class VTParseHtml(ParseHtml): + + def __init__(self, state_key, path, release_number, input_file_name): + super().__init__(state_key, path, release_number, input_file_name) + self.file_no = None + self.h2_pattern_text = None + + def pre_process(self): + if re.search('constitution', self.input_file_name): + self.tag_type_dict: dict = {'ul': r'^Chapter I|^PREAMBLE|I\.', + 'head1': '^Constitution of the United States|' + 'CONSTITUTION OF THE STATE OF VERMONT', + 'head2': '^CHAPTER I|^PREAMBLE', 'head3': r'^§ 1\.|^Section \d+\.', + 'junk1': '^Annotations', + 'article': '——————————', 'head4_1': '^1\.', 'head4': '^ANNOTATIONS', + 'ol_p': r'^Analysis', } + if self.release_number == '84' and re.search(r'us\.html$', self.input_file_name): + self.h2_order: list = ['chapter', '', '', '', ''] + elif self.release_number == '85' and re.search(r'vt\.html$', self.input_file_name): + self.h2_order: list = ['chapter', '', '', '', ''] + elif re.search(r'vt\.html$', self.input_file_name): + self.h2_order: list = ['chapter', '', '', '', ''] + else: + self.h2_order: list = ['article', 'amendment', '', '', ''] + + self.h2_pattern_text: list = ['PREAMBLE', 'DELEGATION AND DISTRIBUTION OF POWERS', + 'LEGISLATIVE DEPARTMENT', 'EXECUTIVE DEPARTMENT', + 'JUDICIARY DEPARTMENT', 'QUALIFICATIONS OF FREEMEN AND FREEWOMEN', + 'ELECTIONS; OFFICERS; TERMS OF OFFICE', 'OATH OF ALLEGIANCE; OATH OF OFFICE', + 'IMPEACHMENT', 'MILITIA', 'GENERAL PROVISIONS', + 'AMENDMENT OF THE CONSTITUTION', + 'TEMPORARY PROVISIONS', ''] + else: + if int(self.release_number) >= int('84'): + self.tag_type_dict: dict = {'ul': r'^(Chapter|Article)\s*\d+\.', 'head2': r'^(CHAPTER|Article) \d+\.', + 'head1': r'^TITLE', + 'head3': r'^§ \d+((-|—)\d+)*\.', 'junk1': '^Annotations', + 'article': '——————————', + 'ol_p': r'^\(A\)', 'head4': '^History'} + else: + self.tag_type_dict: dict = {'ul': r'^\d+\.', 'head2': r'^CHAPTER \d+\.', + 'head1': r'^TITLE \d', + 'head3': r'^§ \d+(-\d+)*\.', 'junk1': '^Annotations', + 'article': '——————————', + 'ol_p': r'^\(A\)', 'head4': '^History', 'analysishead': r'^\d+\.', + 'part': r'^PART \d'} + + self.file_no = re.search(r'gov\.vt\.vsa\.title\.(?P<fno>\w+)\.html', self.input_file_name).group("fno") + + if int(self.release_number) <= 83 and self.file_no in ['11C', '09A', '27A']: + self.tag_type_dict['head2'] = r'^ARTICLE \d' + + if self.file_no in ['18', '05', '03', '10', '09', '08', '06', '12', '13', '14', '16', '20', '16A', + '24', '24A', '33', '30', '29']: + self.h2_order: list = ["part", "chapter", 'subchapter', 'article', ''] + + elif self.file_no in ['27A'] and self.release_number == '84': + self.h2_order: list = ["part", "article", '', '', ''] + + elif self.file_no in ['09A', '11C', '27A']: + self.h2_order: list = ["article", "part", '', '', ''] + + elif self.file_no in ['32']: + if int(self.release_number) <= 83: + self.h2_order: list = ["subtitle", "part", 'chapter', 'subchapter', 'article', '', ''] + else: + self.h2_order: list = ["subtitle", 'chapter', 'subchapter', 'article', '', ''] + self.h2_rename_pattern = [r'^(?P<tag>Part)\s*(?P<id>\d+)', r'^(?P<tag>Chapter) (?P<id>\d{3})\.'] + + else: + self.h2_order: list = ["chapter", 'subchapter', 'article', 'part', ''] + + self.h2_text: list = ['Regulations Chapter 1. Game', 'Title Five Tables', + 'Table 2 Derivation of Sections', + 'Aeronautics and Surface Transportation Generally', 'Executive Orders'] + + self.h2_pattern_text = [r'^(?P<tag>Part)\s*(?P<id>\d+)'] + + if self.file_no == '14': + self.h2_rename_pattern = [r'^(?P<tag>Part) (?P<id>\d)\. Receipts', + r'^(?P<tag>S)ubchapter (?P<id>5). Allocation of Disbursements During ' + r'Administration of Trust', '^(?P<tag>C)hapter (?P<id>119). Uniform Management ' + r'of Institutional Funds Act', + r'^(?P<tag>C)hapter (?P<id>120). Uniform Prudent Management of Institutional ' + r'Funds Act', '^(?P<tag>C)hapter (?P<id>121). Durable Power of ' + 'Attorney for Health Care', + '^(?P<tag>C)hapter (?P<id>123). Powers of Attorney', + '^(?P<tag>C)hapter (?P<id>125). Vermont Revised Uniform Fiduciary Access to ' + 'Digital Assets Act' + ] + elif self.file_no == '24A': + self.h2_rename_pattern = [r'^§ (?P<id>1401)\.(?P<tag>Boundaries)'] + + if self.file_no == '03A': + self.tag_type_dict['head3'] = r'^Executive Order No\. \d-\d' + + self.h4_head: list = ['OFFICIAL COMMENT','History', 'Compiler’s Notes.', 'CROSS REFERENCES', 'ANNOTATIONS', 'Notes to Opinions'] + + self.watermark_text = """Release {0} of the Official Code of Vermont Annotated released {1}. + Transformed and posted by Public.Resource.Org using cic-beautify-state-codes.py version 1.4 on {2}. + This document is not subject to copyright and is in the public domain. + """ + + self.regex_pattern_obj = CustomisedRegexVT() + + def replace_tags_titles(self): + + if int(self.release_number) <= 83: + if self.file_no in ['09A', '27A']: + title_tag = self.soup.find("p", class_=self.tag_type_dict["head2"]) + self.replace_h1_tags_titles(title_tag) + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + if self.file_no in ['03']: + h2_title_tag_pattern = re.compile(r'^TITLE\s*3\s*Executive\s*Appendix\s*(?P<id>Executive Orders)') + for h2_title_tag in self.soup.find_all("p", class_=self.tag_type_dict['head2']): + if h2_title_tag_pattern.search((h2_title_tag.text.strip())): + h2_title_tag.name = "h2" + h2_title_tag["class"] = "oneh2" + h2_title_tag['id'] = f't{self.file_no}-executiveorders' + self.ul_tag = self.soup.new_tag("ul", **{"class": "leaders"}) + + for rename_class_tag in self.soup.find_all("p", class_=self.tag_type_dict["ol_p"]): + if self.regex_pattern_obj.rename_class_section_pattern.search(rename_class_tag.text.strip()): + pos = rename_class_tag.attrs['class'].index(self.tag_type_dict["ol_p"]) + rename_class_tag.attrs['class'][pos] = self.tag_type_dict["head3"] + for rename_class_tag in self.soup.find_all("p", class_=self.tag_type_dict["article"]): + if self.regex_pattern_obj.h2_chapter_pattern.search(rename_class_tag.text.strip()): + pos = rename_class_tag.attrs['class'].index(self.tag_type_dict["article"]) + rename_class_tag.attrs['class'][pos] = self.tag_type_dict["head2"] + elif self.regex_pattern_obj.h2_subchapter_pattern.search( + rename_class_tag.text.strip()) and self.file_no == '20' \ + and not rename_class_tag.text.strip().isupper(): + pos = rename_class_tag.attrs['class'].index(self.tag_type_dict["article"]) + rename_class_tag.attrs['class'][pos] = self.tag_type_dict["head1"] + + super(VTParseHtml, self).replace_tags_titles() + + cap_roman = "I" + cap_alpha = None + cap_num = None + h5_alpha_id = None + h5_rom_id = None + cap_roman_tag = None + annotation_id = None + analysis_id1 = None + annotation_text_list: list = [] + annotation_id_list: list = [] + h5_count = 1 + subtitle_nav_tag = None + h4_pattern = re.compile(r'Annotations From Former §*? \d+') + + for header_tag in self.soup.find_all(): + if header_tag.get("class") == [self.tag_type_dict["head4"]]: + if h4_pattern.search(header_tag.text.strip()): + self.replace_h4_tag_titles(header_tag, self.h4_count) + if re.search(r'^CASE NOTES$|^Analysis$|^ANNOTATIONS$', header_tag.text.strip()): + cap_roman = "I" + cap_roman_tag = None + elif re.search(rf'^{cap_roman}\.', header_tag.text.strip()): + header_tag.name = "h5" + cap_roman_tag = header_tag + h5_rom_text = re.search(r'^(?P<h5_id>[IVX]+)\.', header_tag.text.strip()).group("h5_id") + h5_rom_id = f'{header_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecisison-{h5_rom_text}' + + if h5_rom_id in annotation_id_list: + header_tag["id"] = f'{h5_rom_id}.{h5_count}' + h5_count += 1 + else: + header_tag["id"] = f'{h5_rom_id}' + h5_count = 1 + + annotation_id_list.append(h5_rom_id) + cap_alpha = 'A' + cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) + + elif cap_alpha and re.search(fr'^{cap_alpha}\.', header_tag.text.strip()): + header_tag.name = "h5" + h5_alpha_text = re.search(r'^(?P<h5_id>[A-Z]+)\.', header_tag.text.strip()).group("h5_id") + h5_alpha_id = f"{h5_rom_id}-{h5_alpha_text}" + header_tag['id'] = h5_alpha_id + cap_alpha = chr(ord(cap_alpha) + 1) + cap_num = 1 + + elif cap_num and re.search(fr'^{cap_num}\.', header_tag.text.strip()): + header_tag.name = "h5" + h5_num_text = re.search(r'^(?P<h5_id>\d+)\.', header_tag.text.strip()).group("h5_id") + h5_num_id = f"{h5_alpha_id}-{h5_num_text}" + header_tag['id'] = h5_num_id + cap_num += 1 + + elif re.search(r'^0\.5\.\s[a-zA-Z]+', header_tag.text.strip()): + header_tag.name = "h5" + header_tag['id'] = f'{header_tag.find_previous({"h3", "h2", "h1"}).get("id")}-annotation-0.5' + + elif annotation_id and re.search(r'^—[a-zA-Z]+', header_tag.text.strip()): + header_tag.name = "h5" + tag_text = re.sub(r'[\W\s.]+', '', header_tag.text.strip()).lower() + inner_head_id = f'{annotation_id}-{tag_text}' + if inner_head_id in annotation_id_list: + header_tag["id"] = f'{inner_head_id}.{h5_count}' + h5_count += 1 + else: + header_tag["id"] = f'{inner_head_id}' + h5_count = 1 + annotation_id_list.append(inner_head_id) + + else: + annotation_text = re.sub(r'[\W\s]+', '', header_tag.text.strip()).lower() + if annotation_text in annotation_text_list and re.search(r'^ANNOTATIONS$', header_tag.find_previous( + "h4").text.strip()): + header_tag.name = "h5" + if cap_roman_tag: + annotation_id = f'{cap_roman_tag.get("id")}-{annotation_text}' + else: + annotation_id = f'{header_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecision-{annotation_text}' + + if annotation_id in annotation_id_list: + header_tag["id"] = f'{annotation_id}.{h5_count}' + h5_count += 1 + else: + header_tag["id"] = f'{annotation_id}' + h5_count = 1 + annotation_id_list.append(annotation_id) + + if re.search(r'^Analysis$|^ANNOTATIONS$', header_tag.text.strip(), re.I): + for tag in header_tag.find_next_siblings(): + if int(self.release_number) >= 84: + if tag.get('class') == [self.tag_type_dict["head4"]]: + break + else: + tag["class"] = "casenote" + annotation_text_list.append(re.sub(r'[\W\s]+', '', tag.text.strip()).lower()) + + if int(self.release_number) <= 83 and header_tag.get("class") == [self.tag_type_dict["analysishead"]]: + if header_tag.find_previous("h4") and not h4_pattern.search( + header_tag.find_previous("h4").text.strip()): + previous_header_id = f'{header_tag.find_previous({"h3", "h2", "h1"}).get("id")}-annotation' + else: + previous_header_id = header_tag.find_previous({"h4", "h3", "h2", "h1"}).get("id") + + if re.search(r'^0\.5\.\s[a-zA-Z]+', header_tag.text.strip()): + header_tag.name = "h5" + header_tag['id'] = f'{previous_header_id}-0.5' + + elif re.search(r'^\d{1,2}\.', header_tag.text.strip()) and not \ + re.search(r'^OFFICIAL COMMENT',header_tag.find_previous("h4").text.strip()): + header_tag.name = "h5" + analysis_id = re.search(r'^(?P<a_id>\d{1,2})\.', header_tag.text.strip()).group("a_id") + analysis_id1 = f"{previous_header_id}-{analysis_id}" + + if analysis_id1 in annotation_id_list: + header_tag["id"] = f'{analysis_id1}.{h5_count}' + h5_count += 1 + else: + header_tag["id"] = f'{analysis_id1}' + h5_count = 1 + + elif re.search(r'^\*\d{1,2}\.', header_tag.text.strip()): + header_tag.name = "h5" + analysis_id = re.sub(r'[\W\d]', '', header_tag.text.strip()).lower() + header_id = f"{analysis_id1}-{analysis_id}" + + if header_id in annotation_id_list: + header_tag["id"] = f'{header_id}.{h5_count}' + h5_count += 1 + else: + header_tag["id"] = f'{header_id}' + h5_count = 1 + + annotation_id_list.append(header_tag.get("id")) + + if (self.file_no in ['27A', '09A'] and self.regex_pattern_obj.h2_part_pattern.search( + header_tag.text.strip()) and header_tag.name == "p") or \ + (self.regex_pattern_obj.h2_part_pattern.search( + header_tag.text.strip()) and header_tag.name == "p" and header_tag.text.strip().isupper()): + header_tag["class"] = "navhead" + if subtitle_nav_tag: + header_tag[ + "id"] = f'{subtitle_nav_tag.get("id")}p{self.regex_pattern_obj.h2_part_pattern.search(header_tag.text.strip()).group("id").zfill(2)}' + else: + header_tag[ + "id"] = f'{header_tag.find_previous({"h1", "h2"}).get("id")}p{self.regex_pattern_obj.h2_part_pattern.search(header_tag.text.strip()).group("id").zfill(2)}' + + if self.regex_pattern_obj.h2_subchapter_pattern.search(header_tag.text.strip()) and header_tag.name == "p": + header_tag["class"] = "navhead" + header_tag[ + "id"] = f'{header_tag.find_previous("h2").get("id")}s{self.regex_pattern_obj.h2_subchapter_pattern.search(header_tag.text.strip()).group("id").zfill(2)}' + + if self.regex_pattern_obj.h2_subtitle_pattern.search(header_tag.text.strip()) and header_tag.name == "p": + subtitle_nav_tag = header_tag + header_tag["class"] = "navhead" + header_tag[ + "id"] = f'{header_tag.find_previous("h1").get("id")}s{self.regex_pattern_obj.h2_subtitle_pattern.search(header_tag.text.strip()).group("id").zfill(2)}' + + def add_anchor_tags(self): + super(VTParseHtml, self).add_anchor_tags() + for li_tag in self.soup.find_all("li"): + if not li_tag.get("id") and re.search(r'^Part \d+\.', li_tag.text.strip()): + chap_num = re.search(r'^Part (?P<id>\d+)\.', li_tag.text.strip()).group("id") + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="p", + prev_id=li_tag.find_previous("h1").get("id"), + cnav=f'cnav{self.c_nav_count:02}') + + elif not li_tag.get("id") and re.search(r'^Executive Order No\. (?P<id>\d+-\d+)', li_tag.text.strip()): + chap_num = re.search(r'^Executive Order No\. (?P<id>\d+-\d+)', li_tag.text.strip()).group("id") + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="s", + prev_id=li_tag.find_previous("h2").get("id"), + cnav=f'cnav{self.c_nav_count:02}') + + if int(self.release_number) <= 83 and self.file_no in ['27A', '09A'] and \ + li_tag.a.text.strip() in ['1.', '2.', '3.', '4.', '2A.', '4A.', '5.', '6.', '7.', '8.', '9.']: + li_tag["id"] = str(li_tag.get("id")).replace('c', 'a') + li_tag.a["href"] = str(li_tag.a.get("href")).replace('c', 'a') + + def convert_paragraph_to_alphabetical_ol_tags(self): + """ + For each tag which has to be converted to orderd list(<ol>) + - create new <ol> tags with appropriate type (1, A, i, a ..) + - get previous headers id to set unique id for each list item (<li>) + - append each li to respective ol accordingly + """ + main_sec_alpha = 'a' + inner_alpha = 'a' + num_count = 1 + inner_num_count = 1 + num_ol = self.soup.new_tag("ol") + roman_ol = self.soup.new_tag("ol", type="i") + sec_alpha_ol = self.soup.new_tag("ol", type="a") + num_ol1 = self.soup.new_tag("ol") + inner_sec_alpha_ol = self.soup.new_tag("ol", type="a") + inner_num_ol = self.soup.new_tag("ol") + cap_alpha_ol = self.soup.new_tag("ol", type="A") + cap_roman_ol = self.soup.new_tag("ol", type="I") + cap_roman_ol1 = self.soup.new_tag("ol", type="I") + alpha_ol1 = self.soup.new_tag("ol", type="a") + cap_alpha_ol1 = self.soup.new_tag("ol", type="A") + ol_count = 1 + sec_alpha_cur_tag = None + num_cur_tag1 = None + cap_alpha_cur_tag1 = None + cap_alpha = 'A' + cap_alpha1 = 'A' + cap_alpha2 = 'A' + small_roman = "i" + cap_rom = "I" + inner_cap_rom = "I" + sec_alpha_id = None + prev_id1 = None + num_id = None + cap_alpha_id = None + cap_alpha_id1 = None + num_tag = None + previous_li_tag = None + cap_roman_cur_tag = None + num_id1 = None + inner_num_cur_tag = None + cap_roman_cur_tag1 = None + cap_alpha_cur_tag = None + roman_cur_tag = None + prev_id_rom = None + inner_sec_alpha_id = None + + for p_tag in self.soup.body.find_all(['h3', 'h4', 'h5', 'p']): + current_tag_text = p_tag.text.strip() + if p_tag.i: + p_tag.i.unwrap() + if re.search(rf'^\({small_roman}\)', current_tag_text) and cap_alpha_cur_tag1: + p_tag.name = "li" + roman_cur_tag = p_tag + cap_rom = "I" + if re.search(r'^\(i\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="i") + p_tag.wrap(roman_ol) + prev_class = p_tag.find_previous({'h4', 'h3'}).get("class") + if prev_class == ['subsection']: + if sec_alpha_cur_tag: + sec_alpha_cur_tag.append(roman_ol) + prev_id1 = sec_alpha_cur_tag.get('id') + p_tag["id"] = f'{prev_id1}i' + p_tag.string = re.sub(r'^\(i\)', '', current_tag_text) + main_sec_alpha = 'j' + elif num_tag: + num_tag.append(roman_ol) + prev_id1 = num_tag.get('id') + p_tag["id"] = f'{prev_id1}i' + p_tag.string = re.sub(r'^\(i\)', '', current_tag_text) + else: + prev_id1 = f"{p_tag.find_previous('h4', class_='subsection').get('id')}ol{ol_count}" + prev_id1 = f'{prev_id1}' + p_tag["id"] = f'{prev_id1}i' + p_tag.string = re.sub(r'^\(i\)', '', current_tag_text) + else: + prev_li = p_tag.find_previous("li") + prev_li.append(roman_ol) + prev_id1 = prev_li.get("id") + p_tag["id"] = f'{prev_li.get("id")}i' + p_tag.string = re.sub(r'^\(i\)', '', current_tag_text) + else: + roman_ol.append(p_tag) + rom_head = re.search(r'^\((?P<rom>[ivx]+)\)', current_tag_text).group("rom") + p_tag["id"] = f'{prev_id1}{rom_head}' + p_tag.string = re.sub(r'^\([ivx]+\)', '', current_tag_text) + small_roman = roman.toRoman(roman.fromRoman(small_roman.upper()) + 1).lower() + + if re.search(rf'^\([ivx]+\)\s*\(I\)', current_tag_text): + cap_roman_ol = self.soup.new_tag("ol", type="I") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([ivx]+\)\s*\(I\)', '', current_tag_text) + cap_roman_cur_tag = li_tag + cur_tag1 = re.search(r'^\((?P<cid>[ivx]+)\)\s*\((?P<pid>I)\)', current_tag_text) + prev_id1 = f'{cap_alpha_cur_tag1.get("id")}{cur_tag1.group("cid")}' + li_tag["id"] = f'{cap_alpha_cur_tag1.get("id")}{cur_tag1.group("cid")}{cur_tag1.group("pid")}' + cap_roman_ol.append(li_tag) + p_tag.string = "" + p_tag.append(cap_roman_ol) + previous_li_tag = p_tag + + elif re.search(r'^\d{0,2}\.\d+(\.\d+)*', current_tag_text) and p_tag.name == 'p': + p_tag.name = "li" + num_tag = p_tag + main_sec_alpha = 'a' + + prev_h3 = re.search(r'\d+([a-b])*$', p_tag.find_previous("h3").get("id").strip()).group() + if re.search(rf'^1\.0|{prev_h3}\.(0|1)', current_tag_text): + num_ol = self.soup.new_tag("ol") + p_tag.wrap(num_ol) + else: + num_ol.append(p_tag) + prev_num_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2', 'h1'}).get('id')}ol{ol_count}" + num_id = re.search(r'^(?P<n_id>\d{0,2}\.\d+(\.\d+)*)', current_tag_text).group("n_id") + p_tag["id"] = f'{prev_num_id}{num_id}' + p_tag.string = re.sub(r'^\d{0,2}\.\d+\.*(\d+)*', '', p_tag.text.strip()) + previous_li_tag = p_tag + + elif re.search(rf'^\({main_sec_alpha}\)', current_tag_text): + p_tag.name = "li" + sec_alpha_cur_tag = p_tag + num_count = 1 + cap_alpha_cur_tag1 = None + + if re.search(r'^\(a\)', current_tag_text): + sec_alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(sec_alpha_ol) + sec_alpha_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + if num_tag: + sec_alpha_id = num_tag.get('id') + num_tag.append(sec_alpha_ol) + elif cap_roman_cur_tag1: + sec_alpha_id = cap_roman_cur_tag1.get('id') + cap_roman_cur_tag1.append(sec_alpha_ol) + else: + sec_alpha_ol.append(p_tag) + p_tag["id"] = f'{sec_alpha_id}{main_sec_alpha}' + p_tag.string = re.sub(rf'^\({main_sec_alpha}\)', '', current_tag_text) + main_sec_alpha = chr(ord(main_sec_alpha) + 1) + + if re.search(rf'^\([a-z]\)\s*\(\d+\)', current_tag_text): + num_ol1 = self.soup.new_tag("ol") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([a-z]\)\s*\(\d+\)', '', current_tag_text) + li_tag.append(current_tag_text) + num_cur_tag1 = li_tag + cur_tag = re.search(r'^\((?P<cid>[a-z])\)\s*\((?P<pid>\d+)\)', current_tag_text) + num_id1 = f'{sec_alpha_id}{cur_tag.group("cid")}' + sec_alpha_id = f'{sec_alpha_id}{cur_tag.group("cid")}' + li_tag["id"] = f'{num_id1}{cur_tag.group("pid")}' + num_ol1.append(li_tag) + p_tag.string = "" + p_tag.append(num_ol1) + num_count = 2 + cap_alpha1 = 'A' + + if re.search(r'^\([a-z]\)\s*\(\d+\)\s?\(A\)', current_tag_text): + cap_alpha_ol1 = self.soup.new_tag("ol", type="A") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\([a-z]\)\s*\(\d+\)\s?\(A\)', '', current_tag_text) + inner_li_tag.append(current_tag_text) + cap_alpha_cur_tag1 = inner_li_tag + cur_tag = re.search(r'^\((?P<cid>[a-z])\)\s?\((?P<pid>\d+)\)\s\(?(?P<nid>A)\)',current_tag_text) + cap_alpha_id1 = f'{num_cur_tag1.get("id")}{cur_tag.group("pid")}' + inner_li_tag["id"] = f'{num_cur_tag1.get("id")}{cur_tag.group("pid")}{cur_tag.group("nid")}' + cap_alpha_ol1.append(inner_li_tag) + num_cur_tag1.string = "" + num_cur_tag1.append(cap_alpha_ol1) + cap_alpha1 = 'B' + previous_li_tag = p_tag + + elif re.search(rf'^{inner_alpha}\.', current_tag_text): + p_tag.name = "li" + inner_sec_alpha_tag = p_tag + if re.search(r'^a\.', current_tag_text): + inner_sec_alpha_ol = self.soup.new_tag("ol", type="a") + p_tag.wrap(inner_sec_alpha_ol) + inner_sec_alpha_id = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + if inner_num_cur_tag: + inner_sec_alpha_id = inner_num_cur_tag.get('id') + inner_num_cur_tag.append(inner_sec_alpha_ol) + else: + inner_sec_alpha_ol.append(p_tag) + p_tag["id"] = f'{inner_sec_alpha_id}{inner_alpha}' + p_tag.string = re.sub(rf'^{inner_alpha}\.', '', current_tag_text) + inner_alpha = chr(ord(inner_alpha) + 1) + previous_li_tag = p_tag + + elif re.search(rf'^\({num_count}\)', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + num_cur_tag1 = p_tag + cap_alpha1 = 'A' + cap_alpha2 = 'A' + + if re.search(r'^\(1\)', current_tag_text): + num_ol1 = self.soup.new_tag("ol") + p_tag.wrap(num_ol1) + if sec_alpha_cur_tag: + num_id1 = sec_alpha_cur_tag.get('id') + sec_alpha_cur_tag.append(num_ol1) + elif inner_num_cur_tag: + num_id1 = inner_num_cur_tag.get('id') + inner_num_cur_tag.append(num_ol1) + elif cap_roman_cur_tag1: + num_id1 = cap_roman_cur_tag1.get("id") + cap_roman_cur_tag1.append(num_ol1) + else: + num_id1 = f"{p_tag.find_previous(['h5', 'h4', 'h3', 'h2']).get('id')}ol{ol_count}" + else: + num_ol1.append(p_tag) + p_tag["id"] = f'{num_id1}{num_count}' + p_tag.string = re.sub(rf'^\({num_count}\)', '', current_tag_text) + num_count += 1 + + if re.search(rf'^\(\d+\)\s*\([A-Z]\)', current_tag_text): + cap_alpha_ol1 = self.soup.new_tag("ol", type='A') + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\(\d+\)\s*\([A-Z]\)', '', current_tag_text) + li_tag.append(current_tag_text) + cap_alpha_cur_tag1 = li_tag + cur_tag = re.search(r'^\((?P<cid>\d+)\)\s*\((?P<pid>[A-Z])\)', current_tag_text) + if sec_alpha_cur_tag: + cap_alpha_id1 = f'{sec_alpha_cur_tag.get("id")}{cur_tag.group("cid")}' + li_tag["id"] = f'{sec_alpha_cur_tag.get("id")}{cur_tag.group("cid")}{cur_tag.group("pid")}' + else: + cap_alpha_id1 = f'{p_tag.find_previous({"h5", "h4", "h3", "h2"}).get("id")}ol{ol_count}{cur_tag.group("cid")}' + + li_tag[ + "id"] = f'{p_tag.find_previous({"h5", "h4", "h3", "h2"}).get("id")}ol{ol_count}{cur_tag.group("cid")}{cur_tag.group("pid")}' + cap_alpha_ol1.append(li_tag) + p_tag.string = "" + p_tag.append(cap_alpha_ol1) + cap_alpha1 = 'B' + + if re.search(r'^\(\d+\)\s*\([A-Z]\)\s*\(i\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="i") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\(\d+\)\s*\([A-Z]\)\s*\(i\)', '', current_tag_text) + roman_cur_tag = inner_li_tag + cur_tag = re.search(r'^\((?P<cid>\d+)\)\s?\((?P<pid>[A-Z])\)\s*\(?(?P<nid>i)\)',current_tag_text) + prev_id1 = f'{num_cur_tag1.get("id")}{cur_tag.group("pid")}' + inner_li_tag["id"] = f'{num_cur_tag1.get("id")}{cur_tag.group("pid")}{cur_tag.group("nid")}' + roman_ol.append(inner_li_tag) + cap_alpha_cur_tag1.string = "" + cap_alpha_cur_tag1.append(roman_ol) + small_roman = 'ii' + cap_rom = "I" + previous_li_tag = p_tag + + elif re.search(rf'^{inner_num_count}\.', current_tag_text) and p_tag.name == "p": + p_tag.name = "li" + inner_num_cur_tag = p_tag + num_count = 1 + inner_alpha = 'a' + + if re.search(r'^1\.', current_tag_text): + inner_num_ol = self.soup.new_tag("ol") + p_tag.wrap(inner_num_ol) + if cap_roman_cur_tag1: + cap_roman_cur_tag1.append(inner_num_ol) + num_id = cap_roman_cur_tag1.get('id') + elif cap_alpha_cur_tag: + cap_alpha_cur_tag.append(inner_num_ol) + num_id = cap_alpha_cur_tag.get('id') + else: + num_id = f"{p_tag.find_previous(['h5', 'h4', 'h3', 'h2']).get('id')}ol{ol_count}" + + else: + inner_num_ol.append(p_tag) + + p_tag["id"] = f'{num_id}{inner_num_count}' + p_tag.string = re.sub(rf'^{inner_num_count}\.', '', current_tag_text) + inner_num_count += 1 + previous_li_tag = p_tag + + elif re.search(rf'^\({cap_alpha2}{cap_alpha2}\)', current_tag_text): + p_tag.name = "li" + cap_alpha_ol1.append(p_tag) + p_tag_id = re.search(rf'^\((?P<p_id>{cap_alpha2}{cap_alpha2})\)', current_tag_text).group('p_id') + p_tag["id"] = f'{cap_alpha_id1}{p_tag_id}' + p_tag.string = re.sub(rf'^\({cap_alpha2}{cap_alpha2}\)', '', current_tag_text) + cap_alpha2 = chr(ord(cap_alpha2) + 1) + previous_li_tag = p_tag + + elif re.search(rf'^{cap_alpha}\.', current_tag_text) and p_tag.name == "p": + inner_num_count = 1 + p_tag.name = "li" + cap_alpha_cur_tag = p_tag + + if re.search(r'^A\.', current_tag_text): + cap_alpha_ol = self.soup.new_tag("ol", type="A") + p_tag.wrap(cap_alpha_ol) + if cap_roman_cur_tag1: + cap_roman_cur_tag1.append(cap_alpha_ol) + cap_alpha_id = cap_roman_cur_tag1.get("id") + else: + cap_alpha_id = f"{p_tag.find_previous(['h5', 'h4', 'h3', 'h2']).get('id')}ol{ol_count}" + else: + cap_alpha_ol.append(p_tag) + p_tag["id"] = f'{cap_alpha_id}-{cap_alpha}' + p_tag.string = re.sub(rf'^{cap_alpha}\.', '', current_tag_text) + if cap_alpha == 'Z': + cap_alpha = 'A' + else: + cap_alpha = chr(ord(cap_alpha) + 1) + previous_li_tag = p_tag + + elif re.search(rf'^\({cap_rom}\)', current_tag_text) and p_tag.name == "p" \ + and cap_alpha1 not in ['I', 'V', 'X'] and p_tag.get("class") != "casenote": + p_tag.name = "li" + cap_roman_cur_tag = p_tag + if re.search(r'^\(I\)', current_tag_text): + cap_roman_ol = self.soup.new_tag("ol", type="I") + p_tag.wrap(cap_roman_ol) + roman_cur_tag.append(cap_roman_ol) + prev_id1 = roman_cur_tag.get('id') + else: + cap_roman_ol.append(p_tag) + p_tag["id"] = f'{prev_id1}{roman.fromRoman(cap_rom.upper())}' + p_tag.string = re.sub(r'^\([IVX]+\)', '', current_tag_text) + cap_rom = roman.toRoman(roman.fromRoman(cap_rom.upper()) + 1) + + if re.search(rf'^\([IVX]+\)\s*\(aa\)', current_tag_text): + alpha_ol1 = self.soup.new_tag("ol", type="a") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([IVX]+\)\s*\(aa\)', '', current_tag_text) + li_tag.append(current_tag_text) + cap_roman_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<cid>[IVX]+)\)\s*\((?P<pid>aa)\)', current_tag_text) + li_tag["id"] = f'{cap_roman_cur_tag.get("id")}{cur_tag.group("pid")}' + alpha_ol1.append(li_tag) + p_tag.string = "" + p_tag.append(alpha_ol1) + previous_li_tag = p_tag + + elif re.search(rf'^{inner_cap_rom}\.', current_tag_text) and p_tag.name == "p" \ + and cap_alpha1 not in ['I', 'V', 'X'] and p_tag.get("class") != "casenote": + p_tag.name = "li" + cap_roman_cur_tag1 = p_tag + main_sec_alpha = 'a' + cap_alpha = "A" + inner_num_count = 1 + if re.search(r'^I\.', current_tag_text): + cap_roman_ol1 = self.soup.new_tag("ol", type="I") + p_tag.wrap(cap_roman_ol1) + prev_id_rom = f"{p_tag.find_previous({'h5', 'h4', 'h3', 'h2'}).get('id')}ol{ol_count}" + else: + cap_roman_ol1.append(p_tag) + p_tag["id"] = f'{prev_id_rom}{inner_cap_rom}' + p_tag.string = re.sub(r'^[IVX]+\.', '', current_tag_text) + inner_cap_rom = roman.toRoman(roman.fromRoman(inner_cap_rom.upper()) + 1) + previous_li_tag = p_tag + + elif re.search(rf'^\({cap_alpha1}\)', current_tag_text) and p_tag.name == "p": + cap_alpha2 = 'A' + p_tag.name = "li" + cap_alpha_cur_tag1 = p_tag + small_roman = "i" + if re.search(r'^\(A\)', current_tag_text): + cap_alpha_ol1 = self.soup.new_tag("ol", type="A") + p_tag.wrap(cap_alpha_ol1) + if num_cur_tag1: + num_cur_tag1.append(cap_alpha_ol1) + cap_alpha_id1 = num_cur_tag1.get("id") + else: + cap_alpha_id1 = f"{p_tag.find_previous(['h5', 'h4', 'h3', 'h2']).get('id')}ol{ol_count}" + else: + cap_alpha_ol1.append(p_tag) + p_tag["id"] = f'{cap_alpha_id1}{cap_alpha1}' + p_tag.string = re.sub(rf'^\({cap_alpha1}\)', '', current_tag_text) + if cap_alpha1 == 'Z': + cap_alpha1 = 'A' + else: + cap_alpha1 = chr(ord(cap_alpha1) + 1) + + if re.search(rf'^\([A-Z]\)\s*\([ivx]+\)', current_tag_text): + roman_ol = self.soup.new_tag("ol", type="i") + li_tag = self.soup.new_tag("li") + li_tag.string = re.sub(r'^\([A-Z]\)\s*\([ivx]+\)', '', current_tag_text) + li_tag.append(current_tag_text) + roman_cur_tag = li_tag + cur_tag = re.search(r'^\((?P<cid>[A-Z])\)\s*\((?P<pid>[ivx]+)\)', current_tag_text) + prev_id1 = f'{cap_alpha_cur_tag1.get("id")}' + li_tag["id"] = f'{cap_alpha_cur_tag1.get("id")}{cur_tag.group("pid")}' + roman_ol.append(li_tag) + p_tag.string = "" + p_tag.append(roman_ol) + small_roman = "ii" + cap_rom = "I" + + if re.search(r'^\([A-Z]\)\s*\([ivx]+\)\s*\([IVX]+\)', current_tag_text): + cap_roman_ol = self.soup.new_tag("ol", type="I") + inner_li_tag = self.soup.new_tag("li") + inner_li_tag.string = re.sub(r'^\([A-Z]\)\s*\([ivx]+\)\s*\([IVX]+\)', '', current_tag_text) + inner_li_tag.append(current_tag_text) + cap_roman_cur_tag = inner_li_tag + cur_tag = re.search(r'^\((?P<cid>[A-Z])\)\s?\((?P<pid>[ivx]+)\)\s\(?(?P<nid>I)\)', + current_tag_text) + prev_id1 = f'{roman_cur_tag.get("id")}{cur_tag.group("pid")}' + inner_li_tag["id"] = f'{roman_cur_tag.get("id")}{cur_tag.group("pid")}{cur_tag.group("nid")}' + cap_roman_ol.append(inner_li_tag) + roman_cur_tag.string = "" + roman_cur_tag.append(cap_roman_ol) + previous_li_tag = p_tag + + elif re.search(r'^\([a-z][a-z]\)', current_tag_text) and cap_roman_cur_tag: + p_tag.name = "li" + if re.search(r'^\(aa\)', current_tag_text): + alpha_ol1 = self.soup.new_tag("ol", type="a") + p_tag.wrap(alpha_ol1) + cap_roman_cur_tag.append(alpha_ol1) + elif alpha_ol1: + alpha_ol1.append(p_tag) + p_tag_id = re.search(r'^\((?P<p_id>[a-z][a-z])\)', current_tag_text).group('p_id') + p_tag["id"] = f'{cap_roman_cur_tag.get("id")}{p_tag_id}' + p_tag.string = re.sub(r'^\([a-z][a-z]\)', '', current_tag_text) + previous_li_tag = p_tag + + elif p_tag.get("class") == [self.tag_type_dict["ol_p"]] \ + and not re.search(r'^HISTORY:|^History', current_tag_text) and previous_li_tag: + if previous_li_tag: + previous_li_tag.append(p_tag) + + if re.search(r'^CASE NOTES|^HISTORY:', current_tag_text) or p_tag.name in ['h3', 'h4', 'h5']: + num_count = 1 + ol_count = 1 + inner_num_count = 1 + main_sec_alpha = 'a' + inner_alpha = 'a' + num_cur_tag1 = None + sec_alpha_cur_tag = None + cap_alpha1 = "A" + cap_alpha2 = "A" + cap_alpha = 'A' + inner_cap_rom = "I" + sec_alpha_id = None + num_tag = None + small_roman = "i" + alpha_ol1 = None + cap_alpha_cur_tag1 = None + cap_roman_cur_tag = None + previous_li_tag = None + cap_roman_cur_tag1 = None + inner_num_cur_tag = None + cap_alpha_cur_tag = None + print('ol tags added') + + def create_analysis_nav_tag(self): + if self.release_number in ['83', '82', '81']: + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + inner_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + digit_id = None + num_tag = None + digit_tag = None + case_tag_id = None + for analysis_p_tag in self.soup.findAll('p', {'class': self.tag_type_dict['ol_p']}): + if re.search(r'^Analysis', analysis_p_tag.text.strip()): + rept_tag = re.split('\n', analysis_p_tag.text.strip()) + analysis_p_tag.clear() + for tag_text in rept_tag: + new_tag = self.soup.new_tag("p") + new_tag.string = tag_text + analysis_p_tag.append(new_tag) + if not re.search(r'Analysis', tag_text): + new_tag["class"] = "analysisnote" + analysis_p_tag.unwrap() + + for analysis_tag in self.soup.find_all("p", class_="analysisnote"): + if re.search(r'^\d+\.*|^-', analysis_tag.text.strip()): + analysis_tag.name = "li" + if re.search(r'^0\.5\.', analysis_tag.text.strip()): + num_tag = analysis_tag + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + analysis_tag.wrap(digit_ul) + case_tag_id = f'#{analysis_tag.find_previous({"h3", "h2"}).get("id")}-annotation-0.5' + + elif re.search(r'^\d+\.*', analysis_tag.text.strip()): + digit_tag = analysis_tag + if re.search(r'^1\.', analysis_tag.text.strip()): + if num_tag: + digit_ul.append(analysis_tag) + else: + digit_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + analysis_tag.wrap(digit_ul) + else: + digit_ul.append(analysis_tag) + num_tag = None + p_tag_num = re.search(r'^(?P<num>\d+)', analysis_tag.text.strip()).group("num") + digit_id = f'#{analysis_tag.find_previous({"h3", "h2"}).get("id")}-annotation-{p_tag_num}' + case_tag_id = f'#{analysis_tag.find_previous({"h3", "h2"}).get("id")}-annotation-{p_tag_num}' + + elif re.search(r'^-', analysis_tag.text.strip()): + case_id1 = re.sub(r'[\W\d]', '', analysis_tag.text.strip()).lower() + case_tag_id = f"{digit_id}-{case_id1}" + if re.search(r'^\d', analysis_tag.find_previous("li").text.strip()): + inner_ul = self.soup.new_tag("ul", **{"class": "leaders"}) + analysis_tag.wrap(inner_ul) + digit_tag.append(inner_ul) + else: + inner_ul.append(analysis_tag) + anchor = self.soup.new_tag('a', href=case_tag_id) + anchor.string = analysis_tag.text + analysis_tag.string = '' + analysis_tag.append(anchor) + else: + super(VTParseHtml, self).create_case_note_analysis_nav_tag() + print("Case Notes nav created") + + def replace_tags_constitution(self): + super(VTParseHtml, self).replace_tags_constitution() + cap_roman = "I" + cap_alpha = None + cap_num = None + h5_alpha_id = None + h5_rom_id = None + cap_roman_tag = None + annotation_id = None + annotation_text_list: list = [] + annotation_id_list: list = [] + h5_count = 1 + h3_count = 1 + + for header_tag in self.soup.find_all(): + if int(self.release_number) <= 83: + if header_tag.get('class') and header_tag.get("class")[0] == self.tag_type_dict["head4_1"]: + if re.search(fr'^\d+\.', header_tag.text.strip()): + header_tag.name = "h5" + h5_num_text = re.search(r'^(?P<h5_id>\d+)\.', header_tag.text.strip()).group("h5_id") + h5_num_id = f"{header_tag.find_previous({'h3', 'h2'}).get('id')}-annotation-{h5_num_text}" + if h5_num_id in annotation_id_list: + header_tag['id'] = f'{h5_num_id}.{h5_count}' + h5_count += 1 + else: + h5_count = 1 + header_tag['id'] = h5_num_id + + annotation_id_list.append(h5_num_id) + elif re.search(r'^\*\d{1,2}\.', header_tag.text.strip()): + header_tag.name = "h5" + analysis_id = re.sub(r'[\W\d]', '', header_tag.text.strip()).lower() + header_tag['id'] = f"{h5_num_id}-{analysis_id}" + elif header_tag.get('class') and \ + header_tag.get("class")[0] == self.tag_type_dict["head1"] and header_tag.name == "p": + if h3_tag := re.search(r'^AMENDMENT (?P<id>[IVX]+)\.', header_tag.text.strip()): + header_tag.name = "h2" + header_tag['id'] = f'{header_tag.find_previous("h1").get("id")}-ammendmentam{h3_tag.group("id").zfill(2)}' + header_tag['class'] = "oneh2" + else: + header_tag.name = "h2" + header_tag['id'] = f'{header_tag.find_previous("h1").get("id")}-ammendment' + + if header_tag.get('class') and header_tag.get("class")[0] == self.tag_type_dict["head4"]: + if re.search(r'^CASE NOTES$|^Analysis$|^ANNOTATIONS$', header_tag.text.strip()): + cap_roman = "I" + cap_roman_tag = None + elif re.search(rf'^{cap_roman}\.', header_tag.text.strip()) and \ + re.search(r'^ANNOTATIONS$', header_tag.find_previous("h4").text.strip()): + header_tag.name = "h5" + cap_roman_tag = header_tag + h5_rom_text = re.search(r'^(?P<h5_id>[IVX]+)\.', header_tag.text.strip()).group("h5_id") + h5_rom_id = f'{header_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecisison-{h5_rom_text}' + header_tag['id'] = h5_rom_id + cap_alpha = 'A' + cap_roman = roman.toRoman(roman.fromRoman(cap_roman.upper()) + 1) + elif cap_alpha and re.search(fr'^{cap_alpha}\.', header_tag.text.strip()): + header_tag.name = "h5" + h5_alpha_text = re.search(r'^(?P<h5_id>[A-Z]+)\.', header_tag.text.strip()).group("h5_id") + h5_alpha_id = f"{h5_rom_id}-{h5_alpha_text}" + header_tag['id'] = h5_alpha_id + cap_alpha = chr(ord(cap_alpha) + 1) + cap_num = 1 + elif cap_num and re.search(fr'^{cap_num}\.', header_tag.text.strip()): + header_tag.name = "h5" + h5_num_text = re.search(r'^(?P<h5_id>\d+)\.', header_tag.text.strip()).group("h5_id") + h5_num_id = f"{h5_alpha_id}-{h5_num_text}" + header_tag['id'] = h5_num_id + cap_num += 1 + elif annotation_id and re.search(r'^—[a-zA-Z]+', header_tag.text.strip()): + header_tag.name = "h5" + tag_text = re.sub(r'[\W\s.]+', '', header_tag.text.strip()).lower() + inner_head_id = f'{annotation_id}-{tag_text}' + if inner_head_id in annotation_id_list: + header_tag["id"] = f'{inner_head_id}.{h5_count}' + h5_count += 1 + else: + header_tag["id"] = f'{inner_head_id}' + h5_count = 1 + annotation_id_list.append(inner_head_id) + else: + annotation_text = re.sub(r'[\W\s]+', '', header_tag.text.strip()).lower() + if annotation_text in annotation_text_list and re.search(r'^ANNOTATIONS$', header_tag.find_previous( + "h4").text.strip()): + header_tag.name = "h5" + if cap_roman_tag: + annotation_id = f'{cap_roman_tag.get("id")}-{annotation_text}' + else: + annotation_id = f'{header_tag.find_previous({"h3", "h2", "h1"}).get("id")}-notetodecision-{annotation_text}' + if annotation_id in annotation_id_list: + header_tag["id"] = f'{annotation_id}.{h5_count}' + h5_count += 1 + else: + header_tag["id"] = f'{annotation_id}' + h5_count = 1 + annotation_id_list.append(annotation_id) + if int(self.release_number) >= 84 and re.search(r'^Analysis|^ANNOTATIONS', header_tag.text.strip()): + for tag in header_tag.find_next_siblings(): + if tag.get('class') and \ + tag.get('class')[0] == self.tag_type_dict["head4"]: + break + else: + tag["class"] = "casenote" + annotation_text_list.append(re.sub(r'[\W\s]+', '', tag.text.strip()).lower()) + if header_tag.name == "h3" and re.search(r'^§ 1401\.Boundaries$', header_tag.text.strip()): + tag_id = re.search(r'^§ (?P<id>1401)\.Boundaries$', header_tag.text.strip()).group("id") + header_tag["id"] = f'{header_tag.find_previous("h2", class_="twoh2").get("id")}s{tag_id}' + if int(self.release_number) <= 83 and re.search(r'vt\.html$', self.input_file_name) \ + and re.search(r'^\[.+]$', header_tag.text.strip()): + header_tag.name = "h3" + header_tag['id'] = f"{header_tag.find_previous('h2').get('id')}-s{h3_count:02}" + h3_count += 1 + + def add_anchor_tags_con(self): + for li_tag in self.soup.findAll("li"): + if not li_tag.get("id"): + if tag := re.search(r'^(?P<id>[IVX]+)\.', li_tag.text.strip()): + chap_num = tag.group("id") + self.c_nav_count += 1 + if li_tag.find_previous("p") and \ + re.search(r'^(Section\.?|Chapter|Sec\.|Article|Amendment)$', li_tag.find_previous("p").text.strip()): + s_tag = f'{li_tag.find_previous("p").text.strip()[:2].lower()}' + else: + s_tag = 'ch' + self.set_chapter_section_id(li_tag, chap_num, + sub_tag=f'{s_tag}', + prev_id=li_tag.find_previous({"h2","h1"}).get("id"), + cnav=f'cnav{self.c_nav_count:02}') + elif tag := re.search(r'^(?P<id>\d+)\.', li_tag.text.strip()): + chap_num = tag.group("id") + if re.search(r'^Section$', li_tag.find_previous().text.strip()): + self.s_nav_count = 0 + self.s_nav_count += 1 + self.set_chapter_section_id(li_tag, chap_num, + sub_tag="-s", + prev_id=li_tag.find_previous("h2").get("id"), + cnav=f'cnav{self.s_nav_count:02}') + elif re.search(r'^Amendments$',li_tag.text.strip()): + self.c_nav_count += 1 + self.set_chapter_section_id(li_tag, "ammendment", + sub_tag="-", + prev_id=li_tag.find_previous("h1").get("id"), + cnav=f'cnav{self.c_nav_count:02}') + + def wrap_inside_main_tag(self): + + """wrap inside main tag""" + + main_tag = self.soup.new_tag('main') + chap_nav = self.soup.find('nav') + + h2_tag = self.soup.find("h2") + tag_to_wrap = h2_tag.find_previous_sibling() + + if tag_to_wrap: + for tag in tag_to_wrap.find_next_siblings(): + tag.wrap(main_tag) + + for nav_tag in chap_nav.find_next_siblings(): + if nav_tag.name != "main": + nav_tag.wrap(chap_nav) + + def format_id(self, section_id, tag): + if int(self.release_number) <= 83: + if sec_id := re.search( + r'(?P<id>\d+\w?)(\.)?\s?-\d+\w?\.\s*\[?(Repealed|Reserved|Redesignated|Omitted)\.?]?', + tag.text.strip()): + return sec_id.group("id") + elif re.search(r'\.', section_id): + return re.sub(r'\.', '', section_id) + else: + return section_id + else: + pass