Skip to content

Commit

Permalink
Merge pull request #895 from freelawproject/fix-connecticut
Browse files Browse the repository at this point in the history
fix(conn): Update Conn
  • Loading branch information
flooie authored Jan 30, 2024
2 parents 8dac8f0 + 56572bd commit 958bbef
Show file tree
Hide file tree
Showing 4 changed files with 1,036 additions and 1,029 deletions.
69 changes: 51 additions & 18 deletions juriscraper/opinions/united_states/state/conn.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,27 +26,60 @@ def __init__(self, *args, **kwargs):
self.year = date.today().strftime("%y")
self.url = f"http://www.jud.ct.gov/external/supapp/archiveAROsup{self.year}.htm"
self.status = "Published"
self.cipher = "AES256-SHA256"
self.set_custom_adapter(self.cipher)

@staticmethod
def find_published_date(date_str: str) -> str:
"""Extract published date
:param date_str: The row text
:return: The date string
"""
m = re.search(
r"(\b\d{1,2}/\d{1,2}/\d{2,4}\b)|(\b(?:January|February|March|April|May|June|July|August|September|October|November|December) \d{1,2}, \d{4}\b)",
date_str,
)
return m.groups()[0] if m.groups()[0] else m.groups()[1]

def extract_dockets_and_name(self, row) -> [str, str]:
"""Extract the docket and case name from each row
:param row: Row to process
:return: Docket(s) and Case Name
"""
text = " ".join(row.xpath("ancestor::li[1]//text()"))
clean_text = re.sub(r"[\n\r\t\s]+", " ", text)
m = re.match(
r"(?P<dockets>[SAC0-9, ]+)(?P<op_type> [A-Z].*)? - (?P<case_name>.*)",
clean_text,
)
if not m:
# Handle bad inputs
m = re.match(
r"(?P<dockets>[SAC0-9, ]+)(?P<op_type> [A-Z].*)? (?P<case_name>.*)",
clean_text,
)
op_type = m.group("op_type")
name = m.group("case_name")
if op_type:
name = f"{name} ({op_type.strip()})"
return m.group("dockets"), name

def _process_html(self) -> None:
"""Process the html and extract out the opinions
:return: None
"""
for row in self.html.xpath("//ul/preceding-sibling::p"):
if "Published in the Law Journal" not in row.text_content():
continue
date = row.text_content().split(" ")[-1][:-1]
for document in row.xpath("following-sibling::ul[1]/li"):
output_string = re.sub(
r"\s{2,}", " ", document.text_content()
).strip()
name = output_string.split("-")[-1].strip()
docket = document.xpath(".//a")[0].text_content().split()[0]
self.cases.append(
{
"date": date,
"url": document.xpath(".//a")[0].get("href"),
"docket": docket,
"name": name,
}
)
for row in self.html.xpath(".//*[contains(@href, '.pdf')]"):
pub = row.xpath('preceding::*[contains(., "Published")][1]/text()')
date = self.find_published_date(pub[0])
dockets, name = self.extract_dockets_and_name(row)
self.cases.append(
{
"url": row.get("href"),
"name": name,
"docket": dockets,
"date": date,
}
)
30 changes: 2 additions & 28 deletions juriscraper/opinions/united_states/state/connappct.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,11 @@
- 2023-11-20, William Palin: Updated
"""

from datetime import date
from juriscraper.opinions.united_states.state import conn

from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
class Site(conn.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.year = date.today().strftime("%y")
self.url = f"http://www.jud.ct.gov/external/supapp/archiveAROap{self.year}.htm"
self.status = "Published"

def _process_html(self) -> None:
"""Process the html and extract out the opinions
:return: None
"""
for date_section in self.html.xpath("//ul"):
b = date_section[0].xpath(".//preceding::b[1]")[0]
date = b.text_content().strip().split("of")[-1].strip()[:-1]
for li in date_section.xpath(".//li"):
link = li.xpath(".//a")[0]
link_text = link.text_content()
docket = link.text_content()
name = li.text_content().replace(link_text, "").strip()
self.cases.append(
{
"date": date,
"url": link.get("href"),
"docket": docket,
"name": name,
}
)
Loading

0 comments on commit 958bbef

Please sign in to comment.