Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(pacer.email): plain text parsing of bankruptcy short description #865

Merged
91 changes: 55 additions & 36 deletions juriscraper/pacer/email.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from lxml.html import HtmlElement

from juriscraper.AbstractSite import logger

from ..lib.string_utils import clean_string, convert_date_string, harmonize
from .docket_report import BaseDocketReport
from .reports import BaseReport
Expand Down Expand Up @@ -66,23 +68,20 @@ def data(self):
# Emails with attached images should be ignored.
if self.is_valid is False or self.tree is None or self.image_attached:
return {}

base = {
"court_id": self.court_id,
}
parsed = {
"appellate": self._is_appellate(),
"dockets": self._get_dockets(),
}
if self.content_type == "text/plain":
parsed = {
"appellate": self._is_appellate(),
"contains_attachments": self._contains_attachments_plain(),
"dockets": self._get_dockets(),
"email_recipients": self._get_email_recipients_plain(),
}
parsed["contains_attachments"] = self._contains_attachments_plain()
parsed["email_recipients"] = self._get_email_recipients_plain()
else:
parsed = {
"appellate": self._is_appellate(),
"contains_attachments": self._contains_attachments(),
"dockets": self._get_dockets(),
"email_recipients": self._get_email_recipients(),
}
parsed["contains_attachments"] = self._contains_attachments()
parsed["email_recipients"] = self._get_email_recipients()
return {**base, **parsed}

def _is_appellate(self) -> bool:
Expand Down Expand Up @@ -309,27 +308,29 @@ def _get_description(self, current_node: HtmlElement) -> str:
f"Can't get docket entry description, court: {self.court_id}"
)

def _get_description_plain(self):
def _get_description_plain(self) -> str:
"""Gets the docket text for plain email

:raises: Exception if description can't be parsed
:returns: Cleaned docket text
"""
email_body = self.tree.text_content()
regex = r"^.*?Docket Text:(.*?)electronically mailed to:"
find_description = re.findall(regex, email_body, re.DOTALL)
regex = r"^.*?Docket Text:(?P<descr>.*?)(The following document|electronically mailed to:)"
find_description = re.search(regex, email_body, re.DOTALL)

description = ""
if find_description:
splitlines = find_description[0].splitlines()
for index_line in range(len(splitlines)):
if "Notice has been" not in splitlines[index_line]:
# Build description line by line
description = f"{description} {splitlines[index_line]}"
else:
# Stop looking for description lines
for line in find_description.group("descr").splitlines():
if "Notice has been" in line:
break
description = clean_string(description)

# Build description line by line
description += f" {line}"
description = clean_string(description)

if description:
return description

raise Exception(
f"Can't get docket entry description for court: {self.court_id}"
)
Expand All @@ -352,20 +353,17 @@ def _contains_attachments_plain(self) -> bool:

:returns: True if it contains otherwise False.
"""

mail_body = self.tree.text_content()
regex = r"^.*?The following document\(s\) are associated with this transaction:(.*?)$"

find_attachments = re.findall(regex, mail_body, re.DOTALL)

associated_documents = 0
if find_attachments:
splitlines = find_attachments[0].splitlines()
for index_line in range(len(splitlines)):
if "Document description:" in splitlines[index_line]:
associated_documents = +1
if associated_documents <= 1:
return False
return True
for line in find_attachments[0].splitlines():
if "Document description:" in line:
associated_documents += 1

return associated_documents > 1

def _get_dockets(self) -> DocketType:
"""Get all the dockets mentioned in the notification.
Expand All @@ -380,15 +378,16 @@ def _get_dockets(self) -> DocketType:
dockets = []
if self.content_type == "text/plain":
docket_number = self._get_docket_number_plain()
# Cache the docket number for its later use.
self.docket_numbers.append(docket_number)

docket = {
"case_name": self._get_case_name_plain(),
"docket_number": docket_number,
"date_filed": None,
"docket_entries": self._get_docket_entries(),
}
dockets.append(docket)
# Cache the docket number for its later use.
self.docket_numbers.append(docket_number)
else:
dockets_table = self.tree.xpath(
"//table[contains(., 'Case Name:')]"
Expand Down Expand Up @@ -435,7 +434,7 @@ def _get_docket_entries(
description = self._get_description_plain()
if description is not None:
email_body = self.tree.text_content()
regex = r"view the document:[\r\n]+([^\r\n]+)"
regex = r"view the document:[\r\n\s]+([^\r\n]+)"
url = re.findall(regex, email_body)
if url:
document_url = url[0]
Expand Down Expand Up @@ -521,13 +520,22 @@ def _parse_bankruptcy_short_description(self, subject: str) -> str:
if len(self.docket_numbers) > 1:
# Since we don't have examples for bankruptcy multi docket NEF.
# No short_description parsing support yet.
logger.error(
"Not parsing description for Bankruptcy Multi Docket NEF for court '%s'",
self.court_id,
extra={
"fingerprint": [
f"{self.court_id}-not-parsing-multi-docket-short-description"
]
},
)
return ""

short_description = ""
docket_number = self.docket_numbers[0]
case_name = self.case_names[0]

if self.court_id == "cacb" or self.court_id == "ctb":
if self.court_id in ["cacb", "ctb", "cob"]:
# In: 6:22-bk-13643-SY Request for courtesy Notice of Electronic Filing (NEF)
# Out: Request for courtesy Notice of Electronic Filing (NEF)
short_description = subject.split(docket_number)[-1]
Expand Down Expand Up @@ -566,6 +574,17 @@ def _parse_bankruptcy_short_description(self, subject: str) -> str:
# Out: Reply
short_description = subject.split(case_name)[-1]

else:
logger.error(
"Short description has no parsing for bankruptcy court '%s'",
self.court_id,
extra={
"fingerprint": [
f"{self.court_id}-not-parsing-short-description"
]
},
)

return short_description

def _parse_appellate_short_description(self, subject: str) -> str:
Expand Down
8 changes: 0 additions & 8 deletions tests/examples/pacer/nef/s3/almd_4_plain.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,11 +166,3 @@ Document description: Main Document
Original filename: suppressed
Electronic document Stamp:
[STAMP dcecfStamp_ID=973800458 [Date=3/15/2019] [FileNumber=5919796-0] [717bcae4fd13b40a3b62bca1ad6ea5dbd5646af9111fc1ed5a689cc000861edb09a3792ba796791f060e277a1399ce9af46cd936b516f2234c67bcae4154f601]]

Document description: Main Document
Original filename: suppressed
Electronic document Stamp:
[STAMP dcecfStamp_ID=973800458 [Date=3/15/2019] [FileNumber=5919796-0] [717bcae4fd13b40a3b62bca1ad6ea5dbd5646af9111fc1ed5a689cc000861edb09a3792ba796791f060e277a1399ce9af46cd936b516f2234c67bcae4154f601]]



26 changes: 26 additions & 0 deletions tests/examples/pacer/nef/s3/cob.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"appellate": false,
"contains_attachments": true,
"court_id": "cob",
"dockets": [
{
"case_name": "Delta, LLC",
"date_filed": null,
"docket_entries": [
{
"date_filed": "2023-09-25",
"description": "Motion to Dismiss Case For Other Reasons the debtor entity does not have a need for bankruptcy protection Filed by FilerFirstname FilerLastName on behalf of Delta, LLC. (Attachments: (1) Proposed/Unsigned Order) (FilerLastName, FilerFirstname)",
"document_number": "27",
"document_url": "https://ecf.cob.uscourts.gov/doc1/038040602882?pdf_header=&magic_num=16522018&de_seq_num=95&caseid=516007",
"pacer_case_id": "516007",
"pacer_doc_id": "038040602882",
"pacer_magic_num": "16522018",
"pacer_seq_no": "95",
"short_description": "Motion to Dismiss Case"
}
],
"docket_number": "23-14130"
}
],
"email_recipients": []
}
71 changes: 71 additions & 0 deletions tests/examples/pacer/nef/s3/cob.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
Return-Path: <cmecf-reply@cob.uscourts.gov>
Received: from icmecf101.gtwy.uscourts.gov (icmecf101.gtwy.uscourts.gov [199.107.16.200])
by inbound-smtp.us-west-2.amazonaws.com with SMTP id uh1jdnaqtdqolk941bfo994tuc9a9p95j2r2a6o1
for recipient@recap.email;
Mon, 25 Sep 2023 20:33:06 +0000 (UTC)
X-SES-Spam-Verdict: PASS
X-SES-Virus-Verdict: PASS
Received-SPF: pass (spfCheck: domain of cob.uscourts.gov designates 199.107.16.200 as permitted sender) client-ip=199.107.16.200; envelope-from=cmecf-reply@cob.uscourts.gov; helo=icmecf101.gtwy.uscourts.gov;
Authentication-Results: amazonses.com;
spf=pass (spfCheck: domain of cob.uscourts.gov designates 199.107.16.200 as permitted sender) client-ip=199.107.16.200; envelope-from=cmecf-reply@cob.uscourts.gov; helo=icmecf101.gtwy.uscourts.gov;
dmarc=none header.from=cob.uscourts.gov;
X-SBRS: None
X-REMOTE-IP: 156.119.193.180
Received: from cobdb-rep.cob.gtwy.dcn ([156.119.193.180])
by icmecf101.gtwy.uscourts.gov with ESMTP; 25 Sep 2023 16:33:05 -0400
Received: from cobdb-rep.cob.gtwy.dcn (localhost.localdomain [127.0.0.1])
by cobdb-rep.cob.gtwy.dcn (8.14.7/8.14.7) with ESMTP id 38PKVmDr073308;
Mon, 25 Sep 2023 14:31:55 -0600
Received: (from ecf_web@localhost)
by cobdb-rep.cob.gtwy.dcn (8.14.7/8.14.4/Submit) id 38PKVh2Y073206;
Mon, 25 Sep 2023 14:31:43 -0600
Date: Mon, 25 Sep 2023 14:31:43 -0600
X-Authentication-Warning: cobdb-rep.cob.gtwy.dcn: ecf_web set sender to cmecf-reply@cob.uscourts.gov using -f
MIME-Version:1.0
From:cmecf-reply@cob.uscourts.gov
To:courtmail@cob.uscourts.gov
Message-Id:<48774699@cob.uscourts.gov>
Subject:23-14130-JGR Motion to Dismiss Case
Content-Type: text/plain

***NOTE TO PUBLIC ACCESS USERS*** Judicial Conference of the United States policy permits attorneys of record and parties in a case (including pro se litigants) to receive one free electronic copy of all documents filed electronically, if receipt is required by law or directed by the filer. PACER access fees apply to all other users. To avoid later charges, download a copy of each document during this first viewing. However, if the referenced document is a transcript, the free copy and 30-page limit do not apply.

U.S. Bankruptcy Court
District of Colorado

Notice of Electronic Filing
The following transaction was received from FirstName Last Name entered on 9/25/2023 at 2:31 PM MDT and filed on 9/25/2023

Case Name: Delta, LLC
Case Number: 23-14130-JGR https://ecf.cob.uscourts.gov/cgi-bin/DktRpt.pl?516007

Document Number: 27
Copy the URL address from the line below into the location bar of your Web browser to view the document: https://ecf.cob.uscourts.gov/doc1/038040602882?pdf_header=&magic_num=16522018&de_seq_num=95&caseid=516007

Docket Text:
Motion to Dismiss Case For Other Reasons the debtor entity does not have a need for bankruptcy protection Filed by FilerFirstname FilerLastName on behalf of Delta, LLC. (Attachments: # (1) Proposed/Unsigned Order) (FilerLastName, FilerFirstname)

The following document(s) are associated with this transaction:
Document description: Main Document
Original filename: Dismissal.pdf
Electronic document Stamp:
[STAMP bkecfStamp_ID=985638001 [Date=9/25/2023] [FileNumber=48774698-0] [ad540b5d97ef1738274ab45e91c33cb34d46c4ea12d40070860aa383a9c33d8bade5e4ceb25f28af8647ab34d0ce7085a93200a15270c5a5e51353a7a79edba9]]
Document description: Proposed/Unsigned Order
Original filename: C:\fakepath\Ex A - Proposed Order.pdf
Electronic document Stamp:
[STAMP bkecfStamp_ID=985638001 [Date=9/25/2023] [FileNumber=48774698-1] [06f97e6a23f90c93e04b24659af0d156b12d3694b68f527325fee1a54f101f3e5cf9587ff565009042203876d040b7e0bf418eeb94635534be99f134d37f999d]]


23-14130-JGR Notice will be electronically mailed to:
Recipient_FirstName, Recipient_LastName on behalf of Debtor Delta, LLC
recipient@example.com, recipient.example.com@recap.email

Trustee First name Trustee Last Name on behalf of U.S. Trustee US Trustee
trustee@usdoj.gov

US Trustee
USTPRegion19.DV.ECF@usdoj.gov



23-14130-JGR Notice will not be electronically mailed to:
Loading