diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index 2dc89d6..1d45b2c 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -11,7 +11,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: ['3.8'] + python-version: ['3.10'] steps: - uses: actions/checkout@v3 @@ -44,3 +44,4 @@ jobs: uses: codecov/codecov-action@v5 with: files: ./coverage.xml + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/casparser/parsers/mupdf.py b/casparser/parsers/mupdf.py index b3f77ce..83d61e8 100644 --- a/casparser/parsers/mupdf.py +++ b/casparser/parsers/mupdf.py @@ -52,9 +52,9 @@ def extract_blocks(page_dict): for block in grouped_blocks: lines = [] items = [] - if len(block.get("lines", [])) == 0: - continue - bbox = block["lines"][0]["bbox"] + bbox = [0, 0, 0, 0] + if len(block.get("lines", [])) > 0: + bbox = block["lines"][0]["bbox"] y0, y1 = bbox[1], bbox[3] for line in sorted(block["lines"], key=lambda x: x["bbox"][1]): if len(items) > 0 and not ( @@ -113,12 +113,10 @@ def parse_investor_info(page_dict, page_rect: fitz.Rect) -> InvestorInfo: name = None for block in blocks: for line in block["lines"]: - for span in line["spans"]: - if span["bbox"][0] > width / 3: - continue + for span in filter( + lambda x: x["bbox"][0] <= width / 3 and x["text"].strip() != "", line["spans"] + ): txt = span["text"].strip() - if txt == "": - continue if not email_found: if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I): email = m.group(1).strip() @@ -156,9 +154,9 @@ def group_similar_rows(elements_list: List[Iterator[Any]]): lines = [] for elements in elements_list: sorted_elements = list(sorted(elements, key=itemgetter(1, 0))) - if len(sorted_elements) == 0: - continue - y0, y1 = sorted_elements[0][1], sorted_elements[0][3] + y0, y1 = 0, 0 + if len(sorted_elements) > 0: + y0, y1 = sorted_elements[0][1], sorted_elements[0][3] items = [] for el in sorted_elements: x2, y2, x3, y3 = el[:4] diff --git a/casparser/parsers/pdfminer.py b/casparser/parsers/pdfminer.py index 806ffd9..7b9719c 100644 --- a/casparser/parsers/pdfminer.py +++ b/casparser/parsers/pdfminer.py @@ -22,7 +22,10 @@ def parse_investor_info(layout, width, height) -> InvestorInfo: [ x for x in layout - if isinstance(x, LTTextBoxHorizontal) and x.x1 < width / 1.5 and x.y1 > height / 2 + if isinstance(x, LTTextBoxHorizontal) + and x.x1 < width / 1.5 + and x.y1 > height / 2 + and x.get_text().strip() != "" ], key=lambda x: -x.y1, ) @@ -33,8 +36,6 @@ def parse_investor_info(layout, width, height) -> InvestorInfo: name = None for el in text_elements: txt = el.get_text().strip() - if txt == "": - continue if not email_found: if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I): email = m.group(1).strip() @@ -88,9 +89,9 @@ def group_similar_rows(elements_list: List[Iterator[LTTextBoxHorizontal]]): lines = [] for elements in elements_list: sorted_elements = list(sorted(elements, key=lambda x: (-x.y1, x.x0))) - if len(sorted_elements) == 0: - continue - y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1 + y0, y1 = 0, 0 + if len(sorted_elements) > 0: + y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1 items = [] for el in sorted_elements: if len(items) > 0 and not (