-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathprocess-schwab-2016.py
executable file
·114 lines (107 loc) · 3.98 KB
/
process-schwab-2016.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#!/usr/bin/python
# In Ubuntu or Fedora, pdftotext is part of the poppler-utils package.
# Take the PDF file containing Form 1099-B and run these commands:
# pdftotext -layout 1099-b.pdf 1099-b.txt
# python process-schwab-2016.py 1099-b.txt > 1099-b.csv
# python create-txf-2015.py 1099-b.csv > 1099-b.txf
import re
import sys
with open(sys.argv[1]) as f:
content = [x.strip('\n') for x in f.readlines()]
# Start positions of the columns in the text file.
columns1 = (0, 70, 79, 112, 131, 151, 170)
columns2 = (0, 70, 95, 130)
def splitLine(line, columns):
columns = list(columns)
ncol = len(columns)
if ncol >= 7:
for i in xrange(1, ncol):
dollar = line.find('$', columns[i], columns[i] + 5)
if dollar >= 0:
columns[i] = dollar
entries = []
for i in xrange(ncol):
end = len(line) if i == ncol - 1 else columns[i + 1]
entry = line[columns[i]:end]
if i > 0:
dollar = entry.find('$')
if dollar >= 0:
entry = entry[dollar+1:]
elif ncol >= 7:
# The position of the first two columns in the first row of a
# transaction overlaps slightly in the conversion to text. The
# second column only contains a single code without a space.
entry = entry.rstrip()
pos = entry.rfind(' ')
if pos >= 0:
entries.append(entry[:pos].strip())
entry = entry[pos+1:]
entry = entry.strip().replace(',', '')
if entry.startswith('(') and entry.endswith(')'):
# Schwab indicates negative numbers with parentheses.
entry = '-' + entry[1:-1]
entries.append(entry)
return entries
box = 'A'
pat = re.compile(r'Box ([A-F]) checked')
records = []
state = 0
for line in content:
#print state, line
if line.startswith('YEAR-END SUMMARY INFORMATION IS NOT PROVIDED TO THE IRS'):
break
if line.startswith('FATCA Filing Requirement'):
continue
match = re.search(pat, line)
if match:
box = match.group(1)
if line.startswith('CUSIP Number / Symbol') or line.startswith('Security Subtotal'):
# Security Subtotal now has 2 lines
state = 1
continue
if line.startswith('Total ') or line.startswith('Please see the '):
state = 0
continue
if not state or not line:
state = 0
continue
if state == 1:
state = 2
continue
if state == 2:
pending = splitLine(line, columns1)
state = 3
continue
pending2 = splitLine(line, columns2)
pos = pending2[0].find(' / ')
if pos > 0:
symbol = pending2[0][pos+3:]
if symbol:
pending2[0] = symbol
else:
pending2[0] = pending2[0][:pos]
records.append(pending + pending2 + [box])
state = 2
records.sort(key=lambda r: r[8])
pat = re.compile(r'([0-9.]+)(S?) .*')
pat2 = re.compile(r'/(20)\d\d ')
for r in records:
#print r
match = re.match(pat, r[0])
count = float(match.group(1))
if match.group(2) == 'S':
count = -count
match2 = re.search(pat2, r[8])
if match2:
r[8] = r[8][:match2.start(1)] + r[8][match2.end(1):]
# From the instructions of Charles Schwab Form 1099-B:
# Box 1f. Shows W for wash sale, C for collectibles, or D for market discount.
# Box 1g. Shows the amount of nondeductible loss in a wash sale
# transaction or the amount of accrued market discount. When the
# sale of a debt instrument is a wash sale and has accrued market
# discount, code "W" will be in box 1f and the amount of the wash
# sale loss disallowed will be in box 1g. For details on wash
# sales and market discount, see Scheduled D (Form 1040)
# instructions and Pub. 550.
# r[4] contains the content of Box 1g and r[12] contains the contents of Box 1f.
print '%s,%g,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s' % (r[8], count, r[2], r[9], r[3], r[4], r[5], r[6], r[7], r[11], r[12], r[1], r[0], r[10])