-
Notifications
You must be signed in to change notification settings - Fork 29
/
extract_candidates.py
120 lines (99 loc) · 3.58 KB
/
extract_candidates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import re
from dateparser.search import search_dates
def get_invoice_nums(all_words):
inv_nums = []
invoice_no_re = r'^[0-9a-zA-Z-:]+$'
for word in all_words:
if not re.search('\d', word['text']):
continue
if len(word['text']) < 3:
continue
result = re.findall(invoice_no_re,word['text'])
if result:
inv_nums.append({
'text': word['text'],
'x1': word['left'],
'y1': word['top'],
'x2': word['left'] + word['width'],
'y2': word['top'] + word['height']
})
return inv_nums
def get_dates(all_text, all_words):
dates, all_dates = [], []
indices = []
matches = search_dates(all_text)
for match in matches:
text = match[0]
token_length = len(text.split(' '))
idx = all_text.find(match[0])
text_len = len(text)
index = len(all_text[:idx].strip().split(' '))
replaced_text = ' '.join(['*'*len(i) for i in text.split(' ')])
indices.append(list(range(index, index + token_length)))
index += token_length
all_text = all_text[:idx + text_len].replace(text, replaced_text) + all_text[idx + text_len:]
for date_indices in indices:
date = ''
left, top, right, bottom = [], [], [], []
for i in date_indices:
date += ' ' + all_words[i]['text']
left.append(all_words[i]['left'])
top.append(all_words[i]['top'])
right.append(all_words[i]['left'] + all_words[i]['width'])
bottom.append(all_words[i]['top'] + all_words[i]['height'])
all_dates.append({
'text': date.strip(),
'x1': min(left),
'y1': min(top),
'x2': max(right),
'y2': max(bottom)
})
return all_dates
def get_amounts(all_words):
amounts = []
amount_re = r"\$?([0-9]*,)*[0-9]{3,}(\.[0-9]+)?"
for word in all_words:
if not re.search(amount_re, word['text']):
continue
try:
formatted_word = re.sub(r'[$,]','', word['text'])
float(formatted_word)
amounts.append({
'text': word['text'],
'x1': word['left'],
'y1': word['top'],
'x2': word['left'] + word['width'],
'y2': word['top'] + word['height']
})
except ValueError:
continue
return amounts
def get_candidates(data):
all_words = []
for idx, word in enumerate(data['text']):
if word.strip() != "":
all_words.append({
'text': data['text'][idx],
'left': data['left'][idx],
'top': data['top'][idx],
'width': data['width'][idx],
'height': data['height'][idx]})
text = ' '.join([word['text'].strip() for word in all_words])
try:
invoice_date_candidates = get_dates(text,all_words)
except Exception as e:
invoice_date_candidates = []
try:
total_amount_candidates = get_amounts(all_words)
except Exception as e:
total_amount_candidates = []
try:
invoice_no_candidates = get_invoice_nums(all_words)
except Exception as e:
invoice_no_candidates = []
candidate_data = {
'invoice_no': invoice_no_candidates,
'invoice_date': invoice_date_candidates,
'total': total_amount_candidates
}
return candidate_data