-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathheaders_para.py
146 lines (118 loc) · 5.78 KB
/
headers_para.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
from operator import itemgetter
'''
this is a code that I found on medium for extracting header files with html like tag <h2> and paragraph with
<p>, from this output I could identify which are the headings and I added them to a headers_list
'''
def headers(doc):
font_counts, styles = fonts(doc)
size_tag = font_tags(font_counts, styles)
header_para = headers_para(doc, size_tag)
return header_para
def fonts(doc, granularity=False):
"""Extracts fonts and their usage in PDF documents.
:param doc: PDF document to iterate through
:type doc: <class 'fitz.fitz.Document'>
:param granularity: also use 'font', 'flags' and 'color' to discriminate text
:type granularity: bool
:rtype: [(font_size, count), (font_size, count}], dict
:return: most used fonts sorted by count, font style information
"""
styles = {}
font_counts = {}
for page in doc:
blocks = page.getText("dict")["blocks"]
for b in blocks: # iterate through the text blocks
if b['type'] == 0: # block contains text
for l in b["lines"]: # iterate through the text lines
for s in l["spans"]: # iterate through the text spans
if granularity:
identifier = "{0}_{1}_{2}_{3}".format(
s['size'], s['flags'], s['font'], s['color'])
styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
'color': s['color']}
else:
identifier = "{0}".format(s['size'])
styles[identifier] = {
'size': s['size'], 'font': s['font']}
font_counts[identifier] = font_counts.get(
identifier, 0) + 1 # count the fonts usage
font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)
if len(font_counts) < 1:
raise ValueError("Zero discriminating fonts found!")
return font_counts, styles
def font_tags(font_counts, styles):
"""Returns dictionary with font sizes as keys and tags as value.
:param font_counts: (font_size, count) for all fonts occuring in document
:type font_counts: list
:param styles: all styles found in the document
:type styles: dict
:rtype: dict
:return: all element tags based on font-sizes
"""
p_style = styles[font_counts[0][0]
] # get style for most used font by count (paragraph)
p_size = p_style['size'] # get the paragraph's size
# sorting the font sizes high to low, so that we can append the right integer to each tag
font_sizes = []
for (font_size, count) in font_counts:
font_sizes.append(float(font_size))
font_sizes.sort(reverse=True)
# aggregating the tags for each font size
idx = 0
size_tag = {}
for size in font_sizes:
idx += 1
if size == p_size:
idx = 0
size_tag[size] = '<p>'
if size > p_size:
size_tag[size] = '<h{0}>'.format(idx)
elif size < p_size:
size_tag[size] = '<s{0}>'.format(idx)
return size_tag
def headers_para(doc, size_tag):
"""Scrapes headers & paragraphs from PDF and return texts with element tags.
:param doc: PDF document to iterate through
:type doc: <class 'fitz.fitz.Document'>
:param size_tag: textual element tags for each size
:type size_tag: dict
:rtype: list
:return: texts with pre-prended element tags
"""
header_para = [] # list with headers and paragraphs
first = True # boolean operator for first header
previous_s = {} # previous span
for page in doc:
blocks = page.getText("dict")["blocks"]
for b in blocks: # iterate through the text blocks
if b['type'] == 0: # this block contains text
# REMEMBER: multiple fonts and sizes are possible IN one block
block_string = "" # text found in block
for l in b["lines"]: # iterate through the text lines
for s in l["spans"]: # iterate through the text spans
if s['text'].strip(): # removing whitespaces:
if first:
previous_s = s
first = False
block_string = size_tag[s['size']] + s['text']
else:
if s['size'] == previous_s['size']:
if block_string and all((c == "|") for c in block_string):
# block_string only contains pipes
block_string = size_tag[s['size']
] + s['text']
if block_string == "":
# new block has started, so append size tag
block_string = size_tag[s['size']
] + s['text']
else: # in the same block, so concatenate strings
block_string += " " + s['text']
else:
header_para.append(block_string)
block_string = size_tag[s['size']
] + s['text']
previous_s = s
# new block started, indicating with a pipe
block_string += "|"
header_para.append(block_string)
return header_para