-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsave_for_mallet.py
108 lines (86 loc) · 3.83 KB
/
save_for_mallet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
'''
This example processes the term counts for each page of a volume, outputting each
page as a document for Mallet import.
The process results in two files: train-mallet.txt and infer-mallet.txt.
train-mallet.txt can be used to train a unigram LDA model on that volume's pages.
infer-mallet.txt combines pages into a sliding frame (e.g. pg 1-3, 2-4, 3-5, etc.).
Since page breaks are often independent of the content of the book,
this will hopefully allow us to smooth over the individual quirks of any
particular page and get more of a sense of topics as they move through the
book.
TODO:
'''
from __future__ import unicode_literals
import os
import argparse
from six import iteritems
from htrc_features import FeatureReader
def main():
parser = argparse.ArgumentParser()
parser.add_argument('input',
help='Path to document to parse',
nargs='*')
parser.add_argument('-f', '--frame-size', default=10, type=int,
help='Number of pages to use in sliding frame')
parser.add_argument('-o', '--outpath', default="tmp", type=unicode,
help='Output directory')
args = parser.parse_args()
append = False
frame = []
for volinput in args.input:
# Get a list of json.bz2 files to read
freader = FeatureReader(volinput)
vol = freader.next()
# Remove special characters from title. This will allow us to name a file after it
clean_id = ''.join([char for char in vol.title if char.isalnum()])
# Open files for training (Doc=1 page) and inference (Doc=sliding set of pages)
tfile = open(os.path.join(args.outpath, 'train-{}.txt'.format(clean_id)), 'w+' if not append else 'a')
inferfile = open(os.path.join(args.outpath, 'infer-{}.txt'.format(clean_id)), 'w+' if not append else 'a')
for page in vol.pages():
all_terms = explode_terms(page)
# Data cleaning
all_terms = [clean(term) for term in all_terms]
all_terms = [term for term in all_terms if term]
# Make into string
pagetxt = " ".join(all_terms)
frame += [pagetxt]
while len(frame) > args.frame_size:
frame = frame[1:]
tfile.write('page{0} page{0} {1}\n'.format(page.seq, pagetxt))
inferfile.write('pages{0}to{1} pages{0}to{1} {2}\n'.format(page.seq+1-len(frame),
page.seq,
" ".join(frame)))
tfile.close()
inferfile.close()
def clean(s):
# Strip special chars
s = ''.join([char for char in s if char.isalnum()])
# Remvoe short words
if len(s) <= 2:
return False
return s
def explode_terms(page):
# Access case-insensitive counts for non-POS-tagged terms
counts_dict = page.body.tokenlist.token_counts(case=False, pos=False)
# Explode {'word': count} to large list
counts_list = []
for term, count in iteritems(counts_dict):
counts_list += [term] * count
return counts_list
def old():
# Get a list of json.bz2 files to read
paths = glob.glob('data/*.json.bz2')
paths = paths[0:4] # Truncate list for example
# Open file for writing results
f = bz2.BZ2File('term_volume_counts.bz2', "w")
# Start a feature reader with the paths and pass the mapping function
feature_reader = FeatureReader(paths)
results = feature_reader.multiprocessing(get_term_volume_counts)
# Save the results
for vol, result in results:
for t,c in result.iteritems(): # result.items() in python3
s = "{0}\t{1}\t{2}\t{3}\n".format(vol[0], vol[1],t,c)
f.write(s.encode('UTF-8')) # For python3, use str(s)
f.close()
if __name__ == '__main__':
main()