-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlda2vec_run.py
124 lines (115 loc) · 4.32 KB
/
lda2vec_run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
# Author: Chris Moody <chrisemoody@gmail.com>
# License: MIT
# This simple example loads the newsgroups data from sklearn
# and train an LDA-like model on it
import os
import os.path
import pickle
import time
import shelve
import chainer
from chainer import cuda
from chainer import serializers
import chainer.optimizers as O
import numpy as np
from lda2vec import utils
from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence
from lda2vec_model import LDA2Vec
gpu_id = int(os.getenv('CUDA_GPU', 0))
cuda.get_device(gpu_id).use()
print ("Using GPU " + str(gpu_id))
data_dir = os.getenv('data_dir', '../data/')
fn_vocab = '{data_dir:s}/vocab.pkl'.format(data_dir=data_dir)
fn_corpus = '{data_dir:s}/corpus.pkl'.format(data_dir=data_dir)
fn_flatnd = '{data_dir:s}/flattened.npy'.format(data_dir=data_dir)
fn_docids = '{data_dir:s}/doc_ids.npy'.format(data_dir=data_dir)
fn_vectors = '{data_dir:s}/vectors.npy'.format(data_dir=data_dir)
vocab = pickle.load(open(fn_vocab, 'r'))
corpus = pickle.load(open(fn_corpus, 'r'))
flattened = np.load(fn_flatnd)
doc_ids = np.load(fn_docids)
vectors = np.load(fn_vectors)
# Model Parameters
# Number of documents
n_docs = doc_ids.max() + 1
# Number of unique words in the vocabulary
n_vocab = flattened.max() + 1
# 'Strength' of the dircihlet prior; 200.0 seems to work well
clambda = 200.0
# Number of topics to fit
n_topics = int(os.getenv('n_topics', 20))
batchsize = 4096
# Power for neg sampling
power = float(os.getenv('power', 0.75))
# Intialize with pretrained word vectors
pretrained = bool(int(os.getenv('pretrained', True)))
# Sampling temperature
temperature = float(os.getenv('temperature', 1.0))
# Number of dimensions in a single word vector
n_units = int(os.getenv('n_units', 300))
# Get the string representation for every compact key
words = corpus.word_list(vocab)[:n_vocab]
# How many tokens are in each document
doc_idx, lengths = np.unique(doc_ids, return_counts=True)
doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')
doc_lengths[doc_idx] = lengths
# Count all token frequencies
tok_idx, freq = np.unique(flattened, return_counts=True)
term_frequency = np.zeros(n_vocab, dtype='int32')
term_frequency[tok_idx] = freq
for key in sorted(locals().keys()):
val = locals()[key]
if len(str(val)) < 100 and '<' not in str(val):
print (key, val)
model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,
n_units=n_units, n_vocab=n_vocab, counts=term_frequency,
n_samples=15, power=power, temperature=temperature)
if os.path.exists('lda2vec.hdf5'):
print ("Reloading from saved")
serializers.load_hdf5("lda2vec.hdf5", model)
if pretrained:
model.sampler.W.data[:, :] = vectors[:n_vocab, :]
model.to_gpu()
optimizer = O.Adam()
optimizer.setup(model)
clip = chainer.optimizer.GradientClipping(5.0)
optimizer.add_hook(clip)
j = 0
epoch = 0
fraction = batchsize * 1.0 / flattened.shape[0]
progress = shelve.open('progress.shelve')
for epoch in range(200):
data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),
cuda.to_cpu(model.mixture.factors.W.data).copy(),
cuda.to_cpu(model.sampler.W.data).copy(),
words)
top_words = print_top_words_per_topic(data)
if j % 100 == 0 and j > 100:
coherence = topic_coherence(top_words)
for j in range(n_topics):
print (j, coherence[(j, 'cv')])
kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
progress[str(epoch)] = pickle.dumps(kw)
data['doc_lengths'] = doc_lengths
data['term_frequency'] = term_frequency
np.savez('topics.pyldavis', **data)
for d, f in utils.chunks(batchsize, doc_ids, flattened):
t0 = time.time()
optimizer.zero_grads()
l = model.fit_partial(d.copy(), f.copy())
prior = model.prior()
loss = prior * fraction
loss.backward()
optimizer.update()
msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
"P:{prior:1.3e} R:{rate:1.3e}")
prior.to_cpu()
loss.to_cpu()
t1 = time.time()
dt = t1 - t0
rate = batchsize / dt
logs = dict(loss=float(l), epoch=epoch, j=j,
prior=float(prior.data), rate=rate)
print (msg.format(**logs))
j += 1
serializers.save_hdf5("lda2vec.hdf5", model)