-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmmeigenwords.py
221 lines (194 loc) · 9.64 KB
/
mmeigenwords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# coding: utf-8
import os
from collections import Counter
from tqdm import tqdm
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.utils.extmath import randomized_svd
from sklearn.utils.extmath import safe_sparse_dot as ssdot
from corpus_io import read_corpus, count_word_dask
from construct_matrix import construct_matrix, construct_matrix_dask
import math_utils as mu
import block_matrix as bm
import named_array as na
from algorithm import randomized_ghep
from sammlung import DefaultOrderedDict
from embed_base import WordEmbedBase
from context import Context
class MMEigenwords(WordEmbedBase):
def __init__(self, vocab_size=10000, window_size=4, dim=300,
oversampling=20, n_iter=3, squash='sqrt',
word_dict=None):
self.corpus_path = []
self.vocab_size = vocab_size
self.window_size = window_size
self.dim = dim
self.oversampling = oversampling
self.n_iter = n_iter
self.squash = squash
self.wv = None
self.context = None
self.ev = None
self.image_mapping = None
self.mean_image = None
self.mapped_image = None
self.train_image_id = np.array([], dtype=str)
if squash == 'log1p':
self._squash = mu.log1p
elif squash == 'sqrt':
self._squash = mu.sqrt
else:
raise Exception('invaldi squash option')
if word_dict is not None:
self.word_dict = word_dict
else:
self.word_dict = None
def load_corpus(self, path, images, Wvi, context=False, use_dask=False,
n_worker=-1, n_chunk=200, verbose=False): #TODO: implement dask option
verboseprint = lambda x: print(x) if verbose else None
verboseprint('Fixing Wvi')
mandatory_words = set()
if isinstance(Wvi, na.NamedArrBase):
mandatory_words = set(Wvi.names[0])
elif isinstance(Wvi, (pd.DataFrame, pd.SparseDataFrame)):
mandatory_words = set(Wvi.index)
else:
raise ValueError("Wvi must be one of named_array.NamedArray, pandas.DataFrame, or pandas.SparseDataFrame.")
if self.word_dict is None:
if use_dask:
word_count = count_word_dask(
path, self.vocab_size, n_workers=n_worker,
n_partition=n_chunk, verbose=verbose)
else:
word_count = Counter()
word_count.update(read_corpus(path, max_len=50000000))
mandatory_words.intersection_update(word_count.keys())
vocab2idx = DefaultOrderedDict(int, append_missing=False)
vocab2idx['<OOV>'] = 0
vocab2idx.update(
(word, i + 1) for i, word in enumerate(x[0] for x in word_count.most_common(self.vocab_size - 1)))
vocab2idx.update(
(word, i + self.vocab_size) for i, word in enumerate(mandatory_words.difference(vocab2idx.keys())))
self.word_dict = vocab2idx
# update Wvi
missing_words = set(vocab2idx.keys()).difference(mandatory_words)
if isinstance(Wvi, na.NamedArrBase):
missing_Wvi = sparse.csr_matrix((len(missing_words), Wvi.shape[1]), dtype=Wvi.dtype)
new_Wvi = Wvi[list(mandatory_words), :] # error without ":"
missing_Wvi = na.NamedArray(missing_Wvi, axis_names=[missing_words, None])
new_Wvi = na.vstack([new_Wvi, missing_Wvi], format='csr')
new_Wvi = new_Wvi[list(vocab2idx.keys()), ]
self.train_image_id = np.array(Wvi.names[1], dtype=str) # FIXME: what if it's missing?
elif isinstance(Wvi, pd.DataFrame):
missing_Wvi = np.zeros((len(missing_words), Wvi.shape[1]), dtype=Wvi.dtype)
new_Wvi = Wvi.loc[list(mandatory_words)]
missing_Wvi = pd.DataFrame(missing_Wvi, index=missing_words)
new_Wvi = pd.concat([new_Wvi, missing_Wvi]).loc[vocab2idx.keys()]
new_Wvi = new_Wvi.values
self.train_image_id = np.array(Wvi.columns, dtype=str)
elif isinstance(Wvi, pd.SparseDataFrame):
missing_Wvi = sparse.csr_matrix((len(missing_words), Wvi.shape[1]), dtype=Wvi.dtype)
new_Wvi = Wvi.loc[list(mandatory_words)]
missing_Wvi = pd.SparseDataFrame(missing_Wvi, index=missing_words)
new_Wvi = pd.concat([new_Wvi, missing_Wvi]).loc[vocab2idx.keys()]
new_Wvi = new_Wvi.to_coo().tocsr()
self.train_image_id = np.array(Wvi.columns, dtype=str)
self.vocab_size = len(vocab2idx)
# show info
verboseprint(' Vocab size: %d'%self.vocab_size)
self.corpus_path.append(path)
self.train(path if use_dask else read_corpus(path, max_len=50000000, convert=self._tokens2idx),
images, new_Wvi, context,
use_dask, n_worker, n_chunk, verbose)
def train(self, tokens, images, Wvi, context=False,
use_dask=False, n_worker=-1, n_chunk=200, verbose=False):
verboseprint = lambda x: print(x) if verbose else None
verboseprint('Constructing matrices...')
if verbose and use_dask == False:
tokens = tqdm(tokens)
if use_dask:
tVC, tVV_diag, tCC_diag = construct_matrix_dask(
tokens, self.window_size, self.vocab_size,
self._tokens2idx, n_worker, n_chunk, verbose)
else:
tVC, tVV_diag, tCC_diag = construct_matrix(tokens, self.window_size, self.vocab_size)
self.mean_image = np.mean(images, axis=0, keepdims=True)
Xvis = images - self.mean_image
verboseprint('Squashing...')
tVC, tVV_diag, tCC_diag = self._squash_arrays(tVC, tVV_diag, tCC_diag)
verboseprint('Preparing arrays...')
n_tags_per_vocab = mu.sum(Wvi, axis=1)
tVWviXvis = ssdot(ssdot(sparse.diags(tVV_diag), Wvi), Xvis)
Gvv_diag = tVV_diag + tVV_diag * n_tags_per_vocab
Gvis = Xvis.T @ ssdot(sparse.diags(ssdot(Wvi.T, tVV_diag)), Xvis)
verboseprint('Calculating word vectors...')
H = bm.block_sym_mat([[None, tVC, tVWviXvis],
[None, None, None],
[None, None, None]])
G = bm.block_diag_mat(
[sparse.diags(Gvv_diag), sparse.diags(tCC_diag), Gvis])
eigenvalues, A = randomized_ghep(H, G,
n_components=self.dim,
n_oversamples=self.dim + self.oversampling,
n_iter=self.n_iter)
self.ev = eigenvalues[::-1]
self._set_keyedvector('wv', self.word_dict.keys(), self.dim,
vec=A[:self.vocab_size, ::-1])
self.image_mapping = A[-Xvis.shape[1]:, ::-1]
if context:
self.context = Context(A[self.vocab_size:-Xvis.shape[1], ::-1],
len(self.word_dict), self.window_size)
def map_image(self, images):
if isinstance(images, pd.DataFrame):
image_ids = images.index.tolist()
images = images.values
elif isinstance(images, na.NamedArrBase):
image_ids = images.names[0]
self._set_keyedvector('mapped_image', image_ids, self.dim,
vec=normalize((images - self.mean_image) @ self.image_mapping))
def most_similar(self, pos_word=[], neg_word=[], pos_img=[], neg_img=[],
target="word", topn=10):
positive = []
negative = []
positive.extend(self.wv.word_vec(x, use_norm=True) for x in pos_word)
positive.extend(self.mapped_image.word_vec(x, use_norm=True) for x in pos_img)
negative.extend(self.wv.word_vec(x, use_norm=True) for x in neg_word)
negative.extend(self.mapped_image.word_vec(x, use_norm=True) for x in neg_img)
if target == "word":
return self.wv.most_similar(positive=positive, negative=negative, topn=topn)
elif target == "image":
return self.mapped_image.most_similar(positive=positive, negative=negative, topn=topn)
else:
raise ValueError("invalid target. target must be one of word or image")
def _save_meta_hook(self, model_meta):
model_meta['init_param'].update({
'oversampling': self.oversampling,
'n_iter': self.n_iter,
'squash': self.squash
})
model_meta['non_init_param'].update({
'corpus_path': self.corpus_path
})
return model_meta
def save_model(self, dir_path, save_context=False, **kwargs):
super().save_model(dir_path, **kwargs)
self._save_np_params(dir_path, param_list=[
'ev', 'image_mapping', 'mean_image', 'train_image_id'])
if save_context and self.context is not None:
np.savez(os.path.join(dir_path, 'context_param.npz'),
context=self.context._window_vec)
@classmethod
def load_model(self, dir_path, load_context=False):
model = super().load_model(dir_path)
if load_context:
try:
with np.load(os.path.join(dir_path, 'context_param.npz')) as data:
model.context = Context(data['context'], len(model.word_dict),
model.window_size)
except IOError:
print('Failed to load context_param.npz')
model.word_dict = DefaultOrderedDict(int, append_missing=False)
model.word_dict.update((word, i) for i, word in enumerate(model.wv.index2word))
return model