-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnnet_vi_sbm.py
486 lines (382 loc) · 24.8 KB
/
nnet_vi_sbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
import logging
import numpy as np
import os
from pprint import pprint
import scipy.sparse as sp
from scipy.misc import logsumexp
import tensorflow as tf
import tensorflow.contrib.distributions as ds
tf.logging.set_verbosity(tf.logging.WARN)
from utils import BatchGenerator, get_pairs, log_gaussian_density
class NNetSBM:
def __init__(self):
"""
Base class for a model for binary symmetric link matrices with blockmodel structure.
"""
def construct_graph(self):
N = self.N # number of nodes
T = self.T # truncation level of the Dirichlet process
n_features = self.n_features
hidden_layer_sizes = self.hidden_layer_sizes
# List all placeholders here for easy reference.
self.row = tf.placeholder(dtype=tf.int32, shape=[None], name="row") # i.e. user 1
self.col = tf.placeholder(dtype=tf.int32, shape=[None], name="col") # i.e. user 2
self.val = tf.placeholder(dtype=tf.int32, shape=[None], name="val") # i.e. link
self.batch_scale = tf.placeholder(dtype=tf.float32, shape=[], name='batch_scale')
self.n_samples = tf.placeholder(dtype=tf.int32, shape=[], name='n_samples')
# The variational parameters for the local assignment variables, Z_i, are analytically updated and passed in
# as placeholders.
self.qZ = tf.placeholder(dtype=tf.float32, shape=[N, T], name='qZ')
self.sum_qZ_above = tf.placeholder(dtype=tf.float32, shape=[N, T - 1], name='sum_qZ_above')
# Create the features and nnet inputs.
init_scale = - 4.6 # initial scale of inv_softplus(sigma), for noise std devs sigma; -4.6 maps to 0.01 under softplus
var_std = 0.01
with tf.name_scope("latent_features"):
# the node-specific features are vectors drawn from a cluster specific distribution
# make the prior distributions for each cluster
self.pU_dist = ds.Normal(loc=tf.Variable(tf.random_normal([n_features], stddev=var_std), name='pU_mean'),
scale=tf.nn.softplus(tf.Variable(tf.ones([n_features]) * init_scale, name='pU_std_unc')),
name='pU_dist')
self.qU_dist = ds.Normal(loc=tf.Variable(tf.random_normal([T, n_features], stddev=var_std), name='qU_mean'),
scale=tf.nn.softplus(tf.Variable(tf.ones([T, n_features]) * init_scale, name='qU_std_unc')),
name='qU_dist')
self.pUp_dist = ds.Normal(loc=tf.Variable(tf.random_normal([self.n_pairwise_features], stddev=var_std), name='pU_mean'),
scale=tf.nn.softplus(tf.Variable(tf.ones([self.n_pairwise_features]) * init_scale, name='pUp_std_unc')),
name='pUp_dist')
self.qUp_dist = ds.Normal(loc=tf.Variable(tf.random_normal([T, self.n_pairwise_features], stddev=var_std), name='qUp_mean'),
scale=tf.nn.softplus(tf.Variable(tf.ones([T, self.n_pairwise_features]) * init_scale, name='qUp_std_unc')),
name='qUp_dist')
qU_samps = self.qU_dist.sample(self.n_samples) # (n_samples, T, n_features)
qUp_samps = self.qUp_dist.sample(self.n_samples) # (n_samples, T, d_pairwise)
# We must integrate w.r.t. qZ, which requires an eventual sum over all possible combinations of q(Z_i), q(Z_j),
# for each (i, j) in the minibatch. But this requires us to compute the likelihood for each possible Z_i, Z_j.
all_T_pairs = np.concatenate([np.tril_indices(T, k=0), # includes diagonal
np.triu_indices(T, k=1)
], axis=1) # (2, n_T_pairs)
row_features = tf.gather(qU_samps, indices=all_T_pairs[0], axis=1) # (n_samples, n_T_pairs, n_features)
col_features = tf.gather(qU_samps, indices=all_T_pairs[1], axis=1)
pairwise_features = tf.multiply(tf.gather(qUp_samps, indices=all_T_pairs[0], axis=1),
tf.gather(qUp_samps, indices=all_T_pairs[1], axis=1))
inputs_ = tf.concat([row_features,
col_features,
pairwise_features
], axis=2) # (n_samples, n_T_pairs, n_inputs)
# Create the neural network.
# all weights share a prior under p
self.pW_dist = ds.Normal(loc=tf.Variable(tf.random_normal([1]), name='pW_mean'),
scale=tf.nn.softplus(tf.Variable(init_scale, name='pW_std_unc')),
name='pW_dist')
# the biases are also random variables, which I find necessary for good performance (as opposed to just
# params of the ELBO)
self.pB_dist = ds.Normal(loc=tf.Variable(tf.random_normal([1]), name='pB_mean'),
scale=tf.nn.softplus(tf.Variable(init_scale, name='pB_std_unc')),
name='pB_dist')
# we will collect up the weight and bias tensors for reference later
self.nnet_tensors = [] # will be a list of (W, b) tuples
activation_fn = tf.nn.relu
n_inputs = tf.cast(inputs_.shape[-1], tf.int32)
for layer_i, layer_size in enumerate(hidden_layer_sizes): # if an empty list then this loop is not entered
with tf.name_scope("NNet_layer_%d" % layer_i):
# the p and q distributions for this layer's weights and biases
qW_dist = ds.Normal(loc=tf.Variable(tf.random_normal([n_inputs, layer_size], stddev=var_std), name='qW_layer_mean'),
scale=tf.nn.softplus(tf.Variable(tf.ones([n_inputs, layer_size]) * init_scale, name='qW_layer_std_unc')),
name='qW_layer_dist')
qB_dist = ds.Normal(loc=tf.Variable(tf.random_normal([layer_size], stddev=var_std), name='qB_mean'),
scale=tf.nn.softplus(tf.Variable(tf.ones([layer_size]) * init_scale, name='qB_std_unc')),
name='qB_dist')
W_samps = qW_dist.sample(self.n_samples) # (n_samples, prev_layer_size, layer_size)
B_samps = qB_dist.sample(self.n_samples) # (n_samples, layer_size)
# inputs_ will be (n_samples, n_T_pairs, prev_layer_size)
inputs_ = activation_fn(tf.matmul(inputs_, W_samps) + B_samps[:, None, :]) # (n_samples, n_T_pairs, layer_size)
n_inputs = layer_size
# store the distribution objects, but we won't need the samples
self.nnet_tensors.append((qW_dist, qB_dist))
# the output layer mapping to a single probability of a link
qW_dist = ds.Normal(loc=tf.Variable(tf.random_normal([n_inputs, 1], stddev=var_std), name='qW_out_mean'),
scale=tf.nn.softplus(tf.Variable(tf.ones([n_inputs, 1]) * init_scale, name='qW_out_std_unc')),
name='qW_out_dist')
qB_dist = ds.Normal(loc=tf.Variable(tf.random_normal([1], stddev=var_std), name='qB_out_mean'),
scale=tf.nn.softplus(tf.Variable(init_scale, name='qB_out_std_unc')),
name='qB_out_dist')
self.nnet_tensors.append((qW_dist, qB_dist))
W_samps = qW_dist.sample(self.n_samples) # (n_samples, prev_layer_size, 1)
B_samps = qB_dist.sample(self.n_samples) # (n_samples, 1)... not sure why the 1 trails here in the shape...
# inputs_ is (n_samples, n_T_pairs, final_layer_size)
logits = tf.matmul(inputs_, W_samps) + B_samps[:, None, :] # (n_samples, n_T_pairs, 1)
# Compute the likelihood.
# cross entropy is negative Bernoulli log-likelihood
n_T_pairs = all_T_pairs.shape[1]
batch_size = tf.shape(self.val)[0]
val_ = tf.tile(self.val[None, None, :], [self.n_samples, n_T_pairs, 1]) # must be the same type and shape as logits
logits_ = tf.tile(logits, [1, 1, batch_size]) # (n_samples, n_T_pairs, batch_size)
log_bernoulli_likel = - tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(val_, tf.float32),
logits=logits_) # (n_samples, n_T_pairs, batch_size)
self.log_bernoulli_likel = tf.transpose(log_bernoulli_likel, [0, 2, 1]) # (n_samples, batch_size, n_T_pairs)
# should check if the following indexing is better done outside TF graph
qZ_row = tf.gather(self.qZ, indices=self.row, axis=0) # (batch_size, T)
qZ_col = tf.gather(self.qZ, indices=self.col, axis=0)
qZ_pairs_row = tf.gather(qZ_row, indices=all_T_pairs[0], axis=1) # (batch_size, n_T_pairs)
qZ_pairs_col = tf.gather(qZ_col, indices=all_T_pairs[1], axis=1) # (batch_size, n_T_pairs)
loglikel = tf.einsum('jk,ijk->ij', qZ_pairs_row * qZ_pairs_col, self.log_bernoulli_likel) # (n_samples, batch_size); presumably faster than tile->matmul
# Compute the KL terms.
kl_divergence = tf.constant(0.0) # will broadcast up
# KL terms of U (analytically evaluated)
kl_divergence += tf.reduce_sum(self.qU_dist.kl_divergence(self.pU_dist))
kl_divergence += tf.reduce_sum(self.qUp_dist.kl_divergence(self.pUp_dist))
# nnet weights and biases
for qW_dist, qB_dist in self.nnet_tensors: # will have at least one entry
kl_divergence += tf.reduce_sum(qW_dist.kl_divergence(self.pW_dist)) # scalar
kl_divergence += tf.reduce_sum(qB_dist.kl_divergence(self.pB_dist)) # scalar
# KL terms for the DP sticks V; V can be analytically updated but we'll prefer to do gradient updates
self.dp_conc = tf.nn.softplus(tf.Variable(3.5, name='dp_conc_unc')) # 3.5 maps to 3.0 under softplus
self.qV_shp1 = tf.nn.softplus(tf.Variable(tf.ones(T - 1) * 0.54, name='qV_shp1')) # 0.54 maps to 1.0 under softplus
self.qV_shp2 = tf.nn.softplus(tf.Variable(tf.ones(T - 1) * 0.54, name='qV_shp2'))
digamma_sum = tf.digamma(self.qV_shp1 + self.qV_shp2)
self.E_log_V = tf.digamma(self.qV_shp1) - digamma_sum # (T-1,)
self.E_log_1mV = tf.digamma(self.qV_shp2) - digamma_sum
# KL terms for E[log p(Z|V)] with V integrated out (verified this, it's correct)
# note KL divergence is E_q [logq / logp]
kl_divergence += - tf.reduce_sum(self.sum_qZ_above * self.E_log_1mV + self.qZ[:, :-1] * self.E_log_V) \
+ tf.reduce_sum(self.qZ * tf.log(self.qZ))
# elbo terms for E[log p(V|c)]
kl_divergence += - tf.log(self.dp_conc) + (self.dp_conc - 1.0) * tf.reduce_sum(self.E_log_1mV) \
+ tf.reduce_sum( tf.lgamma(self.qV_shp1 + self.qV_shp2) - tf.lgamma(self.qV_shp1) - tf.lgamma(self.qV_shp2)
+ (self.qV_shp1 - 1.0) * self.E_log_V + (self.qV_shp2 - 1.0) * self.E_log_1mV
) # a scalar
# Assemble the ELBO.
with tf.name_scope("ELBO"):
self.data_loglikel = tf.reduce_sum(loglikel) / tf.cast(self.n_samples, tf.float32) # will be recorded
self.elbo = self.batch_scale * self.data_loglikel - kl_divergence
def train(self, N, row, col, T,
n_features, n_pairwise_features,
hidden_layer_sizes,
n_iterations, batch_size, n_samples, holdout_ratio_valid, learning_rate,
root_savedir,
log_interval=10, no_train_metric=False, seed=None, debug=False):
"""
Training routine.
Note about the data: the (row, col) tuples of the ON (i.e., one-valued) entries of the graph are to be passed,
and they should correspond to the upper triangle of the graph. (Recall we do not allow self-links.) Regardless,
the code will make a symmetric graph out of all passed entries (within the upper triangular or not) and only the
upper triangle of the resulting matrix will be kept.
:param N: Number of nodes in the graph.
:param row: row indices corresponding to the ON entries (in the upper triangle).
:param col: col indices corresponding to the ON entries (in the upper triangle).
:param T: Truncation level for the DP.
:param n_features:
:param hidden_layer_sizes:
:param n_iterations:
:param batch_size: HALF the minibatch size. In particular, we will always add the symmetric entry in the graph
(i.e., the corresponding entry in the lower triangle) in the minibatch.
:param n_samples:
:param holdout_ratio_valid:
:param learning_rate:
:param root_savedir:
:param no_train_metric:
:param seed:
:param debug:
:return:
"""
self.N = N
self.T = T
self.n_features = n_features
self.n_pairwise_features = n_pairwise_features
self.hidden_layer_sizes = hidden_layer_sizes
if not os.path.exists(root_savedir):
os.makedirs(root_savedir)
# Data handling.
X_sp = sp.csr_matrix((np.ones(len(row)), (row, col)), shape=[N, N])
X_sp = X_sp + X_sp.transpose()
X_sp = sp.triu(X_sp, k=1)
row, col = X_sp.nonzero()
pairs = get_pairs(N, row, col)
pairs = pairs.astype(int)
batch_generator = BatchGenerator(pairs, batch_size,
holdout_ratio=holdout_ratio_valid,
seed=seed)
# Construct the TF graph.
self.construct_graph()
all_vars = tf.trainable_variables()
print("\nTrainable variables:")
pprint([var_.name for var_ in all_vars])
train_op = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(-self.elbo)
### Create q(Z) variational parameters ###
# before this was uniformly initialized
# self.qZ_ = np.ones([N, T]) / T
self.qZ_ = np.random.dirichlet(np.ones(T), size=N) # (N, T)
# the following quantity needs to be passed to the TF graph and must be updated after every update to qZ
sum_qZ_above = np.zeros([N, T - 1])
for k in range(T - 1):
sum_qZ_above[:, k] = np.sum(self.qZ_[:, k + 1:], axis=1)
# Training.
if not no_train_metric:
train_elbo = tf.placeholder(dtype=tf.float32, shape=[], name='train_elbo')
train_elbo_summary = tf.summary.scalar('train_elbo', train_elbo)
train_ll = tf.placeholder(dtype=tf.float32, shape=[], name='train_ll')
train_ll_summary = tf.summary.scalar('train_ll', train_ll)
if holdout_ratio_valid is not None:
test_ll = tf.placeholder(dtype=tf.float32, shape=[], name='test_ll')
test_ll_summary = tf.summary.scalar('test_ll', test_ll)
# Grab all scalar variables, to track in Tensorboard.
trainable_vars = tf.trainable_variables()
scalar_summaries = [tf.summary.scalar(tensor_.name, tensor_) for tensor_ in trainable_vars if len(tensor_.shape) == 0]
tensor_summaries = [tf.summary.histogram(tensor_.name, tensor_) for tensor_ in trainable_vars if len(tensor_.shape) > 0]
root_logdir = os.path.join(root_savedir, "tf_logs")
writer = tf.summary.FileWriter(root_logdir)
saver = tf.train.Saver()
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
if not no_train_metric:
# add symmetric entries from the lower triangle
train_data = batch_generator.train
row = np.concatenate([train_data[:, 0], train_data[:, 1]])
col = np.concatenate([train_data[:, 1], train_data[:, 0]])
val = np.concatenate([train_data[:, 2], train_data[:, 2]])
train_dict = {self.row: row, self.col: col, self.val: val, self.batch_scale: 1.0}
if holdout_ratio_valid is not None:
test_data = batch_generator.test
row = np.concatenate([test_data[:, 0], test_data[:, 1]])
col = np.concatenate([test_data[:, 1], test_data[:, 0]])
val = np.concatenate([test_data[:, 2], test_data[:, 2]])
test_dict = {self.row: row, self.col: col, self.val: val, self.batch_scale: 1.0}
logging.info("Starting training...")
for iteration in range(n_iterations):
batch = batch_generator.next_batch()
batch_dict = {self.row: np.concatenate([batch[:, 0], batch[:, 1]]),
self.col: np.concatenate([batch[:, 1], batch[:, 0]]),
self.val: np.concatenate([batch[:, 2], batch[:, 2]]),
self.qZ: self.qZ_,
self.n_samples: n_samples,
self.batch_scale: len(pairs) / len(batch),
self.sum_qZ_above: sum_qZ_above,
}
# make a gradient update
sess.run(train_op, feed_dict=batch_dict)
# analytically
self.update_qZ(sess=sess, batch=batch,
n_samples=n_samples, debug=debug)
# this update to sum_qZ_above was done at the beginning of the iteration. this implementation updates the sum_qZ_above before
# logging the intermediate loss functions, and also one more time before saving the model. this actually makes more sense to me.
# we could also just add this computation inside the construct graph function? it would have to be recomputed a few times more, but makes the code cleaner
for k in range(T - 1):
sum_qZ_above[:, k] = np.sum(self.qZ_[:, k + 1:], axis=1)
if iteration % log_interval == 0:
# Add scalar variables to Tensorboard.
for summ_str in sess.run(scalar_summaries):
writer.add_summary(summ_str, iteration)
# Add tensor variables to Tensorboard.
for summ_str in sess.run(tensor_summaries):
writer.add_summary(summ_str, iteration)
if not no_train_metric:
train_dict.update({self.qZ: self.qZ_, self.sum_qZ_above: sum_qZ_above, self.n_samples: 100})
train_ll_, train_elbo_ = sess.run([self.data_loglikel, self.elbo], feed_dict=train_dict)
train_ll_summary_str, train_elbo_summary_str = sess.run([train_ll_summary, train_elbo_summary],
feed_dict={train_ll: train_ll_,
train_elbo: train_elbo_})
writer.add_summary(train_ll_summary_str, iteration)
writer.add_summary(train_elbo_summary_str, iteration)
if holdout_ratio_valid is not None:
test_dict.update({self.qZ: self.qZ_, self.sum_qZ_above: sum_qZ_above, self.n_samples: 100})
test_ll_ = sess.run(self.data_loglikel, feed_dict=test_dict)
test_ll_summary_str = sess.run(test_ll_summary, feed_dict={test_ll: test_ll_})
writer.add_summary(test_ll_summary_str, iteration)
# Log training overview.
log_str = "%-4d" % iteration
if not no_train_metric:
log_str += " ELBO: %.4e Train ll: %.4e" % (train_elbo_, train_ll_)
if holdout_ratio_valid is not None:
log_str += " Valid ll: %.4e" % test_ll_
logging.info(log_str)
# save the model
saver.save(sess, os.path.join(root_savedir, "model.ckpt"))
# close the file writer
writer.close()
def update_qZ(self, sess, batch, n_samples, debug=False):
"""
Analytically update the variational parameters of the distribution on the DP indicators Z.
:param sess:
:param qZ:
:param batch:
:param n_samples:
:param debug:
:return:
"""
N, T = self.qZ_.shape
# grab the values needed to update qZ
E_log_V, E_log_1mV = sess.run([self.E_log_V, self.E_log_1mV]) # nothing needs to be passed to feed_dict
# force symmetry in the subgraph so that updating the rows means updating all nodes in the subgraph
row = np.concatenate([batch[:, 0], batch[:, 1]])
col = np.concatenate([batch[:, 1], batch[:, 0]])
val = np.concatenate([batch[:, 2], batch[:, 2]])
# The terms corresponding to E[log p(X|Z,U)] are a bit tricky to compute; we update only those entries present in
# the minibatch (i.e., the subgraph) and the likelihood is also approximated only on this minibatch.
log_bernoulli_likel = sess.run(self.log_bernoulli_likel,
feed_dict={self.row: row, self.col: col, self.val: val, self.n_samples: n_samples
}) # (n_samples, 2 * batch_size, n_T_pairs)
all_T_pairs = np.concatenate([np.tril_indices(T, k=0), # includes diagonal
np.triu_indices(T, k=1)
], axis=1) # (2, n_T_pairs)
# this computation is an absolute nightmare... should think about if it's possible to change the way the TF graph
# is constructed so that we can instead index (k, \ell) separately...
qZ_col = self.qZ_[:, all_T_pairs[1]] # (N, n_T_pairs)
qZ_col = qZ_col[col, :] # (2 * batch_size, n_T_pairs)
ll_ = qZ_col * log_bernoulli_likel # (n_samples, 2 * batch_size, n_T_pairs)
ll_ = np.apply_along_axis(lambda x: np.bincount(row, weights=x, minlength=N),
axis=1,
arr=ll_) # (n_samples, N, n_T_pairs)
ll_ = np.apply_along_axis(lambda x: np.bincount(all_T_pairs[0], weights=x, minlength=T),
axis=2,
arr=ll_) # (n_samples, N, T)
ll_ = np.mean(ll_, axis=0) # (N, T)
if debug:
# slow computation... that this computes the same thing is itself not that clear unfortunately...
ll_test = np.zeros([N, T])
for i in range(N):
row_is_i = row == i # (2 * batch_size,)
if np.any(row_is_i):
log_p = log_bernoulli_likel[:, row_is_i, :] # (n_samples, |E_i|, n_T_pairs); all terms correspond to when i is the row index
qZ_col = self.qZ_[col[row_is_i], :] # (|E_i|, T); the q(Z_j=\ell) terms
for k in range(T):
row_is_k = all_T_pairs[0] == k
corresponding_col = all_T_pairs[1][row_is_k]
qZi_col_k = qZ_col[:, corresponding_col] # (|E_i|, T) -- b/c k will be connected to T different \ell
ll_test[i, k] = np.sum(log_p[:, :, row_is_k] * qZi_col_k) / n_samples
assert np.all(np.isclose(ll_, ll_test))
# only replace for those nodes in the minibatch
mbatch_row = np.unique(row)
ll_ = ll_[mbatch_row, :]
# the terms correspondin to E[log p(Z|V)] are a bit easier
E_log_dp = np.zeros(T)
for k in range(T - 1):
E_log_dp[k] = E_log_V[k] + np.sum(E_log_1mV[:k])
# final stick
E_log_dp[-1] = np.sum(E_log_1mV)
# should ll_ here be scaled by batch_scale?
# ll_ = batch_scale * ll_ + E_log_dp
ll_ = ll_ + E_log_dp # (N, T)
# now normalize to find the probability vectors
Z_probs = np.exp(ll_ - logsumexp(ll_, axis=1)[:, None])
# truncate anything too small
to_truncate = Z_probs < 1e-8
if np.any(to_truncate):
Z_probs[to_truncate] = 1e-8
Z_probs = Z_probs / np.sum(Z_probs, axis=1)[:, None]
self.qZ_[mbatch_row, :] = Z_probs
if __name__ == '__main__':
from scipy.sparse import find
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)
N = 50
X = np.random.rand(N, N) < 0.4
row, col, _ = find(X)
root_savedir = "/Users/onnokampman/git_repos/variational-nnet-sbm/test_savedir"
T = 7
n_features = 8
d_pairwise = 16
hidden_layer_sizes = [12, 8]
m = NNetSBM()
m.train(N, row, col, T=T,
n_features=n_features, n_pairwise_features=d_pairwise, hidden_layer_sizes=hidden_layer_sizes,
n_iterations=100, batch_size=50, n_samples=6, holdout_ratio_valid=0.1, learning_rate=0.01,
root_savedir=root_savedir, no_train_metric=False, seed=None, debug=False)
# os.system('~/anaconda3/bin/tensorboard --logdir=' + root_savedir)