-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
/
Copy pathpoincare.py
1758 lines (1457 loc) · 68.7 KB
/
poincare.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Author: Jayant Jain <jayantjain1992@gmail.com>
# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html
"""Python implementation of Poincaré Embeddings.
These embeddings are better at capturing latent hierarchical information than traditional Euclidean embeddings.
The method is described in detail in `Maximilian Nickel, Douwe Kiela -
"Poincaré Embeddings for Learning Hierarchical Representations" <https://arxiv.org/abs/1705.08039>`_.
The main use-case is to automatically learn hierarchical representations of nodes from a tree-like structure,
such as a Directed Acyclic Graph (DAG), using a transitive closure of the relations. Representations of nodes in a
symmetric graph can also be learned.
This module allows training Poincaré Embeddings from a training file containing relations of graph in a
csv-like format, or from a Python iterable of relations.
Examples
--------
Initialize and train a model from a list
.. sourcecode:: pycon
>>> from gensim.models.poincare import PoincareModel
>>> relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal'), ('gib', 'cat')]
>>> model = PoincareModel(relations, negative=2)
>>> model.train(epochs=50)
Initialize and train a model from a file containing one relation per line
.. sourcecode:: pycon
>>> from gensim.models.poincare import PoincareModel, PoincareRelations
>>> from gensim.test.utils import datapath
>>> file_path = datapath('poincare_hypernyms.tsv')
>>> model = PoincareModel(PoincareRelations(file_path), negative=2)
>>> model.train(epochs=50)
"""
import csv
import logging
from numbers import Integral
import sys
import time
from collections import defaultdict, Counter
import numpy as np
from numpy import random as np_random, float32 as REAL
from scipy.stats import spearmanr
try:
from autograd import grad # Only required for optionally verifying gradients while training
from autograd import numpy as grad_np
AUTOGRAD_PRESENT = True
except ImportError:
AUTOGRAD_PRESENT = False
from gensim import utils, matutils
from gensim.models.keyedvectors import KeyedVectors
logger = logging.getLogger(__name__)
class PoincareModel(utils.SaveLoad):
"""Train, use and evaluate Poincare Embeddings.
The model can be stored/loaded via its :meth:`~gensim.models.poincare.PoincareModel.save`
and :meth:`~gensim.models.poincare.PoincareModel.load` methods, or stored/loaded in the word2vec format
via `model.kv.save_word2vec_format` and :meth:`~gensim.models.poincare.PoincareKeyedVectors.load_word2vec_format`.
Notes
-----
Training cannot be resumed from a model loaded via `load_word2vec_format`, if you wish to train further,
use :meth:`~gensim.models.poincare.PoincareModel.save` and :meth:`~gensim.models.poincare.PoincareModel.load`
methods instead.
An important attribute (that provides a lot of additional functionality when directly accessed) are the
keyed vectors:
self.kv : :class:`~gensim.models.poincare.PoincareKeyedVectors`
This object essentially contains the mapping between nodes and embeddings, as well the vocabulary of the model
(set of unique nodes seen by the model). After training, it can be used to perform operations on the vectors
such as vector lookup, distance and similarity calculations etc.
See the documentation of its class for usage examples.
"""
def __init__(self, train_data, size=50, alpha=0.1, negative=10, workers=1, epsilon=1e-5, regularization_coeff=1.0,
burn_in=10, burn_in_alpha=0.01, init_range=(-0.001, 0.001), dtype=np.float64, seed=0):
"""Initialize and train a Poincare embedding model from an iterable of relations.
Parameters
----------
train_data : {iterable of (str, str), :class:`gensim.models.poincare.PoincareRelations`}
Iterable of relations, e.g. a list of tuples, or a :class:`gensim.models.poincare.PoincareRelations`
instance streaming from a file. Note that the relations are treated as ordered pairs,
i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric,
the data should contain both relations (a, b) and (b, a).
size : int, optional
Number of dimensions of the trained model.
alpha : float, optional
Learning rate for training.
negative : int, optional
Number of negative samples to use.
workers : int, optional
Number of threads to use for training the model.
epsilon : float, optional
Constant used for clipping embeddings below a norm of one.
regularization_coeff : float, optional
Coefficient used for l2-regularization while training (0 effectively disables regularization).
burn_in : int, optional
Number of epochs to use for burn-in initialization (0 means no burn-in).
burn_in_alpha : float, optional
Learning rate for burn-in initialization, ignored if `burn_in` is 0.
init_range : 2-tuple (float, float)
Range within which the vectors are randomly initialized.
dtype : numpy.dtype
The numpy dtype to use for the vectors in the model (numpy.float64, numpy.float32 etc).
Using lower precision floats may be useful in increasing training speed and reducing memory usage.
seed : int, optional
Seed for random to ensure reproducibility.
Examples
--------
Initialize a model from a list:
.. sourcecode:: pycon
>>> from gensim.models.poincare import PoincareModel
>>> relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal'), ('gib', 'cat')]
>>> model = PoincareModel(relations, negative=2)
Initialize a model from a file containing one relation per line:
.. sourcecode:: pycon
>>> from gensim.models.poincare import PoincareModel, PoincareRelations
>>> from gensim.test.utils import datapath
>>> file_path = datapath('poincare_hypernyms.tsv')
>>> model = PoincareModel(PoincareRelations(file_path), negative=2)
See :class:`~gensim.models.poincare.PoincareRelations` for more options.
"""
self.train_data = train_data
self.kv = PoincareKeyedVectors(size, 0)
self.all_relations = []
self.node_relations = defaultdict(set)
self._negatives_buffer = NegativesBuffer([])
self._negatives_buffer_size = 2000
self.size = size
self.train_alpha = alpha # Learning rate for training
self.burn_in_alpha = burn_in_alpha # Learning rate for burn-in
self.alpha = alpha # Current learning rate
self.negative = negative
self.workers = workers
self.epsilon = epsilon
self.regularization_coeff = regularization_coeff
self.burn_in = burn_in
self._burn_in_done = False
self.dtype = dtype
self.seed = seed
self._np_random = np_random.RandomState(seed)
self.init_range = init_range
self._loss_grad = None
self.build_vocab(train_data)
def build_vocab(self, relations, update=False):
"""Build the model's vocabulary from known relations.
Parameters
----------
relations : {iterable of (str, str), :class:`gensim.models.poincare.PoincareRelations`}
Iterable of relations, e.g. a list of tuples, or a :class:`gensim.models.poincare.PoincareRelations`
instance streaming from a file. Note that the relations are treated as ordered pairs,
i.e. a relation (a, b) does not imply the opposite relation (b, a). In case the relations are symmetric,
the data should contain both relations (a, b) and (b, a).
update : bool, optional
If true, only new nodes's embeddings are initialized.
Use this when the model already has an existing vocabulary and you want to update it.
If false, all node's embeddings are initialized.
Use this when you're creating a new vocabulary from scratch.
Examples
--------
Train a model and update vocab for online training:
.. sourcecode:: pycon
>>> from gensim.models.poincare import PoincareModel
>>>
>>> # train a new model from initial data
>>> initial_relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal')]
>>> model = PoincareModel(initial_relations, negative=1)
>>> model.train(epochs=50)
>>>
>>> # online training: update the vocabulary and continue training
>>> online_relations = [('striped_skunk', 'mammal')]
>>> model.build_vocab(online_relations, update=True)
>>> model.train(epochs=50)
"""
old_index_to_key_len = len(self.kv.index_to_key)
logger.info("loading relations from train data..")
for relation in relations:
if len(relation) != 2:
raise ValueError('Relation pair "%s" should have exactly two items' % repr(relation))
for item in relation:
if item in self.kv.key_to_index:
self.kv.set_vecattr(item, 'count', self.kv.get_vecattr(item, 'count') + 1)
else:
self.kv.key_to_index[item] = len(self.kv.index_to_key)
self.kv.index_to_key.append(item)
self.kv.set_vecattr(item, 'count', 1)
node_1, node_2 = relation
node_1_index, node_2_index = self.kv.key_to_index[node_1], self.kv.key_to_index[node_2]
self.node_relations[node_1_index].add(node_2_index)
relation = (node_1_index, node_2_index)
self.all_relations.append(relation)
logger.info("loaded %d relations from train data, %d nodes", len(self.all_relations), len(self.kv))
self.indices_set = set(range(len(self.kv.index_to_key))) # Set of all node indices
self.indices_array = np.fromiter(range(len(self.kv.index_to_key)), dtype=int) # Numpy array of all node indices
self._init_node_probabilities()
if not update:
self._init_embeddings()
else:
self._update_embeddings(old_index_to_key_len)
def _init_embeddings(self):
"""Randomly initialize vectors for the items in the vocab."""
shape = (len(self.kv.index_to_key), self.size)
self.kv.vectors = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype)
def _update_embeddings(self, old_index_to_key_len):
"""Randomly initialize vectors for the items in the additional vocab."""
shape = (len(self.kv.index_to_key) - old_index_to_key_len, self.size)
v = self._np_random.uniform(self.init_range[0], self.init_range[1], shape).astype(self.dtype)
self.kv.vectors = np.concatenate([self.kv.vectors, v])
def _init_node_probabilities(self):
"""Initialize a-priori probabilities."""
counts = self.kv.expandos['count'].astype(np.float64)
self._node_counts_cumsum = np.cumsum(counts)
self._node_probabilities = counts / counts.sum()
def _get_candidate_negatives(self):
"""Get candidate negatives of size `self.negative` from the negative examples buffer.
Returns
-------
numpy.array
Array of shape (`self.negative`,) containing indices of negative nodes.
"""
if self._negatives_buffer.num_items() < self.negative:
# cumsum table of counts used instead of the standard approach of a probability cumsum table
# this is to avoid floating point errors that result when the number of nodes is very high
# for reference: https://github.com/RaRe-Technologies/gensim/issues/1917
max_cumsum_value = self._node_counts_cumsum[-1]
uniform_numbers = self._np_random.randint(1, max_cumsum_value + 1, self._negatives_buffer_size)
cumsum_table_indices = np.searchsorted(self._node_counts_cumsum, uniform_numbers)
self._negatives_buffer = NegativesBuffer(cumsum_table_indices)
return self._negatives_buffer.get_items(self.negative)
def _sample_negatives(self, node_index):
"""Get a sample of negatives for the given node.
Parameters
----------
node_index : int
Index of the positive node for which negative samples are to be returned.
Returns
-------
numpy.array
Array of shape (self.negative,) containing indices of negative nodes for the given node index.
"""
node_relations = self.node_relations[node_index]
num_remaining_nodes = len(self.kv) - len(node_relations)
if num_remaining_nodes < self.negative:
raise ValueError(
'Cannot sample %d negative nodes from a set of %d negative nodes for %s' %
(self.negative, num_remaining_nodes, self.kv.index_to_key[node_index])
)
positive_fraction = float(len(node_relations)) / len(self.kv)
if positive_fraction < 0.01:
# If number of positive relations is a small fraction of total nodes
# re-sample till no positively connected nodes are chosen
indices = self._get_candidate_negatives()
unique_indices = set(indices)
times_sampled = 1
while (len(indices) != len(unique_indices)) or (unique_indices & node_relations):
times_sampled += 1
indices = self._get_candidate_negatives()
unique_indices = set(indices)
if times_sampled > 1:
logger.debug('sampled %d times, positive fraction %.5f', times_sampled, positive_fraction)
else:
# If number of positive relations is a significant fraction of total nodes
# subtract positively connected nodes from set of choices and sample from the remaining
valid_negatives = np.array(list(self.indices_set - node_relations))
probs = self._node_probabilities[valid_negatives]
probs /= probs.sum()
indices = self._np_random.choice(valid_negatives, size=self.negative, p=probs, replace=False)
return list(indices)
@staticmethod
def _loss_fn(matrix, regularization_coeff=1.0):
"""Computes loss value.
Parameters
----------
matrix : numpy.array
Array containing vectors for u, v and negative samples, of shape (2 + negative_size, dim).
regularization_coeff : float, optional
Coefficient to use for l2-regularization
Returns
-------
float
Computed loss value.
Warnings
--------
Only used for autograd gradients, since autograd requires a specific function signature.
"""
vector_u = matrix[0]
vectors_v = matrix[1:]
euclidean_dists = grad_np.linalg.norm(vector_u - vectors_v, axis=1)
norm = grad_np.linalg.norm(vector_u)
all_norms = grad_np.linalg.norm(vectors_v, axis=1)
poincare_dists = grad_np.arccosh(
1 + 2 * (
(euclidean_dists ** 2) / ((1 - norm ** 2) * (1 - all_norms ** 2))
)
)
exp_negative_distances = grad_np.exp(-poincare_dists)
regularization_term = regularization_coeff * grad_np.linalg.norm(vectors_v[0]) ** 2
return -grad_np.log(exp_negative_distances[0] / (exp_negative_distances.sum())) + regularization_term
@staticmethod
def _clip_vectors(vectors, epsilon):
"""Clip vectors to have a norm of less than one.
Parameters
----------
vectors : numpy.array
Can be 1-D, or 2-D (in which case the norm for each row is checked).
epsilon : float
Parameter for numerical stability, each dimension of the vector is reduced by `epsilon`
if the norm of the vector is greater than or equal to 1.
Returns
-------
numpy.array
Array with norms clipped below 1.
"""
one_d = len(vectors.shape) == 1
threshold = 1 - epsilon
if one_d:
norm = np.linalg.norm(vectors)
if norm < threshold:
return vectors
else:
return vectors / norm - (np.sign(vectors) * epsilon)
else:
norms = np.linalg.norm(vectors, axis=1)
if (norms < threshold).all():
return vectors
else:
vectors[norms >= threshold] *= (threshold / norms[norms >= threshold])[:, np.newaxis]
vectors[norms >= threshold] -= np.sign(vectors[norms >= threshold]) * epsilon
return vectors
def save(self, *args, **kwargs):
"""Save complete model to disk, inherited from :class:`~gensim.utils.SaveLoad`.
See also
--------
:meth:`~gensim.models.poincare.PoincareModel.load`
Parameters
----------
*args
Positional arguments passed to :meth:`~gensim.utils.SaveLoad.save`.
**kwargs
Keyword arguments passed to :meth:`~gensim.utils.SaveLoad.save`.
"""
self._loss_grad = None # Can't pickle autograd fn to disk
attrs_to_ignore = ['_node_probabilities', '_node_counts_cumsum']
kwargs['ignore'] = set(list(kwargs.get('ignore', [])) + attrs_to_ignore)
super(PoincareModel, self).save(*args, **kwargs)
@classmethod
def load(cls, *args, **kwargs):
"""Load model from disk, inherited from :class:`~gensim.utils.SaveLoad`.
See also
--------
:meth:`~gensim.models.poincare.PoincareModel.save`
Parameters
----------
*args
Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
**kwargs
Keyword arguments passed to :meth:`~gensim.utils.SaveLoad.load`.
Returns
-------
:class:`~gensim.models.poincare.PoincareModel`
The loaded model.
"""
model = super(PoincareModel, cls).load(*args, **kwargs)
model._init_node_probabilities()
return model
def _prepare_training_batch(self, relations, all_negatives, check_gradients=False):
"""Create a training batch and compute gradients and loss for the batch.
Parameters
----------
relations : list of tuples
List of tuples of positive examples of the form (node_1_index, node_2_index).
all_negatives : list of lists
List of lists of negative samples for each node_1 in the positive examples.
check_gradients : bool, optional
Whether to compare the computed gradients to autograd gradients for this batch.
Returns
-------
:class:`~gensim.models.poincare.PoincareBatch`
Node indices, computed gradients and loss for the batch.
"""
batch_size = len(relations)
indices_u, indices_v = [], []
for relation, negatives in zip(relations, all_negatives):
u, v = relation
indices_u.append(u)
indices_v.append(v)
indices_v.extend(negatives)
vectors_u = self.kv.vectors[indices_u]
vectors_v = self.kv.vectors[indices_v].reshape((batch_size, 1 + self.negative, self.size))
vectors_v = vectors_v.swapaxes(0, 1).swapaxes(1, 2)
batch = PoincareBatch(vectors_u, vectors_v, indices_u, indices_v, self.regularization_coeff)
batch.compute_all()
if check_gradients:
self._check_gradients(relations, all_negatives, batch)
return batch
def _check_gradients(self, relations, all_negatives, batch, tol=1e-8):
"""Compare computed gradients for batch to autograd gradients.
Parameters
----------
relations : list of tuples
List of tuples of positive examples of the form (node_1_index, node_2_index).
all_negatives : list of lists
List of lists of negative samples for each node_1 in the positive examples.
batch : :class:`~gensim.models.poincare.PoincareBatch`
Batch for which computed gradients are to be checked.
tol : float, optional
The maximum error between our computed gradients and the reference ones from autograd.
"""
if not AUTOGRAD_PRESENT:
logger.warning('autograd could not be imported, cannot do gradient checking')
logger.warning('please install autograd to enable gradient checking')
return
if self._loss_grad is None:
self._loss_grad = grad(PoincareModel._loss_fn)
max_diff = 0.0
for i, (relation, negatives) in enumerate(zip(relations, all_negatives)):
u, v = relation
auto_gradients = self._loss_grad(
np.vstack((self.kv.vectors[u], self.kv.vectors[[v] + negatives])), self.regularization_coeff)
computed_gradients = np.vstack((batch.gradients_u[:, i], batch.gradients_v[:, :, i]))
diff = np.abs(auto_gradients - computed_gradients).max()
if diff > max_diff:
max_diff = diff
logger.info('max difference between computed gradients and autograd gradients: %.10f', max_diff)
assert max_diff < tol, (
'Max difference between computed gradients and autograd gradients %.10f, '
'greater than tolerance %.10f' % (max_diff, tol))
def _sample_negatives_batch(self, nodes):
"""Get negative examples for each node.
Parameters
----------
nodes : iterable of int
Iterable of node indices for which negative samples are to be returned.
Returns
-------
list of lists
Each inner list is a list of negative samples for a single node in the input list.
"""
all_indices = [self._sample_negatives(node) for node in nodes]
return all_indices
def _train_on_batch(self, relations, check_gradients=False):
"""Perform training for a single training batch.
Parameters
----------
relations : list of tuples of (int, int)
List of tuples of positive examples of the form (node_1_index, node_2_index).
check_gradients : bool, optional
Whether to compare the computed gradients to autograd gradients for this batch.
Returns
-------
:class:`~gensim.models.poincare.PoincareBatch`
The batch that was just trained on, contains computed loss for the batch.
"""
all_negatives = self._sample_negatives_batch(relation[0] for relation in relations)
batch = self._prepare_training_batch(relations, all_negatives, check_gradients)
self._update_vectors_batch(batch)
return batch
@staticmethod
def _handle_duplicates(vector_updates, node_indices):
"""Handle occurrences of multiple updates to the same node in a batch of vector updates.
Parameters
----------
vector_updates : numpy.array
Array with each row containing updates to be performed on a certain node.
node_indices : list of int
Node indices on which the above updates are to be performed on.
Notes
-----
Mutates the `vector_updates` array.
Required because vectors[[2, 1, 2]] += np.array([-0.5, 1.0, 0.5]) performs only the last update
on the row at index 2.
"""
counts = Counter(node_indices)
node_dict = defaultdict(list)
for i, node_index in enumerate(node_indices):
node_dict[node_index].append(i)
for node_index, count in counts.items():
if count == 1:
continue
positions = node_dict[node_index]
# Move all updates to the same node to the last such update, zeroing all the others
vector_updates[positions[-1]] = vector_updates[positions].sum(axis=0)
vector_updates[positions[:-1]] = 0
def _update_vectors_batch(self, batch):
"""Update vectors for nodes in the given batch.
Parameters
----------
batch : :class:`~gensim.models.poincare.PoincareBatch`
Batch containing computed gradients and node indices of the batch for which updates are to be done.
"""
grad_u, grad_v = batch.gradients_u, batch.gradients_v
indices_u, indices_v = batch.indices_u, batch.indices_v
batch_size = len(indices_u)
u_updates = (self.alpha * (batch.alpha ** 2) / 4 * grad_u).T
self._handle_duplicates(u_updates, indices_u)
self.kv.vectors[indices_u] -= u_updates
self.kv.vectors[indices_u] = self._clip_vectors(self.kv.vectors[indices_u], self.epsilon)
v_updates = self.alpha * (batch.beta ** 2)[:, np.newaxis] / 4 * grad_v
v_updates = v_updates.swapaxes(1, 2).swapaxes(0, 1)
v_updates = v_updates.reshape(((1 + self.negative) * batch_size, self.size))
self._handle_duplicates(v_updates, indices_v)
self.kv.vectors[indices_v] -= v_updates
self.kv.vectors[indices_v] = self._clip_vectors(self.kv.vectors[indices_v], self.epsilon)
def train(self, epochs, batch_size=10, print_every=1000, check_gradients_every=None):
"""Train Poincare embeddings using loaded data and model parameters.
Parameters
----------
epochs : int
Number of iterations (epochs) over the corpus.
batch_size : int, optional
Number of examples to train on in a single batch.
print_every : int, optional
Prints progress and average loss after every `print_every` batches.
check_gradients_every : int or None, optional
Compares computed gradients and autograd gradients after every `check_gradients_every` batches.
Useful for debugging, doesn't compare by default.
Examples
--------
.. sourcecode:: pycon
>>> from gensim.models.poincare import PoincareModel
>>> relations = [('kangaroo', 'marsupial'), ('kangaroo', 'mammal'), ('gib', 'cat')]
>>> model = PoincareModel(relations, negative=2)
>>> model.train(epochs=50)
"""
if self.workers > 1:
raise NotImplementedError("Multi-threaded version not implemented yet")
# Some divide-by-zero results are handled explicitly
old_settings = np.seterr(divide='ignore', invalid='ignore')
logger.info(
"training model of size %d with %d workers on %d relations for %d epochs and %d burn-in epochs, "
"using lr=%.5f burn-in lr=%.5f negative=%d",
self.size, self.workers, len(self.all_relations), epochs, self.burn_in,
self.alpha, self.burn_in_alpha, self.negative
)
if self.burn_in > 0 and not self._burn_in_done:
logger.info("starting burn-in (%d epochs)----------------------------------------", self.burn_in)
self.alpha = self.burn_in_alpha
self._train_batchwise(
epochs=self.burn_in, batch_size=batch_size, print_every=print_every,
check_gradients_every=check_gradients_every)
self._burn_in_done = True
logger.info("burn-in finished")
self.alpha = self.train_alpha
logger.info("starting training (%d epochs)----------------------------------------", epochs)
self._train_batchwise(
epochs=epochs, batch_size=batch_size, print_every=print_every,
check_gradients_every=check_gradients_every)
logger.info("training finished")
np.seterr(**old_settings)
def _train_batchwise(self, epochs, batch_size=10, print_every=1000, check_gradients_every=None):
"""Train Poincare embeddings using specified parameters.
Parameters
----------
epochs : int
Number of iterations (epochs) over the corpus.
batch_size : int, optional
Number of examples to train on in a single batch.
print_every : int, optional
Prints progress and average loss after every `print_every` batches.
check_gradients_every : int or None, optional
Compares computed gradients and autograd gradients after every `check_gradients_every` batches.
Useful for debugging, doesn't compare by default.
"""
if self.workers > 1:
raise NotImplementedError("Multi-threaded version not implemented yet")
for epoch in range(1, epochs + 1):
indices = list(range(len(self.all_relations)))
self._np_random.shuffle(indices)
avg_loss = 0.0
last_time = time.time()
for batch_num, i in enumerate(range(0, len(indices), batch_size), start=1):
should_print = not (batch_num % print_every)
check_gradients = bool(check_gradients_every) and (batch_num % check_gradients_every) == 0
batch_indices = indices[i:i + batch_size]
relations = [self.all_relations[idx] for idx in batch_indices]
result = self._train_on_batch(relations, check_gradients=check_gradients)
avg_loss += result.loss
if should_print:
avg_loss /= print_every
time_taken = time.time() - last_time
speed = print_every * batch_size / time_taken
logger.info(
'training on epoch %d, examples #%d-#%d, loss: %.2f'
% (epoch, i, i + batch_size, avg_loss))
logger.info(
'time taken for %d examples: %.2f s, %.2f examples / s'
% (print_every * batch_size, time_taken, speed))
last_time = time.time()
avg_loss = 0.0
class PoincareBatch:
"""Compute Poincare distances, gradients and loss for a training batch.
Store intermediate state to avoid recomputing multiple times.
"""
def __init__(self, vectors_u, vectors_v, indices_u, indices_v, regularization_coeff=1.0):
"""
Initialize instance with sets of vectors for which distances are to be computed.
Parameters
----------
vectors_u : numpy.array
Vectors of all nodes `u` in the batch. Expected shape (batch_size, dim).
vectors_v : numpy.array
Vectors of all positively related nodes `v` and negatively sampled nodes `v'`,
for each node `u` in the batch. Expected shape (1 + neg_size, dim, batch_size).
indices_u : list of int
List of node indices for each of the vectors in `vectors_u`.
indices_v : list of lists of int
Nested list of lists, each of which is a list of node indices
for each of the vectors in `vectors_v` for a specific node `u`.
regularization_coeff : float, optional
Coefficient to use for l2-regularization
"""
self.vectors_u = vectors_u.T[np.newaxis, :, :] # (1, dim, batch_size)
self.vectors_v = vectors_v # (1 + neg_size, dim, batch_size)
self.indices_u = indices_u
self.indices_v = indices_v
self.regularization_coeff = regularization_coeff
self.poincare_dists = None
self.euclidean_dists = None
self.norms_u = None
self.norms_v = None
self.alpha = None
self.beta = None
self.gamma = None
self.gradients_u = None
self.distance_gradients_u = None
self.gradients_v = None
self.distance_gradients_v = None
self.loss = None
self._distances_computed = False
self._gradients_computed = False
self._distance_gradients_computed = False
self._loss_computed = False
def compute_all(self):
"""Convenience method to perform all computations."""
self.compute_distances()
self.compute_distance_gradients()
self.compute_gradients()
self.compute_loss()
def compute_distances(self):
"""Compute and store norms, euclidean distances and poincare distances between input vectors."""
if self._distances_computed:
return
euclidean_dists = np.linalg.norm(self.vectors_u - self.vectors_v, axis=1) # (1 + neg_size, batch_size)
norms_u = np.linalg.norm(self.vectors_u, axis=1) # (1, batch_size)
norms_v = np.linalg.norm(self.vectors_v, axis=1) # (1 + neg_size, batch_size)
alpha = 1 - norms_u ** 2 # (1, batch_size)
beta = 1 - norms_v ** 2 # (1 + neg_size, batch_size)
gamma = 1 + 2 * (
(euclidean_dists ** 2) / (alpha * beta)
) # (1 + neg_size, batch_size)
poincare_dists = np.arccosh(gamma) # (1 + neg_size, batch_size)
exp_negative_distances = np.exp(-poincare_dists) # (1 + neg_size, batch_size)
Z = exp_negative_distances.sum(axis=0) # (batch_size)
self.euclidean_dists = euclidean_dists
self.poincare_dists = poincare_dists
self.exp_negative_distances = exp_negative_distances
self.Z = Z
self.gamma = gamma
self.norms_u = norms_u
self.norms_v = norms_v
self.alpha = alpha
self.beta = beta
self.gamma = gamma
self._distances_computed = True
def compute_gradients(self):
"""Compute and store gradients of loss function for all input vectors."""
if self._gradients_computed:
return
self.compute_distances()
self.compute_distance_gradients()
# (1 + neg_size, dim, batch_size)
gradients_v = -self.exp_negative_distances[:, np.newaxis, :] * self.distance_gradients_v
gradients_v /= self.Z # (1 + neg_size, dim, batch_size)
gradients_v[0] += self.distance_gradients_v[0]
gradients_v[0] += self.regularization_coeff * 2 * self.vectors_v[0]
# (1 + neg_size, dim, batch_size)
gradients_u = -self.exp_negative_distances[:, np.newaxis, :] * self.distance_gradients_u
gradients_u /= self.Z # (1 + neg_size, dim, batch_size)
gradients_u = gradients_u.sum(axis=0) # (dim, batch_size)
gradients_u += self.distance_gradients_u[0]
assert not np.isnan(gradients_u).any()
assert not np.isnan(gradients_v).any()
self.gradients_u = gradients_u
self.gradients_v = gradients_v
self._gradients_computed = True
def compute_distance_gradients(self):
"""Compute and store partial derivatives of poincare distance d(u, v) w.r.t all u and all v."""
if self._distance_gradients_computed:
return
self.compute_distances()
euclidean_dists_squared = self.euclidean_dists ** 2 # (1 + neg_size, batch_size)
# (1 + neg_size, 1, batch_size)
c_ = (4 / (self.alpha * self.beta * np.sqrt(self.gamma ** 2 - 1)))[:, np.newaxis, :]
# (1 + neg_size, 1, batch_size)
u_coeffs = ((euclidean_dists_squared + self.alpha) / self.alpha)[:, np.newaxis, :]
distance_gradients_u = u_coeffs * self.vectors_u - self.vectors_v # (1 + neg_size, dim, batch_size)
distance_gradients_u *= c_ # (1 + neg_size, dim, batch_size)
nan_gradients = self.gamma == 1 # (1 + neg_size, batch_size)
if nan_gradients.any():
distance_gradients_u.swapaxes(1, 2)[nan_gradients] = 0
self.distance_gradients_u = distance_gradients_u
# (1 + neg_size, 1, batch_size)
v_coeffs = ((euclidean_dists_squared + self.beta) / self.beta)[:, np.newaxis, :]
distance_gradients_v = v_coeffs * self.vectors_v - self.vectors_u # (1 + neg_size, dim, batch_size)
distance_gradients_v *= c_ # (1 + neg_size, dim, batch_size)
if nan_gradients.any():
distance_gradients_v.swapaxes(1, 2)[nan_gradients] = 0
self.distance_gradients_v = distance_gradients_v
self._distance_gradients_computed = True
def compute_loss(self):
"""Compute and store loss value for the given batch of examples."""
if self._loss_computed:
return
self.compute_distances()
self.loss = -np.log(self.exp_negative_distances[0] / self.Z).sum() # scalar
self._loss_computed = True
class PoincareKeyedVectors(KeyedVectors):
"""Vectors and vocab for the :class:`~gensim.models.poincare.PoincareModel` training class.
Used to perform operations on the vectors such as vector lookup, distance calculations etc.
(May be used to save/load final vectors in the plain word2vec format, via the inherited
methods save_word2vec_format() and load_word2vec_format().)
Examples
--------
.. sourcecode:: pycon
>>> from gensim.test.utils import datapath
>>>
>>> # Read the sample relations file and train the model
>>> relations = PoincareRelations(file_path=datapath('poincare_hypernyms_large.tsv'))
>>> model = PoincareModel(train_data=relations)
>>> model.train(epochs=50)
>>>
>>> # Query the trained model.
>>> wv = model.kv.get_vector('kangaroo.n.01')
"""
def __init__(self, vector_size, vector_count, dtype=REAL):
super(PoincareKeyedVectors, self).__init__(vector_size, vector_count, dtype=dtype)
self.max_distance = 0
def _load_specials(self, *args, **kwargs):
super(PoincareKeyedVectors, self)._load_specials(*args, **kwargs)
# fixup rename of syn0
if not hasattr(self, 'vectors'):
self.vectors = self.__dict__.pop('syn0')
@staticmethod
def vector_distance(vector_1, vector_2):
"""Compute poincare distance between two input vectors. Convenience method over `vector_distance_batch`.
Parameters
----------
vector_1 : numpy.array
Input vector.
vector_2 : numpy.array
Input vector.
Returns
-------
numpy.float
Poincare distance between `vector_1` and `vector_2`.
"""
return PoincareKeyedVectors.vector_distance_batch(vector_1, vector_2[np.newaxis, :])[0]
@staticmethod
def vector_distance_batch(vector_1, vectors_all):
"""Compute poincare distances between one vector and a set of other vectors.
Parameters
----------
vector_1 : numpy.array
vector from which Poincare distances are to be computed, expected shape (dim,).
vectors_all : numpy.array
for each row in vectors_all, distance from vector_1 is computed, expected shape (num_vectors, dim).
Returns
-------
numpy.array
Poincare distance between `vector_1` and each row in `vectors_all`, shape (num_vectors,).
"""
euclidean_dists = np.linalg.norm(vector_1 - vectors_all, axis=1)
norm = np.linalg.norm(vector_1)
all_norms = np.linalg.norm(vectors_all, axis=1)
return np.arccosh(
1 + 2 * (
(euclidean_dists ** 2) / ((1 - norm ** 2) * (1 - all_norms ** 2))
)
)
def closest_child(self, node):
"""Get the node closest to `node` that is lower in the hierarchy than `node`.
Parameters
----------
node : {str, int}
Key for node for which closest child is to be found.
Returns
-------
{str, None}
Node closest to `node` that is lower in the hierarchy than `node`.
If there are no nodes lower in the hierarchy, None is returned.
"""
all_distances = self.distances(node)
all_norms = np.linalg.norm(self.vectors, axis=1)
node_norm = all_norms[self.get_index(node)]
mask = node_norm >= all_norms
if mask.all(): # No nodes lower in the hierarchy
return None
all_distances = np.ma.array(all_distances, mask=mask)
closest_child_index = np.ma.argmin(all_distances)
return self.index_to_key[closest_child_index]
def closest_parent(self, node):
"""Get the node closest to `node` that is higher in the hierarchy than `node`.
Parameters
----------
node : {str, int}
Key for node for which closest parent is to be found.
Returns
-------
{str, None}
Node closest to `node` that is higher in the hierarchy than `node`.
If there are no nodes higher in the hierarchy, None is returned.
"""
all_distances = self.distances(node)
all_norms = np.linalg.norm(self.vectors, axis=1)
node_norm = all_norms[self.get_index(node)]
mask = node_norm <= all_norms
if mask.all(): # No nodes higher in the hierarchy
return None
all_distances = np.ma.array(all_distances, mask=mask)
closest_child_index = np.ma.argmin(all_distances)
return self.index_to_key[closest_child_index]
def descendants(self, node, max_depth=5):
"""Get the list of recursively closest children from the given node, up to a max depth of `max_depth`.
Parameters
----------
node : {str, int}
Key for node for which descendants are to be found.
max_depth : int
Maximum number of descendants to return.
Returns
-------
list of str
Descendant nodes from the node `node`.
"""