forked from aymericdamien/TensorFlow-Examples
-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathword2vec.py
195 lines (166 loc) · 6.81 KB
/
word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
""" Word2Vec.
Implement Word2Vec algorithm to compute vector representations of words.
This example is using a small chunk of Wikipedia articles to train from.
References:
- Mikolov, Tomas et al. "Efficient Estimation of Word Representations
in Vector Space.", 2013.
Links:
- [Word2Vec] https://arxiv.org/pdf/1301.3781.pdf
Author: Aymeric Damien
Project: https://github.com/nebulaai/TensorFlow-Examples/
"""
from __future__ import division, print_function, absolute_import
import collections
import os
import random
import urllib
import zipfile
import numpy as np
import tensorflow as tf
# Training Parameters
learning_rate = 0.1
batch_size = 128
num_steps = 3000000
display_step = 10000
eval_step = 200000
# Evaluation Parameters
eval_words = ['five', 'of', 'going', 'hardware', 'american', 'britain']
# Word2Vec Parameters
embedding_size = 200 # Dimension of the embedding vector
max_vocabulary_size = 50000 # Total number of different words in the vocabulary
min_occurrence = 10 # Remove all words that does not appears at least n times
skip_window = 3 # How many words to consider left and right
num_skips = 2 # How many times to reuse an input to generate a label
num_sampled = 64 # Number of negative examples to sample
# Download a small chunk of Wikipedia articles collection
url = 'http://mattmahoney.net/dc/text8.zip'
data_path = 'text8.zip'
if not os.path.exists(data_path):
print("Downloading the dataset... (It may take some time)")
filename, _ = urllib.urlretrieve(url, data_path)
print("Done!")
# Unzip the dataset file. Text has already been processed
with zipfile.ZipFile(data_path) as f:
text_words = f.read(f.namelist()[0]).lower().split()
# Build the dictionary and replace rare words with UNK token
count = [('UNK', -1)]
# Retrieve the most common words
count.extend(collections.Counter(text_words).most_common(max_vocabulary_size - 1))
# Remove samples with less than 'min_occurrence' occurrences
for i in range(len(count) - 1, -1, -1):
if count[i][1] < min_occurrence:
count.pop(i)
else:
# The collection is ordered, so stop when 'min_occurrence' is reached
break
# Compute the vocabulary size
vocabulary_size = len(count)
# Assign an id to each word
word2id = dict()
for i, (word, _)in enumerate(count):
word2id[word] = i
data = list()
unk_count = 0
for word in text_words:
# Retrieve a word id, or assign it index 0 ('UNK') if not in dictionary
index = word2id.get(word, 0)
if index == 0:
unk_count += 1
data.append(index)
count[0] = ('UNK', unk_count)
id2word = dict(zip(word2id.values(), word2id.keys()))
print("Words count:", len(text_words))
print("Unique words:", len(set(text_words)))
print("Vocabulary size:", vocabulary_size)
print("Most common words:", count[:10])
data_index = 0
# Generate training batch for the skip-gram model
def next_batch(batch_size, num_skips, skip_window):
global data_index
assert batch_size % num_skips == 0
assert num_skips <= 2 * skip_window
batch = np.ndarray(shape=(batch_size), dtype=np.int32)
labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
# get window size (words left and right + current one)
span = 2 * skip_window + 1
buffer = collections.deque(maxlen=span)
if data_index + span > len(data):
data_index = 0
buffer.extend(data[data_index:data_index + span])
data_index += span
for i in range(batch_size // num_skips):
context_words = [w for w in range(span) if w != skip_window]
words_to_use = random.sample(context_words, num_skips)
for j, context_word in enumerate(words_to_use):
batch[i * num_skips + j] = buffer[skip_window]
labels[i * num_skips + j, 0] = buffer[context_word]
if data_index == len(data):
buffer.extend(data[0:span])
data_index = span
else:
buffer.append(data[data_index])
data_index += 1
# Backtrack a little bit to avoid skipping words in the end of a batch
data_index = (data_index + len(data) - span) % len(data)
return batch, labels
# Input data
X = tf.placeholder(tf.int32, shape=[None])
# Input label
Y = tf.placeholder(tf.int32, shape=[None, 1])
# Ensure the following ops & var are assigned on CPU
# (some ops are not compatible on GPU)
with tf.device('/cpu:0'):
# Create the embedding variable (each row represent a word embedding vector)
embedding = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
# Lookup the corresponding embedding vectors for each sample in X
X_embed = tf.nn.embedding_lookup(embedding, X)
# Construct the variables for the NCE loss
nce_weights = tf.Variable(tf.random_normal([vocabulary_size, embedding_size]))
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# Compute the average NCE loss for the batch
loss_op = tf.reduce_mean(
tf.nn.nce_loss(weights=nce_weights,
biases=nce_biases,
labels=Y,
inputs=X_embed,
num_sampled=num_sampled,
num_classes=vocabulary_size))
# Define the optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss_op)
# Evaluation
# Compute the cosine similarity between input data embedding and every embedding vectors
X_embed_norm = X_embed / tf.sqrt(tf.reduce_sum(tf.square(X_embed)))
embedding_norm = embedding / tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keepdims=True))
cosine_sim_op = tf.matmul(X_embed_norm, embedding_norm, transpose_b=True)
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
with tf.Session() as sess:
# Run the initializer
sess.run(init)
# Testing data
x_test = np.array([word2id[w] for w in eval_words])
average_loss = 0
for step in xrange(1, num_steps + 1):
# Get a new batch of data
batch_x, batch_y = next_batch(batch_size, num_skips, skip_window)
# Run training op
_, loss = sess.run([train_op, loss_op], feed_dict={X: batch_x, Y: batch_y})
average_loss += loss
if step % display_step == 0 or step == 1:
if step > 1:
average_loss /= display_step
print("Step " + str(step) + ", Average Loss= " + \
"{:.4f}".format(average_loss))
average_loss = 0
# Evaluation
if step % eval_step == 0 or step == 1:
print("Evaluation...")
sim = sess.run(cosine_sim_op, feed_dict={X: x_test})
for i in xrange(len(eval_words)):
top_k = 8 # number of nearest neighbors
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = '"%s" nearest neighbors:' % eval_words[i]
for k in xrange(top_k):
log_str = '%s %s,' % (log_str, id2word[nearest[k]])
print(log_str)