-
Notifications
You must be signed in to change notification settings - Fork 64
/
Copy pathNLP-wordvectors.tex
749 lines (455 loc) · 24.3 KB
/
NLP-wordvectors.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
%\documentclass[mathserif]{beamer}
\documentclass[handout]{beamer}
%\usetheme{Goettingen}
%\usetheme{Warsaw}
\usetheme{Singapore}
%\usetheme{Frankfurt}
%\usetheme{Copenhagen}
%\usetheme{Szeged}
%\usetheme{Montpellier}
%\usetheme{CambridgeUS}
%\usecolortheme{}
%\setbeamercovered{transparent}
\usepackage[english, activeacute]{babel}
\usepackage[utf8]{inputenc}
\usepackage{amsmath, amssymb}
\usepackage{dsfont}
\usepackage{graphics}
\usepackage{cases}
\usepackage{graphicx}
\usepackage{pgf}
\usepackage{epsfig}
\usepackage{amssymb}
\usepackage{multirow}
\usepackage{amstext}
\usepackage[ruled,vlined,lined]{algorithm2e}
\usepackage{amsmath}
\usepackage{epic}
\usepackage{epsfig}
\usepackage{fontenc}
\usepackage{framed,color}
\usepackage{palatino, url, multicol}
%\algsetup{indent=2em}
\newcommand{\factorial}{\ensuremath{\mbox{\sc Factorial}}}
\newcommand{\BIGOP}[1]{\mathop{\mathchoice%
{\raise-0.22em\hbox{\huge $#1$}}%
{\raise-0.05em\hbox{\Large $#1$}}{\hbox{\large $#1$}}{#1}}}
\newcommand{\bigtimes}{\BIGOP{\times}}
\vspace{-0.5cm}
\title{Natural Language Processing \\ Word Vectors}
\vspace{-0.5cm}
\author[Felipe Bravo Márquez]{\footnotesize
%\author{\footnotesize
\textcolor[rgb]{0.00,0.00,1.00}{Felipe Bravo-Marquez}}
\date{\today}
\begin{document}
\begin{frame}
\titlepage
\end{frame}
\begin{frame}{Word Vectors}
\begin{scriptsize}
\begin{itemize}
\item A major component in neural networks for language is the use of an embedding
layer.
\item A mapping of discrete symbols to continuous vectors.
\item When embedding words, they transform from being isolated distinct symbols into mathematical
objects that can be operated on.
\item Distance between vectors can be equated to distance between words.
\item This makes easier to generalize the behavior from one word to another.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Distributional Vectors}
\begin{scriptsize}
\begin{itemize}
\item \textbf{Distributional Hypothesis} \cite{harris1954}: words occurring in the same \textbf{contexts} tend to have similar meanings.
\item Or equivalently: ``a word is characterized by the \textbf{company} it keeps".
\item \textbf{Distributional representations}: words are represented by \textbf{high-dimensional vectors} based on the context's where they occur.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Word-context Matrices}
\begin{scriptsize}
\begin{itemize}
\item Distributional vectors are built from word-context matrices $M$.
\item Each cell $(i,j)$ is a co-occurrence based association value between a \textbf{target word} $w_i$ and a \textbf{context} $c_j$ calculated from a corpus of documents.
\item Contexts are commonly defined as windows of words surrounding $w_i$.
\item The window length $k$ is a parameter ( between 1 and 8 words on both the left and the right sides of $w_i$).
\item If the Vocabulary of the target words and context words is the same, $M$ has dimensionality $|\mathcal{V}| \times |\mathcal{V}|$.
\item Whereas shorter windows are likely to capture \textbf{syntactic information} (e.g, POS), longer windows are more likely to capture topical similarity \cite{goldberg2016primer, JurafskyBook}.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Distributional Vectors with context windows of size 1}
\begin{figure}[htb]
\centering
\includegraphics[scale=0.3]{pics/distributionalSocher.png}
\end{figure}
\footnotetext{Example taken from: \url{http://cs224d.stanford.edu/lectures/CS224d-Lecture2.pdf}}
%\footnote{Source: \url{http://cs224d.stanford.edu/lectures/CS224d-Lecture2.pdf}}
\end{frame}
\begin{frame}{Word-context Matrices}
\begin{scriptsize}
The associations between words and contexts can be calculated using different approaches:
\begin{enumerate}
\item Co-occurrence counts.
\item Positive point-wise mutual information (PPMI).
\item The significance values of a paired t-test.
\end{enumerate}
The most common of those according to \cite{JurafskyBook} is PPMI.
Distributional methods are also referred to as count-based methods.
\end{scriptsize}
\end{frame}
\begin{frame}{PPMI}
\begin{scriptsize}
\begin{itemize}
\item PMI calculates the log of the probability of word-context pairs occurring together over the probability of them being independent.
\begin{equation}
\operatorname{PMI}(w, c)= \log_2 \left( \frac{P(w,c)}{P(w)P(c)} \right) = \log_{2} \left ( \frac{\operatorname{count}(w,c)\times |D|}{\operatorname{count}(w)\times \operatorname{count}(c)} \right )
\end{equation}
\item Negative PMI values suggest that the pair co-occurs less often than chance.
\item These estimates are unreliable unless the counts are calculated from very large corpora \cite{JurafskyBook}.
\item PPMI corrects this problem by replacing negative values by zero:
\begin{equation}
\operatorname{PPMI}(w, c)= \operatorname{max}(0,\operatorname{PMI}(w, c))
\end{equation}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Distributed Vectors or Word embeddings}
\begin{scriptsize}
\begin{itemize}
\item Count-based distributional vectors increase in size with vocabulary i.e., can have a very high dimensionality.
\item Explicitly storing the co-occurrence matrix can be memory-intensive.
\item Some classification models don't scale well to high-dimensional data.
\item The neural network community prefers using \textbf{distributed representations}\footnote{Idea: The meaning of the word is ``distributed'' over a combination of dimensions.} or \textbf{word embeddings}.
\item Word \textbf{embeddings} are low-dimensional continuous dense word vectors trained from document corpora using \textbf{neural networks}.
\item The dimensions are not directly interpretable i.e., represent latent features of the word, ``hopefully capturing useful syntactic and semantic properties''~\cite{turian2010word}.
\item They have become a crucial component of neural network architectures for NLP.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Distributed Vectors or Word embeddings (2)}
\begin{scriptsize}
\begin{itemize}
\item There are two main approaches for obtaining word embeddings:
\begin{enumerate}
\begin{scriptsize}
\item Embedding layers: using an embedding layer in a task-specific neural network architecture trained from labeled examples (e.g., sentiment analysis).
\item Pre-trained word embeddings: creating an auxiliary predictive task from unlabeled copora (e.g., predict the following word) in which word embeddings will naturally arise from the neural-network architecture.
\end{scriptsize}
\end{enumerate}
\item These approaches can also be combined: one can initialize an embedding layer of a task-specific neural network with pre-trained word embeddings obtained with the second approach.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Distributed Vectors or Word embeddings (2)}
\begin{scriptsize}
\begin{itemize}
\item Most popular models based on the second approach are skip-gram \cite{Mikolov2013}, continuous bag-of-words \cite{Mikolov2013}, and Glove \cite{penningtonSM14}.
\item Word embeddings have shown to be more powerful than distributional approaches in many NLP tasks~\cite{baroni2014don}.
\item In \cite{amir2015SemEval}, they were used as \textbf{features} in a regression model for determining the association between Twitter words and \textbf{positive sentiment}.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Word2Vec}
\begin{scriptsize}
\begin{itemize}
\item Word2Vec is a software package that implements two neural network architectures for training word embeddings: Continuous Bag of Words (CBOW) and Skip-gram.
\item It implements two optimization models: Negative Sampling and Hierarchical Softmax.
\item These models are shallow neural networks that are trained to predict the contexts of words.
\item A very comprehensive tutorial about the algorithms behind word2vec: \url{https://arxiv.org/pdf/1411.2738.pdf}.
\end{itemize}
\end{scriptsize}
\end{frame}
% Good Word2Vec tutorial
%http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
\begin{frame}{Skip-gram Model}
\begin{scriptsize}
\begin{itemize}
\item A neural network with one ``projection'' or ``hidden'' layer is trained for predicting the words surrounding a center word, within a window of size $k$ that is shifted along the input corpus.
\item The center and surrounding $k$ words correspond to the input and output layers of the network.
\item Words are initially represented by 1-hot vectors: vectors of the size of the vocabulary ($|V|$) with zero values in all entries except for the corresponding word index that receives a value of 1.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Skip-gram Model}
\begin{scriptsize}
\begin{itemize}
\item The output layer combines the $k$ 1-hot vectors of the surrounding words.
\item The hidden layer has a dimensionality $d$, which determines the size of the embeddings (normally $d \ll |V|$).
\end{itemize}
\end{scriptsize}
\begin{figure}[h]
\includegraphics[scale = 0.4]{pics/skip-gram.png}
\end{figure}
\end{frame}
\begin{frame}{Skip-gram Model}
\begin{figure}[h]
\includegraphics[scale = 0.4]{pics/skip_gram_net_arch.png}
\end{figure}
\footnotetext{Picture taken from: \url{http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/}}
\end{frame}
% we use hierarchical softmax where the vocabulary is represented as a Huffman binary tree. This follows previous observations that the frequency of words works well for obtaining classes in neural net language models [16]. Huffman trees assign short binary codes to frequent words, and this further reduces the number of output units that need to be evaluated
%If two different words have very similar “contexts” (that is, what words are likely to appear around them), then our model needs to output very similar results for these two words. And one way for the network to output similar context predictions for these two words is if the word vectors are similar. So, if two words have similar contexts, then our network is motivated to learn similar word vectors for these two words! Ta da!
%And what does it mean for two words to have similar contexts? I think you could expect that synonyms like “intelligent” and “smart” would have very similar contexts. Or that words that are related, like “engine” and “transmission”, would probably have similar contexts as well.
%This can also handle stemming for you – the network will likely learn similar word vectors for the words “ant” and “ants” because these should have similar contexts.
%https://arxiv.org/pdf/1402.3722.pdf
\begin{frame}{Parametrization of the Skip-gram model}
\begin{scriptsize}
\begin{itemize}
\item We are given an input corpus formed by a sequence of words $w_1, w_2, w_3, . . . , w_T$ and a window size $k$.
\item We denote target or (center) words by letter $w$ and surrounding context words by letter $c$.
\item The context window $c_{1:k}$ of word $w_t$ corresponds to words $w_{t-k/2},\dots, w_{t-1}, w_{t+1}, \dots, w_{t+k/2}$ (assuming that $k$ is an even number).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Parametrization of the Skip-gram model}
\begin{scriptsize}
\begin{itemize}
\item The objective of the Skip-gram model is to maximize the average log probability of the context words given the target words:
\begin{displaymath}
\frac{1}{T} \sum_{t=1}^T \sum_{c \in c_{1:k}} \log P(c|w_t)
\end{displaymath}
\item The conditional probability of a context word $c$ given a center word $w$ is modeled with a softmax ($C$ is the set of all context words, which is usually the same as the vocabulary):
\begin{displaymath}
P(c|w) = \frac{e^{\vec{c}\cdot \vec{w}}}{ \sum_{c'\in C} e^{\vec{c}'\cdot \vec{w}}}
\end{displaymath}
\item Model's parameters $\theta$: $\vec{c}$ and $\vec{w}$ (vector representations of contexts and target words).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Parametrization of the Skip-gram model}
\begin{scriptsize}
\begin{itemize}
\item Let $D$ be the set of correct word-context pairs (i.e., word pairs that are observed in the Corpus).
\item The optimization goal is to maximize the conditional log-likelihood of the contexts $c$ (this is equivalent to minimizing the cross-entropy loss):
\begin{equation}
\begin{split}
\operatorname{arg} \max_{\vec{c}, \vec{w}} & \quad \sum_{(w,c) \in D}{\log P(c|w)} = \sum_{(w,c) \in D} ( \log e^{\vec{c}\cdot \vec{w}} - \log \sum_{c'\in C} e^{\vec{c}'\cdot \vec{w}} )
\end{split}
\end{equation}
\item Assumption: maximizing this function will result in good embeddings $\vec{w}$ i.e., similar words will have similar vectors.
\item The term $P(c|w)$ is computationally expensive because of the summation $\sum_{c'\in C} e^{\vec{c}'\cdot \vec{w}}$ over all the contexts $c'$.
\item Fix: replace the softmax with a hierarchical softmax (the vocabulary is represented with a Huffman binary tree).
\item Huffman trees assign short binary codes to frequent words, reducing the number of output units to be evaluated.
\end{itemize}
\end{scriptsize}
\end{frame}
%The distributional hypothesis states that words in similar contexts have similar meanings. The objective above clearly tries to increase the quantity for good word-context pairs, and decrease it for bad ones. Intuitively, this means that words that share many contexts will be similar to each other (notealso that contexts sharing many words will also be similar to each other). This is, however, very hand-wavy.
% https://arxiv.org/pdf/1402.3722.pdf Skip-gram and Negative Sampling are not the same
\begin{frame}{Skip-gram with Negative Sampling}
\begin{scriptsize}
\begin{itemize}
\item Negative-sampling (NS) is presented as a more efficient model for calculating skip-gram embeddings.
\item However, it optimizes a different objective function \cite{goldberg2014word2vec}.
\item NS maximizes the probability that a word-context pair $(w, c)$ came from the set of correct word-context pairs $D$ using a sigmoid function:
\begin{displaymath}
P(D = 1| w,c_i) = \frac{1}{1+e^{-\vec{w} \cdot \vec{c_{i}}}}
\end{displaymath}
\item Assumption: the contexts words $c_i$ are independent from each other:
\begin{displaymath}
P(D = 1| w,c_{1:k}) = \prod_{i=1}^{k}{P(D = 1| w,c_i)} = \prod_{i=1}^{k}{\frac{1}{1+e^{-\vec{w} \cdot \vec{c_{i}}}}}
\end{displaymath}
\item This leads to the following target function (log-likelihood):
\begin{equation}
\begin{split}
\operatorname{arg} \max_{\vec{c}, \vec{w}} & \quad \log P(D = 1| w,c_{1:k}) = \sum_{i=1}^{k}{\log \frac{1}{1+e^{-\vec{w} \cdot \vec{c_{i}}}}}
\end{split}
\end{equation}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Skip-gram with Negative Sampling (2)}
\begin{scriptsize}
\begin{itemize}
\item This objective has a trivial solution if we set $\vec{w}$,$\vec{c}$ such that $P(D=1|w,c)=1$ for every pair $(w,c)$ from $D$.
\item This is achieved by setting $\vec{w}=\vec{c}$ and $\vec{w} \cdot \vec{c} = K$ for all $\vec{w},\vec{c}$, where $K$ is a large number.
\item We need a mechanism that prevents all the vectors from having the same value, by disallowing some $(w, c)$ combinations.
\item One way to do so, is to present the model with some $(w, c)$ pairs for which $P(D= 1|w, c)$ must be low, i.e.
pairs which are not in the data.
\item This is achieved sampling negative samples from $\tilde{D}$.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Skip-gram with Negative Sampling (3)}
\begin{scriptsize}
\begin{itemize}
\item Sample $m$ words for each word-context pair $(w,c) \in D$.
\item Add each sampled word $w_i$ together with the original context $c$ as a negative example to $\tilde{D}$.
%\item $\tilde{D}$ being $m$ times larger than $D$.
%\item The number of negative samples $m$ is a parameter of the algorithm.
\item Final objective function:
\begin{equation}
\begin{split}
\operatorname{arg} \max_{\vec{c}, \vec{w}} & \quad \sum_{(w,c) \in D}{\log P(D = 1| w,c_{1:k})} + \sum_{(w,c) \in \tilde{D}} \log P(D = 0| w,c_{1:k})
\end{split}
\end{equation}
\item The negative words are sampled from smoothed version of the corpus frequencies:
\begin{displaymath}
\frac{\#(w)^{0.75}}{\sum_{w'}\#(w')^{0.75}}
\end{displaymath}
\item This gives more relative weight to less frequent words.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Continuous Bag of Words: CBOW}
\begin{scriptsize}
\begin{itemize}
\item Similar to the skip-gram model but now the center word is predicted from the surrounding context.
\begin{figure}[h]
\includegraphics[scale = 0.55]{pics/CBOW.png}
\end{figure}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{GloVe}
\begin{scriptsize}
\begin{itemize}
\item GloVe (from global vectors) is another popular method for training word embeddings \cite{penningtonSM14}.
\item It constructs an explicit word-context
matrix, and trains the word and context vectors $\vec{w}$ and $\vec{c}$ attempting to satisfy:
\begin{equation}
\vec{w} \cdot \vec{c} + b_{[w]}+b_{[c]} = \log \#(w,c) \quad \forall (w,c) \in D
\end{equation}
\item where $b_{[w]}$ and $b_{[c]}$ are word-specific and context-specific trained biases.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{GloVe (2)}
\begin{scriptsize}
\begin{itemize}
\item In terms of matrix factorization, if we fix $b_{[w]}=\log \#(w)$ and $b_{[c]}=\log \#(c)$ we'll get an objective that is very similar to factorizing the word-context PMI matrix, shifted by $\log (|D|)$.
\item In GloVe the bias parameters are learned and not fixed, giving it another degree of freedom.
\item The optimization objective is weighted least-squares loss, assigning more weight to the correct reconstruction of frequent items.
\item When using the same word and context vocabularies, the model suggests representing each word as the sum of its corresponding word and context embedding vectors.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Word Analogies}
\begin{scriptsize}
\begin{itemize}
\item Word embeddings can capture certain semantic relationships, e.g. male-female, verb tense and country-capital relationships between words.
\item For example, the following relationship is found for word embeddings
trained using Word2Vec: $\vec{w}_{king} - \vec{w}_{man} + \vec{w}_{woman} \approx \vec{w}_{queen}$.
\end{itemize}
\begin{figure}[h]
\includegraphics[scale = 0.2]{pics/linear-relationships.png}
\end{figure}
\footnotemark{Source: \url{https://www.tensorflow.org/tutorials/word2vec}}
\end{scriptsize}
\end{frame}
\begin{frame}{Evaluation}
\begin{scriptsize}
\begin{itemize}
\item There are many datasets with human annotated associations of word pairs or gold analogies that can be used to evaluate word embeddings algorithms.
\item Those approaches are called \textit{Intrinsic Evaluation Approaches}.
\item Most of them are implemented in: \url{https://github.com/kudkudak/word-embeddings-benchmarks}.
\item Word embeddings can also be evaluated extrinsically by using them in an external NLP task (e.g., POS tagging, sentiment analysis).
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Correspondence between Distributed and Distributional Models}
\begin{scriptsize}
\begin{itemize}
\item Both the distributional ``count-based'' methods and the distributed ``neural'' ones are based on the distributional hypothesis.
\item The both attempt to capture the similarity between words based on the similarity between the contexts in which they occur.
\item Levy and Goldebrg showed in \cite{levy2014neural} that Skip-gram negative sampling (SGNS) is implicitly factorizing a word-context matrix, whose cells are the pointwise mutual information (PMI) of the respective word and context pairs, shifted by a global constant.
%https://levyomer.files.wordpress.com/2014/09/neural-word-embeddings-as-implicit-matrix-factorization.pdf
\item This ties the neural methods and the traditional ``count-based'' suggesting that in a deep sense
the two algorithmic families are equivalent.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{FastText}
\begin{scriptsize}
\begin{itemize}
\item FastText embeddings extend the skipgram model to take into account the internal structure of words while learning word representations \cite{bojanowski2016enriching}.
\item A vector representation is associated with each character $n$-gram.
\item Words are represented as the sum of these representations.
\item Taking the word \emph{where} and $n = 3$, it will be represented by the character $n$-grams: $<$wh, whe, her, ere, re$>$, and the special sequence $<$where$>$.
\item Note that the sequence \emph{$<$her$>$}, corresponding to the word ``her'' is different from the tri-gram ``her'' form the word ``here''.
\item FastText is useful for morphologically rich languages. For example, the words ``amazing'' and ``amazingly'' share information in FastText through their shared $n$-grams, whereas in Word2Vec these two words are completely unrelated.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{FastText (2)}
\begin{scriptsize}
\begin{itemize}
\item Let $\mathcal{G}_{w}$ be the set of $n$-grams appearing in $w$.
\item FastText associates a vector $\vec{g}$ with each $n$-gram in $\mathcal{G}_{w}$.
\item In FastText the probability that a word-context pair $(w, c)$ came from the input corpus $D$ is calculated as follows:
\begin{displaymath}
P(D | w, c) = \frac{1}{1+e^{-s(w,c)}}
\end{displaymath}
where,
\begin{displaymath}
s(w,c) = \sum_{g \in {G}_{w}} \vec{g} \cdot \vec{c}.
\end{displaymath}
\item The negative sampling algorithm can be calculated in the same form as in the skip-gram model with this formulation.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Sentiment-Specific Phrase Embeddings}
%https://pdfs.semanticscholar.org/107f/b80ff801894b6191d0613af41aba91c134a4.pdf
\begin{scriptsize}
\begin{itemize}
\item Problem of word embeddings: antonyms can be used in similar contexts e.g., my car is nice vs my car is ugly.
\item In \cite{TangCol14} \textbf{sentiment-specific} word embeddings are proposed by combining the skip-gram model with emoticon-annotated tweets :) :( .
\item These embeddings are used for \textbf{training} a word-level polarity classifier.
\item The model integrates sentiment information into the continuous representation of phrases by developing a tailored neural architecture.
\item Input: $\{w_i,s_j,pol_j\}$, where $w_i$ is a phrase (or word), $s_j$ the sentence, and $pol_j$ the sentence's polarity.
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Sentiment-Specific Phrase Embeddings (2)}
%https://pdfs.semanticscholar.org/107f/b80ff801894b6191d0613af41aba91c134a4.pdf
\begin{scriptsize}
\begin{itemize}
\item The training objective uses the embedding of $w_i$ to predict its context words (in the same way as the skip-gram model), and uses the sentence representation $se_j$ to predict $pol_j$.
\item Sentences ($se_j$) are represented by averaging the word vectors of their words.
\item The objective of the sentiment part is to maximize the average of log sentiment probability:
\begin{displaymath}
f_{sentiment}= \frac{1}{S}\sum_{j=1}^{S}\log p(pol_j|se_j)
\end{displaymath}
\item The final training objective is to maximize the linear combination of the skip-gram and sentiment objectives:
\begin{displaymath}
f = \alpha f_{skipgram} + (1- \alpha)f_{sentiment}
\end{displaymath}
\end{itemize}
\end{scriptsize}
\end{frame}
\begin{frame}{Sentiment-Specific Phrase Embeddings}
%https://pdfs.semanticscholar.org/107f/b80ff801894b6191d0613af41aba91c134a4.pdf
\begin{figure}[h]
\includegraphics[scale = 0.4]{pics/SSPE.png}
\end{figure}
\begin{figure}[h]
\includegraphics[scale = 0.3]{pics/SSPERes.png}
\end{figure}
\end{frame}
\begin{frame}{Gensim}
\begin{scriptsize}
Gensim is an open source Python library for natural language processing that implements many algorithms for training word embeddings.
\begin{itemize}
\item \url{https://radimrehurek.com/gensim/}
\item \url{https://machinelearningmastery.com/develop-word-embeddings-python-gensim/}
\end{itemize}
\begin{figure}[h]
\includegraphics[scale = 0.3]{pics/gensim.png}
\end{figure}
\end{scriptsize}
\end{frame}
\begin{frame}
\frametitle{Questions?}
%\vspace{1.5cm}
\begin{center}\LARGE Thanks for your Attention!\\ \end{center}
\end{frame}
\begin{frame}[allowframebreaks]\scriptsize
\frametitle{References}
\bibliography{bio}
\bibliographystyle{apalike}
%\bibliographystyle{flexbib}
\end{frame}
%%%%%%%%%%%%%%%%%%%%%%%%%%%
\end{document}