This repository has been archived by the owner on Nov 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathTREC.kt
126 lines (92 loc) · 4.71 KB
/
TREC.kt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
package com.komputation.cuda.demos.trec
import com.komputation.cuda.network.cudaNetwork
import com.komputation.demos.trec.NLP
import com.komputation.demos.trec.TRECData
import com.komputation.initialization.uniformInitialization
import com.komputation.instructions.continuation.activation.Activation
import com.komputation.instructions.continuation.activation.relu
import com.komputation.instructions.continuation.convolution.convolution
import com.komputation.instructions.continuation.dense.dense
import com.komputation.instructions.continuation.dropout.dropout
import com.komputation.instructions.entry.lookup
import com.komputation.instructions.loss.crossEntropyLoss
import com.komputation.optimization.historical.nesterov
import java.io.File
import java.util.*
fun main(args: Array<String>) {
if (args.size != 2) {
throw Exception("Please specify the path to the Glove word embeddings and the number of dimensions.")
}
val embeddingFilePath = args.first()
val dimensions = args.last().toInt()
Trec().run(embeddingFilePath, dimensions)
}
class Trec {
fun run(embeddingFilePath: String, embeddingDimension: Int) {
val random = Random(1)
val initialization = uniformInitialization(random, -0.1f, 0.1f)
val optimization = nesterov(0.01f, 0.9f)
val batchSize = 32
val numberIterations = 7
val numberFilters = 100
val filterHeight = embeddingDimension
val filterWidth = 2
val keepProbability = 0.8f
val trecDirectory = File(javaClass.classLoader.getResource("trec").toURI())
val trainingFile = File(trecDirectory, "training.data")
val testFile = File(trecDirectory, "test.data")
val (trainingCategories, trainingDocuments) = TRECData.readExamples(trainingFile)
val (testCategories, testDocuments) = TRECData.readExamples(testFile)
val vocabulary = NLP.generateVocabulary(trainingDocuments)
val embeddingFile = File(embeddingFilePath)
val embeddingMap = NLP.embedVocabulary(vocabulary, embeddingFile)
val embeddableVocabulary = embeddingMap.keys.sorted()
val missing = vocabulary.minus(embeddingMap.keys)
val trainingDocumentsWithFilteredTokens = NLP.filterTokens(trainingDocuments, embeddableVocabulary)
val maximumDocumentLength = trainingDocumentsWithFilteredTokens.maxBy { document -> document.size }!!.size
val testDocumentsWithFilteredTokens = NLP.filterTokens(testDocuments, embeddableVocabulary)
val embeddableTrainingIndices = NLP.filterDocuments(trainingDocumentsWithFilteredTokens, filterWidth)
val embeddableTestIndices = NLP.filterDocuments(testDocumentsWithFilteredTokens, filterWidth)
val embeddableTrainingDocuments = trainingDocumentsWithFilteredTokens.slice(embeddableTrainingIndices)
val embeddableTestDocuments = testDocumentsWithFilteredTokens.slice(embeddableTestIndices)
val trainingRepresentations = NLP.vectorizeDocuments(embeddableTrainingDocuments, embeddableVocabulary)
val testRepresentations = NLP.vectorizeDocuments(embeddableTestDocuments, embeddableVocabulary)
val embeddableTrainingCategories = trainingCategories.slice(embeddableTrainingIndices)
val embeddableTestCategories = testCategories.slice(embeddableTestIndices)
val indexedCategories = NLP.indexCategories(trainingCategories.toSet())
val numberCategories = indexedCategories.size
val trainingTargets = NLP.createTargets(embeddableTrainingCategories, indexedCategories)
val testTargets = NLP.createTargets(embeddableTestCategories, indexedCategories)
val embeddings = embeddableVocabulary
.map { token -> embeddingMap[token]!! }
.toTypedArray()
val sentenceClassifier = cudaNetwork(
batchSize,
lookup(embeddings, maximumDocumentLength, embeddingDimension, optimization),
convolution(numberFilters, filterWidth, filterHeight, initialization, optimization),
relu(),
dropout(random, keepProbability),
dense(numberCategories, Activation.Softmax, initialization, optimization)
)
val test = sentenceClassifier
.test(
testRepresentations,
testTargets,
batchSize,
numberCategories,
1)
val training = sentenceClassifier
.training(
trainingRepresentations,
trainingTargets,
numberIterations,
crossEntropyLoss()) { _: Int, _: Float ->
println(test.run())
}
training
.run()
test.free()
training.free()
sentenceClassifier.free()
}
}