This repository has been archived by the owner on Oct 6, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path03_lda_sigopt.R
117 lines (81 loc) · 3.48 KB
/
03_lda_sigopt.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
################################################################################
# This script runs the LSI analysis using SigOpt optimized parameters
################################################################################
source("00_globals.R")
### Set global options etc. to work with SigOpt ----
# set environmental token to use sigopt API
Sys.setenv(SIGOPT_API_TOKEN =
scan("sigopt_api_key", what = "character", sep = "\n", quiet = TRUE)
)
# create an experiment
experiment <- create_experiment(list(
name = "LDA optimization",
parameters = list(
list(name = "k", type = "int", bounds = list(min = 100, max = 900)),
list(name = "alpha", type = "double", bounds = list(min = 0.01, max = 1)),
list(name = "beta_sum", type = "double", bounds = list(min = 50, max = 500))
),
parallel_bandwidth = 4,
observation_budget = 100,
metrics = list(list(name = "acc",
objective = "maximize",
strategy = "optimize"),
list(name = "coh",
objective = "maximize",
strategy = "optimize")),
project = "topicmodel_compare"
))
### read in data ----
load("data_derived/20_newsgroups_formatted.RData")
### declare model creation and evaluation functions for SigOpt ----
# sample a split to train the topic model and train the random forest model
tlda <- sample(train1, size = 1000, replace = FALSE)
trf <- setdiff(train1, tlda)
create_model <- function(assignments) {
# create an LDA model
lda <- FitLdaModel(dtm[tlda, ], k = assignments$k,
iterations = 300,
burnin = 250,
alpha = assignments$alpha,
beta = (assignments$beta_sum) * (colSums(dtm[tlda,]) / sum(dtm[tlda, ])),
optimize_alpha = TRUE,
calc_likelihood = FALSE,
calc_r2 = FALSE,
calc_coherence = FALSE)
# apply it to trf
lda2 <- predict(lda, dtm[trf, ], method = "dot")
lda2[is.na(lda2) | is.infinite(lda2) ] <- 0
# train a classifier using trf
m_lda <- train_classifier(y = doc_class[trf],
x = lda2)
# apply topic model to train2
lda3 <- predict(lda, dtm[train2, ], method = "dot")
# predict it on train2 data for optimization
p_lda <- predict_classifier(object = m_lda,
new_data = lda3)
# get accuracy for random forest on train2
predicted_class <- apply(p_lda, 1, function(x) names(x)[which.max(x)])
acc <- sum(predicted_class == as.character(doc_class[train2])) / length(train2)
# get coherence from train2
coh <- CalcProbCoherence(phi = lda$phi, dtm = dtm[train2, ])
coh <- mean(coh)
# return metrics
metrics <- list(list(name = "acc", value = acc),
list(name = "coh", value = coh))
metrics
}
### run the optimization loop ----
Sys.sleep(60)
output <- parallel::mclapply(seq_len(experiment$observation_budget), function(j){
suggestion <- create_suggestion(experiment$id)
value <- create_model(suggestion$assignments)
create_observation(experiment$id, list(
suggestion=suggestion$id,
values=value
))
}, mc.cores = 4)
### get the final results ----
lda_experiment <- fetch_experiment(experiment$id)
lda_best_assignments <- lda_experiment$progress$best_observation$assignments
print(lda_best_assignments)
save(lda_experiment, lda_best_assignments, file = "data_derived/lda_sigopt.RData")