71
71
# ' control argument for \code{\link[daarem]{daarem}}. This setting
72
72
# ' determines to what extent the monotonicity condition can be
73
73
# ' violated.}
74
+ # '
75
+ # ' \item{\code{training_frac}}{Fraction of the columns of input data \code{Y}
76
+ # ' to fit initial model on. If set to \code{1} (default), the model is fit
77
+ # ' by optimizing the parameters on the entire dataset. If set between \code{0}
78
+ # ' and \code{1}, the model is optimized by first fitting a model on a randomly
79
+ # ' selected fraction of the columns of \code{Y}, and then projecting the
80
+ # ' remaining columns of \code{Y} onto the solution. Setting this to a smaller
81
+ # ' value will increase speed but decrease accuracy.
82
+ # ' }
83
+ # '
84
+ # ' \item{\code{num_projection_ccd_iter}}{Number of co-ordinate descent updates
85
+ # ' be made to elements of \code{V} if and when a subset of \code{Y} is
86
+ # ' projected onto \code{U}. Only used if \code{training_frac} is less than
87
+ # ' \code{1}.
88
+ # ' }
74
89
# '
75
90
# ' \item{\code{num_ccd_iter}}{Number of co-ordinate descent updates to
76
91
# ' be made to parameters at each iteration of the algorithm.}
@@ -196,7 +211,7 @@ fit_glmpca_pois <- function(
196
211
# Check and process input argument "control".
197
212
control <- modifyList(fit_glmpca_pois_control_default(),
198
213
control ,keep.null = TRUE )
199
-
214
+
200
215
# Set up the internal fit.
201
216
D <- sqrt(fit0 $ d )
202
217
if (K == 1 )
@@ -205,7 +220,7 @@ fit_glmpca_pois <- function(
205
220
D <- diag(D )
206
221
LL <- t(cbind(fit0 $ U %*% D ,fit0 $ X ,fit0 $ W ))
207
222
FF <- t(cbind(fit0 $ V %*% D ,fit0 $ B ,fit0 $ Z ))
208
-
223
+
209
224
# Determine which rows of LL and FF are "clamped".
210
225
fixed_l <- numeric (0 )
211
226
fixed_f <- numeric (0 )
@@ -217,9 +232,86 @@ fit_glmpca_pois <- function(
217
232
fixed_f <- c(fixed_f ,K + fit0 $ fixed_b_cols )
218
233
if (nz > 0 )
219
234
fixed_f <- c(fixed_f ,K + nx + seq(1 ,nz ))
220
-
221
- # Perform the updates.
222
- res <- fit_glmpca_pois_main_loop(LL ,FF ,Y ,fixed_l ,fixed_f ,verbose ,control )
235
+
236
+ if (control $ training_frac == 1 ) {
237
+
238
+ # Perform the updates.
239
+ res <- fit_glmpca_pois_main_loop(LL ,FF ,Y ,fixed_l ,fixed_f ,verbose ,control )
240
+
241
+ } else {
242
+
243
+ if (control $ training_frac < = 0 || control $ training_frac > 1 )
244
+ stop(" control argument \" training_frac\" should be between 0 and 1" )
245
+
246
+ train_idx <- sample(
247
+ 1 : ncol(Y ),
248
+ size = ceiling(ncol(Y ) * control $ training_frac )
249
+ )
250
+
251
+ browser()
252
+ Y_train <- Y [, train_idx ]
253
+
254
+ if (any(Matrix :: rowSums(Y_train ) == 0 ) || any(Matrix :: colSums(Y_train ) == 0 )) {
255
+
256
+ stop(
257
+ " After subsetting, the remaining values of \" Y\" " ,
258
+ " contain a row or a column where all counts are 0. This can cause " ,
259
+ " problems with optimization. Please either remove rows / columns " ,
260
+ " with few non-zero counts from \" Y\" , or set \" training_frac\" to " ,
261
+ " a larger value."
262
+ )
263
+
264
+ }
265
+
266
+ FF_train <- FF [, train_idx ]
267
+ FF_test <- FF [, - train_idx ]
268
+ Y_test <- Y [, - train_idx ]
269
+
270
+ test_idx <- 1 : ncol(Y )
271
+ test_idx <- test_idx [- train_idx ]
272
+
273
+ # Perform the updates.
274
+ res <- fit_glmpca_pois_main_loop(
275
+ LL ,
276
+ FF_train ,
277
+ Y_train ,
278
+ fixed_l ,
279
+ fixed_f ,
280
+ verbose ,
281
+ control
282
+ )
283
+
284
+ update_indices_f <- sort(setdiff(1 : K ,fixed_f ))
285
+
286
+ # now, I just need to project the results back
287
+ update_factors_faster_parallel(
288
+ L_T = t(res $ fit $ LL ),
289
+ FF = FF_test ,
290
+ M = as.matrix(res $ fit $ LL [update_indices_f ,,drop = FALSE ] %*% Y_test ),
291
+ update_indices = update_indices_f - 1 ,
292
+ num_iter = control $ num_projection_ccd_iter ,
293
+ line_search = control $ line_search ,
294
+ alpha = control $ ls_alpha ,
295
+ beta = control $ ls_beta
296
+ )
297
+
298
+ # now, I need to reconstruct FF, and hopefully compute the log-likelihood
299
+ FF [, train_idx ] <- res $ fit $ FF
300
+ FF [, test_idx ] <- FF_test
301
+ res $ fit $ FF <- FF
302
+
303
+ if (inherits(Y ," sparseMatrix" )) {
304
+ test_loglik_const <- sum(mapSparse(Y_test ,lfactorial ))
305
+ loglik_func <- lik_glmpca_pois_log_sp
306
+ } else {
307
+ test_loglik_const <- sum(lfactorial(Y_test ))
308
+ loglik_func <- lik_glmpca_pois_log
309
+ }
310
+
311
+ test_loglik <- loglik_func(Y_test ,res $ fit $ LL ,FF_test ,test_loglik_const )
312
+ res $ loglik <- res $ loglik + test_loglik
313
+
314
+ }
223
315
224
316
# Prepare the final output.
225
317
res $ progress $ iter <- max(fit0 $ progress $ iter ) + res $ progress $ iter
@@ -258,9 +350,12 @@ fit_glmpca_pois <- function(
258
350
dimnames(fit $ W ) <- dimnames(fit0 $ W )
259
351
}
260
352
class(fit ) <- c(" glmpca_pois_fit" ," list" )
353
+
261
354
return (fit )
355
+
262
356
}
263
357
358
+
264
359
# This implements the core part of fit_glmpca_pois.
265
360
#
266
361
# ' @importFrom Matrix t
@@ -358,6 +453,8 @@ fit_glmpca_pois_control_default <- function()
358
453
list (use_daarem = FALSE ,
359
454
maxiter = 100 ,
360
455
tol = 1e-4 ,
456
+ training_frac = 1 ,
457
+ num_projection_ccd_iter = 10 ,
361
458
mon.tol = 0.05 ,
362
459
convtype = " objfn" ,
363
460
line_search = TRUE ,
0 commit comments