-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathL0EM_regularized_regression.jl
261 lines (229 loc) · 9.08 KB
/
L0EM_regularized_regression.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
# This implements the L0 EM algorithm for regularized regression as described in:
#
# Zhenqiu Liu and Gang Li. "Efficient Regularized Regression for Variable
# Selection with L0 Penalty" Submitted to arXiv on July 29th 2014.
#
# theta = l0EM(X, y, lambda; kws...)
#
# Find coefficients 'theta' for regularized regression with L0 penalty, i.e.
# theta minimizes 0.5*norm(y - X*theta) + lambda/2*sum(theta .> 0.0).
#
# Outputs:
# 'theta' is an M*1 vector with the (sparse) coefficients.
#
# Inputs:
# 'X' is an N by M matrix.
#
# 'y' is an N by 1 array.
#
# 'lambda' is the regularization parameter. By default it is set via AIC, i.e. to 2.
# Other, typically valid values are log(N) (BIC) and 2*log(M) (RIC).
#
# Keyword args:
# 'epsilon' is the minimum coefficient value that is allowed, below which the
# coefficient is set to zero. Defaults to 1e-4.
#
# 'delta_treshold' is the treshold value below which the iteration is deemed to have
# converged. Defaults to 1e-5.
#
# 'nonnegative' is true iff only non-negative coefficients are allowed. Defaults to false.
#
# 'maxIterations' is max number of iterations (in case a lambda given so it doesn't converge).
#
function l0EM(X, y, lambda = 2;
epsilon = 1e-4, deltaTreshold = 1e-5,
nonnegative = false, maxIterations = 10000)
N, M = size(X)
#lambda_eye = UniformScaling(lambda) # Same as but often faster than: lambda * eye(N)
lambda_eye = lambda * eye(N)
xt = X'
theta = xt * ((X * xt .+ lambda_eye) \ y) # Same as but often faster / more stable than: theta = xt * inv(X * xt .+ lambda_eye) * y
if nonnegative
set_negative_to_zero!(theta)
end
iterations = 0
while iterations < maxIterations
iterations += 1
# E-step:
eta = theta
# M-step:
eta_squared = eta .^ 2
xt_eta = broadcast(*, eta_squared, xt)
theta = xt_eta * ((X * xt_eta .+ lambda_eye) \ y) # Same as but often faster / more stable than: theta = xt_eta * inv(X * xt_eta .+ lambda_eye) * y
# Ensure non-negativity if required
if nonnegative
set_negative_to_zero!(theta)
end
# We have converged if change is too small => break.
if norm(theta - eta) < deltaTreshold
break
end
end
theta[abs(theta) .< epsilon] = 0.0
theta
end
function set_negative_to_zero!(v)
v[v .< 0.0] = 0.0
v
end
# Given mild constraints on the co-linearity of features there is
# a max lambda value we can calculate.
function find_max_lambda(X, y)
lambda_from_col_j(j) = (sum(X[:,j] .* y)^2) / (4 * sum(X[:,j].^2))
N, M = size(X)
maximum(map(lambda_from_col_j, 1:M))
end
num_selected(v) = sum(v .> 0.0)
# Instead of checking a fixed number of lambda values in a range from 1e-8
# to max_lambda we do a binary search to identify lambda values where there
# is a transition in the number of selected variables.
#
# cs, ns = adaptive_lambda_regularized_regression(X, y, regressor)
#
# Outputs:
# 'cs' is a dict mapping the tried lambda values to the regressed coefficients for
# each lambda.
#
# 'ns' is a dict mapping an integer number of selected variables to the lambda value
# for which that number of selected variables was first encountered.
#
# Inputs:
# 'X' is an N by M matrix corresponding to N cases each with M measured features.
#
# 'y' is an N by 1 array corresponding to the dependent variable for the N cases.
#
# 'regressor' is a Julia function to call to perform regularized regression.
# It needs to have an interface like regressor(X, y, lambda; kws...).
# Defaults to using the L0 EM algorithm for regularized regression.
#
# Optional keyword arguments:
# 'minLambda' is the minimum lambda value to use. Defaults to 1e-7 and might not
# converge at all if a smaller value is selected.
#
# 'numLambdas' is an indication (not exact) of the number of calls to the regressor
# that we are allowed to do. The actual number of calls can deviate somewhat but not a lot.
#
# 'stepDivisor' determines how small lambda increments are considered when searching in
# between the standard log-spaced values of LASSO and similar algorithms. Defaults to
# 2^12 which tends to give a sensible number of alternative models to choose from.
# Increase it to find a large number of selected number of variables (returned in 'ns').
#
# 'logSpace' is true iff the search should be in log space i.e. if taking the logarithm
# into account when selecting mid points in the binary search.
#
# 'includeAICetal' is true iff we ensure AIC, BIC and RIC are included among the lambdas
# to be tested. The stepDivisor should typically be set lower (2^6) is this is set to true.
# Defaults to false.
#
# 'kws' are any other keyword arguments that will be passed on to the regressor.
function adaptive_lambda_regularized_regression(X, y, regressor = l0EM;
minLambda = 1e-7, numLambdas = 100,
stepDivisor = 2^12,
logSpace = true,
includeAICetal = false,
kws...)
max_lambda = find_max_lambda(X, y)
log_increment = (log(max_lambda + 1) - log(minLambda + 1)) / (numLambdas-1)
num_vars_to_lambda = Dict{Int64, Float64}()
coeffs = Dict{Float64, Matrix{Float64}}()
num_calls_to_regularized_regressor = 0
update_if_not_there!(l) = begin
if haskey(coeffs, l)
cs = coeffs[l]
else
coeffs[l] = cs = regressor(X, y, l; kws...)
num_calls_to_regularized_regressor += 1
end
nvars = num_selected(cs)
if !haskey(num_vars_to_lambda, nvars)
num_vars_to_lambda[nvars] = l
end
return (cs, nvars)
end
midpoint(minL, maxL) = begin
if logSpace
log_minL = log(minL)
exp( log_minL + (log(maxL) - log_minL) / 2 )
else
minL + (maxL - minL) / 2
end
end
# Search the lambda values from minLambda to maxLambda
binary_index_search(minLambda, maxLambda, minDelta) = begin
#println("bis($minLambda, $maxLambda, $minDelta)")
if num_calls_to_regularized_regressor <= numLambdas && (maxLambda - minLambda > minDelta)
csmin, nvarmin = update_if_not_there!(minLambda)
#println(" nvarmin = $nvarmin ($minLambda)")
csmax, nvarmax = update_if_not_there!(maxLambda)
#println(" nvarmax = $nvarmax ($maxLambda)")
if (nvarmin - nvarmax > 1)
mid = midpoint(minLambda, maxLambda)
csmid, nvarmid = update_if_not_there!(mid)
#println(" nvarmid = $nvarmid ($mid)")
if nvarmin - nvarmid > 1
binary_index_search(minLambda, mid, minDelta)
end
if nvarmid - nvarmax > 1
binary_index_search(mid, maxLambda, minDelta)
end
end
end
end
if includeAICetal
# Always include the AIC, BIC and RIC endpoints and search them first to ensure
# they are included.
a, b = 2.0 < log(N) ? (2.0, log(N)) : (log(N), 2.0)
binary_index_search(a, b, log_increment/stepDivisor)
b, c = b < (2*log(M)) ? (b, 2*log(M)) : (2*log(M), b)
binary_index_search(b, c, log_increment/stepDivisor)
# And then from minLambda up to AIC and from RIC up to max lambda.
binary_index_search(minLambda, a, log_increment/stepDivisor)
binary_index_search(c, max_lambda, log_increment/stepDivisor)
else
binary_index_search(minLambda, max_lambda, log_increment/stepDivisor)
end
return (coeffs, num_vars_to_lambda)
end
mse(theta, X, y) = mean( (X * theta - y).^2 )
# Basic test:
function basic_l0EM_test(N, M, NumReps = 30)
for rep in 1:NumReps
k = rand(1:iceil(2*log(M)))
X = randn(N, M)
actual_theta = [linspace(1,k,k), zeros(M-k)]
y = X * actual_theta + 0.10 * randn(N, 1)
cs, ns = adaptive_lambda_regularized_regression(X, y, l0EM;
stepDivisor = 2^4, includeAICetal = true);
# Find lambda with lowest MSE on test set
Xtest = randn(N, M)
ytest = Xtest * actual_theta + 0.10 * randn(N, 1)
best_mse = Inf
local best_lambda, best_nvars
for lambda in values(ns)
newmse = mse(cs[lambda], Xtest, ytest)
if newmse < best_mse
best_nvars = num_selected(cs[lambda])
best_lambda = lambda
best_mse = newmse
end
end
println("k = $k, Lambda = $best_lambda, num vars = $best_nvars, MSE = $best_mse")
end
end
basic_l0EM_test(50, 100, 10)
# Using log space binary search seems to speed the search up about 10-30% depending on problem size.
# Potentially more for big data sets and with cross validation.
#@time cs2, ns2 = adaptive_lambda_regularized_regression(X, y, l0EM; stepDivisor = 2^2, includeAICetal = true);
#@time cs4, ns4 = adaptive_lambda_regularized_regression(X, y, l0EM; stepDivisor = 2^4, includeAICetal = true);
#@time cs6, ns6 = adaptive_lambda_regularized_regression(X, y, l0EM; stepDivisor = 2^6, includeAICetal = true);
#@time cs4b, ns4b = adaptive_lambda_regularized_regression(X, y, l0EM; stepDivisor = 2^4, includeAICetal = false);
#@time csf, nsf = adaptive_lambda_regularized_regression(X, y, l0EM; logSpace = false);
#sort(collect(keys(ns)))
#length(cs)
# If not using the adaptive (binary) search the standard method is to do 100 lambda values
# in a log scale:
#function log_split_lambdas(X, y, numLambdas = 10)
# exp(linspace(0.0, log(find_max_lambda(X, y) + 1), numLambdas)) - 1.0 + 1e-6
#end
#ts100 = map((l) -> l0EM(X, y, l), log_split_lambdas(X, y, 100))
#num_vars = map((t) -> sum(t .> 0.0), ts100)