Skip to content

Commit

Permalink
Allow fitting GLMs on matrices and refitting GLMs on new responses
Browse files Browse the repository at this point in the history
This adds a method for fitting a GLM by explicitly specifying the
design matrix and response vectors. The resulting GlmMod object has
empty ModelFrame and formula fields, and I've changed the few
functions that reference these fields to first check if they are
defined.

Eventually it is probably a good idea to follow @lindahua's suggestion
from JuliaStats/Roadmap.jl#11 and split out functionality that depends
on DataFrames into a separate package, but most of these changes will
be necessary for that as well.

I have also added a method for fitting a GLM on a new response vector
using the same design matrix.

Closes #54
  • Loading branch information
simonster committed Feb 24, 2014
1 parent fefe01b commit 9b90d18
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 25 deletions.
45 changes: 38 additions & 7 deletions src/glmfit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,16 @@ function wrkwt(r::GlmResp)
end

type GlmMod <: LinPredModel
fr::ModelFrame
rr::GlmResp
pp::LinPred
ff::Formula
fit::Bool

# optional
fr::ModelFrame
ff::Formula

GlmMod(rr::GlmResp, pp::LinPred, fit::Bool, fr::ModelFrame, ff::Formula) = new(rr, pp, fit, fr, ff)
GlmMod(rr::GlmResp, pp::LinPred, fit::Bool) = new(rr, pp, fit)
end

function coeftable(mm::GlmMod)
Expand All @@ -79,7 +84,7 @@ function coeftable(mm::GlmMod)
zz = cc ./ se
CoefTable(hcat(cc,se,zz,2.0 * ccdf(Normal(), abs(zz))),
["Estimate","Std.Error","z value", "Pr(>|z|)"],
coefnames(mm.fr), 4)
isdefined(mm, :fr) ? coefnames(mm.fr) : ["x$i" for i = 1:size(mm.pp.X, 2)], 4)
end

function confint(obj::GlmMod, level::Real)
Expand All @@ -95,7 +100,7 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re
0 < minStepFac < 1 || error("minStepFac must be in (0, 1)")

cvg = false; p = m.pp; r = m.rr
scratch = similar(p.X.m)
scratch = similar(p.X)
devold = updatemu!(r, linpred(delbeta!(p, wrkresp(r), wrkwt(r), scratch)))
installbeta!(p)
for i=1:maxIter
Expand All @@ -116,7 +121,34 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re
m
end

function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link; dofit::Bool=true, wts=Float64[], offset=Float64[])
function fit(m::GlmMod, y; wts=nothing, offset=nothing, args...)
r = m.rr
V = typeof(r.y)
r.y = isa(y, V) ? copy(y) : convert(V, y)
wts == nothing || (r.wts = isa(wts, V) ? copy(wts) : convert(V, wts))
offset == nothing || (r.offset = isa(offset, V) ? copy(offset) : convert(V, offset))
m.fit = false
fit(m; args...)
end

function glm{T<:FloatingPoint,V<:FPStoredVector}(X::Matrix{T}, y::V, d::UnivariateDistribution,
l::Link=canonicallink(d);
dofit::Bool=true,
wts::V=fill!(similar(y), one(eltype(y))),
offset::V=similar(y, 0))
size(X, 1) == size(y, 1) || DimensionMismatch("number of rows in X and y must match")
n = length(y)
length(wts) == n || error("length(wts) = $lw should be 0 or $n")
mu = mustart(d, y, wts)
rr = GlmResp{typeof(y)}(y, d, l, linkfun!(l, similar(mu), mu), mu, offset, wts)
res = GlmMod(rr, DensePredChol(X), false)
dofit ? fit(res) : res
end

glm(X::Matrix, y::AbstractVector, args...; kwargs...) = glm(float(X), float(y), args...; kwargs...)

function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link=canonicallink(d);
dofit::Bool=true, wts=Float64[], offset=Float64[])
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
y = model_response(mf); T = eltype(y);
Expand All @@ -130,13 +162,12 @@ function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Li
mu = mustart(d, y, w)
off = T <: Float64 ? copy(offset) : convert(Vector{T}, offset)
rr = GlmResp{typeof(y)}(y, d, l, linkfun!(l, similar(mu), mu), mu, off, w)
res = GlmMod(mf, rr, DensePredChol(mm), f, false)
res = GlmMod(rr, DensePredChol(mm.m), false, mf, f)
dofit ? fit(res) : res
end

glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution, l::Link) = glm(Formula(e),df,d,l)
glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(e),df,d,canonicallink(d))
glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution) = glm(f, df, d, canonicallink(d))
glm(s::String, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(parse(s)[1]), df, d)

## scale(m) -> estimate, s, of the scale parameter
Expand Down
32 changes: 16 additions & 16 deletions src/linpred.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@


## Return the linear predictor vector
linpred(p::LinPred, f::Real=1.) = p.X.m * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))
linpred(p::LinPred, f::Real=1.) = p.X * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))

## Install beta0 + f*delbeta as beta0 and zero out delbeta
function installbeta!(p::LinPred, f::Real=1.)
Expand All @@ -13,45 +13,45 @@ end
typealias BlasReal Union(Float32,Float64)

type DensePredQR{T<:BlasReal} <: DensePred
X::ModelMatrix{T} # model matrix
X::Matrix{T} # model matrix
beta0::Vector{T} # base coefficient vector
delbeta::Vector{T} # coefficient increment
qr::QRCompactWY{T}
function DensePredQR(X::ModelMatrix{T}, beta0::Vector{T})
function DensePredQR(X::Matrix{T}, beta0::Vector{T})
n,p = size(X); length(beta0) == p || error("dimension mismatch")
new(X, beta0, zeros(T,p), qrfact(X.m))
new(X, beta0, zeros(T,p), qrfact(X))
end
end
DensePredQR{T<:BlasReal}(X::ModelMatrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))
DensePredQR{T<:BlasReal}(X::Matrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))

cholfact{T<:FP}(p::DensePredQR{T}) = Cholesky{T}(p.qr[:R],'U')

delbeta!{T<:BlasReal}(p::DensePredQR{T}, r::Vector{T}) = (p.delbeta = p.qr\r; p)

type DensePredChol{T<:BlasReal} <: DensePred
X::ModelMatrix{T} # model matrix
beta0::Vector{T} # base vector for coefficients
delbeta::Vector{T} # coefficient increment
X::Matrix{T} # model matrix
beta0::Vector{T} # base vector for coefficients
delbeta::Vector{T} # coefficient increment
chol::Cholesky{T}
function DensePredChol(X::ModelMatrix{T}, beta0::Vector{T})
function DensePredChol(X::Matrix{T}, beta0::Vector{T})
n,p = size(X); length(beta0) == p || error("dimension mismatch")
new(X, beta0, zeros(T,p), cholfact(X.m'X.m))
new(X, beta0, zeros(T,p), cholfact(X'X))
end
end
DensePredChol{T<:BlasReal}(X::ModelMatrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))
DensePredChol{T<:BlasReal}(X::Matrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))

solve!{T<:BlasReal}(C::Cholesky{T}, B::StridedVecOrMat{T}) = potrs!(C.uplo, C.UL, B)

cholfact{T<:FP}(p::DensePredChol{T}) = (c = p.chol; Cholesky{T}(copy(c.UL),c.uplo))

function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T})
solve!(p.chol, gemv!('T', 1.0, p.X.m, r, 0.0, p.delbeta))
solve!(p.chol, gemv!('T', 1.0, p.X, r, 0.0, p.delbeta))
p
end

function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}, wt::Vector{T}, scr::Matrix{T})
vbroadcast!(Multiply(), scr, p.X.m, wt, 1)
fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X.m, 0.0, p.chol.UL))
vbroadcast!(Multiply(), scr, p.X, wt, 1)
fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X, 0.0, p.chol.UL))
info == 0 || error("Singularity detected at column $info of weighted model matrix")
solve!(p.chol, gemv!('T', 1.0, scr, r, 0.0, p.delbeta))
p
Expand All @@ -73,8 +73,8 @@ cor(x::LinPredModel) = (invstd = map(RcpFun(),stderr(x)); scale!(invstd,scale!(v
stderr(x::LinPredModel) = sqrt(diag(vcov(x)))

function show(io::IO, obj::LinPredModel)
println(io, "$(obj.ff)\n\nCoefficients:")
print(io, coeftable(obj))
println(io, isdefined(obj, :ff) ? obj.ff : "Explicitly specified design matrix",
"\n\nCoefficients:", coeftable(obj))
end

## function show(io::IO, obj::GlmMod)
Expand Down
4 changes: 2 additions & 2 deletions src/lm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ cholfact(x::LmMod) = cholfact(x.pp)

function lm(f::Formula, df::AbstractDataFrame)
mf = ModelFrame(f, df); mm = ModelMatrix(mf)
rr = LmResp(model_response(mf)); pp = DensePredQR(mm)
rr = LmResp(model_response(mf)); pp = DensePredQR(mm.m)
installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
LmMod(mf, rr, pp, f)
end
Expand All @@ -46,7 +46,7 @@ lm(f::String, df::AbstractDataFrame) = lm(Formula(parse(f)[1]), df)

function lmc(f::Formula, df::AbstractDataFrame)
mf = ModelFrame(f, df); mm = ModelMatrix(mf)
rr = LmResp(model_response(mf)); pp = DensePredChol(mm)
rr = LmResp(model_response(mf)); pp = DensePredChol(mm.m)
installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
LmMod(mf, rr, pp, f)
end
Expand Down
12 changes: 12 additions & 0 deletions test/glmFit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,15 @@ gm4 = glm(admit ~ gre + gpa + rank, df, Binomial(), CauchitLink())
gm5 = glm(admit ~ gre + gpa + rank, df, Binomial(), CloglogLink())
@test_approx_eq deviance(gm5) 458.89439629612616

mf = ModelFrame(admit ~ gre + gpa + rank, df)
X = ModelMatrix(mf).m
gm6 = glm(X, model_response(mf), Binomial())
@test_approx_eq deviance(gm2) 458.5174924758994
@test_approx_eq coef(gm2) [-3.9899786606380734,0.0022644256521549043,0.8040374535155766,-0.6754428594116577,-1.3402038117481079,-1.5514636444657492]

y = rand(0:1, size(X, 1))
fit(gm6, y)

gm7 = glm(X, y, Binomial())
@test_approx_eq_eps deviance(gm6) deviance(gm7) 1e-6
@test_approx_eq_eps coef(gm6) coef(gm7) 1e-6

0 comments on commit 9b90d18

Please sign in to comment.