From 9b90d18376d3b962efca01563190d18f75141256 Mon Sep 17 00:00:00 2001 From: Simon Kornblith Date: Sun, 23 Feb 2014 19:47:22 -0500 Subject: [PATCH] Allow fitting GLMs on matrices and refitting GLMs on new responses This adds a method for fitting a GLM by explicitly specifying the design matrix and response vectors. The resulting GlmMod object has empty ModelFrame and formula fields, and I've changed the few functions that reference these fields to first check if they are defined. Eventually it is probably a good idea to follow @lindahua's suggestion from JuliaStats/Roadmap.jl#11 and split out functionality that depends on DataFrames into a separate package, but most of these changes will be necessary for that as well. I have also added a method for fitting a GLM on a new response vector using the same design matrix. Closes #54 --- src/glmfit.jl | 45 ++++++++++++++++++++++++++++++++++++++------- src/linpred.jl | 32 ++++++++++++++++---------------- src/lm.jl | 4 ++-- test/glmFit.jl | 12 ++++++++++++ 4 files changed, 68 insertions(+), 25 deletions(-) diff --git a/src/glmfit.jl b/src/glmfit.jl index 236ef5ba..58cc4321 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -66,11 +66,16 @@ function wrkwt(r::GlmResp) end type GlmMod <: LinPredModel - fr::ModelFrame rr::GlmResp pp::LinPred - ff::Formula fit::Bool + + # optional + fr::ModelFrame + ff::Formula + + GlmMod(rr::GlmResp, pp::LinPred, fit::Bool, fr::ModelFrame, ff::Formula) = new(rr, pp, fit, fr, ff) + GlmMod(rr::GlmResp, pp::LinPred, fit::Bool) = new(rr, pp, fit) end function coeftable(mm::GlmMod) @@ -79,7 +84,7 @@ function coeftable(mm::GlmMod) zz = cc ./ se CoefTable(hcat(cc,se,zz,2.0 * ccdf(Normal(), abs(zz))), ["Estimate","Std.Error","z value", "Pr(>|z|)"], - coefnames(mm.fr), 4) + isdefined(mm, :fr) ? coefnames(mm.fr) : ["x$i" for i = 1:size(mm.pp.X, 2)], 4) end function confint(obj::GlmMod, level::Real) @@ -95,7 +100,7 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re 0 < minStepFac < 1 || error("minStepFac must be in (0, 1)") cvg = false; p = m.pp; r = m.rr - scratch = similar(p.X.m) + scratch = similar(p.X) devold = updatemu!(r, linpred(delbeta!(p, wrkresp(r), wrkwt(r), scratch))) installbeta!(p) for i=1:maxIter @@ -116,7 +121,34 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re m end -function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link; dofit::Bool=true, wts=Float64[], offset=Float64[]) +function fit(m::GlmMod, y; wts=nothing, offset=nothing, args...) + r = m.rr + V = typeof(r.y) + r.y = isa(y, V) ? copy(y) : convert(V, y) + wts == nothing || (r.wts = isa(wts, V) ? copy(wts) : convert(V, wts)) + offset == nothing || (r.offset = isa(offset, V) ? copy(offset) : convert(V, offset)) + m.fit = false + fit(m; args...) +end + +function glm{T<:FloatingPoint,V<:FPStoredVector}(X::Matrix{T}, y::V, d::UnivariateDistribution, + l::Link=canonicallink(d); + dofit::Bool=true, + wts::V=fill!(similar(y), one(eltype(y))), + offset::V=similar(y, 0)) + size(X, 1) == size(y, 1) || DimensionMismatch("number of rows in X and y must match") + n = length(y) + length(wts) == n || error("length(wts) = $lw should be 0 or $n") + mu = mustart(d, y, wts) + rr = GlmResp{typeof(y)}(y, d, l, linkfun!(l, similar(mu), mu), mu, offset, wts) + res = GlmMod(rr, DensePredChol(X), false) + dofit ? fit(res) : res +end + +glm(X::Matrix, y::AbstractVector, args...; kwargs...) = glm(float(X), float(y), args...; kwargs...) + +function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link=canonicallink(d); + dofit::Bool=true, wts=Float64[], offset=Float64[]) mf = ModelFrame(f, df) mm = ModelMatrix(mf) y = model_response(mf); T = eltype(y); @@ -130,13 +162,12 @@ function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Li mu = mustart(d, y, w) off = T <: Float64 ? copy(offset) : convert(Vector{T}, offset) rr = GlmResp{typeof(y)}(y, d, l, linkfun!(l, similar(mu), mu), mu, off, w) - res = GlmMod(mf, rr, DensePredChol(mm), f, false) + res = GlmMod(rr, DensePredChol(mm.m), false, mf, f) dofit ? fit(res) : res end glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution, l::Link) = glm(Formula(e),df,d,l) glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(e),df,d,canonicallink(d)) -glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution) = glm(f, df, d, canonicallink(d)) glm(s::String, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(parse(s)[1]), df, d) ## scale(m) -> estimate, s, of the scale parameter diff --git a/src/linpred.jl b/src/linpred.jl index bdb3f7fb..e15ae00b 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -1,7 +1,7 @@ ## Return the linear predictor vector -linpred(p::LinPred, f::Real=1.) = p.X.m * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f)) +linpred(p::LinPred, f::Real=1.) = p.X * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f)) ## Install beta0 + f*delbeta as beta0 and zero out delbeta function installbeta!(p::LinPred, f::Real=1.) @@ -13,45 +13,45 @@ end typealias BlasReal Union(Float32,Float64) type DensePredQR{T<:BlasReal} <: DensePred - X::ModelMatrix{T} # model matrix + X::Matrix{T} # model matrix beta0::Vector{T} # base coefficient vector delbeta::Vector{T} # coefficient increment qr::QRCompactWY{T} - function DensePredQR(X::ModelMatrix{T}, beta0::Vector{T}) + function DensePredQR(X::Matrix{T}, beta0::Vector{T}) n,p = size(X); length(beta0) == p || error("dimension mismatch") - new(X, beta0, zeros(T,p), qrfact(X.m)) + new(X, beta0, zeros(T,p), qrfact(X)) end end -DensePredQR{T<:BlasReal}(X::ModelMatrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2))) +DensePredQR{T<:BlasReal}(X::Matrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2))) cholfact{T<:FP}(p::DensePredQR{T}) = Cholesky{T}(p.qr[:R],'U') delbeta!{T<:BlasReal}(p::DensePredQR{T}, r::Vector{T}) = (p.delbeta = p.qr\r; p) type DensePredChol{T<:BlasReal} <: DensePred - X::ModelMatrix{T} # model matrix - beta0::Vector{T} # base vector for coefficients - delbeta::Vector{T} # coefficient increment + X::Matrix{T} # model matrix + beta0::Vector{T} # base vector for coefficients + delbeta::Vector{T} # coefficient increment chol::Cholesky{T} - function DensePredChol(X::ModelMatrix{T}, beta0::Vector{T}) + function DensePredChol(X::Matrix{T}, beta0::Vector{T}) n,p = size(X); length(beta0) == p || error("dimension mismatch") - new(X, beta0, zeros(T,p), cholfact(X.m'X.m)) + new(X, beta0, zeros(T,p), cholfact(X'X)) end end -DensePredChol{T<:BlasReal}(X::ModelMatrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2))) +DensePredChol{T<:BlasReal}(X::Matrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2))) solve!{T<:BlasReal}(C::Cholesky{T}, B::StridedVecOrMat{T}) = potrs!(C.uplo, C.UL, B) cholfact{T<:FP}(p::DensePredChol{T}) = (c = p.chol; Cholesky{T}(copy(c.UL),c.uplo)) function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}) - solve!(p.chol, gemv!('T', 1.0, p.X.m, r, 0.0, p.delbeta)) + solve!(p.chol, gemv!('T', 1.0, p.X, r, 0.0, p.delbeta)) p end function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}, wt::Vector{T}, scr::Matrix{T}) - vbroadcast!(Multiply(), scr, p.X.m, wt, 1) - fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X.m, 0.0, p.chol.UL)) + vbroadcast!(Multiply(), scr, p.X, wt, 1) + fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X, 0.0, p.chol.UL)) info == 0 || error("Singularity detected at column $info of weighted model matrix") solve!(p.chol, gemv!('T', 1.0, scr, r, 0.0, p.delbeta)) p @@ -73,8 +73,8 @@ cor(x::LinPredModel) = (invstd = map(RcpFun(),stderr(x)); scale!(invstd,scale!(v stderr(x::LinPredModel) = sqrt(diag(vcov(x))) function show(io::IO, obj::LinPredModel) - println(io, "$(obj.ff)\n\nCoefficients:") - print(io, coeftable(obj)) + println(io, isdefined(obj, :ff) ? obj.ff : "Explicitly specified design matrix", + "\n\nCoefficients:", coeftable(obj)) end ## function show(io::IO, obj::GlmMod) diff --git a/src/lm.jl b/src/lm.jl index 86082ec5..fb7e0158 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -37,7 +37,7 @@ cholfact(x::LmMod) = cholfact(x.pp) function lm(f::Formula, df::AbstractDataFrame) mf = ModelFrame(f, df); mm = ModelMatrix(mf) - rr = LmResp(model_response(mf)); pp = DensePredQR(mm) + rr = LmResp(model_response(mf)); pp = DensePredQR(mm.m) installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.)) LmMod(mf, rr, pp, f) end @@ -46,7 +46,7 @@ lm(f::String, df::AbstractDataFrame) = lm(Formula(parse(f)[1]), df) function lmc(f::Formula, df::AbstractDataFrame) mf = ModelFrame(f, df); mm = ModelMatrix(mf) - rr = LmResp(model_response(mf)); pp = DensePredChol(mm) + rr = LmResp(model_response(mf)); pp = DensePredChol(mm.m) installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.)) LmMod(mf, rr, pp, f) end diff --git a/test/glmFit.jl b/test/glmFit.jl index e2967262..a262864f 100644 --- a/test/glmFit.jl +++ b/test/glmFit.jl @@ -29,3 +29,15 @@ gm4 = glm(admit ~ gre + gpa + rank, df, Binomial(), CauchitLink()) gm5 = glm(admit ~ gre + gpa + rank, df, Binomial(), CloglogLink()) @test_approx_eq deviance(gm5) 458.89439629612616 +mf = ModelFrame(admit ~ gre + gpa + rank, df) +X = ModelMatrix(mf).m +gm6 = glm(X, model_response(mf), Binomial()) +@test_approx_eq deviance(gm2) 458.5174924758994 +@test_approx_eq coef(gm2) [-3.9899786606380734,0.0022644256521549043,0.8040374535155766,-0.6754428594116577,-1.3402038117481079,-1.5514636444657492] + +y = rand(0:1, size(X, 1)) +fit(gm6, y) + +gm7 = glm(X, y, Binomial()) +@test_approx_eq_eps deviance(gm6) deviance(gm7) 1e-6 +@test_approx_eq_eps coef(gm6) coef(gm7) 1e-6