From 9b90d18376d3b962efca01563190d18f75141256 Mon Sep 17 00:00:00 2001
From: Simon Kornblith <simon@simonster.com>
Date: Sun, 23 Feb 2014 19:47:22 -0500
Subject: [PATCH] Allow fitting GLMs on matrices and refitting GLMs on new
 responses

This adds a method for fitting a GLM by explicitly specifying the
design matrix and response vectors. The resulting GlmMod object has
empty ModelFrame and formula fields, and I've changed the few
functions that reference these fields to first check if they are
defined.

Eventually it is probably a good idea to follow @lindahua's suggestion
from JuliaStats/Roadmap.jl#11 and split out functionality that depends
on DataFrames into a separate package, but most of these changes will
be necessary for that as well.

I have also added a method for fitting a GLM on a new response vector
using the same design matrix.

Closes #54
---
 src/glmfit.jl  | 45 ++++++++++++++++++++++++++++++++++++++-------
 src/linpred.jl | 32 ++++++++++++++++----------------
 src/lm.jl      |  4 ++--
 test/glmFit.jl | 12 ++++++++++++
 4 files changed, 68 insertions(+), 25 deletions(-)

diff --git a/src/glmfit.jl b/src/glmfit.jl
index 236ef5ba..58cc4321 100644
--- a/src/glmfit.jl
+++ b/src/glmfit.jl
@@ -66,11 +66,16 @@ function wrkwt(r::GlmResp)
 end
 
 type GlmMod <: LinPredModel
-    fr::ModelFrame
     rr::GlmResp
     pp::LinPred
-    ff::Formula
     fit::Bool
+
+    # optional
+    fr::ModelFrame
+    ff::Formula
+
+    GlmMod(rr::GlmResp, pp::LinPred, fit::Bool, fr::ModelFrame, ff::Formula) = new(rr, pp, fit, fr, ff)
+    GlmMod(rr::GlmResp, pp::LinPred, fit::Bool) = new(rr, pp, fit)
 end
 
 function coeftable(mm::GlmMod)
@@ -79,7 +84,7 @@ function coeftable(mm::GlmMod)
     zz = cc ./ se
     CoefTable(hcat(cc,se,zz,2.0 * ccdf(Normal(), abs(zz))),
               ["Estimate","Std.Error","z value", "Pr(>|z|)"],
-              coefnames(mm.fr), 4)
+              isdefined(mm, :fr) ? coefnames(mm.fr) : ["x$i" for i = 1:size(mm.pp.X, 2)], 4)
 end
 
 function confint(obj::GlmMod, level::Real)
@@ -95,7 +100,7 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re
     0 < minStepFac < 1 || error("minStepFac must be in (0, 1)")
 
     cvg = false; p = m.pp; r = m.rr
-    scratch = similar(p.X.m)
+    scratch = similar(p.X)
     devold = updatemu!(r, linpred(delbeta!(p, wrkresp(r), wrkwt(r), scratch)))
     installbeta!(p)
     for i=1:maxIter
@@ -116,7 +121,34 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re
     m
 end
 
-function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link; dofit::Bool=true, wts=Float64[], offset=Float64[])
+function fit(m::GlmMod, y; wts=nothing, offset=nothing, args...)
+    r = m.rr
+    V = typeof(r.y)
+    r.y = isa(y, V) ? copy(y) : convert(V, y)
+    wts == nothing || (r.wts = isa(wts, V) ? copy(wts) : convert(V, wts))
+    offset == nothing || (r.offset = isa(offset, V) ? copy(offset) : convert(V, offset))
+    m.fit = false
+    fit(m; args...)
+end
+
+function glm{T<:FloatingPoint,V<:FPStoredVector}(X::Matrix{T}, y::V, d::UnivariateDistribution,
+                                                 l::Link=canonicallink(d);
+                                                 dofit::Bool=true,
+                                                 wts::V=fill!(similar(y), one(eltype(y))),
+                                                 offset::V=similar(y, 0))
+    size(X, 1) == size(y, 1) || DimensionMismatch("number of rows in X and y must match")
+    n = length(y)
+    length(wts) == n || error("length(wts) = $lw should be 0 or $n")
+    mu = mustart(d, y, wts)
+    rr = GlmResp{typeof(y)}(y, d, l, linkfun!(l, similar(mu), mu), mu, offset, wts)
+    res = GlmMod(rr, DensePredChol(X), false)
+    dofit ? fit(res) : res
+end
+
+glm(X::Matrix, y::AbstractVector, args...; kwargs...) = glm(float(X), float(y), args...; kwargs...)
+
+function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link=canonicallink(d);
+             dofit::Bool=true, wts=Float64[], offset=Float64[])
     mf = ModelFrame(f, df)
     mm = ModelMatrix(mf)
     y = model_response(mf); T = eltype(y);
@@ -130,13 +162,12 @@ function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Li
     mu = mustart(d, y, w)
     off = T <: Float64 ? copy(offset) : convert(Vector{T}, offset)
     rr = GlmResp{typeof(y)}(y, d, l, linkfun!(l, similar(mu), mu), mu, off, w)
-    res = GlmMod(mf, rr, DensePredChol(mm), f, false)
+    res = GlmMod(rr, DensePredChol(mm.m), false, mf, f)
     dofit ? fit(res) : res
 end
 
 glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution, l::Link) = glm(Formula(e),df,d,l)
 glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(e),df,d,canonicallink(d))
-glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution) = glm(f, df, d, canonicallink(d))
 glm(s::String, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(parse(s)[1]), df, d)
 
 ## scale(m) -> estimate, s, of the scale parameter
diff --git a/src/linpred.jl b/src/linpred.jl
index bdb3f7fb..e15ae00b 100644
--- a/src/linpred.jl
+++ b/src/linpred.jl
@@ -1,7 +1,7 @@
 
                   
 ## Return the linear predictor vector
-linpred(p::LinPred, f::Real=1.) = p.X.m * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))
+linpred(p::LinPred, f::Real=1.) = p.X * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))
 
 ## Install beta0 + f*delbeta as beta0 and zero out delbeta
 function installbeta!(p::LinPred, f::Real=1.)
@@ -13,45 +13,45 @@ end
 typealias BlasReal Union(Float32,Float64)
     
 type DensePredQR{T<:BlasReal} <: DensePred
-    X::ModelMatrix{T}             # model matrix
+    X::Matrix{T}                  # model matrix
     beta0::Vector{T}              # base coefficient vector
     delbeta::Vector{T}            # coefficient increment
     qr::QRCompactWY{T}
-    function DensePredQR(X::ModelMatrix{T}, beta0::Vector{T})
+    function DensePredQR(X::Matrix{T}, beta0::Vector{T})
         n,p = size(X); length(beta0) == p || error("dimension mismatch")
-        new(X, beta0, zeros(T,p), qrfact(X.m))
+        new(X, beta0, zeros(T,p), qrfact(X))
     end
 end
-DensePredQR{T<:BlasReal}(X::ModelMatrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))
+DensePredQR{T<:BlasReal}(X::Matrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))
 
 cholfact{T<:FP}(p::DensePredQR{T}) = Cholesky{T}(p.qr[:R],'U')
 
 delbeta!{T<:BlasReal}(p::DensePredQR{T}, r::Vector{T}) = (p.delbeta = p.qr\r; p)
               
 type DensePredChol{T<:BlasReal} <: DensePred
-    X::ModelMatrix{T}                   # model matrix
-    beta0::Vector{T}                    # base vector for coefficients
-    delbeta::Vector{T}                  # coefficient increment
+    X::Matrix{T}                   # model matrix
+    beta0::Vector{T}               # base vector for coefficients
+    delbeta::Vector{T}             # coefficient increment
     chol::Cholesky{T}
-    function DensePredChol(X::ModelMatrix{T}, beta0::Vector{T})
+    function DensePredChol(X::Matrix{T}, beta0::Vector{T})
         n,p = size(X); length(beta0) == p || error("dimension mismatch")
-        new(X, beta0, zeros(T,p), cholfact(X.m'X.m))
+        new(X, beta0, zeros(T,p), cholfact(X'X))
     end
 end
-DensePredChol{T<:BlasReal}(X::ModelMatrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))
+DensePredChol{T<:BlasReal}(X::Matrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))
 
 solve!{T<:BlasReal}(C::Cholesky{T}, B::StridedVecOrMat{T}) = potrs!(C.uplo, C.UL, B)
 
 cholfact{T<:FP}(p::DensePredChol{T}) = (c = p.chol; Cholesky{T}(copy(c.UL),c.uplo))
 
 function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T})
-    solve!(p.chol, gemv!('T', 1.0, p.X.m, r, 0.0, p.delbeta))
+    solve!(p.chol, gemv!('T', 1.0, p.X, r, 0.0, p.delbeta))
     p
 end
 
 function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}, wt::Vector{T}, scr::Matrix{T})
-    vbroadcast!(Multiply(), scr, p.X.m, wt, 1)
-    fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X.m, 0.0, p.chol.UL))
+    vbroadcast!(Multiply(), scr, p.X, wt, 1)
+    fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X, 0.0, p.chol.UL))
     info == 0 || error("Singularity detected at column $info of weighted model matrix")
     solve!(p.chol, gemv!('T', 1.0, scr, r, 0.0, p.delbeta))
     p
@@ -73,8 +73,8 @@ cor(x::LinPredModel) = (invstd = map(RcpFun(),stderr(x)); scale!(invstd,scale!(v
 stderr(x::LinPredModel) = sqrt(diag(vcov(x)))
 
 function show(io::IO, obj::LinPredModel)
-    println(io, "$(obj.ff)\n\nCoefficients:")
-    print(io, coeftable(obj))
+    println(io, isdefined(obj, :ff) ? obj.ff : "Explicitly specified design matrix",
+            "\n\nCoefficients:", coeftable(obj))
 end
 
 ## function show(io::IO, obj::GlmMod)
diff --git a/src/lm.jl b/src/lm.jl
index 86082ec5..fb7e0158 100644
--- a/src/lm.jl
+++ b/src/lm.jl
@@ -37,7 +37,7 @@ cholfact(x::LmMod) = cholfact(x.pp)
 
 function lm(f::Formula, df::AbstractDataFrame)
     mf = ModelFrame(f, df); mm = ModelMatrix(mf)
-    rr = LmResp(model_response(mf)); pp = DensePredQR(mm)
+    rr = LmResp(model_response(mf)); pp = DensePredQR(mm.m)
     installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
     LmMod(mf, rr, pp, f)
 end
@@ -46,7 +46,7 @@ lm(f::String, df::AbstractDataFrame) = lm(Formula(parse(f)[1]), df)
 
 function lmc(f::Formula, df::AbstractDataFrame)
     mf = ModelFrame(f, df); mm = ModelMatrix(mf)
-    rr = LmResp(model_response(mf)); pp = DensePredChol(mm)
+    rr = LmResp(model_response(mf)); pp = DensePredChol(mm.m)
     installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
     LmMod(mf, rr, pp, f)
 end
diff --git a/test/glmFit.jl b/test/glmFit.jl
index e2967262..a262864f 100644
--- a/test/glmFit.jl
+++ b/test/glmFit.jl
@@ -29,3 +29,15 @@ gm4 = glm(admit ~ gre + gpa + rank, df, Binomial(), CauchitLink())
 gm5 = glm(admit ~ gre + gpa + rank, df, Binomial(), CloglogLink())
 @test_approx_eq deviance(gm5) 458.89439629612616
 
+mf = ModelFrame(admit ~ gre + gpa + rank, df)
+X = ModelMatrix(mf).m
+gm6 = glm(X, model_response(mf), Binomial())
+@test_approx_eq deviance(gm2) 458.5174924758994
+@test_approx_eq coef(gm2) [-3.9899786606380734,0.0022644256521549043,0.8040374535155766,-0.6754428594116577,-1.3402038117481079,-1.5514636444657492]
+
+y = rand(0:1, size(X, 1))
+fit(gm6, y)
+
+gm7 = glm(X, y, Binomial())
+@test_approx_eq_eps deviance(gm6) deviance(gm7) 1e-6
+@test_approx_eq_eps coef(gm6) coef(gm7) 1e-6