Uncouple GLM from DataFrames

Requires JuliaData/DataFrames.jl#571, JuliaStats/StatsBase.jl#51 Refs JuliaStats/Roadmap.jl#11
simonster · Mar 30, 2014 · 8b31cbd · 8b31cbd
2 parents 939bd42 + 29eb273
commit 8b31cbd
Show file tree

Hide file tree

Showing 8 changed files with 86 additions and 91 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -9,6 +9,6 @@ before_install:
   - sudo apt-get update -qq -y
   - sudo apt-get install libpcre3-dev julia -y
 script:
-  - julia -e 'Pkg.init(); run(`ln -s $(pwd()) $(Pkg.dir())/GLM`); Pkg.resolve()'
+  - julia -e 'Pkg.init(); run(`ln -s $(pwd()) $(Pkg.dir())/GLM`); Pkg.resolve(); Pkg.add("DataFrames"); Pkg.checkout("DataFrames")'
   - julia -e 'using GLM; @assert isdefined(:GLM); @assert typeof(GLM) === Module'
   - julia ./test/glmFit.jl
diff --git a/README.md b/README.md
@@ -51,7 +51,7 @@ julia> form = dataset("datasets","Formaldehyde")
 | 5     | 0.7  | 0.626  |
 | 6     | 0.9  | 0.782  |
 
-julia> lm1 = lm(OptDen ~ Carb, form)
+julia> lm1 = fit(LmMod, OptDen ~ Carb, form)
 Formula: OptDen ~ Carb
 
 Coefficients:
@@ -103,7 +103,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings")
 | 49    | Libya          | 8.89  | 43.69 | 2.07  | 123.58  | 16.71 |
 | 50    | Malaysia       | 4.71  | 47.2  | 0.66  | 242.69  | 5.08  |
 
-julia> fm2 = lm(SR ~ Pop15 + Pop75 + DPI + DDPI, LifeCycleSavings)
+julia> fm2 = fit(LmMod, SR ~ Pop15 + Pop75 + DPI + DDPI, LifeCycleSavings)
 Formula: SR ~ :(+(Pop15,Pop75,DPI,DDPI))
 
 Coefficients:
@@ -203,7 +203,7 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12],
 | 8     | 13.0   | 2       | 3         |
 | 9     | 12.0   | 3       | 3         |
 
-julia> gm1 = glm(Counts ~ Outcome + Treatment, dobson, Poisson())
+julia> gm1 = fit(GlmMod, Counts ~ Outcome + Treatment, dobson, Poisson())
 Formula: Counts ~ :(+(Outcome,Treatment))
 
 Coefficients:

diff --git a/REQUIRE b/REQUIRE
@@ -1,5 +1,4 @@
 julia 0.3-
-DataFrames 0.5-
 Distributions 0.4-
 NumericExtensions 0.6-
 NumericFuns 0.2.1-

diff --git a/src/GLM.jl b/src/GLM.jl
@@ -1,18 +1,17 @@
-using DataFrames, Distributions, NumericExtensions
+using Distributions, NumericExtensions
 
 module GLM
 
-    using DataFrames, Distributions, NumericExtensions, NumericFuns
+    using Distributions, NumericExtensions, NumericFuns
     using Base.LinAlg.LAPACK: potrf!, potrs!
     using Base.LinAlg.BLAS: gemm!, gemv!
     using Base.LinAlg: QRCompactWY
     using StatsBase: CoefTable, StatisticalModel, RegressionModel
 
     import Base: (\), cholfact, cor, scale, show, size
-    import Distributions: fit
-    import DataFrames: ModelFrame, ModelMatrix, model_response
-    import StatsBase: coef, coeftable, confint, loglikelihood, nobs, stderr, vcov,
-                  residuals, predict
+    import StatsBase
+    import StatsBase: coef, coeftable, confint, deviance, loglikelihood, nobs, stderr,
+                      vcov, residuals, predict, fit
     import NumericExtensions: evaluate, result_type
 
     export                              # types
@@ -41,6 +40,7 @@ module GLM
         devresid,       # vector of squared deviance residuals
         df_residual,    # degrees of freedom for residuals
         drsum,          # sum of squared deviance residuals
+        fit,            # function to fit models, from StatsBase
         formula,        # extract the formula from a model
         glm,            # general interface
         linkfun!,       # mutating link function
@@ -72,11 +72,12 @@ module GLM
 
     abstract LinPred                   # linear predictor in statistical models
     abstract DensePred <: LinPred      # linear predictor with dense X
-    abstract LinPredModel <: StatisticalModel # model based on a linear predictor
+    abstract LinPredModel <: RegressionModel # model based on a linear predictor
 
     include("linpred.jl")
     include("lm.jl")
     include("glmtools.jl")
     include("glmfit.jl")
+    include("deprecated.jl")
 
 end # module
diff --git a/src/glmfit.jl b/src/glmfit.jl
@@ -79,10 +79,8 @@ function wrkwt!(r::GlmResp)
 end
 
 type GlmMod <: LinPredModel
-    fr::ModelFrame
     rr::GlmResp
     pp::LinPred
-    ff::Formula
     fit::Bool
 end
 
@@ -92,7 +90,7 @@ function coeftable(mm::GlmMod)
     zz = cc ./ se
     CoefTable(hcat(cc,se,zz,2.0 * ccdf(Normal(), abs(zz))),
               ["Estimate","Std.Error","z value", "Pr(>|z|)"],
-              coefnames(mm.fr), 4)
+              ["x$i" for i = 1:size(mm.pp.X, 2)], 4)
 end
 
 function confint(obj::GlmMod, level::Real)
@@ -102,13 +100,14 @@ confint(obj::GlmMod) = confint(obj, 0.95)
 
 deviance(m::GlmMod)  = deviance(m.rr)
 
-function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Real=0.001, convTol::Real=1.e-6)
+function StatsBase.fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30,
+                       minStepFac::Real=0.001, convTol::Real=1.e-6)
     m.fit && return m
     maxIter >= 1 || error("maxIter must be positive")
     0 < minStepFac < 1 || error("minStepFac must be in (0, 1)")
 
     cvg = false; p = m.pp; r = m.rr
-    scratch = similar(p.X.m)
+    scratch = similar(p.X)
     devold = updatemu!(r, linpred(delbeta!(p, wrkresp(r), wrkwt!(r), scratch)))
     installbeta!(p)
     for i=1:maxIter
@@ -129,32 +128,43 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re
     m
 end
 
-function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link; dofit::Bool=true, wts=Float64[], offset=Float64[])
-    mf = ModelFrame(f, df)
-    mm = ModelMatrix(mf)
-    y = model_response(mf); T = eltype(y);
-    if T <: Integer
-        y = float64(y)
-        T = Float64
-    end
-    n = length(y); lw = length(wts)
-    lw == 0 || lw == n || error("length(wts) = $lw should be 0 or $n")
-    w = lw == 0 ? ones(T,n) : (T <: Float64 ? copy(wts) : convert(typeof(y), wts))
-    mu = mustart(d, y, w)
-    eta = linkfun!(l, similar(mu), mu)
+function StatsBase.fit(m::GlmMod, y; wts=nothing, offset=nothing, fitargs...)
+    r = m.rr
+    V = typeof(r.y)
+    r.y = isa(y, V) ? copy(y) : convert(V, y)
+    wts == nothing || (r.wts = isa(wts, V) ? copy(wts) : convert(V, wts))
+    offset == nothing || (r.offset = isa(offset, V) ? copy(offset) : convert(V, offset))
+    r.mu = mustart(r.d, r.y, r.wts)
+    linkfun!(r.l, r.eta, r.mu)
+    updatemu!(r, r.eta)
+    fill!(m.pp.beta0, zero(eltype(m.pp.beta0)))
+    m.fit = false
+    dofit ? fit(m; fitargs...) : m
+end
+
+function StatsBase.fit{T<:FloatingPoint,V<:FPStoredVector}(::Type{GlmMod},
+                                                           X::Matrix{T}, y::V, d::UnivariateDistribution,
+                                                           l::Link=canonicallink(d);
+                                                           dofit::Bool=true,
+                                                           wts::V=fill!(similar(y), one(eltype(y))),
+                                                           offset::V=similar(y, 0), fitargs...)
+    size(X, 1) == size(y, 1) || DimensionMismatch("number of rows in X and y must match")
+    n = length(y)
+    length(wts) == n || error("length(wts) = $lw should be 0 or $n")
+    wts = T <: Float64 ? copy(wts) : convert(typeof(y), wts)
     off = T <: Float64 ? copy(offset) : convert(Vector{T}, offset)
+    mu = mustart(d, y, wts)
+    eta = linkfun!(l, similar(mu), mu)
     if !isempty(off)
         subtract!(eta, off)
     end
-    rr = GlmResp{typeof(y)}(y, d, l, eta, mu, off, w)
-    res = GlmMod(mf, rr, DensePredChol(mm), f, false)
-    dofit ? fit(res) : res
+    rr = GlmResp{typeof(y)}(y, d, l, eta, mu, offset, wts)
+    res = GlmMod(rr, DensePredChol(X), false)
+    dofit ? fit(res; fitargs...) : res
 end
 
-glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution, l::Link) = glm(Formula(e),df,d,l)
-glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(e),df,d,canonicallink(d))
-glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution) = glm(f, df, d, canonicallink(d))
-glm(s::String, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(parse(s)[1]), df, d)
+StatsBase.fit(::Type{GlmMod}, X::Matrix, y::AbstractVector, args...; kwargs...) =
+    fit(GlmMod, float(X), float(y), args...; kwargs...)
 
 ## scale(m) -> estimate, s, of the scale parameter
 ## scale(m,true) -> estimate, s^2, of the squared scale parameter

diff --git a/src/linpred.jl b/src/linpred.jl
@@ -1,7 +1,7 @@
 
 
 ## Return the linear predictor vector
-linpred(p::LinPred, f::Real=1.) = p.X.m * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))
+linpred(p::LinPred, f::Real=1.) = p.X * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))
 
 ## Install beta0 + f*delbeta as beta0 and zero out delbeta
 function installbeta!(p::LinPred, f::Real=1.)
@@ -13,45 +13,45 @@ end
 typealias BlasReal Union(Float32,Float64)
 
 type DensePredQR{T<:BlasReal} <: DensePred
-    X::ModelMatrix{T}             # model matrix
+    X::Matrix{T}                  # model matrix
     beta0::Vector{T}              # base coefficient vector
     delbeta::Vector{T}            # coefficient increment
     qr::QRCompactWY{T}
-    function DensePredQR(X::ModelMatrix{T}, beta0::Vector{T})
+    function DensePredQR(X::Matrix{T}, beta0::Vector{T})
         n,p = size(X); length(beta0) == p || error("dimension mismatch")
-        new(X, beta0, zeros(T,p), qrfact(X.m))
+        new(X, beta0, zeros(T,p), qrfact(X))
     end
 end
-DensePredQR{T<:BlasReal}(X::ModelMatrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))
+DensePredQR{T<:BlasReal}(X::Matrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))
 
 cholfact{T<:FP}(p::DensePredQR{T}) = Cholesky{T}(p.qr[:R],'U')
 
 delbeta!{T<:BlasReal}(p::DensePredQR{T}, r::Vector{T}) = (p.delbeta = p.qr\r; p)
 
 type DensePredChol{T<:BlasReal} <: DensePred
-    X::ModelMatrix{T}                   # model matrix
-    beta0::Vector{T}                    # base vector for coefficients
-    delbeta::Vector{T}                  # coefficient increment
+    X::Matrix{T}                   # model matrix
+    beta0::Vector{T}               # base vector for coefficients
+    delbeta::Vector{T}             # coefficient increment
     chol::Cholesky{T}
-    function DensePredChol(X::ModelMatrix{T}, beta0::Vector{T})
+    function DensePredChol(X::Matrix{T}, beta0::Vector{T})
         n,p = size(X); length(beta0) == p || error("dimension mismatch")
-        new(X, beta0, zeros(T,p), cholfact(X.m'X.m))
+        new(X, beta0, zeros(T,p), cholfact(X'X))
     end
 end
-DensePredChol{T<:BlasReal}(X::ModelMatrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))
+DensePredChol{T<:BlasReal}(X::Matrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))
 
 solve!{T<:BlasReal}(C::Cholesky{T}, B::StridedVecOrMat{T}) = potrs!(C.uplo, C.UL, B)
 
 cholfact{T<:FP}(p::DensePredChol{T}) = (c = p.chol; Cholesky{T}(copy(c.UL),c.uplo))
 
 function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T})
-    solve!(p.chol, gemv!('T', 1.0, p.X.m, r, 0.0, p.delbeta))
+    solve!(p.chol, gemv!('T', 1.0, p.X, r, 0.0, p.delbeta))
     p
 end
 
 function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}, wt::Vector{T}, scr::Matrix{T})
-    vbroadcast!(Multiply(), scr, p.X.m, wt, 1)
-    fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X.m, 0.0, p.chol.UL))
+    vbroadcast!(Multiply(), scr, p.X, wt, 1)
+    fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X, 0.0, p.chol.UL))
     info == 0 || error("Singularity detected at column $info of weighted model matrix")
     solve!(p.chol, gemv!('T', 1.0, scr, r, 0.0, p.delbeta))
     p
@@ -73,8 +73,7 @@ cor(x::LinPredModel) = (invstd = map(RcpFun(),stderr(x)); scale!(invstd,scale!(v
 stderr(x::LinPredModel) = sqrt(diag(vcov(x)))
 
 function show(io::IO, obj::LinPredModel)
-    println(io, "$(obj.ff)\n\nCoefficients:")
-    print(io, coeftable(obj))
+    println(io, "$(typeof(obj)):\n\nCoefficients:", coeftable(obj))
 end
 
 ## function show(io::IO, obj::GlmMod)

diff --git a/src/lm.jl b/src/lm.jl
@@ -26,32 +26,19 @@ result_type{T<:FP}(::WtResid,wt::T,y::T,mu::T) = T
 deviance(r::LmResp) = length(r.wts) == 0 ? sumsqdiff(r.y, r.mu) : wsumsqdiff(r.wts,r.y,r.mu)
 residuals(r::LmResp)= length(r.wts) == 0 ? r.y - r.mu : map(WtResid(),r.wts,r.y,r.mu)
 
-type LmMod <: LinPredModel
-    fr::ModelFrame
+type LmMod{T<:LinPred} <: LinPredModel
     rr::LmResp
-    pp::LinPred
-    ff::Formula
+    pp::T
 end
 
 cholfact(x::LmMod) = cholfact(x.pp)
 
-function lm(f::Formula, df::AbstractDataFrame)
-    mf = ModelFrame(f, df); mm = ModelMatrix(mf)
-    rr = LmResp(float(model_response(mf))); pp = DensePredQR(mm)
+function StatsBase.fit{LinPredT<:LinPred}(::Type{LmMod{LinPredT}}, X::Matrix, y::Vector)
+    rr = LmResp(float(y)); pp = LinPredT(X)
     installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
-    LmMod(mf, rr, pp, f)
+    LmMod(rr, pp)
 end
-lm(f::Expr, df::AbstractDataFrame) = lm(Formula(f), df)
-lm(f::String, df::AbstractDataFrame) = lm(Formula(parse(f)[1]), df)
-
-function lmc(f::Formula, df::AbstractDataFrame)
-    mf = ModelFrame(f, df); mm = ModelMatrix(mf)
-    rr = LmResp(model_response(mf)); pp = DensePredChol(mm)
-    installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
-    LmMod(mf, rr, pp, f)
-end
-lmc(f::Expr, df::AbstractDataFrame) = lmc(Formula(f), df)
-lmc(f::String, df::AbstractDataFrame) = lmc(Formula(parse(f)[1]), df)
+StatsBase.fit(::Type{LmMod}, X::Matrix, y::Vector) = StatsBase.fit(LmMod{DensePredQR}, X, y)
 
 ## scale(m) -> estimate, s, of the scale parameter
 ## scale(m,true) -> estimate, s^2, of the squared scale parameter
@@ -66,7 +53,7 @@ function coeftable(mm::LmMod)
     tt = cc ./ se
     CoefTable(hcat(cc,se,tt,ccdf(FDist(1, df_residual(mm)), abs2(tt))),
               ["Estimate","Std.Error","t value", "Pr(>|t|)"],
-              coefnames(mm.fr), 4)
+              ["x$i" for i = 1:size(mm.pp.X, 2)], 4)
 end
 
 predict(mm::LmMod, newx::Matrix) =  newx * coef(mm)

diff --git a/test/glmFit.jl b/test/glmFit.jl
@@ -1,68 +1,67 @@
-using Base.Test
-using GLM
+using Base.Test, GLM, DataFrames
 
 ## Formaldehyde data from the R Datasets package
 form = DataFrame(Carb=[0.1,0.3,0.5,0.6,0.7,0.9],OptDen=[0.086,0.269,0.446,0.538,0.626,0.782])
-lm1 = lm(OptDen ~ Carb, form)
+lm1 = fit(LmMod, OptDen ~ Carb, form)
 @test_approx_eq coef(lm1) linreg(array(form[:Carb]),array(form[:OptDen]))
 
 dobson = DataFrame(Counts=[18.,17,15,20,10,20,25,13,12], Outcome=gl(3,1,9), Treatment=gl(3,3))
-gm1 = glm(Counts ~ Outcome + Treatment, dobson, Poisson());
+gm1 = fit(GlmMod, Counts ~ Outcome + Treatment, dobson, Poisson());
 @test_approx_eq deviance(gm1) 5.12914107700115
 @test_approx_eq coef(gm1)[1:3] [3.044522437723423,-0.45425527227759555,-0.29298712468147375]
 
 ## Example from http://www.ats.ucla.edu/stat/r/dae/logit.htm
 df = readtable(Pkg.dir("GLM","data","admit.csv.gz"))
 df[:rank] = pool(df[:rank])
 
-gm2 = glm(admit ~ gre + gpa + rank, df, Binomial())
+gm2 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial())
 @test_approx_eq deviance(gm2) 458.5174924758994
 @test_approx_eq coef(gm2) [-3.9899786606380734,0.0022644256521549043,0.8040374535155766,-0.6754428594116577,-1.3402038117481079,-1.5514636444657492]
 
-gm3 = glm(admit ~ gre + gpa + rank, df, Binomial(), ProbitLink())
+gm3 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), ProbitLink())
 @test_approx_eq deviance(gm3) 458.4131713833386
 @test_approx_eq coef(gm3) [-2.3867922998680786,0.0013755394922972369,0.47772908362647015,-0.4154125854823675,-0.8121458010130356,-0.9359047862425298]
 
-gm4 = glm(admit ~ gre + gpa + rank, df, Binomial(), CauchitLink())
+gm4 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), CauchitLink())
 @test_approx_eq deviance(gm4) 459.3401112751141
 
-gm5 = glm(admit ~ gre + gpa + rank, df, Binomial(), CloglogLink())
+gm5 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), CloglogLink())
 @test_approx_eq deviance(gm5) 458.89439629612616
 
 ## Example with offsets from Venables & Ripley (2002, p.189)
 df = readtable(Pkg.dir("GLM","data","anorexia.csv.gz"))
 df[:Treat] = pool(df[:Treat])
 
-gm6 = glm(Postwt ~ Prewt + Treat, df, Normal(), IdentityLink(), offset=array(df[:Prewt]))
+gm6 = fit(GlmMod, Postwt ~ Prewt + Treat, df, Normal(), IdentityLink(), offset=array(df[:Prewt]))
 @test_approx_eq deviance(gm6) 3311.262619919613
 @test_approx_eq coef(gm6) [49.7711090149846,-0.5655388496391,-4.0970655280729,4.5630626529188]
-@test_approx_eq scale(gm6, true) 48.6950385282296
+@test_approx_eq scale(gm6.model, true) 48.6950385282296
 @test_approx_eq stderr(gm6) [13.3909581420259,0.1611823618518,1.8934926069669,2.1333359226431]
 
-gm7 = fit(glm(Postwt ~ Prewt + Treat, df, Normal(), LogLink(), offset=array(df[:Prewt]), dofit=false),
+gm7 = fit(GlmMod, Postwt ~ Prewt + Treat, df, Normal(), LogLink(), offset=array(df[:Prewt]),
 	      convTol=1e-8)
 @test_approx_eq deviance(gm7) 3265.207242977156
 @test_approx_eq coef(gm7) [3.992326787835955,-0.994452693131178,-0.050698258703974,0.051494029957641]
-@test_approx_eq scale(gm7, true) 48.01787789178518
+@test_approx_eq scale(gm7.model, true) 48.01787789178518
 @test_approx_eq stderr(gm7) [0.157167944259695,0.001886285986164,0.022584069426311,0.023882826190166]
 
 ## Gamma example from McCullagh & Nelder (1989, pp. 300-2)
 clotting = DataFrame(u = log([5,10,15,20,30,40,60,80,100]),
                      lot1 = [118,58,42,35,27,25,21,19,18])
-gm8 = glm(lot1 ~ u, clotting, Gamma())
+gm8 = fit(GlmMod, lot1 ~ u, clotting, Gamma())
 @test_approx_eq deviance(gm8) 0.01672971517848353
 @test_approx_eq coef(gm8) [-0.01655438172784895,0.01534311491072141]
-@test_approx_eq scale(gm8, true) 0.002446059333495581
+@test_approx_eq scale(gm8.model, true) 0.002446059333495581
 @test_approx_eq stderr(gm8) [0.0009275466067257,0.0004149596425600]
 
-gm9 = fit(glm(lot1 ~ u, clotting, Gamma(), LogLink(), dofit=false), convTol=1e-8)
+gm9 = fit(GlmMod, lot1 ~ u, clotting, Gamma(), LogLink(), convTol=1e-8)
 @test_approx_eq deviance(gm9) 0.16260829451739
 @test_approx_eq coef(gm9) [5.50322528458221,-0.60191617825971]
-@test_approx_eq scale(gm9, true) 0.02435442293561081
+@test_approx_eq scale(gm9.model, true) 0.02435442293561081
 @test_approx_eq stderr(gm9) [0.19030107482720,0.05530784660144]
 
-gm10 = fit(glm(lot1 ~ u, clotting, Gamma(), IdentityLink(), dofit=false), convTol=1e-8)
+gm10 = fit(GlmMod, lot1 ~ u, clotting, Gamma(), IdentityLink(), convTol=1e-8)
 @test_approx_eq deviance(gm10) 0.60845414895344
 @test_approx_eq coef(gm10) [99.250446880986,-18.374324929002]
-@test_approx_eq scale(gm10, true) 0.1041772704067886
+@test_approx_eq scale(gm10.model, true) 0.1041772704067886
 @test_approx_eq stderr(gm10) [17.864388462865,4.297968703823]