Skip to content

Commit

Permalink
Uncouple GLM from DataFrames
Browse files Browse the repository at this point in the history
  • Loading branch information
simonster committed Mar 30, 2014
2 parents 939bd42 + 29eb273 commit 8b31cbd
Show file tree
Hide file tree
Showing 8 changed files with 86 additions and 91 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,6 @@ before_install:
- sudo apt-get update -qq -y
- sudo apt-get install libpcre3-dev julia -y
script:
- julia -e 'Pkg.init(); run(`ln -s $(pwd()) $(Pkg.dir())/GLM`); Pkg.resolve()'
- julia -e 'Pkg.init(); run(`ln -s $(pwd()) $(Pkg.dir())/GLM`); Pkg.resolve(); Pkg.add("DataFrames"); Pkg.checkout("DataFrames")'
- julia -e 'using GLM; @assert isdefined(:GLM); @assert typeof(GLM) === Module'
- julia ./test/glmFit.jl
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ julia> form = dataset("datasets","Formaldehyde")
| 5 | 0.7 | 0.626 |
| 6 | 0.9 | 0.782 |

julia> lm1 = lm(OptDen ~ Carb, form)
julia> lm1 = fit(LmMod, OptDen ~ Carb, form)
Formula: OptDen ~ Carb

Coefficients:
Expand Down Expand Up @@ -103,7 +103,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings")
| 49 | Libya | 8.89 | 43.69 | 2.07 | 123.58 | 16.71 |
| 50 | Malaysia | 4.71 | 47.2 | 0.66 | 242.69 | 5.08 |

julia> fm2 = lm(SR ~ Pop15 + Pop75 + DPI + DDPI, LifeCycleSavings)
julia> fm2 = fit(LmMod, SR ~ Pop15 + Pop75 + DPI + DDPI, LifeCycleSavings)
Formula: SR ~ :(+(Pop15,Pop75,DPI,DDPI))

Coefficients:
Expand Down Expand Up @@ -203,7 +203,7 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12],
| 8 | 13.0 | 2 | 3 |
| 9 | 12.0 | 3 | 3 |

julia> gm1 = glm(Counts ~ Outcome + Treatment, dobson, Poisson())
julia> gm1 = fit(GlmMod, Counts ~ Outcome + Treatment, dobson, Poisson())
Formula: Counts ~ :(+(Outcome,Treatment))

Coefficients:
Expand Down
1 change: 0 additions & 1 deletion REQUIRE
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
julia 0.3-
DataFrames 0.5-
Distributions 0.4-
NumericExtensions 0.6-
NumericFuns 0.2.1-
Expand Down
15 changes: 8 additions & 7 deletions src/GLM.jl
Original file line number Diff line number Diff line change
@@ -1,18 +1,17 @@
using DataFrames, Distributions, NumericExtensions
using Distributions, NumericExtensions

module GLM

using DataFrames, Distributions, NumericExtensions, NumericFuns
using Distributions, NumericExtensions, NumericFuns
using Base.LinAlg.LAPACK: potrf!, potrs!
using Base.LinAlg.BLAS: gemm!, gemv!
using Base.LinAlg: QRCompactWY
using StatsBase: CoefTable, StatisticalModel, RegressionModel

import Base: (\), cholfact, cor, scale, show, size
import Distributions: fit
import DataFrames: ModelFrame, ModelMatrix, model_response
import StatsBase: coef, coeftable, confint, loglikelihood, nobs, stderr, vcov,
residuals, predict
import StatsBase
import StatsBase: coef, coeftable, confint, deviance, loglikelihood, nobs, stderr,
vcov, residuals, predict, fit
import NumericExtensions: evaluate, result_type

export # types
Expand Down Expand Up @@ -41,6 +40,7 @@ module GLM
devresid, # vector of squared deviance residuals
df_residual, # degrees of freedom for residuals
drsum, # sum of squared deviance residuals
fit, # function to fit models, from StatsBase
formula, # extract the formula from a model
glm, # general interface
linkfun!, # mutating link function
Expand Down Expand Up @@ -72,11 +72,12 @@ module GLM

abstract LinPred # linear predictor in statistical models
abstract DensePred <: LinPred # linear predictor with dense X
abstract LinPredModel <: StatisticalModel # model based on a linear predictor
abstract LinPredModel <: RegressionModel # model based on a linear predictor

include("linpred.jl")
include("lm.jl")
include("glmtools.jl")
include("glmfit.jl")
include("deprecated.jl")

end # module
60 changes: 35 additions & 25 deletions src/glmfit.jl
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,8 @@ function wrkwt!(r::GlmResp)
end

type GlmMod <: LinPredModel
fr::ModelFrame
rr::GlmResp
pp::LinPred
ff::Formula
fit::Bool
end

Expand All @@ -92,7 +90,7 @@ function coeftable(mm::GlmMod)
zz = cc ./ se
CoefTable(hcat(cc,se,zz,2.0 * ccdf(Normal(), abs(zz))),
["Estimate","Std.Error","z value", "Pr(>|z|)"],
coefnames(mm.fr), 4)
["x$i" for i = 1:size(mm.pp.X, 2)], 4)
end

function confint(obj::GlmMod, level::Real)
Expand All @@ -102,13 +100,14 @@ confint(obj::GlmMod) = confint(obj, 0.95)

deviance(m::GlmMod) = deviance(m.rr)

function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Real=0.001, convTol::Real=1.e-6)
function StatsBase.fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30,
minStepFac::Real=0.001, convTol::Real=1.e-6)
m.fit && return m
maxIter >= 1 || error("maxIter must be positive")
0 < minStepFac < 1 || error("minStepFac must be in (0, 1)")

cvg = false; p = m.pp; r = m.rr
scratch = similar(p.X.m)
scratch = similar(p.X)
devold = updatemu!(r, linpred(delbeta!(p, wrkresp(r), wrkwt!(r), scratch)))
installbeta!(p)
for i=1:maxIter
Expand All @@ -129,32 +128,43 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re
m
end

function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link; dofit::Bool=true, wts=Float64[], offset=Float64[])
mf = ModelFrame(f, df)
mm = ModelMatrix(mf)
y = model_response(mf); T = eltype(y);
if T <: Integer
y = float64(y)
T = Float64
end
n = length(y); lw = length(wts)
lw == 0 || lw == n || error("length(wts) = $lw should be 0 or $n")
w = lw == 0 ? ones(T,n) : (T <: Float64 ? copy(wts) : convert(typeof(y), wts))
mu = mustart(d, y, w)
eta = linkfun!(l, similar(mu), mu)
function StatsBase.fit(m::GlmMod, y; wts=nothing, offset=nothing, fitargs...)
r = m.rr
V = typeof(r.y)
r.y = isa(y, V) ? copy(y) : convert(V, y)
wts == nothing || (r.wts = isa(wts, V) ? copy(wts) : convert(V, wts))
offset == nothing || (r.offset = isa(offset, V) ? copy(offset) : convert(V, offset))
r.mu = mustart(r.d, r.y, r.wts)
linkfun!(r.l, r.eta, r.mu)
updatemu!(r, r.eta)
fill!(m.pp.beta0, zero(eltype(m.pp.beta0)))
m.fit = false
dofit ? fit(m; fitargs...) : m
end

function StatsBase.fit{T<:FloatingPoint,V<:FPStoredVector}(::Type{GlmMod},
X::Matrix{T}, y::V, d::UnivariateDistribution,
l::Link=canonicallink(d);
dofit::Bool=true,
wts::V=fill!(similar(y), one(eltype(y))),
offset::V=similar(y, 0), fitargs...)
size(X, 1) == size(y, 1) || DimensionMismatch("number of rows in X and y must match")
n = length(y)
length(wts) == n || error("length(wts) = $lw should be 0 or $n")
wts = T <: Float64 ? copy(wts) : convert(typeof(y), wts)
off = T <: Float64 ? copy(offset) : convert(Vector{T}, offset)
mu = mustart(d, y, wts)
eta = linkfun!(l, similar(mu), mu)
if !isempty(off)
subtract!(eta, off)
end
rr = GlmResp{typeof(y)}(y, d, l, eta, mu, off, w)
res = GlmMod(mf, rr, DensePredChol(mm), f, false)
dofit ? fit(res) : res
rr = GlmResp{typeof(y)}(y, d, l, eta, mu, offset, wts)
res = GlmMod(rr, DensePredChol(X), false)
dofit ? fit(res; fitargs...) : res
end

glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution, l::Link) = glm(Formula(e),df,d,l)
glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(e),df,d,canonicallink(d))
glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution) = glm(f, df, d, canonicallink(d))
glm(s::String, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(parse(s)[1]), df, d)
StatsBase.fit(::Type{GlmMod}, X::Matrix, y::AbstractVector, args...; kwargs...) =
fit(GlmMod, float(X), float(y), args...; kwargs...)

## scale(m) -> estimate, s, of the scale parameter
## scale(m,true) -> estimate, s^2, of the squared scale parameter
Expand Down
31 changes: 15 additions & 16 deletions src/linpred.jl
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@


## Return the linear predictor vector
linpred(p::LinPred, f::Real=1.) = p.X.m * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))
linpred(p::LinPred, f::Real=1.) = p.X * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f))

## Install beta0 + f*delbeta as beta0 and zero out delbeta
function installbeta!(p::LinPred, f::Real=1.)
Expand All @@ -13,45 +13,45 @@ end
typealias BlasReal Union(Float32,Float64)

type DensePredQR{T<:BlasReal} <: DensePred
X::ModelMatrix{T} # model matrix
X::Matrix{T} # model matrix
beta0::Vector{T} # base coefficient vector
delbeta::Vector{T} # coefficient increment
qr::QRCompactWY{T}
function DensePredQR(X::ModelMatrix{T}, beta0::Vector{T})
function DensePredQR(X::Matrix{T}, beta0::Vector{T})
n,p = size(X); length(beta0) == p || error("dimension mismatch")
new(X, beta0, zeros(T,p), qrfact(X.m))
new(X, beta0, zeros(T,p), qrfact(X))
end
end
DensePredQR{T<:BlasReal}(X::ModelMatrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))
DensePredQR{T<:BlasReal}(X::Matrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2)))

cholfact{T<:FP}(p::DensePredQR{T}) = Cholesky{T}(p.qr[:R],'U')

delbeta!{T<:BlasReal}(p::DensePredQR{T}, r::Vector{T}) = (p.delbeta = p.qr\r; p)

type DensePredChol{T<:BlasReal} <: DensePred
X::ModelMatrix{T} # model matrix
beta0::Vector{T} # base vector for coefficients
delbeta::Vector{T} # coefficient increment
X::Matrix{T} # model matrix
beta0::Vector{T} # base vector for coefficients
delbeta::Vector{T} # coefficient increment
chol::Cholesky{T}
function DensePredChol(X::ModelMatrix{T}, beta0::Vector{T})
function DensePredChol(X::Matrix{T}, beta0::Vector{T})
n,p = size(X); length(beta0) == p || error("dimension mismatch")
new(X, beta0, zeros(T,p), cholfact(X.m'X.m))
new(X, beta0, zeros(T,p), cholfact(X'X))
end
end
DensePredChol{T<:BlasReal}(X::ModelMatrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))
DensePredChol{T<:BlasReal}(X::Matrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2)))

solve!{T<:BlasReal}(C::Cholesky{T}, B::StridedVecOrMat{T}) = potrs!(C.uplo, C.UL, B)

cholfact{T<:FP}(p::DensePredChol{T}) = (c = p.chol; Cholesky{T}(copy(c.UL),c.uplo))

function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T})
solve!(p.chol, gemv!('T', 1.0, p.X.m, r, 0.0, p.delbeta))
solve!(p.chol, gemv!('T', 1.0, p.X, r, 0.0, p.delbeta))
p
end

function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}, wt::Vector{T}, scr::Matrix{T})
vbroadcast!(Multiply(), scr, p.X.m, wt, 1)
fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X.m, 0.0, p.chol.UL))
vbroadcast!(Multiply(), scr, p.X, wt, 1)
fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X, 0.0, p.chol.UL))
info == 0 || error("Singularity detected at column $info of weighted model matrix")
solve!(p.chol, gemv!('T', 1.0, scr, r, 0.0, p.delbeta))
p
Expand All @@ -73,8 +73,7 @@ cor(x::LinPredModel) = (invstd = map(RcpFun(),stderr(x)); scale!(invstd,scale!(v
stderr(x::LinPredModel) = sqrt(diag(vcov(x)))

function show(io::IO, obj::LinPredModel)
println(io, "$(obj.ff)\n\nCoefficients:")
print(io, coeftable(obj))
println(io, "$(typeof(obj)):\n\nCoefficients:", coeftable(obj))
end

## function show(io::IO, obj::GlmMod)
Expand Down
27 changes: 7 additions & 20 deletions src/lm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,19 @@ result_type{T<:FP}(::WtResid,wt::T,y::T,mu::T) = T
deviance(r::LmResp) = length(r.wts) == 0 ? sumsqdiff(r.y, r.mu) : wsumsqdiff(r.wts,r.y,r.mu)
residuals(r::LmResp)= length(r.wts) == 0 ? r.y - r.mu : map(WtResid(),r.wts,r.y,r.mu)

type LmMod <: LinPredModel
fr::ModelFrame
type LmMod{T<:LinPred} <: LinPredModel
rr::LmResp
pp::LinPred
ff::Formula
pp::T
end

cholfact(x::LmMod) = cholfact(x.pp)

function lm(f::Formula, df::AbstractDataFrame)
mf = ModelFrame(f, df); mm = ModelMatrix(mf)
rr = LmResp(float(model_response(mf))); pp = DensePredQR(mm)
function StatsBase.fit{LinPredT<:LinPred}(::Type{LmMod{LinPredT}}, X::Matrix, y::Vector)
rr = LmResp(float(y)); pp = LinPredT(X)
installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
LmMod(mf, rr, pp, f)
LmMod(rr, pp)
end
lm(f::Expr, df::AbstractDataFrame) = lm(Formula(f), df)
lm(f::String, df::AbstractDataFrame) = lm(Formula(parse(f)[1]), df)

function lmc(f::Formula, df::AbstractDataFrame)
mf = ModelFrame(f, df); mm = ModelMatrix(mf)
rr = LmResp(model_response(mf)); pp = DensePredChol(mm)
installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.))
LmMod(mf, rr, pp, f)
end
lmc(f::Expr, df::AbstractDataFrame) = lmc(Formula(f), df)
lmc(f::String, df::AbstractDataFrame) = lmc(Formula(parse(f)[1]), df)
StatsBase.fit(::Type{LmMod}, X::Matrix, y::Vector) = StatsBase.fit(LmMod{DensePredQR}, X, y)

## scale(m) -> estimate, s, of the scale parameter
## scale(m,true) -> estimate, s^2, of the squared scale parameter
Expand All @@ -66,7 +53,7 @@ function coeftable(mm::LmMod)
tt = cc ./ se
CoefTable(hcat(cc,se,tt,ccdf(FDist(1, df_residual(mm)), abs2(tt))),
["Estimate","Std.Error","t value", "Pr(>|t|)"],
coefnames(mm.fr), 4)
["x$i" for i = 1:size(mm.pp.X, 2)], 4)
end

predict(mm::LmMod, newx::Matrix) = newx * coef(mm)
Expand Down
35 changes: 17 additions & 18 deletions test/glmFit.jl
Original file line number Diff line number Diff line change
@@ -1,68 +1,67 @@
using Base.Test
using GLM
using Base.Test, GLM, DataFrames

## Formaldehyde data from the R Datasets package
form = DataFrame(Carb=[0.1,0.3,0.5,0.6,0.7,0.9],OptDen=[0.086,0.269,0.446,0.538,0.626,0.782])
lm1 = lm(OptDen ~ Carb, form)
lm1 = fit(LmMod, OptDen ~ Carb, form)
@test_approx_eq coef(lm1) linreg(array(form[:Carb]),array(form[:OptDen]))

dobson = DataFrame(Counts=[18.,17,15,20,10,20,25,13,12], Outcome=gl(3,1,9), Treatment=gl(3,3))
gm1 = glm(Counts ~ Outcome + Treatment, dobson, Poisson());
gm1 = fit(GlmMod, Counts ~ Outcome + Treatment, dobson, Poisson());
@test_approx_eq deviance(gm1) 5.12914107700115
@test_approx_eq coef(gm1)[1:3] [3.044522437723423,-0.45425527227759555,-0.29298712468147375]

## Example from http://www.ats.ucla.edu/stat/r/dae/logit.htm
df = readtable(Pkg.dir("GLM","data","admit.csv.gz"))
df[:rank] = pool(df[:rank])

gm2 = glm(admit ~ gre + gpa + rank, df, Binomial())
gm2 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial())
@test_approx_eq deviance(gm2) 458.5174924758994
@test_approx_eq coef(gm2) [-3.9899786606380734,0.0022644256521549043,0.8040374535155766,-0.6754428594116577,-1.3402038117481079,-1.5514636444657492]

gm3 = glm(admit ~ gre + gpa + rank, df, Binomial(), ProbitLink())
gm3 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), ProbitLink())
@test_approx_eq deviance(gm3) 458.4131713833386
@test_approx_eq coef(gm3) [-2.3867922998680786,0.0013755394922972369,0.47772908362647015,-0.4154125854823675,-0.8121458010130356,-0.9359047862425298]

gm4 = glm(admit ~ gre + gpa + rank, df, Binomial(), CauchitLink())
gm4 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), CauchitLink())
@test_approx_eq deviance(gm4) 459.3401112751141

gm5 = glm(admit ~ gre + gpa + rank, df, Binomial(), CloglogLink())
gm5 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), CloglogLink())
@test_approx_eq deviance(gm5) 458.89439629612616

## Example with offsets from Venables & Ripley (2002, p.189)
df = readtable(Pkg.dir("GLM","data","anorexia.csv.gz"))
df[:Treat] = pool(df[:Treat])

gm6 = glm(Postwt ~ Prewt + Treat, df, Normal(), IdentityLink(), offset=array(df[:Prewt]))
gm6 = fit(GlmMod, Postwt ~ Prewt + Treat, df, Normal(), IdentityLink(), offset=array(df[:Prewt]))
@test_approx_eq deviance(gm6) 3311.262619919613
@test_approx_eq coef(gm6) [49.7711090149846,-0.5655388496391,-4.0970655280729,4.5630626529188]
@test_approx_eq scale(gm6, true) 48.6950385282296
@test_approx_eq scale(gm6.model, true) 48.6950385282296
@test_approx_eq stderr(gm6) [13.3909581420259,0.1611823618518,1.8934926069669,2.1333359226431]

gm7 = fit(glm(Postwt ~ Prewt + Treat, df, Normal(), LogLink(), offset=array(df[:Prewt]), dofit=false),
gm7 = fit(GlmMod, Postwt ~ Prewt + Treat, df, Normal(), LogLink(), offset=array(df[:Prewt]),
convTol=1e-8)
@test_approx_eq deviance(gm7) 3265.207242977156
@test_approx_eq coef(gm7) [3.992326787835955,-0.994452693131178,-0.050698258703974,0.051494029957641]
@test_approx_eq scale(gm7, true) 48.01787789178518
@test_approx_eq scale(gm7.model, true) 48.01787789178518
@test_approx_eq stderr(gm7) [0.157167944259695,0.001886285986164,0.022584069426311,0.023882826190166]

## Gamma example from McCullagh & Nelder (1989, pp. 300-2)
clotting = DataFrame(u = log([5,10,15,20,30,40,60,80,100]),
lot1 = [118,58,42,35,27,25,21,19,18])
gm8 = glm(lot1 ~ u, clotting, Gamma())
gm8 = fit(GlmMod, lot1 ~ u, clotting, Gamma())
@test_approx_eq deviance(gm8) 0.01672971517848353
@test_approx_eq coef(gm8) [-0.01655438172784895,0.01534311491072141]
@test_approx_eq scale(gm8, true) 0.002446059333495581
@test_approx_eq scale(gm8.model, true) 0.002446059333495581
@test_approx_eq stderr(gm8) [0.0009275466067257,0.0004149596425600]

gm9 = fit(glm(lot1 ~ u, clotting, Gamma(), LogLink(), dofit=false), convTol=1e-8)
gm9 = fit(GlmMod, lot1 ~ u, clotting, Gamma(), LogLink(), convTol=1e-8)
@test_approx_eq deviance(gm9) 0.16260829451739
@test_approx_eq coef(gm9) [5.50322528458221,-0.60191617825971]
@test_approx_eq scale(gm9, true) 0.02435442293561081
@test_approx_eq scale(gm9.model, true) 0.02435442293561081
@test_approx_eq stderr(gm9) [0.19030107482720,0.05530784660144]

gm10 = fit(glm(lot1 ~ u, clotting, Gamma(), IdentityLink(), dofit=false), convTol=1e-8)
gm10 = fit(GlmMod, lot1 ~ u, clotting, Gamma(), IdentityLink(), convTol=1e-8)
@test_approx_eq deviance(gm10) 0.60845414895344
@test_approx_eq coef(gm10) [99.250446880986,-18.374324929002]
@test_approx_eq scale(gm10, true) 0.1041772704067886
@test_approx_eq scale(gm10.model, true) 0.1041772704067886
@test_approx_eq stderr(gm10) [17.864388462865,4.297968703823]

0 comments on commit 8b31cbd

Please sign in to comment.