diff --git a/.travis.yml b/.travis.yml index b765ab4b..041a5a9f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -9,6 +9,6 @@ before_install: - sudo apt-get update -qq -y - sudo apt-get install libpcre3-dev julia -y script: - - julia -e 'Pkg.init(); run(`ln -s $(pwd()) $(Pkg.dir())/GLM`); Pkg.resolve()' + - julia -e 'Pkg.init(); run(`ln -s $(pwd()) $(Pkg.dir())/GLM`); Pkg.resolve(); Pkg.add("DataFrames")' - julia -e 'using GLM; @assert isdefined(:GLM); @assert typeof(GLM) === Module' - julia ./test/glmFit.jl diff --git a/README.md b/README.md index b86f5031..b874d376 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ julia> form = dataset("datasets","Formaldehyde") | 5 | 0.7 | 0.626 | | 6 | 0.9 | 0.782 | -julia> lm1 = lm(OptDen ~ Carb, form) +julia> lm1 = fit(LmMod, OptDen ~ Carb, form) Formula: OptDen ~ Carb Coefficients: @@ -103,7 +103,7 @@ julia> LifeCycleSavings = dataset("datasets", "LifeCycleSavings") | 49 | Libya | 8.89 | 43.69 | 2.07 | 123.58 | 16.71 | | 50 | Malaysia | 4.71 | 47.2 | 0.66 | 242.69 | 5.08 | -julia> fm2 = lm(SR ~ Pop15 + Pop75 + DPI + DDPI, LifeCycleSavings) +julia> fm2 = fit(LmMod, SR ~ Pop15 + Pop75 + DPI + DDPI, LifeCycleSavings) Formula: SR ~ :(+(Pop15,Pop75,DPI,DDPI)) Coefficients: @@ -203,7 +203,7 @@ julia> dobson = DataFrame(Counts = [18.,17,15,20,10,20,25,13,12], | 8 | 13.0 | 2 | 3 | | 9 | 12.0 | 3 | 3 | -julia> gm1 = glm(Counts ~ Outcome + Treatment, dobson, Poisson()) +julia> gm1 = fit(GlmMod, Counts ~ Outcome + Treatment, dobson, Poisson()) Formula: Counts ~ :(+(Outcome,Treatment)) Coefficients: diff --git a/REQUIRE b/REQUIRE index e2333206..06e93b09 100644 --- a/REQUIRE +++ b/REQUIRE @@ -1,5 +1,4 @@ julia 0.3- -DataFrames 0.5- Distributions 0.4- NumericExtensions 0.6- NumericFuns 0.2.1- diff --git a/src/GLM.jl b/src/GLM.jl index 29def5fd..df55a6dd 100644 --- a/src/GLM.jl +++ b/src/GLM.jl @@ -1,18 +1,17 @@ -using DataFrames, Distributions, NumericExtensions +using Distributions, NumericExtensions module GLM - using DataFrames, Distributions, NumericExtensions, NumericFuns + using Distributions, NumericExtensions, NumericFuns using Base.LinAlg.LAPACK: potrf!, potrs! using Base.LinAlg.BLAS: gemm!, gemv! using Base.LinAlg: QRCompactWY using StatsBase: CoefTable, StatisticalModel, RegressionModel import Base: (\), cholfact, cor, scale, show, size - import Distributions: fit - import DataFrames: ModelFrame, ModelMatrix, model_response - import StatsBase: coef, coeftable, confint, loglikelihood, nobs, stderr, vcov, - residuals, predict + import StatsBase + import StatsBase: coef, coeftable, confint, deviance, loglikelihood, nobs, stderr, + vcov, residuals, predict, fit import NumericExtensions: evaluate, result_type export # types @@ -41,6 +40,7 @@ module GLM devresid, # vector of squared deviance residuals df_residual, # degrees of freedom for residuals drsum, # sum of squared deviance residuals + fit, # function to fit models, from StatsBase formula, # extract the formula from a model glm, # general interface linkfun!, # mutating link function @@ -72,11 +72,12 @@ module GLM abstract LinPred # linear predictor in statistical models abstract DensePred <: LinPred # linear predictor with dense X - abstract LinPredModel <: StatisticalModel # model based on a linear predictor + abstract LinPredModel <: RegressionModel # model based on a linear predictor include("linpred.jl") include("lm.jl") include("glmtools.jl") include("glmfit.jl") + include("deprecated.jl") end # module diff --git a/src/glmfit.jl b/src/glmfit.jl index 44f7039b..b02b2315 100644 --- a/src/glmfit.jl +++ b/src/glmfit.jl @@ -79,10 +79,8 @@ function wrkwt!(r::GlmResp) end type GlmMod <: LinPredModel - fr::ModelFrame rr::GlmResp pp::LinPred - ff::Formula fit::Bool end @@ -92,7 +90,7 @@ function coeftable(mm::GlmMod) zz = cc ./ se CoefTable(hcat(cc,se,zz,2.0 * ccdf(Normal(), abs(zz))), ["Estimate","Std.Error","z value", "Pr(>|z|)"], - coefnames(mm.fr), 4) + ["x$i" for i = 1:size(mm.pp.X, 2)], 4) end function confint(obj::GlmMod, level::Real) @@ -102,13 +100,14 @@ confint(obj::GlmMod) = confint(obj, 0.95) deviance(m::GlmMod) = deviance(m.rr) -function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Real=0.001, convTol::Real=1.e-6) +function StatsBase.fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, + minStepFac::Real=0.001, convTol::Real=1.e-6) m.fit && return m maxIter >= 1 || error("maxIter must be positive") 0 < minStepFac < 1 || error("minStepFac must be in (0, 1)") cvg = false; p = m.pp; r = m.rr - scratch = similar(p.X.m) + scratch = similar(p.X) devold = updatemu!(r, linpred(delbeta!(p, wrkresp(r), wrkwt!(r), scratch))) installbeta!(p) for i=1:maxIter @@ -129,32 +128,43 @@ function fit(m::GlmMod; verbose::Bool=false, maxIter::Integer=30, minStepFac::Re m end -function glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution, l::Link; dofit::Bool=true, wts=Float64[], offset=Float64[]) - mf = ModelFrame(f, df) - mm = ModelMatrix(mf) - y = model_response(mf); T = eltype(y); - if T <: Integer - y = float64(y) - T = Float64 - end - n = length(y); lw = length(wts) - lw == 0 || lw == n || error("length(wts) = $lw should be 0 or $n") - w = lw == 0 ? ones(T,n) : (T <: Float64 ? copy(wts) : convert(typeof(y), wts)) - mu = mustart(d, y, w) - eta = linkfun!(l, similar(mu), mu) +function StatsBase.fit(m::GlmMod, y; wts=nothing, offset=nothing, fitargs...) + r = m.rr + V = typeof(r.y) + r.y = isa(y, V) ? copy(y) : convert(V, y) + wts == nothing || (r.wts = isa(wts, V) ? copy(wts) : convert(V, wts)) + offset == nothing || (r.offset = isa(offset, V) ? copy(offset) : convert(V, offset)) + r.mu = mustart(r.d, r.y, r.wts) + linkfun!(r.l, r.eta, r.mu) + updatemu!(r, r.eta) + fill!(m.pp.beta0, zero(eltype(m.pp.beta0))) + m.fit = false + dofit ? fit(m; fitargs...) : m +end + +function StatsBase.fit{T<:FloatingPoint,V<:FPStoredVector}(::Type{GlmMod}, + X::Matrix{T}, y::V, d::UnivariateDistribution, + l::Link=canonicallink(d); + dofit::Bool=true, + wts::V=fill!(similar(y), one(eltype(y))), + offset::V=similar(y, 0), fitargs...) + size(X, 1) == size(y, 1) || DimensionMismatch("number of rows in X and y must match") + n = length(y) + length(wts) == n || error("length(wts) = $lw should be 0 or $n") + wts = T <: Float64 ? copy(wts) : convert(typeof(y), wts) off = T <: Float64 ? copy(offset) : convert(Vector{T}, offset) + mu = mustart(d, y, wts) + eta = linkfun!(l, similar(mu), mu) if !isempty(off) subtract!(eta, off) end - rr = GlmResp{typeof(y)}(y, d, l, eta, mu, off, w) - res = GlmMod(mf, rr, DensePredChol(mm), f, false) - dofit ? fit(res) : res + rr = GlmResp{typeof(y)}(y, d, l, eta, mu, offset, wts) + res = GlmMod(rr, DensePredChol(X), false) + dofit ? fit(res; fitargs...) : res end -glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution, l::Link) = glm(Formula(e),df,d,l) -glm(e::Expr, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(e),df,d,canonicallink(d)) -glm(f::Formula, df::AbstractDataFrame, d::UnivariateDistribution) = glm(f, df, d, canonicallink(d)) -glm(s::String, df::AbstractDataFrame, d::UnivariateDistribution) = glm(Formula(parse(s)[1]), df, d) +StatsBase.fit(::Type{GlmMod}, X::Matrix, y::AbstractVector, args...; kwargs...) = + fit(GlmMod, float(X), float(y), args...; kwargs...) ## scale(m) -> estimate, s, of the scale parameter ## scale(m,true) -> estimate, s^2, of the squared scale parameter diff --git a/src/linpred.jl b/src/linpred.jl index bdb3f7fb..eb18beb3 100644 --- a/src/linpred.jl +++ b/src/linpred.jl @@ -1,7 +1,7 @@ ## Return the linear predictor vector -linpred(p::LinPred, f::Real=1.) = p.X.m * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f)) +linpred(p::LinPred, f::Real=1.) = p.X * (f == 0. ? p.beta0 : fma(p.beta0, p.delbeta, f)) ## Install beta0 + f*delbeta as beta0 and zero out delbeta function installbeta!(p::LinPred, f::Real=1.) @@ -13,45 +13,45 @@ end typealias BlasReal Union(Float32,Float64) type DensePredQR{T<:BlasReal} <: DensePred - X::ModelMatrix{T} # model matrix + X::Matrix{T} # model matrix beta0::Vector{T} # base coefficient vector delbeta::Vector{T} # coefficient increment qr::QRCompactWY{T} - function DensePredQR(X::ModelMatrix{T}, beta0::Vector{T}) + function DensePredQR(X::Matrix{T}, beta0::Vector{T}) n,p = size(X); length(beta0) == p || error("dimension mismatch") - new(X, beta0, zeros(T,p), qrfact(X.m)) + new(X, beta0, zeros(T,p), qrfact(X)) end end -DensePredQR{T<:BlasReal}(X::ModelMatrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2))) +DensePredQR{T<:BlasReal}(X::Matrix{T}) = DensePredQR{T}(X, zeros(T,size(X,2))) cholfact{T<:FP}(p::DensePredQR{T}) = Cholesky{T}(p.qr[:R],'U') delbeta!{T<:BlasReal}(p::DensePredQR{T}, r::Vector{T}) = (p.delbeta = p.qr\r; p) type DensePredChol{T<:BlasReal} <: DensePred - X::ModelMatrix{T} # model matrix - beta0::Vector{T} # base vector for coefficients - delbeta::Vector{T} # coefficient increment + X::Matrix{T} # model matrix + beta0::Vector{T} # base vector for coefficients + delbeta::Vector{T} # coefficient increment chol::Cholesky{T} - function DensePredChol(X::ModelMatrix{T}, beta0::Vector{T}) + function DensePredChol(X::Matrix{T}, beta0::Vector{T}) n,p = size(X); length(beta0) == p || error("dimension mismatch") - new(X, beta0, zeros(T,p), cholfact(X.m'X.m)) + new(X, beta0, zeros(T,p), cholfact(X'X)) end end -DensePredChol{T<:BlasReal}(X::ModelMatrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2))) +DensePredChol{T<:BlasReal}(X::Matrix{T}) = DensePredChol{T}(X, zeros(T,size(X,2))) solve!{T<:BlasReal}(C::Cholesky{T}, B::StridedVecOrMat{T}) = potrs!(C.uplo, C.UL, B) cholfact{T<:FP}(p::DensePredChol{T}) = (c = p.chol; Cholesky{T}(copy(c.UL),c.uplo)) function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}) - solve!(p.chol, gemv!('T', 1.0, p.X.m, r, 0.0, p.delbeta)) + solve!(p.chol, gemv!('T', 1.0, p.X, r, 0.0, p.delbeta)) p end function delbeta!{T<:BlasReal}(p::DensePredChol{T}, r::Vector{T}, wt::Vector{T}, scr::Matrix{T}) - vbroadcast!(Multiply(), scr, p.X.m, wt, 1) - fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X.m, 0.0, p.chol.UL)) + vbroadcast!(Multiply(), scr, p.X, wt, 1) + fac, info = potrf!('U', gemm!('T', 'N', 1.0, scr, p.X, 0.0, p.chol.UL)) info == 0 || error("Singularity detected at column $info of weighted model matrix") solve!(p.chol, gemv!('T', 1.0, scr, r, 0.0, p.delbeta)) p @@ -73,8 +73,7 @@ cor(x::LinPredModel) = (invstd = map(RcpFun(),stderr(x)); scale!(invstd,scale!(v stderr(x::LinPredModel) = sqrt(diag(vcov(x))) function show(io::IO, obj::LinPredModel) - println(io, "$(obj.ff)\n\nCoefficients:") - print(io, coeftable(obj)) + println(io, "$(typeof(obj)):\n\nCoefficients:", coeftable(obj)) end ## function show(io::IO, obj::GlmMod) diff --git a/src/lm.jl b/src/lm.jl index 5ebd662d..4a8cb7a0 100644 --- a/src/lm.jl +++ b/src/lm.jl @@ -26,32 +26,19 @@ result_type{T<:FP}(::WtResid,wt::T,y::T,mu::T) = T deviance(r::LmResp) = length(r.wts) == 0 ? sumsqdiff(r.y, r.mu) : wsumsqdiff(r.wts,r.y,r.mu) residuals(r::LmResp)= length(r.wts) == 0 ? r.y - r.mu : map(WtResid(),r.wts,r.y,r.mu) -type LmMod <: LinPredModel - fr::ModelFrame +type LmMod{T<:LinPred} <: LinPredModel rr::LmResp - pp::LinPred - ff::Formula + pp::T end cholfact(x::LmMod) = cholfact(x.pp) -function lm(f::Formula, df::AbstractDataFrame) - mf = ModelFrame(f, df); mm = ModelMatrix(mf) - rr = LmResp(float(model_response(mf))); pp = DensePredQR(mm) +function StatsBase.fit{LinPredT<:LinPred}(::Type{LmMod{LinPredT}}, X::Matrix, y::Vector) + rr = LmResp(float(y)); pp = LinPredT(X) installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.)) - LmMod(mf, rr, pp, f) + LmMod(rr, pp) end -lm(f::Expr, df::AbstractDataFrame) = lm(Formula(f), df) -lm(f::String, df::AbstractDataFrame) = lm(Formula(parse(f)[1]), df) - -function lmc(f::Formula, df::AbstractDataFrame) - mf = ModelFrame(f, df); mm = ModelMatrix(mf) - rr = LmResp(model_response(mf)); pp = DensePredChol(mm) - installbeta!(delbeta!(pp, rr.y)); updatemu!(rr, linpred(pp,0.)) - LmMod(mf, rr, pp, f) -end -lmc(f::Expr, df::AbstractDataFrame) = lmc(Formula(f), df) -lmc(f::String, df::AbstractDataFrame) = lmc(Formula(parse(f)[1]), df) +StatsBase.fit(::Type{LmMod}, X::Matrix, y::Vector) = StatsBase.fit(LmMod{DensePredQR}, X, y) ## scale(m) -> estimate, s, of the scale parameter ## scale(m,true) -> estimate, s^2, of the squared scale parameter @@ -66,7 +53,7 @@ function coeftable(mm::LmMod) tt = cc ./ se CoefTable(hcat(cc,se,tt,ccdf(FDist(1, df_residual(mm)), abs2(tt))), ["Estimate","Std.Error","t value", "Pr(>|t|)"], - coefnames(mm.fr), 4) + ["x$i" for i = 1:size(mm.pp.X, 2)], 4) end predict(mm::LmMod, newx::Matrix) = newx * coef(mm) diff --git a/test/glmFit.jl b/test/glmFit.jl index e3b87cd6..ff01d2ee 100644 --- a/test/glmFit.jl +++ b/test/glmFit.jl @@ -1,13 +1,12 @@ -using Base.Test -using GLM +using Base.Test, GLM, DataFrames ## Formaldehyde data from the R Datasets package form = DataFrame(Carb=[0.1,0.3,0.5,0.6,0.7,0.9],OptDen=[0.086,0.269,0.446,0.538,0.626,0.782]) -lm1 = lm(OptDen ~ Carb, form) +lm1 = fit(LmMod, OptDen ~ Carb, form) @test_approx_eq coef(lm1) linreg(array(form[:Carb]),array(form[:OptDen])) dobson = DataFrame(Counts=[18.,17,15,20,10,20,25,13,12], Outcome=gl(3,1,9), Treatment=gl(3,3)) -gm1 = glm(Counts ~ Outcome + Treatment, dobson, Poisson()); +gm1 = fit(GlmMod, Counts ~ Outcome + Treatment, dobson, Poisson()); @test_approx_eq deviance(gm1) 5.12914107700115 @test_approx_eq coef(gm1)[1:3] [3.044522437723423,-0.45425527227759555,-0.29298712468147375] @@ -15,54 +14,54 @@ gm1 = glm(Counts ~ Outcome + Treatment, dobson, Poisson()); df = readtable(Pkg.dir("GLM","data","admit.csv.gz")) df[:rank] = pool(df[:rank]) -gm2 = glm(admit ~ gre + gpa + rank, df, Binomial()) +gm2 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial()) @test_approx_eq deviance(gm2) 458.5174924758994 @test_approx_eq coef(gm2) [-3.9899786606380734,0.0022644256521549043,0.8040374535155766,-0.6754428594116577,-1.3402038117481079,-1.5514636444657492] -gm3 = glm(admit ~ gre + gpa + rank, df, Binomial(), ProbitLink()) +gm3 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), ProbitLink()) @test_approx_eq deviance(gm3) 458.4131713833386 @test_approx_eq coef(gm3) [-2.3867922998680786,0.0013755394922972369,0.47772908362647015,-0.4154125854823675,-0.8121458010130356,-0.9359047862425298] -gm4 = glm(admit ~ gre + gpa + rank, df, Binomial(), CauchitLink()) +gm4 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), CauchitLink()) @test_approx_eq deviance(gm4) 459.3401112751141 -gm5 = glm(admit ~ gre + gpa + rank, df, Binomial(), CloglogLink()) +gm5 = fit(GlmMod, admit ~ gre + gpa + rank, df, Binomial(), CloglogLink()) @test_approx_eq deviance(gm5) 458.89439629612616 ## Example with offsets from Venables & Ripley (2002, p.189) df = readtable(Pkg.dir("GLM","data","anorexia.csv.gz")) df[:Treat] = pool(df[:Treat]) -gm6 = glm(Postwt ~ Prewt + Treat, df, Normal(), IdentityLink(), offset=array(df[:Prewt])) +gm6 = fit(GlmMod, Postwt ~ Prewt + Treat, df, Normal(), IdentityLink(), offset=array(df[:Prewt])) @test_approx_eq deviance(gm6) 3311.262619919613 @test_approx_eq coef(gm6) [49.7711090149846,-0.5655388496391,-4.0970655280729,4.5630626529188] -@test_approx_eq scale(gm6, true) 48.6950385282296 +@test_approx_eq scale(gm6.model, true) 48.6950385282296 @test_approx_eq stderr(gm6) [13.3909581420259,0.1611823618518,1.8934926069669,2.1333359226431] -gm7 = fit(glm(Postwt ~ Prewt + Treat, df, Normal(), LogLink(), offset=array(df[:Prewt]), dofit=false), +gm7 = fit(GlmMod, Postwt ~ Prewt + Treat, df, Normal(), LogLink(), offset=array(df[:Prewt]), convTol=1e-8) @test_approx_eq deviance(gm7) 3265.207242977156 @test_approx_eq coef(gm7) [3.992326787835955,-0.994452693131178,-0.050698258703974,0.051494029957641] -@test_approx_eq scale(gm7, true) 48.01787789178518 +@test_approx_eq scale(gm7.model, true) 48.01787789178518 @test_approx_eq stderr(gm7) [0.157167944259695,0.001886285986164,0.022584069426311,0.023882826190166] ## Gamma example from McCullagh & Nelder (1989, pp. 300-2) clotting = DataFrame(u = log([5,10,15,20,30,40,60,80,100]), lot1 = [118,58,42,35,27,25,21,19,18]) -gm8 = glm(lot1 ~ u, clotting, Gamma()) +gm8 = fit(GlmMod, lot1 ~ u, clotting, Gamma()) @test_approx_eq deviance(gm8) 0.01672971517848353 @test_approx_eq coef(gm8) [-0.01655438172784895,0.01534311491072141] -@test_approx_eq scale(gm8, true) 0.002446059333495581 +@test_approx_eq scale(gm8.model, true) 0.002446059333495581 @test_approx_eq stderr(gm8) [0.0009275466067257,0.0004149596425600] -gm9 = fit(glm(lot1 ~ u, clotting, Gamma(), LogLink(), dofit=false), convTol=1e-8) +gm9 = fit(GlmMod, lot1 ~ u, clotting, Gamma(), LogLink(), convTol=1e-8) @test_approx_eq deviance(gm9) 0.16260829451739 @test_approx_eq coef(gm9) [5.50322528458221,-0.60191617825971] -@test_approx_eq scale(gm9, true) 0.02435442293561081 +@test_approx_eq scale(gm9.model, true) 0.02435442293561081 @test_approx_eq stderr(gm9) [0.19030107482720,0.05530784660144] -gm10 = fit(glm(lot1 ~ u, clotting, Gamma(), IdentityLink(), dofit=false), convTol=1e-8) +gm10 = fit(GlmMod, lot1 ~ u, clotting, Gamma(), IdentityLink(), convTol=1e-8) @test_approx_eq deviance(gm10) 0.60845414895344 @test_approx_eq coef(gm10) [99.250446880986,-18.374324929002] -@test_approx_eq scale(gm10, true) 0.1041772704067886 +@test_approx_eq scale(gm10.model, true) 0.1041772704067886 @test_approx_eq stderr(gm10) [17.864388462865,4.297968703823]