From 045a0cf1f0fab34af46cf0efe904fd06d78413e5 Mon Sep 17 00:00:00 2001 From: Alexey Stukalov Date: Mon, 27 Nov 2017 17:42:47 +0100 Subject: [PATCH] minimize rows used to detect column types use 20 rows by default and define Dataset_typedetect_rows dictionary that stores the exceptions --- src/dataset.jl | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/src/dataset.jl b/src/dataset.jl index 00667ac..ff20dda 100644 --- a/src/dataset.jl +++ b/src/dataset.jl @@ -1,3 +1,40 @@ +# special cases for datasets that need more rows for proper column type detection +const Dataset_typedetect_rows = Dict{Tuple{String, String}, Int}( + ("COUNT", "loomis") => 50, + ("Ecdat", "MCAS") => 50, + ("Ecdat", "Mofa") => 30, + ("KMsurv", "bcdeter") => 100, + ("Zelig", "SupremeCourt") => 30, + ("adehabitatLT", "albatross") => 1000, + ("adehabitatLT", "bear") => 1200, + ("adehabitatLT", "buffalo") => 1500, + ("adehabitatLT", "capreotf") => 600, + ("adehabitatLT", "hseal") => 150, + ("adehabitatLT", "ibex") => 100, + ("adehabitatLT", "mouflon") => 200, + ("adehabitatLT", "porpoise") => 100, + ("adehabitatLT", "puechcirc") => 100, + ("adehabitatLT", "rupicabau") => 100, + ("adehabitatLT", "whale") => 200, + ("boot", "neuro") => 500, + ("boot", "urine") => 100, + ("datasets", "attenu") => 200, + ("gap", "PD") => 500, + ("gap", "aldh2") => 150, + ("gap", "mao") => 200, + ("plyr", "baseball") => 1000, + ("pscl", "UKHouseOfCommons") => 200, + ("pscl", "ca2006") => 50, + ("psych", "bfi") => 1000, + ("psych", "iqitems") => 150, + ("psych", "sat.act") => 150, + ("robustbase", "ambientNOxCH") => 100, + ("sandwich", "PublicSchools") => 100, + ("survival", "cancer") => 250, + ("survival", "lung") => 250, + ("vcd", "Bundesliga") => 15000, +) + function dataset(package_name::AbstractString, dataset_name::AbstractString) basename = joinpath(dirname(@__FILE__), "..", "data", package_name) @@ -10,7 +47,7 @@ function dataset(package_name::AbstractString, dataset_name::AbstractString) if isfile(csvname) return open(GzipDecompressorStream, csvname, "r") do io CSV.read(io, delim=',', quotechar='\"', null="NA", - rows_for_type_detect=15000) + rows_for_type_detect=get(Dataset_typedetect_rows, (package_name, dataset_name), 20)) end end error(@sprintf "Unable to locate dataset file %s or %s" rdaname csvname)