Skip to content

Commit

Permalink
minimize rows used to detect column types
Browse files Browse the repository at this point in the history
use 20 rows by default and define Dataset_typedetect_rows dictionary
that stores the exceptions
  • Loading branch information
alyst committed Nov 27, 2017
1 parent 01fb1ab commit 045a0cf
Showing 1 changed file with 38 additions and 1 deletion.
39 changes: 38 additions & 1 deletion src/dataset.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,40 @@
# special cases for datasets that need more rows for proper column type detection
const Dataset_typedetect_rows = Dict{Tuple{String, String}, Int}(
("COUNT", "loomis") => 50,
("Ecdat", "MCAS") => 50,
("Ecdat", "Mofa") => 30,
("KMsurv", "bcdeter") => 100,
("Zelig", "SupremeCourt") => 30,
("adehabitatLT", "albatross") => 1000,
("adehabitatLT", "bear") => 1200,
("adehabitatLT", "buffalo") => 1500,
("adehabitatLT", "capreotf") => 600,
("adehabitatLT", "hseal") => 150,
("adehabitatLT", "ibex") => 100,
("adehabitatLT", "mouflon") => 200,
("adehabitatLT", "porpoise") => 100,
("adehabitatLT", "puechcirc") => 100,
("adehabitatLT", "rupicabau") => 100,
("adehabitatLT", "whale") => 200,
("boot", "neuro") => 500,
("boot", "urine") => 100,
("datasets", "attenu") => 200,
("gap", "PD") => 500,
("gap", "aldh2") => 150,
("gap", "mao") => 200,
("plyr", "baseball") => 1000,
("pscl", "UKHouseOfCommons") => 200,
("pscl", "ca2006") => 50,
("psych", "bfi") => 1000,
("psych", "iqitems") => 150,
("psych", "sat.act") => 150,
("robustbase", "ambientNOxCH") => 100,
("sandwich", "PublicSchools") => 100,
("survival", "cancer") => 250,
("survival", "lung") => 250,
("vcd", "Bundesliga") => 15000,
)

function dataset(package_name::AbstractString, dataset_name::AbstractString)
basename = joinpath(dirname(@__FILE__), "..", "data", package_name)

Expand All @@ -10,7 +47,7 @@ function dataset(package_name::AbstractString, dataset_name::AbstractString)
if isfile(csvname)
return open(GzipDecompressorStream, csvname, "r") do io
CSV.read(io, delim=',', quotechar='\"', null="NA",
rows_for_type_detect=15000)
rows_for_type_detect=get(Dataset_typedetect_rows, (package_name, dataset_name), 20))
end
end
error(@sprintf "Unable to locate dataset file %s or %s" rdaname csvname)
Expand Down

0 comments on commit 045a0cf

Please sign in to comment.