minimize rows used to detect column types

use 20 rows by default and define Dataset_typedetect_rows dictionary that stores the exceptions
JuliaStats · Nov 27, 2017 · 045a0cf · 045a0cf
1 parent 01fb1ab
commit 045a0cf
Showing 1 changed file with 38 additions and 1 deletion.
diff --git a/src/dataset.jl b/src/dataset.jl
@@ -1,3 +1,40 @@
+# special cases for datasets that need more rows for proper column type detection
+const Dataset_typedetect_rows = Dict{Tuple{String, String}, Int}(
+    ("COUNT", "loomis") => 50,
+    ("Ecdat", "MCAS") => 50,
+    ("Ecdat", "Mofa") => 30,
+    ("KMsurv", "bcdeter") => 100,
+    ("Zelig", "SupremeCourt") => 30,
+    ("adehabitatLT", "albatross") => 1000,
+    ("adehabitatLT", "bear") => 1200,
+    ("adehabitatLT", "buffalo") => 1500,
+    ("adehabitatLT", "capreotf") => 600,
+    ("adehabitatLT", "hseal") => 150,
+    ("adehabitatLT", "ibex") => 100,
+    ("adehabitatLT", "mouflon") => 200,
+    ("adehabitatLT", "porpoise") => 100,
+    ("adehabitatLT", "puechcirc") => 100,
+    ("adehabitatLT", "rupicabau") => 100,
+    ("adehabitatLT", "whale") => 200,
+    ("boot", "neuro") => 500,
+    ("boot", "urine") => 100,
+    ("datasets", "attenu") => 200,
+    ("gap", "PD") => 500,
+    ("gap", "aldh2") => 150,
+    ("gap", "mao") => 200,
+    ("plyr", "baseball") => 1000,
+    ("pscl", "UKHouseOfCommons") => 200,
+    ("pscl", "ca2006") => 50,
+    ("psych", "bfi") => 1000,
+    ("psych", "iqitems") => 150,
+    ("psych", "sat.act") => 150,
+    ("robustbase", "ambientNOxCH") => 100,
+    ("sandwich", "PublicSchools") => 100,
+    ("survival", "cancer") => 250,
+    ("survival", "lung") => 250,
+    ("vcd", "Bundesliga") => 15000,
+)
+
 function dataset(package_name::AbstractString, dataset_name::AbstractString)
     basename = joinpath(dirname(@__FILE__), "..", "data", package_name)
 
@@ -10,7 +47,7 @@ function dataset(package_name::AbstractString, dataset_name::AbstractString)
     if isfile(csvname)
         return open(GzipDecompressorStream, csvname, "r") do io
             CSV.read(io, delim=',', quotechar='\"', null="NA",
-                     rows_for_type_detect=15000)
+                     rows_for_type_detect=get(Dataset_typedetect_rows, (package_name, dataset_name), 20))
         end
     end
     error(@sprintf "Unable to locate dataset file %s or %s" rdaname csvname)