From 8ad809cd81c0d0ad9d3f798daf92a04488b8c546 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Thu, 28 Apr 2022 10:53:13 +0200 Subject: [PATCH 1/3] fix handling of variable_eltype in stack --- src/abstractdataframe/reshape.jl | 4 ++-- test/reshape.jl | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index a864930808..6013259c87 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -164,8 +164,8 @@ function stack(df::AbstractDataFrame, # (note that copyto! inserts levels in their order of appearance) nms = names(df, ints_measure_vars) simnms = similar(nms, variable_eltype) - catnms = simnms isa Vector ? PooledArray(catnms) : simnms - copyto!(catnms, nms) + copyto!(simnms, nms) + catnms = simnms isa Vector ? PooledArray(simnms) : simnms end return DataFrame(AbstractVector[[repeat(df[!, c], outer=N) for c in ints_id_vars]..., # id_var columns repeat(catnms, inner=nrow(df)), # variable diff --git a/test/reshape.jl b/test/reshape.jl index f3d7c01158..fdd97f77a3 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -845,4 +845,18 @@ end end end +@testset "variable_eltype in stack tests" begin + df = DataFrame(A = 1:3, B = [2.0, -1.1, 2.8], C = ["p","q","r"]) + @test_throws MethodError stack(df, :C, variable_name=:D, variable_eltype=Int) + for T in (AbstractString, Any) + sdf = stack(df, [:A, :B], variable_name=:D, variable_eltype=T) + @test sdf == DataFrame(C=["p", "q", "r", "p", "q", "r"], + D=["A", "A", "A", "B", "B", "B"], + value=[1.0, 2.0, 3.0, 2.0, -1.1, 2.8]) + @test sdf.C isa Vector{String} + @test sdf.value isa Vector{Float64} + @test sdf.D isa PooledVector{T} + end +end + end # module From e475cd33b6e953c0aa095019d4595f64d8577ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Fri, 29 Apr 2022 19:59:44 +0200 Subject: [PATCH 2/3] Apply suggestions from code review Co-authored-by: Milan Bouchet-Valat --- test/reshape.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/reshape.jl b/test/reshape.jl index fdd97f77a3..1f867642d4 100644 --- a/test/reshape.jl +++ b/test/reshape.jl @@ -846,13 +846,13 @@ end end @testset "variable_eltype in stack tests" begin - df = DataFrame(A = 1:3, B = [2.0, -1.1, 2.8], C = ["p","q","r"]) + df = DataFrame(A=1:3, B=[2.0, -1.1, 2.8], C=["p", "q", "r"]) @test_throws MethodError stack(df, :C, variable_name=:D, variable_eltype=Int) for T in (AbstractString, Any) sdf = stack(df, [:A, :B], variable_name=:D, variable_eltype=T) @test sdf == DataFrame(C=["p", "q", "r", "p", "q", "r"], - D=["A", "A", "A", "B", "B", "B"], - value=[1.0, 2.0, 3.0, 2.0, -1.1, 2.8]) + D=["A", "A", "A", "B", "B", "B"], + value=[1.0, 2.0, 3.0, 2.0, -1.1, 2.8]) @test sdf.C isa Vector{String} @test sdf.value isa Vector{Float64} @test sdf.D isa PooledVector{T} From ae8c7634a989311f40e3d82a210494458dd7194f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bogumi=C5=82=20Kami=C5=84ski?= Date: Sat, 30 Apr 2022 09:42:39 +0200 Subject: [PATCH 3/3] minor performance improvement --- Project.toml | 2 +- src/abstractdataframe/reshape.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index ebc02f7ce2..44a8c58a5e 100644 --- a/Project.toml +++ b/Project.toml @@ -30,7 +30,7 @@ DataAPI = "1.10" InvertedIndices = "1" IteratorInterfaceExtensions = "0.1.1, 1" Missings = "0.4.2, 1" -PooledArrays = "1.3.0" +PooledArrays = "1.4.2" PrettyTables = "0.12, 1" Reexport = "0.1, 0.2, 1" ShiftedArrays = "1" diff --git a/src/abstractdataframe/reshape.jl b/src/abstractdataframe/reshape.jl index 6013259c87..f0cfa63aa3 100644 --- a/src/abstractdataframe/reshape.jl +++ b/src/abstractdataframe/reshape.jl @@ -164,8 +164,8 @@ function stack(df::AbstractDataFrame, # (note that copyto! inserts levels in their order of appearance) nms = names(df, ints_measure_vars) simnms = similar(nms, variable_eltype) - copyto!(simnms, nms) catnms = simnms isa Vector ? PooledArray(simnms) : simnms + copyto!(catnms, nms) end return DataFrame(AbstractVector[[repeat(df[!, c], outer=N) for c in ints_id_vars]..., # id_var columns repeat(catnms, inner=nrow(df)), # variable