Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: unstack receives kwarg fillvalue #2828

Merged
merged 11 commits into from
Sep 8, 2021
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,12 @@
(notably `PooledArray` and `CategoricalArray`) or when they contained only
integers in a small range.
([#2812](https://github.com/JuliaData/DataFrames.jl/pull/2812))
* the `unstack` function receives new keyword argument `fill`
(with `missing` default) that is used to fill combinations of not encountered
rows and columns. This feature allows to distinguish between missings in
value column and just missing row/column combinations and to easily fill
with zeros non existing combinations in case of counting.
([#2828](https://github.com/JuliaData/DataFrames.jl/pull/2828))

* Allow adding new columns to a `SubDataFrame` created with `:` as column selector
([#2794](https://github.com/JuliaData/DataFrames.jl/pull/2794)).
Expand Down
49 changes: 37 additions & 12 deletions src/abstractdataframe/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,11 @@ end

"""
unstack(df::AbstractDataFrame, rowkeys, colkey, value; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false)
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
unstack(df::AbstractDataFrame, colkey, value; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false)
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
unstack(df::AbstractDataFrame; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false)
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)

Unstack data frame `df`, i.e. convert it from long to wide format.

Expand All @@ -229,6 +229,10 @@ Row and column keys will be ordered in the order of their first appearance.
- `allowduplicates`: if `false` (the default) then an error an error will be thrown
if combination of `rowkeys` and `colkey` contains duplicate entries; if `true`
then then the last encountered `value` will be retained.
- `fill`: missing row/column combinations are filled with this value; The default
is `missing`. If the `value` column is a `CategoricalVector` and `fill`
bkamins marked this conversation as resolved.
Show resolved Hide resolved
is not `missing` then in order to keep unstacked value columns also
`CategoricalVector` the `fill` must be passed as `CategoricalValue`

# Examples

Expand Down Expand Up @@ -331,36 +335,55 @@ julia> unstack(long, :id, :variable, :value, renamecols=x->Symbol(:_, x))
4 │ 4 2.0 1.0 2.0
5 │ 5 2.0 1.0 3.0
6 │ 6 2.0 1.0 3.0

julia> df = DataFrame(id=["1", "1", "2"],
variable=["Var1", "Var2", "Var1"],
value=[1, 2, 3])
3×3 DataFrame
Row │ id variable value
│ String String Int64
─────┼─────────────────────────
1 │ 1 Var1 1
2 │ 1 Var2 2
3 │ 2 Var1 3

julia> unstack(df, :variable, :value, fill=0)
2×3 DataFrame
Row │ id Var1 Var2
│ String Int64 Int64
─────┼──────────────────────
1 │ 1 1 2
2 │ 2 3 0
```
Note that there are some differences between the widened results above.
"""
function unstack(df::AbstractDataFrame, rowkeys, colkey::ColumnIndex,
value::ColumnIndex; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false)
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
rowkey_ints = vcat(index(df)[rowkeys])
@assert rowkey_ints isa AbstractVector{Int}
length(rowkey_ints) == 0 && throw(ArgumentError("No key column found"))
g_rowkey = groupby(df, rowkey_ints)
g_colkey = groupby(df, colkey)
valuecol = df[!, value]
return _unstack(df, rowkey_ints, index(df)[colkey], g_colkey,
valuecol, g_rowkey, renamecols, allowmissing, allowduplicates)
valuecol, g_rowkey, renamecols, allowmissing, allowduplicates, fill)
end

function unstack(df::AbstractDataFrame, colkey::ColumnIndex, value::ColumnIndex;
renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false)
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing)
colkey_int = index(df)[colkey]
value_int = index(df)[value]
return unstack(df, Not(colkey_int, value_int), colkey_int, value_int,
renamecols=renamecols, allowmissing=allowmissing,
allowduplicates=allowduplicates)
allowduplicates=allowduplicates, fill=fill)
end

unstack(df::AbstractDataFrame; renamecols::Function=identity,
allowmissing::Bool=false, allowduplicates::Bool=false) =
allowmissing::Bool=false, allowduplicates::Bool=false, fill=missing) =
unstack(df, :variable, :value, renamecols=renamecols, allowmissing=allowmissing,
allowduplicates=allowduplicates)
allowduplicates=allowduplicates, fill=fill)

# we take into account the fact that idx, starts and ends are computed lazily
# so we rather directly reference the gdf.groups
Expand Down Expand Up @@ -388,7 +411,7 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
colkey::Int, g_colkey::GroupedDataFrame,
valuecol::AbstractVector, g_rowkey::GroupedDataFrame,
renamecols::Function,
allowmissing::Bool, allowduplicates::Bool)
allowmissing::Bool, allowduplicates::Bool, fill)
rowref = g_rowkey.groups
row_group_row_idxs = find_group_row(g_rowkey)
Nrow = length(g_rowkey)
Expand All @@ -398,13 +421,15 @@ function _unstack(df::AbstractDataFrame, rowkeys::AbstractVector{Int},
Ncol = length(g_colkey)
col_group_row_idxs = find_group_row(g_colkey)
colref_map = df[col_group_row_idxs, colkey]

if any(ismissing, colref_map) && !allowmissing
throw(ArgumentError("Missing value in variable :$(_names(df)[colkey]). " *
"Pass `allowmissing=true` to skip missings."))
end
unstacked_val = [fill!(similar(valuecol,
promote_type(eltype(valuecol), typeof(fill)),
Nrow),
fill) for _ in 1:Ncol]

unstacked_val = [similar_missing(valuecol, Nrow) for i in 1:Ncol]
mask_filled = falses(Nrow, Ncol)

@assert length(rowref) == length(colref) == length(valuecol)
Expand Down
68 changes: 68 additions & 0 deletions test/reshape.jl
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,74 @@ end
@test IndexStyle(DataFrames.StackedVector) == IndexLinear()
end

@testset "unstack with fill" begin
df = DataFrame(factory=["Fac1", "Fac1", "Fac2", "Fac2"],
variable=["Var1", "Var2", "Var1", "Var2"],
value=[1, 2, 3, 4])
dfu1 = DataFrame(factory=["Fac1", "Fac2"],
Var1=allowmissing([1, 3]),
Var2=allowmissing([2, 4]))
dfu = unstack(df, :variable, :value)
@test dfu ≅ dfu1
@test eltype(dfu.Var1) === Union{Missing, Int}
@test eltype(dfu.Var2) === Union{Missing, Int}

for (sentinel, coleltype) in zip([1, 1., "1", nothing], [Int, Float64, Any, Union{Int, Nothing}])
dfu = unstack(df, :variable, :value, fill=sentinel)
@test dfu ≅ dfu1
@test eltype(dfu.Var1) === coleltype
@test eltype(dfu.Var2) === coleltype
end

df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
variable=["Var1", "Var2", "Var1"],
value=[1, 2, 3])
for (sentinel, coleltype) in zip([1, 1.0, "1", nothing], [Int, Float64, Any, Union{Int, Nothing}])
dfu = unstack(df, :variable, :value, fill=sentinel)
@test dfu.Var1 == [1, 3]
@test eltype(dfu.Var1) === coleltype
@test dfu.Var2 == [2, sentinel]
@test eltype(dfu.Var2) === coleltype
end

df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
variable=["Var1", "Var2", "Var1"],
value=categorical([1, 2, 3], ordered=true))
# categorical is dropped here
for (sentinel, coleltype) in zip([0, 0.0, "", nothing], [Int, Float64, Any, Union{Int, Nothing}])
dfu = unstack(df, :variable, :value, fill=sentinel)
@test dfu.Var1 == [1, 3]
@test typeof(dfu.Var1) === Vector{coleltype}
@test dfu.Var2 == [2, sentinel]
@test typeof(dfu.Var2) === Vector{coleltype}
end
# categorical is kept here
for (sentinel, coleltype) in zip([missing, CategoricalValue(1, df.value), ], [Union{Int, Missing}, Int])
dfu = unstack(df, :variable, :value, fill=sentinel)
@test dfu.Var1 == [1, 3]
@test typeof(dfu.Var1) <: CategoricalVector{coleltype}
@test dfu.Var2 ≅ [2, sentinel]
@test typeof(dfu.Var2) <: CategoricalVector{coleltype}
@test levels(dfu.Var1) == levels(dfu.Var2) == levels(df.value)
end
bkamins marked this conversation as resolved.
Show resolved Hide resolved

df = DataFrame(factory=["Fac1", "Fac1", "Fac2"],
variable=["Var1", "Var2", "Var1"],
value=categorical([1, 2, 3]))
dfu = unstack(df, :variable, :value, fill=CategoricalValue(0, categorical([0])))
@test dfu.Var1 == [1, 3]
@test typeof(dfu.Var1) <: CategoricalVector{Int}
@test dfu.Var2 ≅ [2, 0]
@test typeof(dfu.Var2) <: CategoricalVector{Int}
@test levels(dfu.Var1) == levels(dfu.Var2) == 0:3
dfu = unstack(df, :variable, :value, fill=CategoricalValue("0", categorical(["0"])))
@test dfu.Var1 == [1, 3]
@test typeof(dfu.Var1) <: CategoricalVector{Union{Int,String}}
@test dfu.Var2 ≅ [2, "0"]
@test typeof(dfu.Var2) <: CategoricalVector{Union{Int,String}}
@test levels(dfu.Var1) == levels(dfu.Var2) == ["0"; 1:3]
end

@testset "empty unstack" begin
df = DataFrame(a = [], b = [], c = [])
dfu = unstack(df, :b, :c)
Expand Down