Skip to content

Commit

Permalink
Merge 0e0d38e into 28c9f1f
Browse files Browse the repository at this point in the history
  • Loading branch information
dmbates authored Jan 29, 2021
2 parents 28c9f1f + 0e0d38e commit e3ed925
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 5 deletions.
22 changes: 17 additions & 5 deletions src/PooledArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export PooledArray, PooledVector, PooledMatrix
##############################################################################

const DEFAULT_POOLED_REF_TYPE = UInt32
const DEFAULT_SIGNED_REF_TYPE = Int32

# This is used as a wrapper during PooledArray construction only, to distinguish
# arrays of pool indices from normal arrays
Expand Down Expand Up @@ -98,15 +99,23 @@ end
_widen(::Type{UInt8}) = UInt16
_widen(::Type{UInt16}) = UInt32
_widen(::Type{UInt32}) = UInt64

_widen(::Type{Int8}) = Int16
_widen(::Type{Int16}) = Int32
_widen(::Type{Int32}) = Int64
# Constructor from array, invpool, and ref type

"""
PooledArray(array, [reftype])
PooledArray(array, [reftype]; signed=false, compress=false)
Freshly allocate `PooledArray` using the given array as a source where each
element will be referenced as an integer of the given type.
If no `reftype` is specified one is chosen automatically based on the number of unique elements.
The Boolean keyword arguments, `signed` and `compress` determine the choice of `reftype`.
By default, unsigned integers are used, as they have a greater maxtype than the same size of
signed integer. However, the Arrow standard at https://arrow.apache.org/, as implemented in
the Arrow package, requires signed integer types, which are provided when `signed` is `true`.
The `compress` argument controls whether the default size of 32 bits is used (`UInt32` for
unsigned, `Int32` for signed) or if smaller integer types are chosen when they can be used.
If `array` is not a `PooledArray` then the order of elements in `refpool` in the resulting
`PooledArray` is the order of first appereance of elements in `array`.
"""
Expand All @@ -123,13 +132,16 @@ function PooledArray{T}(d::AbstractArray, r::Type{R}) where {T,R<:Integer}
PooledArray(RefArray(refs::Vector{R}), invpool::Dict{T,R}, pool)
end

function PooledArray{T}(d::AbstractArray) where T
refs, invpool, pool = _label(d, T)
function PooledArray{T}(d::AbstractArray; signed::Bool=false, compress::Bool=false) where {T}
R = signed ? (compress ? Int8 : DEFAULT_SIGNED_REF_TYPE) : (compress ? UInt8 : DEFAULT_POOLED_REF_TYPE)
refs, invpool, pool = _label(d, T, R)
PooledArray(RefArray(refs), invpool, pool)
end

PooledArray(d::AbstractArray{T}, r::Type) where {T} = PooledArray{T}(d, r)
PooledArray(d::AbstractArray{T}) where {T} = PooledArray{T}(d)
function PooledArray(d::AbstractArray{T}; signed::Bool=false, compress::Bool=false) where {T}
PooledArray{T}(d, signed=signed, compress=compress)
end

# Construct an empty PooledVector of a specific type
PooledArray(t::Type) = PooledArray(Array(t,0))
Expand Down
5 changes: 5 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ using DataAPI: refarray, refvalue, refpool
@test PooledMatrix == PooledArray{T, R, 2} where {T, R}

s = PooledArray(["a", "a", "b"])
@test eltype(PooledArray(s).refs) == UInt32
@test eltype(PooledArray(s, signed=true).refs) == Int32
@test eltype(PooledArray(s, compress=true).refs) == UInt8
@test eltype(PooledArray(s, signed=true, compress=true).refs) == Int8
@test eltype(PooledArray(rand(300), signed=true, compress=true).refs) == Int16
@test all(refarray(s) .== [1, 1, 2])
for i in 1:3
@test refvalue(s, refarray(s)[i]) == s[i]
Expand Down

0 comments on commit e3ed925

Please sign in to comment.