Skip to content

Commit

Permalink
When delim=groupmark=x, treat x as delim unless input is quoted (#…
Browse files Browse the repository at this point in the history
…182)

* When delim=groupmark=`x`, treat `x` as delim

* Move groupmark checker to helper function

* Adapt tests for new `groupmark` handling

* Simplify tests

* More test cases (based on old test cases)

* Bump version

---------

Co-authored-by: Drvi <tomas.drvostep@gmail.com>
  • Loading branch information
nickrobinson251 and Drvi authored Nov 9, 2023
1 parent d1c6fc5 commit 0eb5f46
Show file tree
Hide file tree
Showing 6 changed files with 231 additions and 54 deletions.
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "Parsers"
uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
authors = ["quinnj <quinn.jacobd@gmail.com>"]
version = "2.7.2"
version = "2.8.0"

[deps]
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Expand Down
61 changes: 54 additions & 7 deletions src/Parsers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ end
* `ignoreemptylines=false`: after parsing a value, if a newline is detected, another immediately proceeding newline will be checked for and consumed
* `stripwhitespace=nothing`: if true, leading and trailing whitespace is stripped from string fields, note that for *quoted* strings however, whitespace is preserved within quotes (but ignored before/after quote characters). To also strip *within* quotes, see `stripquoted`
* `stripquoted=false`: if true, whitespace is also stripped within quoted strings. If true, `stripwhitespace` is also set to true.
* `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`).
* `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`). When the `groupmark` is ambiguous with the `delim`, the user must quote the number if it contains group marks.
* `rounding=RoundNearest`: optionally specify a rounding mode to use when parsing. No rounding means the result will be marked with `INEXACT` code if the value is not exactly representable in the target type.
"""
struct Options
Expand Down Expand Up @@ -141,12 +141,49 @@ function Base.getproperty(x::Options, nm::Symbol)
end
end

const OPTIONS = Options(Flags(false, false, false, false, false, false, false, false, false), UInt8('.'),
Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(""), Token(""),
nothing, nothing, nothing, nothing, nothing)
const XOPTIONS = Options(Flags(false, false, false, false, true, true, true, false, false), UInt8('.'),
Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(UInt8(',')), Token(""),
nothing, nothing, nothing, nothing, nothing)
# Get the default options for single-value parsing (i.e. not delimited), used
# by Parsers.parse and Parsers.tryparse via Parser.xparse2
function _get_default_options(;
flags::Flags=Flags(false, false, false, false, false, false, false, false, false),
decimal::UInt8=UInt8('.'),
oq::Token=Token(UInt8('"')),
cq::Token=Token(UInt8('"')),
e::UInt8=UInt8('"'),
sentinel::Vector{Token}=Token[],
delim::Token=Token(""),
cmt::Token=Token(""),
trues::Union{Nothing, Vector{String}}=nothing,
falses::Union{Nothing, Vector{String}}=nothing,
dateformat::Union{Nothing, Format}=nothing,
groupmark::Union{Nothing,UInt8}=nothing,
rounding::Union{Nothing,RoundingMode}=nothing,
)
return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
end

# Get the default options for delimited parsing, used by Parsers.xparse
function _get_default_xoptions(;
flags::Flags=Flags(false, false, false, false, true, true, true, false, false),
decimal::UInt8=UInt8('.'),
oq::Token=Token(UInt8('"')),
cq::Token=Token(UInt8('"')),
e::UInt8=UInt8('"'),
sentinel::Vector{Token}=Token[],
delim::Token=Token(UInt8(',')),
cmt::Token=Token(""),
trues::Union{Nothing, Vector{String}}=nothing,
falses::Union{Nothing, Vector{String}}=nothing,
dateformat::Union{Nothing, Format}=nothing,
groupmark::Union{Nothing,UInt8}=nothing,
rounding::Union{Nothing,RoundingMode}=nothing,
)
return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
end

# What is used by default in Parsers.parse, Parsers.tryparse, Parsers.xparse2
const OPTIONS = _get_default_options()
# What is used by default in Parsers.xparse
const XOPTIONS = _get_default_xoptions()

prepare!(x::Vector) = sort!(x, by=x->sizeof(x), rev=true)
asciival(c::Char) = isascii(c)
Expand Down Expand Up @@ -446,6 +483,16 @@ function checkdelim!(source::AbstractVector{UInt8}, pos, len, options::Options)
return pos
end

@inline function _has_groupmark(opts::Options, code::ReturnCode)
if opts.groupmark !== nothing
isquoted = (code & QUOTED) != 0
if isquoted || (opts.groupmark != opts.delim)
return true
end
end
return false
end

include("ints.jl")
include("floats.jl")
include("strings.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/floats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ rettype(::Type{T}) where {T} = T === Number ? Nothing : T
@inline function parsedigits(conf::AbstractConf{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, overflow_invalid::Bool=false, ndigits::Int=0, f::F=nothing) where {T, IntType, F}
x = zero(T)
anydigits = false
has_groupmark = options.groupmark !== nothing
has_groupmark = _has_groupmark(options, code)
groupmark0 = something(options.groupmark, 0xff) - UInt8('0')

# we already previously checked if `b` was decimal or a digit, so don't need to check explicitly again
Expand Down
2 changes: 1 addition & 1 deletion src/ints.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))
@inline function typeparser(::AbstractConf{T}, source, pos, len, b, code, pl, opts) where {T <: Integer}
x = zero(T)
neg = false
has_groupmark = opts.groupmark !== nothing
has_groupmark = _has_groupmark(opts, code)
groupmark0 = something(opts.groupmark, 0xff) - UInt8('0')
# start actual int parsing
neg = b == UInt8('-')
Expand Down
96 changes: 81 additions & 15 deletions test/floats.jl
Original file line number Diff line number Diff line change
Expand Up @@ -369,48 +369,114 @@ end
@test Parsers.tryparse(Float64, "0e+") === nothing

@testset "groupmark" begin
@test Parsers.xparse(Float64, "100,000,000.99"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "100,000,000"; groupmark=',').val == 100_000_000.0
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val == 100_000_000.99
# `parse` is used for parsing inputs with a single value in them,
# so when delims==groupmarks, we assume what we see are groupmarks
@testset "Parsers.parse" begin
groupmark(c::Char) = Parsers._get_default_options(groupmark=UInt8(c))
@testset "$T" for T in (Float32, Float64)
# comma
@test Parsers.parse(T, "1,0,0,0,0,0,0,0,099e-2", groupmark(',')) 100_000_000.99
@test Parsers.parse(T, "100,000,00099e-2", groupmark(',')) 100_000_000.99
@test Parsers.parse(T, "100,000,000.99", groupmark(',')) 100_000_000.99
@test Parsers.parse(T, "100,000,000", groupmark(',')) 100_000_000
# space
@test Parsers.parse(T, "1 0 0 0 0 0 0 0 099e-2", groupmark(' ')) 100_000_000.99
@test Parsers.parse(T, "100 000 00099e-2", groupmark(' ')) 100_000_000.99
@test Parsers.parse(T, "100 000 000.99", groupmark(' ')) 100_000_000.99
@test Parsers.parse(T, "100 000 000", groupmark(' ')) 100_000_000
end
end
@test Parsers.xparse(Float64, "100_000_000.99"; groupmark='_').val == 100_000_000.99
@test Parsers.xparse(Float64, "100_000_000"; groupmark='_').val == 100_000_000.0
@test Parsers.xparse(Float64, "1_0_0_0_0_0_0_0_0.99"; groupmark='_').val == 100_000_000.99

@test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val == 100_000_000.99
@test Parsers.xparse(Float64, "100000000.99"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "100000000.99,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 13, 1.0000000099e8)
@test Parsers.xparse(Float64, "\"100,000,000.99\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
@test Parsers.xparse(Float64, "100,000,000.99,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
@test Parsers.xparse(Float64, "100,000,000.99,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 4, 100.0)
@test Parsers.xparse(Float64, "100_000_000.99,100"; groupmark='_', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
@test Parsers.xparse(Float64, "\"100,000,000\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 14, 1.0e8)
res = Parsers.xparse(Float64, "100,000,000,aaa"; groupmark=',')
@test res.code == EOF | INVALID | INVALID_DELIMITER
@test res.tlen == 15
@test res.code == OK | DELIMITED
@test res.tlen == 4
res = Parsers.xparse(Float64, "100_000_000,aaa"; groupmark='_')
@test res.code == OK | DELIMITED
@test res.tlen == 12

@test Parsers.xparse(Float32, "100,000,000.99"; groupmark=',').val 100_000_000.99
@test Parsers.xparse(Float32, "100,000,000"; groupmark=',').val 100_000_000.0
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val 100_000_000.99
@test Parsers.xparse(Float32, "100_000_000.99"; groupmark='_').val 100_000_000.99
@test Parsers.xparse(Float32, "100_000_000"; groupmark='_').val 100_000_000.0
@test Parsers.xparse(Float32, "1_0_0_0_0_0_0_0_0.99"; groupmark='_').val 100_000_000.99
@test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val 100_000_000.99
@test Parsers.xparse(Float32, "100000000.99"; groupmark=',').val 100_000_000.99
res = Parsers.xparse(Float32, "100000000.99,aaa"; groupmark=',')
@test res.code == OK | DELIMITED
@test res.tlen == 13
@test res.val 100_000_000.99
res = Parsers.xparse(Float32, "100,000,000,aaa"; groupmark=',')
@test res.code == EOF | INVALID | INVALID_DELIMITER
@test res.tlen == 15
@test res.code == OK | DELIMITED
@test res.tlen == 4
res = Parsers.xparse(Float32, "100_000_000,aaa"; groupmark='_')
@test res.code == OK | DELIMITED
@test res.tlen == 12

@test Parsers.xparse(Float64, "100,000,00099e-2"; groupmark=',').val == 100.0
@test Parsers.xparse(Float64, "100_000_00099e-2"; groupmark='_').val == 100_000_000.99
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val == 1.0
@test Parsers.xparse(Float64, "1_0_0_0_0_0_0_0_099e-2"; groupmark='_').val == 100_000_000.99

@test Parsers.xparse(Float64, "100,000,00099e-2"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val == 100_000_000.99
@test Parsers.xparse(Float64, "10000000099e-2"; groupmark=',').val == 100_000_000.99
@test Parsers.xparse(Float64, "10000000099e-2,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 15, 1.0000000099e8)
@test Parsers.xparse(Float64, "\"10000000099e-2\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
@test Parsers.xparse(Float64, "10000000099e-2,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)

@test Parsers.xparse(Float32, "100,000,00099e-2"; groupmark=',').val 100_000_000.99
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val 100_000_000.99
@test Parsers.xparse(Float32, "100,000,00099e-2"; groupmark=',').val 100.0
@test Parsers.xparse(Float32, "100_000_00099e-2"; groupmark='_').val 100_000_000.99
@test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val 1.0
@test Parsers.xparse(Float32, "1_0_0_0_0_0_0_0_099e-2"; groupmark='_').val 100_000_000.99

@test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val 100_000_000.99
@test Parsers.xparse(Float32, "10000000099e-2"; groupmark=',').val 100_000_000.99
res = Parsers.xparse(Float32, "10000000099e-2,aaa"; groupmark=',')
@test res.code == OK | DELIMITED
@test res.tlen == 15
@test res.val 100_000_000.99

@testset "$T groupmark=$(repr(g))" for g in (',',' '), T in (Float32, Float64)
xgroupmark(c::Char) = Parsers._get_default_xoptions(groupmark=UInt8(c))
# Groupmark tests for floats
for (input, expected_vals) in [
("1000,0000,2000,3000" => (1000.0,0.0,2000.0,3000.0,)),
("\"1000\",\"0000\",\"2000\",\"3000\"" => (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0\",0000,\"2$(g)0$(g)0$(g)0\",3000" => (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0\",2000,\"3$(g)0$(g)0$(g)0\"" => (1000.0,0.0,2000.0,3000.0,)),
("1000.00,0000.00,2000.00,3000.00" => (1000.0,0.0,2000.0,3000.0,)),
("\"1000.00\",\"0000.00\",\"2000.00\",\"3000.00\"" => (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0.00\",0000.00,\"2$(g)0$(g)0$(g)0.00\",3000.00" => (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0.00\",2000.00,\"3$(g)0$(g)0$(g)0.00\"" => (1000.0,0.0,2000.0,3000.0,)),
("1000.00e0,0000.00e0,2000.00e0,3000.00e0" => (1000.0,0.0,2000.0,3000.0,)),
("\"1000.00e0\",\"0000.00e0\",\"2000.00e0\",\"3000.00e0\"" => (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0.00e0\",0000.00e0,\"2$(g)0$(g)0$(g)0.00e0\",3000.00e0" => (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0.00e0\",2000.00e0,\"3$(g)0$(g)0$(g)0.00e0\"" => (1000.0,0.0,2000.0,3000.0,)),
("1000e0,0000e0,2000e0,3000e0" => (1000.0,0.0,2000.0,3000.0,)),
("\"1000e0\",\"0000e0\",\"2000e0\",\"3000e0\"" => (1000.0,0.0,2000.0,3000.0,)),
("\"1$(g)0$(g)0$(g)0e0\",0000e0,\"2$(g)0$(g)0$(g)0e0\",3000e0" => (1000.0,0.0,2000.0,3000.0,)),
("1000,\"0$(g)0$(g)0$(g)0e0\",2000e0,\"3$(g)0$(g)0$(g)0e0\"" => (1000.0,0.0,2000.0,3000.0,)),
]
pos = 1
len = length(input)
local res
for expected in expected_vals
res = Parsers.xparse(T, input, pos, len, xgroupmark(g))
@test res.val == expected
@test Parsers.ok(res.code)
pos += res.tlen
end
@test Parsers.ok(res.code)
@test Parsers.eof(res.code)
end
end
end

@testset "BigFloats" begin
Expand Down
Loading

2 comments on commit 0eb5f46

@nickrobinson251
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JuliaRegistrator
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Registration pull request created: JuliaRegistries/General/95069

Tip: Release Notes

Did you know you can add release notes too? Just add markdown formatted text underneath the comment after the text
"Release notes:" and it will be added to the registry PR, and if TagBot is installed it will also be added to the
release that TagBot creates. i.e.

@JuliaRegistrator register

Release notes:

## Breaking changes

- blah

To add them here just re-invoke and the PR will be updated.

Tagging

After the above pull request is merged, it is recommended that a tag is created on this repository for the registered package version.

This will be done automatically if the Julia TagBot GitHub Action is installed, or can be done manually through the github interface, or via:

git tag -a v2.8.0 -m "<description of version>" 0eb5f46d10047d18d5aa73a267faab5560baf688
git push origin v2.8.0

Please sign in to comment.