When delim=groupmark=x, treat x as delim unless input is quoted (#…

…182) * When delim=groupmark=`x`, treat `x` as delim * Move groupmark checker to helper function * Adapt tests for new `groupmark` handling * Simplify tests * More test cases (based on old test cases) * Bump version --------- Co-authored-by: Drvi <tomas.drvostep@gmail.com>
JuliaData · Nov 9, 2023 · 0eb5f46 · 0eb5f46 · nickrobinson251 · Nov 9, 2023
1 parent d1c6fc5
commit 0eb5f46
Show file tree

Hide file tree

Showing 6 changed files with 231 additions and 54 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Parsers"
 uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 authors = ["quinnj <quinn.jacobd@gmail.com>"]
-version = "2.7.2"
+version = "2.8.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

diff --git a/src/Parsers.jl b/src/Parsers.jl
@@ -111,7 +111,7 @@ end
   * `ignoreemptylines=false`: after parsing a value, if a newline is detected, another immediately proceeding newline will be checked for and consumed
   * `stripwhitespace=nothing`: if true, leading and trailing whitespace is stripped from string fields, note that for *quoted* strings however, whitespace is preserved within quotes (but ignored before/after quote characters). To also strip *within* quotes, see `stripquoted`
   * `stripquoted=false`: if true, whitespace is also stripped within quoted strings. If true, `stripwhitespace` is also set to true.
-  * `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`).
+  * `groupmark=nothing`: optionally specify a single-byte character denoting the number grouping mark, this allows parsing of numbers that have, e.g., thousand separators (`1,000.00`). When the `groupmark` is ambiguous with the `delim`, the user must quote the number if it contains group marks.
   * `rounding=RoundNearest`: optionally specify a rounding mode to use when parsing. No rounding means the result will be marked with `INEXACT` code if the value is not exactly representable in the target type.
 """
 struct Options
@@ -141,12 +141,49 @@ function Base.getproperty(x::Options, nm::Symbol)
     end
 end
 
-const OPTIONS = Options(Flags(false, false, false, false, false, false, false, false, false), UInt8('.'),
-    Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(""), Token(""),
-    nothing, nothing, nothing, nothing, nothing)
-const XOPTIONS = Options(Flags(false, false, false, false, true, true, true, false, false), UInt8('.'),
-    Token(UInt8('"')), Token(UInt8('"')), UInt8('"'), Token[], Token(UInt8(',')), Token(""),
-    nothing, nothing, nothing, nothing, nothing)
+# Get the default options for single-value parsing (i.e. not delimited), used
+# by Parsers.parse and Parsers.tryparse via Parser.xparse2
+function _get_default_options(;
+    flags::Flags=Flags(false, false, false, false, false, false, false, false, false),
+    decimal::UInt8=UInt8('.'),
+    oq::Token=Token(UInt8('"')),
+    cq::Token=Token(UInt8('"')),
+    e::UInt8=UInt8('"'),
+    sentinel::Vector{Token}=Token[],
+    delim::Token=Token(""),
+    cmt::Token=Token(""),
+    trues::Union{Nothing, Vector{String}}=nothing,
+    falses::Union{Nothing, Vector{String}}=nothing,
+    dateformat::Union{Nothing, Format}=nothing,
+    groupmark::Union{Nothing,UInt8}=nothing,
+    rounding::Union{Nothing,RoundingMode}=nothing,
+)
+    return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
+end
+
+# Get the default options for delimited parsing, used by Parsers.xparse
+function _get_default_xoptions(;
+    flags::Flags=Flags(false, false, false, false, true, true, true, false, false),
+    decimal::UInt8=UInt8('.'),
+    oq::Token=Token(UInt8('"')),
+    cq::Token=Token(UInt8('"')),
+    e::UInt8=UInt8('"'),
+    sentinel::Vector{Token}=Token[],
+    delim::Token=Token(UInt8(',')),
+    cmt::Token=Token(""),
+    trues::Union{Nothing, Vector{String}}=nothing,
+    falses::Union{Nothing, Vector{String}}=nothing,
+    dateformat::Union{Nothing, Format}=nothing,
+    groupmark::Union{Nothing,UInt8}=nothing,
+    rounding::Union{Nothing,RoundingMode}=nothing,
+)
+    return Options(flags, decimal, oq, cq, e, sentinel, delim, cmt, trues, falses, dateformat, groupmark, rounding)
+end
+
+# What is used by default in Parsers.parse, Parsers.tryparse, Parsers.xparse2
+const OPTIONS = _get_default_options()
+# What is used by default in Parsers.xparse
+const XOPTIONS = _get_default_xoptions()
 
 prepare!(x::Vector) = sort!(x, by=x->sizeof(x), rev=true)
 asciival(c::Char) = isascii(c)
@@ -446,6 +483,16 @@ function checkdelim!(source::AbstractVector{UInt8}, pos, len, options::Options)
     return pos
 end
 
+@inline function _has_groupmark(opts::Options, code::ReturnCode)
+    if opts.groupmark !== nothing
+        isquoted = (code & QUOTED) != 0
+        if isquoted || (opts.groupmark != opts.delim)
+            return true
+        end
+    end
+    return false
+end
+
 include("ints.jl")
 include("floats.jl")
 include("strings.jl")

diff --git a/src/floats.jl b/src/floats.jl
@@ -233,7 +233,7 @@ rettype(::Type{T}) where {T} = T === Number ? Nothing : T
 @inline function parsedigits(conf::AbstractConf{T}, source, pos, len, b, code, options, digits::IntType, neg::Bool, startpos, overflow_invalid::Bool=false, ndigits::Int=0, f::F=nothing) where {T, IntType, F}
     x = zero(T)
     anydigits = false
-    has_groupmark = options.groupmark !== nothing
+    has_groupmark = _has_groupmark(options, code)
     groupmark0 = something(options.groupmark, 0xff) - UInt8('0')
 
     # we already previously checked if `b` was decimal or a digit, so don't need to check explicitly again

diff --git a/src/ints.jl b/src/ints.jl
@@ -7,7 +7,7 @@ overflowval(::Type{T}) where {T <: Integer} = div(typemax(T) - T(9), T(10))
 @inline function typeparser(::AbstractConf{T}, source, pos, len, b, code, pl, opts) where {T <: Integer}
     x = zero(T)
     neg = false
-    has_groupmark = opts.groupmark !== nothing
+    has_groupmark = _has_groupmark(opts, code)
     groupmark0 = something(opts.groupmark, 0xff) - UInt8('0')
     # start actual int parsing
     neg = b == UInt8('-')

diff --git a/test/floats.jl b/test/floats.jl
@@ -369,48 +369,114 @@ end
 @test Parsers.tryparse(Float64, "0e+") === nothing
 
 @testset "groupmark" begin
-    @test Parsers.xparse(Float64, "100,000,000.99"; groupmark=',').val == 100_000_000.99
-    @test Parsers.xparse(Float64, "100,000,000"; groupmark=',').val == 100_000_000.0
-    @test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val == 100_000_000.99
+    # `parse` is used for parsing inputs with a single value in them,
+    # so when delims==groupmarks, we assume what we see are groupmarks
+    @testset "Parsers.parse" begin
+        groupmark(c::Char) = Parsers._get_default_options(groupmark=UInt8(c))
+        @testset "$T" for T in (Float32, Float64)
+            # comma
+            @test Parsers.parse(T, "1,0,0,0,0,0,0,0,099e-2", groupmark(',')) ≈ 100_000_000.99
+            @test Parsers.parse(T, "100,000,00099e-2", groupmark(',')) ≈ 100_000_000.99
+            @test Parsers.parse(T, "100,000,000.99", groupmark(',')) ≈ 100_000_000.99
+            @test Parsers.parse(T, "100,000,000", groupmark(',')) ≈ 100_000_000
+            # space
+            @test Parsers.parse(T, "1 0 0 0 0 0 0 0 099e-2", groupmark(' ')) ≈ 100_000_000.99
+            @test Parsers.parse(T, "100 000 00099e-2", groupmark(' ')) ≈ 100_000_000.99
+            @test Parsers.parse(T, "100 000 000.99", groupmark(' ')) ≈ 100_000_000.99
+            @test Parsers.parse(T, "100 000 000", groupmark(' ')) ≈ 100_000_000
+        end
+    end
+    @test Parsers.xparse(Float64, "100_000_000.99"; groupmark='_').val == 100_000_000.99
+    @test Parsers.xparse(Float64, "100_000_000"; groupmark='_').val == 100_000_000.0
+    @test Parsers.xparse(Float64, "1_0_0_0_0_0_0_0_0.99"; groupmark='_').val == 100_000_000.99
+
     @test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val == 100_000_000.99
     @test Parsers.xparse(Float64, "100000000.99"; groupmark=',').val == 100_000_000.99
     @test Parsers.xparse(Float64, "100000000.99,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 13, 1.0000000099e8)
     @test Parsers.xparse(Float64, "\"100,000,000.99\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
-    @test Parsers.xparse(Float64, "100,000,000.99,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
+    @test Parsers.xparse(Float64, "100,000,000.99,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 4, 100.0)
+    @test Parsers.xparse(Float64, "100_000_000.99,100"; groupmark='_', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
     @test Parsers.xparse(Float64, "\"100,000,000\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 14, 1.0e8)
     res = Parsers.xparse(Float64, "100,000,000,aaa"; groupmark=',')
-    @test res.code == EOF | INVALID | INVALID_DELIMITER
-    @test res.tlen == 15
+    @test res.code == OK | DELIMITED
+    @test res.tlen == 4
+    res = Parsers.xparse(Float64, "100_000_000,aaa"; groupmark='_')
+    @test res.code == OK | DELIMITED
+    @test res.tlen == 12
 
-    @test Parsers.xparse(Float32, "100,000,000.99"; groupmark=',').val ≈ 100_000_000.99
-    @test Parsers.xparse(Float32, "100,000,000"; groupmark=',').val ≈ 100_000_000.0
-    @test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,0.99"; groupmark=',').val ≈ 100_000_000.99
+    @test Parsers.xparse(Float32, "100_000_000.99"; groupmark='_').val ≈ 100_000_000.99
+    @test Parsers.xparse(Float32, "100_000_000"; groupmark='_').val ≈ 100_000_000.0
+    @test Parsers.xparse(Float32, "1_0_0_0_0_0_0_0_0.99"; groupmark='_').val ≈ 100_000_000.99
     @test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 0.99"; groupmark=' ').val ≈ 100_000_000.99
     @test Parsers.xparse(Float32, "100000000.99"; groupmark=',').val ≈ 100_000_000.99
     res = Parsers.xparse(Float32, "100000000.99,aaa"; groupmark=',')
     @test res.code == OK | DELIMITED
     @test res.tlen == 13
     @test res.val ≈ 100_000_000.99
     res = Parsers.xparse(Float32, "100,000,000,aaa"; groupmark=',')
-    @test res.code == EOF | INVALID | INVALID_DELIMITER
-    @test res.tlen == 15
+    @test res.code == OK | DELIMITED
+    @test res.tlen == 4
+    res = Parsers.xparse(Float32, "100_000_000,aaa"; groupmark='_')
+    @test res.code == OK | DELIMITED
+    @test res.tlen == 12
+
+    @test Parsers.xparse(Float64, "100,000,00099e-2"; groupmark=',').val == 100.0
+    @test Parsers.xparse(Float64, "100_000_00099e-2"; groupmark='_').val == 100_000_000.99
+    @test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val == 1.0
+    @test Parsers.xparse(Float64, "1_0_0_0_0_0_0_0_099e-2"; groupmark='_').val == 100_000_000.99
 
-    @test Parsers.xparse(Float64, "100,000,00099e-2"; groupmark=',').val == 100_000_000.99
-    @test Parsers.xparse(Float64, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val == 100_000_000.99
     @test Parsers.xparse(Float64, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val == 100_000_000.99
     @test Parsers.xparse(Float64, "10000000099e-2"; groupmark=',').val == 100_000_000.99
     @test Parsers.xparse(Float64, "10000000099e-2,aaa"; groupmark=',') == Parsers.Result{Float64}(OK | DELIMITED, 15, 1.0000000099e8)
     @test Parsers.xparse(Float64, "\"10000000099e-2\",100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(13), 17, 1.0000000099e8)
     @test Parsers.xparse(Float64, "10000000099e-2,100"; groupmark=',', openquotechar='"', closequotechar='"') == Parsers.Result{Float64}(Int16(9), 15, 1.0000000099e8)
 
-    @test Parsers.xparse(Float32, "100,000,00099e-2"; groupmark=',').val ≈ 100_000_000.99
-    @test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val ≈ 100_000_000.99
+    @test Parsers.xparse(Float32, "100,000,00099e-2"; groupmark=',').val ≈ 100.0
+    @test Parsers.xparse(Float32, "100_000_00099e-2"; groupmark='_').val ≈ 100_000_000.99
+    @test Parsers.xparse(Float32, "1,0,0,0,0,0,0,0,099e-2"; groupmark=',').val ≈ 1.0
+    @test Parsers.xparse(Float32, "1_0_0_0_0_0_0_0_099e-2"; groupmark='_').val ≈ 100_000_000.99
+
     @test Parsers.xparse(Float32, "1 0 0 0 0 0 0 0 099e-2"; groupmark=' ').val ≈ 100_000_000.99
     @test Parsers.xparse(Float32, "10000000099e-2"; groupmark=',').val ≈ 100_000_000.99
     res = Parsers.xparse(Float32, "10000000099e-2,aaa"; groupmark=',')
     @test res.code == OK | DELIMITED
     @test res.tlen == 15
     @test res.val ≈ 100_000_000.99
+
+    @testset "$T groupmark=$(repr(g))" for g in (',',' '), T in (Float32, Float64)
+        xgroupmark(c::Char) = Parsers._get_default_xoptions(groupmark=UInt8(c))
+        # Groupmark tests for floats
+        for (input, expected_vals) in [
+            ("1000,0000,2000,3000" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1000\",\"0000\",\"2000\",\"3000\"" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1$(g)0$(g)0$(g)0\",0000,\"2$(g)0$(g)0$(g)0\",3000" => (1000.0,0.0,2000.0,3000.0,)),
+            ("1000,\"0$(g)0$(g)0$(g)0\",2000,\"3$(g)0$(g)0$(g)0\"" => (1000.0,0.0,2000.0,3000.0,)),
+            ("1000.00,0000.00,2000.00,3000.00" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1000.00\",\"0000.00\",\"2000.00\",\"3000.00\"" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1$(g)0$(g)0$(g)0.00\",0000.00,\"2$(g)0$(g)0$(g)0.00\",3000.00" => (1000.0,0.0,2000.0,3000.0,)),
+            ("1000,\"0$(g)0$(g)0$(g)0.00\",2000.00,\"3$(g)0$(g)0$(g)0.00\"" => (1000.0,0.0,2000.0,3000.0,)),
+            ("1000.00e0,0000.00e0,2000.00e0,3000.00e0" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1000.00e0\",\"0000.00e0\",\"2000.00e0\",\"3000.00e0\"" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1$(g)0$(g)0$(g)0.00e0\",0000.00e0,\"2$(g)0$(g)0$(g)0.00e0\",3000.00e0" => (1000.0,0.0,2000.0,3000.0,)),
+            ("1000,\"0$(g)0$(g)0$(g)0.00e0\",2000.00e0,\"3$(g)0$(g)0$(g)0.00e0\"" => (1000.0,0.0,2000.0,3000.0,)),
+            ("1000e0,0000e0,2000e0,3000e0" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1000e0\",\"0000e0\",\"2000e0\",\"3000e0\"" => (1000.0,0.0,2000.0,3000.0,)),
+            ("\"1$(g)0$(g)0$(g)0e0\",0000e0,\"2$(g)0$(g)0$(g)0e0\",3000e0" => (1000.0,0.0,2000.0,3000.0,)),
+            ("1000,\"0$(g)0$(g)0$(g)0e0\",2000e0,\"3$(g)0$(g)0$(g)0e0\"" => (1000.0,0.0,2000.0,3000.0,)),
+        ]
+            pos = 1
+            len = length(input)
+            local res
+            for expected in expected_vals
+                res = Parsers.xparse(T, input, pos, len, xgroupmark(g))
+                @test res.val == expected
+                @test Parsers.ok(res.code)
+                pos += res.tlen
+            end
+            @test Parsers.ok(res.code)
+            @test Parsers.eof(res.code)
+        end
+    end
 end
 
 @testset "BigFloats" begin