JuliaLang · kmsquire · Oct 25, 2012
diff --git a/base/pcre.jl b/base/pcre.jl
@@ -114,4 +114,49 @@ function exec(regex::Array{Uint8}, extra::Ptr{Void},
     cap ? ((n > -1 ? ovec[1:2(ncap+1)] : Array(Int32,0)), ncap) : n > -1
 end
 
+# Returns the name => index mapping for named regular expressions in Regex r
+#
+# According to the pcreapi man page, the name table for
+#
+#         (?<date> (?<year>(\d\d)?\d\d) -
+#         (?<month>\d\d) - (?<day>\d\d) )
+#
+# is stored as
+#
+#         00 01 d  a  t  e  00 ??
+#         00 05 d  a  y  00 ?? ??
+#         00 04 m  o  n  t  h  00
+#         00 02 y  e  a  r  00 ??
+#
+# where the first two bytes in each record hold the index, and the remaining bytes
+# hold the \0-terminated name string
+
+function get_name_table(re::Array{Uint8}, ex::Ptr{Void})
+    name_table_dict = Dict{String, Int}()
+    named_pos = Any[]
+    name_count = int(PCRE.info(re, ex, PCRE.INFO_NAMECOUNT, Int32))
+
+    if name_count > 0
+        name_entry_size = int(PCRE.info(re, ex, PCRE.INFO_NAMEENTRYSIZE, Int32))
+        name_table_ptr = PCRE.info(re, ex, PCRE.INFO_NAMETABLE, Ptr{Uint8})
+
+        name_table = pointer_to_array(name_table_ptr, (name_entry_size, name_count))
+
+        max_idx = 0
+        for n = 1:name_count
+            idx = int(name_table[1,n])<<8 + int(name_table[2,n])
+            last_p = memchr(name_table[3:end,n], 0)+2-1  # null terminator
+            name = bytestring(name_table[3:last_p,n])
+            name_table_dict[name] = idx
+            max_idx = max(idx, max_idx)
+        end
+
+        grow(named_pos, max_idx)
+        named_pos[:] = nothing
+        named_pos[values(name_table_dict)] = keys(name_table_dict)
+    end
+
+    (name_table_dict, named_pos)
+end
+
 end # module
diff --git a/base/regex.jl b/base/regex.jl
@@ -7,6 +7,8 @@ type Regex
     options::Int32
     regex::Array{Uint8}
     extra::Ptr{Void}
+    named_captures::Dict{String, Int}
+    named_pos::Array{Any}
 
     function Regex(pat::String, opts::Integer, study::Bool)
         pat = bytestring(pat); opts = int32(opts)
@@ -15,7 +17,8 @@ type Regex
         end
         re = PCRE.compile(pat, opts & PCRE.COMPILE_MASK)
         ex = study ? PCRE.study(re) : C_NULL
-        new(pat, opts, re, ex)
+        (names, pos) = PCRE.get_name_table(re, ex)
+        new(pat, opts, re, ex, names, pos)
     end
 end
 Regex(p::String, s::Bool)    = Regex(p, 0, s)
@@ -66,6 +69,8 @@ type RegexMatch
     captures::Tuple
     offset::Int
     offsets::Vector{Int}
+    capture_dict::Dict
+    named_pos::Array{Any}
 end
 
 function show(io, m::RegexMatch)
@@ -74,7 +79,11 @@ function show(io, m::RegexMatch)
     if !isempty(m.captures)
         print(io, ", ")
         for i = 1:length(m.captures)
-            print(io, i, "=")
+            if m.named_pos[i] !=  nothing
+                print(io, i, "(", m.named_pos[i], ")=")
+            else
+                print(io, i, "=")
+            end
             show(io, m.captures[i])
             if i < length(m.captures)
                 print(io, ", ")
@@ -97,7 +106,12 @@ function match(re::Regex, str::ByteString, idx::Integer, opts::Integer)
     mat = str[m[1]+1:m[2]]
     cap = ntuple(n, i->(m[2i+1] < 0 ? nothing : str[m[2i+1]+1:m[2i+2]]))
     off = [ m[2i+1]::Int32+1 for i=1:n ]
-    RegexMatch(mat, cap, m[1]+1, off)
+    cap_dict = if !isempty(re.named_captures)
+        dict(tuple(keys(re.named_captures)...), tuple([cap[v] for v in values(re.named_captures)]...))
+    else
+        Dict()
+    end
+    RegexMatch(mat, cap, m[1]+1, off, cap_dict, re.named_pos)
 end
 match(r::Regex, s::String, i::Integer, o::Integer) = match(r, bytestring(s), i, o)
 match(r::Regex, s::String, i::Integer) = match(r, s, i, r.options & PCRE.EXECUTE_MASK)

diff --git a/doc/manual/strings.rst b/doc/manual/strings.rst
@@ -688,6 +688,7 @@ You can extract the following info from a ``RegexMatch`` object:
 -  the captured substrings as a tuple of strings: ``m.captures``
 -  the offset at which the whole match begins: ``m.offset``
 -  the offsets of the captured substrings as a vector: ``m.offsets``
+-  a dictionary of named captured substrings: ``m.capture_dict`` (more on this below)
 
 For when a capture doesn't match, instead of a substring, ``m.captures``
 contains ``nothing`` in that position, and ``m.offsets`` has a zero
@@ -733,6 +734,30 @@ use tuple destructuring syntax to bind them to local variables::
     julia> first
     "a"
 
+In complicated regular expressions, it can be hard  to keep track of the
+numbers, and the numbers may change if the expression is modified.  To
+aid with this, subpatterns may be named::
+
+    julia> m = match(r"(?P<greeting>\w+)(?:, (?P<receiver>\w+))?[.!]", "Greetings, Earthling!")
+    RegexMatch("Greetings, Earthling!", 1(greeting)="Greetings", 2(receiver)="Earthling")
+
+As noted above, a dictionary of matched named patterns is made
+available via ``m.capture_dict``::
+
+    julia> m.capture_dict
+    {"greeting"=>"Greetings","receiver"=>"Earthling"}
+
+As with ``captures``, any unmatched named patterns contain ``nothing``::
+
+    julia> m = match(r"(?P<greeting>\w+)(?:, (?P<receiver>\w+))?[.!]", "Hello.")
+    RegexMatch("Hello.", 1(greeting)="Hello", 2(receiver)=nothing)
+
+    julia> m.capture_dict
+    {"greeting"=>"Hello","receiver"=>nothing}
+
+    julia> m.captures
+    ("Hello",nothing)
+
 You can modify the behavior of regular expressions by some combination of
 the flags ``i``, ``m``, ``s``, and ``x`` after the closing double quote
 mark. These flags have the same meaning as they do in Perl, as explained

diff --git a/test/strings.jl b/test/strings.jl
@@ -488,3 +488,15 @@ for i1 = 1:length(u8str2)
         @assert u8str2[i1:i2] == u8str2plain[i1:i2]
     end
 end
+
+# Named regex
+m = match(r"(?<greeting>.*), (?<place>.*)\.", astr)
+@assert m.capture_dict["greeting"] == "Hello"
+@assert m.capture_dict["place"]    == "world"
+m = match(r"(?'greeting'.*), (?'place'.*)\.", astr)
+@assert m.capture_dict["greeting"] == "Hello"
+@assert m.capture_dict["place"]    == "world"
+m = match(r"(?P<greeting>.*), (?P<place>.*)\.", astr)
+@assert m.capture_dict["greeting"] == "Hello"
+@assert m.capture_dict["place"]    == "world"
+