forked from JuliaLang/julia
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathregex.jl
149 lines (130 loc) · 4.84 KB
/
regex.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
## object-oriented Regex interface ##
include("pcre.jl")
type Regex
pattern::ByteString
options::Int32
regex::Array{Uint8}
extra::Ptr{Void}
named_captures::Dict{String, Int}
named_pos::Array{Any}
function Regex(pat::String, opts::Integer, study::Bool)
pat = bytestring(pat); opts = int32(opts)
if (opts & ~PCRE.OPTIONS_MASK) != 0
error("invalid regex option(s)")
end
re = PCRE.compile(pat, opts & PCRE.COMPILE_MASK)
ex = study ? PCRE.study(re) : C_NULL
(names, pos) = PCRE.get_name_table(re, ex)
new(pat, opts, re, ex, names, pos)
end
end
Regex(p::String, s::Bool) = Regex(p, 0, s)
Regex(p::String, o::Integer) = Regex(p, o, false)
Regex(p::String) = Regex(p, 0, false)
copy(r::Regex) = r
# TODO: make sure thing are escaped in a way PCRE
# likes so that Julia all the Julia string quoting
# constructs are correctly handled.
macro r_str(pattern, flags...)
options = PCRE.UTF8
for fx in flags, f in fx
options |= f=='i' ? PCRE.CASELESS :
f=='m' ? PCRE.MULTILINE :
f=='s' ? PCRE.DOTALL :
f=='x' ? PCRE.EXTENDED :
error("unknown regex flag: $f")
end
Regex(pattern, options)
end
function show(io, re::Regex)
imsx = PCRE.CASELESS|PCRE.MULTILINE|PCRE.DOTALL|PCRE.EXTENDED
if (re.options & ~imsx) == PCRE.UTF8
print(io, 'r')
print_quoted_literal(io, re.pattern)
if (re.options & PCRE.CASELESS ) != 0; print(io, 'i'); end
if (re.options & PCRE.MULTILINE) != 0; print(io, 'm'); end
if (re.options & PCRE.DOTALL ) != 0; print(io, 's'); end
if (re.options & PCRE.EXTENDED ) != 0; print(io, 'x'); end
else
print(io, "Regex(")
show(io, re.pattern)
print(io, ',')
show(io, re.options)
print(io, ')')
end
end
# TODO: map offsets into non-ByteStrings back to original indices.
# or maybe it's better to just fail since that would be quite slow
type RegexMatch
match::ByteString
captures::Tuple
offset::Int
offsets::Vector{Int}
capture_dict::Dict
named_pos::Array{Any}
end
function show(io, m::RegexMatch)
print(io, "RegexMatch(")
show(io, m.match)
if !isempty(m.captures)
print(io, ", ")
for i = 1:length(m.captures)
if m.named_pos[i] != nothing
print(io, i, "(", m.named_pos[i], ")=")
else
print(io, i, "=")
end
show(io, m.captures[i])
if i < length(m.captures)
print(io, ", ")
end
end
end
print(io, ")")
end
ismatch(r::Regex, s::String, o::Integer) =
PCRE.exec(r.regex, r.extra, bytestring(s), 0, o, false)
ismatch(r::Regex, s::String) = ismatch(r, s, r.options & PCRE.EXECUTE_MASK)
contains(s::String, r::Regex, opts::Integer) = ismatch(r,s,opts)
contains(s::String, r::Regex) = ismatch(r,s)
function match(re::Regex, str::ByteString, idx::Integer, opts::Integer)
m, n = PCRE.exec(re.regex, re.extra, str, idx-1, opts, true)
if isempty(m); return nothing; end
mat = str[m[1]+1:m[2]]
cap = ntuple(n, i->(m[2i+1] < 0 ? nothing : str[m[2i+1]+1:m[2i+2]]))
off = [ m[2i+1]::Int32+1 for i=1:n ]
cap_dict = if !isempty(re.named_captures)
dict(tuple(keys(re.named_captures)...), tuple([cap[v] for v in values(re.named_captures)]...))
else
Dict()
end
RegexMatch(mat, cap, m[1]+1, off, cap_dict, re.named_pos)
end
match(r::Regex, s::String, i::Integer, o::Integer) = match(r, bytestring(s), i, o)
match(r::Regex, s::String, i::Integer) = match(r, s, i, r.options & PCRE.EXECUTE_MASK)
match(r::Regex, s::String) = match(r, s, start(s))
function search(str::ByteString, re::Regex, idx::Integer)
len = length(str)
if idx >= len+2
return idx == len+2 ? (0,0) : error(BoundsError)
end
opts = re.options & PCRE.EXECUTE_MASK
m, n = PCRE.exec(re.regex, re.extra, str, idx-1, opts, true)
isempty(m) ? (0,0) : (m[1]+1,m[2]+1)
end
search(s::String, r::Regex, idx::Integer) = error("regex search is only available for bytestrings; use bytestring(s) to convert")
search(s::String, r::Regex) = search(s,r,start(s))
type RegexMatchIterator
regex::Regex
string::ByteString
overlap::Bool
end
start(itr::RegexMatchIterator) = match(itr.regex, itr.string)
done(itr::RegexMatchIterator, m) = m == nothing
next(itr::RegexMatchIterator, m) =
(m, match(itr.regex, itr.string, m.offset + (itr.overlap ? 1 : length(m.match))))
each_match(re::Regex, str::String, ovr::Bool) = RegexMatchIterator(re,str,ovr)
each_match(re::Regex, str::String) = RegexMatchIterator(re,str,false)
# miscellaneous methods that depend on Regex being defined
filter!(r::Regex, d::Dict) = filter!((k,v)->ismatch(r,k),d)
filter(r::Regex, d::Dict) = filter!(r,copy(d))