Skip to content

Commit aafc00e

Browse files
committed
Add allow_dangling_amp configuration option and allow dangling &
1 parent d73f3df commit aafc00e

File tree

5 files changed

+127
-20
lines changed

5 files changed

+127
-20
lines changed

Changelog.md

+3
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ XML specification. See the updated `custom_entities` example!
2424

2525
- [#766]: Allow to parse resolved entities as XML fragments and stream events from them.
2626
- [#766]: Added new event `Event::GeneralRef` with content of [general entity].
27+
- [#766]: Added new configuration option `allow_dangling_amp` which allows to have
28+
a `&` not followed by `;` in the textual data which is required for some applications
29+
for compatibility reasons.
2730

2831
### Bug Fixes
2932

src/reader/buffered_reader.rs

+3-3
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ macro_rules! impl_buffered_source {
161161

162162
*position += read;
163163

164-
return ReadRefResult::UpToRef;
164+
return ReadRefResult::UpToRef(&buf[start..]);
165165
}
166166
Some(i) => {
167167
let is_end = available[i] == b';';
@@ -177,7 +177,7 @@ macro_rules! impl_buffered_source {
177177
return if is_end {
178178
ReadRefResult::Ref(&buf[start..])
179179
} else {
180-
ReadRefResult::UpToMarkup
180+
ReadRefResult::UpToMarkup(&buf[start..])
181181
};
182182
}
183183
None => {
@@ -191,7 +191,7 @@ macro_rules! impl_buffered_source {
191191
}
192192

193193
*position += read;
194-
ReadRefResult::UpToEof
194+
ReadRefResult::UpToEof(&buf[start..])
195195
}
196196

197197
#[inline]

src/reader/mod.rs

+50-12
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,32 @@ use crate::reader::state::ReaderState;
2424
#[cfg_attr(feature = "serde-types", derive(serde::Deserialize, serde::Serialize))]
2525
#[non_exhaustive]
2626
pub struct Config {
27+
/// Whether lone ampersand character (without a paired semicolon) should be
28+
/// allowed in textual content. Unless enabled, in case of a dangling ampersand,
29+
/// the [`Error::IllFormed(UnclosedReference)`] is returned from read methods.
30+
///
31+
/// Default: `false`
32+
///
33+
/// # Example
34+
///
35+
/// ```
36+
/// # use quick_xml::events::{BytesRef, BytesText, Event};
37+
/// # use quick_xml::reader::Reader;
38+
/// # use pretty_assertions::assert_eq;
39+
/// let mut reader = Reader::from_str("text with & & & alone");
40+
/// reader.config_mut().allow_dangling_amp = true;
41+
///
42+
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new("text with ")));
43+
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& ")));
44+
/// assert_eq!(reader.read_event().unwrap(), Event::GeneralRef(BytesRef::new("amp")));
45+
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::new(" ")));
46+
/// assert_eq!(reader.read_event().unwrap(), Event::Text(BytesText::from_escaped("& alone")));
47+
/// assert_eq!(reader.read_event().unwrap(), Event::Eof);
48+
/// ```
49+
///
50+
/// [`Error::IllFormed(UnclosedReference)`]: crate::errors::IllFormedError::UnclosedReference
51+
pub allow_dangling_amp: bool,
52+
2753
/// Whether unmatched closing tag names should be allowed. Unless enabled,
2854
/// in case of a dangling end tag, the [`Error::IllFormed(UnmatchedEndTag)`]
2955
/// is returned from read methods.
@@ -210,6 +236,7 @@ impl Config {
210236
impl Default for Config {
211237
fn default() -> Self {
212238
Self {
239+
allow_dangling_amp: false,
213240
allow_unmatched_ends: false,
214241
check_comments: false,
215242
check_end_names: true,
@@ -261,18 +288,29 @@ macro_rules! read_event_impl {
261288
Ok(Event::GeneralRef(BytesRef::wrap(&bytes[1..], $self.decoder())))
262289
}
263290
// Go to Done state
264-
ReadRefResult::UpToEof => {
291+
ReadRefResult::UpToEof(bytes) if $self.state.config.allow_dangling_amp => {
292+
$self.state.state = ParseState::Done;
293+
Ok(Event::Text($self.state.emit_text(bytes)))
294+
}
295+
ReadRefResult::UpToEof(_) => {
265296
$self.state.state = ParseState::Done;
266297
$self.state.last_error_offset = start;
267298
Err(Error::IllFormed(IllFormedError::UnclosedReference))
268299
}
269300
// Do not change state, stay in InsideRef
270-
ReadRefResult::UpToRef => {
301+
ReadRefResult::UpToRef(bytes) if $self.state.config.allow_dangling_amp => {
302+
Ok(Event::Text($self.state.emit_text(bytes)))
303+
}
304+
ReadRefResult::UpToRef(_) => {
271305
$self.state.last_error_offset = start;
272306
Err(Error::IllFormed(IllFormedError::UnclosedReference))
273307
}
274308
// Go to InsideMarkup state
275-
ReadRefResult::UpToMarkup => {
309+
ReadRefResult::UpToMarkup(bytes) if $self.state.config.allow_dangling_amp => {
310+
$self.state.state = ParseState::InsideMarkup;
311+
Ok(Event::Text($self.state.emit_text(bytes)))
312+
}
313+
ReadRefResult::UpToMarkup(_) => {
276314
$self.state.state = ParseState::InsideMarkup;
277315
$self.state.last_error_offset = start;
278316
Err(Error::IllFormed(IllFormedError::UnclosedReference))
@@ -997,13 +1035,13 @@ enum ReadRefResult<'r> {
9971035
/// Contains text block up to EOF. Neither end of reference (`;`), start of
9981036
/// another reference (`&`) or start of markup (`<`) characters was found.
9991037
/// Result includes start `&`.
1000-
UpToEof,
1038+
UpToEof(&'r [u8]),
10011039
/// Contains text block up to next possible reference (`&` character).
10021040
/// Result includes start `&`.
1003-
UpToRef,
1041+
UpToRef(&'r [u8]),
10041042
/// Contains text block up to start of markup (`<` character).
10051043
/// Result includes start `&`.
1006-
UpToMarkup,
1044+
UpToMarkup(&'r [u8]),
10071045
/// IO error occurred.
10081046
Err(io::Error),
10091047
}
@@ -1722,8 +1760,8 @@ mod test {
17221760
// ^= 2
17231761

17241762
match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1725-
ReadRefResult::UpToEof => (),
1726-
x => panic!("Expected `UpToEof`, but got `{:?}`", x),
1763+
ReadRefResult::UpToEof(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1764+
x => panic!("Expected `UpToEof(_)`, but got `{:?}`", x),
17271765
}
17281766
assert_eq!(position, 2);
17291767
}
@@ -1736,8 +1774,8 @@ mod test {
17361774
// ^= 2
17371775

17381776
match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1739-
ReadRefResult::UpToRef => (),
1740-
x => panic!("Expected `UpToRef`, but got `{:?}`", x),
1777+
ReadRefResult::UpToRef(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1778+
x => panic!("Expected `UpToRef(_)`, but got `{:?}`", x),
17411779
}
17421780
assert_eq!(position, 2);
17431781
}
@@ -1750,8 +1788,8 @@ mod test {
17501788
// ^= 3
17511789

17521790
match $source(&mut input).read_ref(buf, &mut position) $(.$await)? {
1753-
ReadRefResult::UpToMarkup => (),
1754-
x => panic!("Expected `UpToMarkup`, but got `{:?}`", x),
1791+
ReadRefResult::UpToMarkup(bytes) => assert_eq!(Bytes(bytes), Bytes(b"&")),
1792+
x => panic!("Expected `UpToMarkup(_)`, but got `{:?}`", x),
17551793
}
17561794
assert_eq!(position, 3);
17571795
}

src/reader/slice_reader.rs

+4-4
Original file line numberDiff line numberDiff line change
@@ -306,11 +306,11 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
306306
// Do not consume `&` because it may be lone and we would be need to
307307
// return it as part of Text event
308308
Some(i) if self[i + 1] == b'&' => {
309-
let (_, rest) = self.split_at(i + 1);
309+
let (bytes, rest) = self.split_at(i + 1);
310310
*self = rest;
311311
*position += i as u64 + 1;
312312

313-
ReadRefResult::UpToRef
313+
ReadRefResult::UpToRef(bytes)
314314
}
315315
Some(i) => {
316316
let end = i + 1;
@@ -323,15 +323,15 @@ impl<'a> XmlSource<'a, ()> for &'a [u8] {
323323
if is_end {
324324
ReadRefResult::Ref(bytes)
325325
} else {
326-
ReadRefResult::UpToMarkup
326+
ReadRefResult::UpToMarkup(bytes)
327327
}
328328
}
329329
None => {
330330
let bytes = &self[..];
331331
*self = &[];
332332
*position += bytes.len() as u64;
333333

334-
ReadRefResult::UpToEof
334+
ReadRefResult::UpToEof(bytes)
335335
}
336336
}
337337
}

tests/reader-config.rs

+67-1
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,75 @@
66
//! Please keep tests sorted (exceptions are allowed if options are tightly related).
77
88
use quick_xml::errors::{Error, IllFormedError};
9-
use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesStart, BytesText, Event};
9+
use quick_xml::events::{BytesCData, BytesEnd, BytesPI, BytesRef, BytesStart, BytesText, Event};
1010
use quick_xml::reader::Reader;
1111

12+
mod allow_dangling_amp {
13+
use super::*;
14+
use pretty_assertions::assert_eq;
15+
16+
#[test]
17+
fn false_() {
18+
let mut reader = Reader::from_str("&&&lt;&");
19+
reader.config_mut().allow_dangling_amp = false;
20+
21+
match reader.read_event() {
22+
Err(Error::IllFormed(cause)) => {
23+
assert_eq!(cause, IllFormedError::UnclosedReference);
24+
}
25+
x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x),
26+
}
27+
assert_eq!(reader.error_position()..reader.buffer_position(), 0..1);
28+
29+
match reader.read_event() {
30+
Err(Error::IllFormed(cause)) => {
31+
assert_eq!(cause, IllFormedError::UnclosedReference);
32+
}
33+
x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x),
34+
}
35+
assert_eq!(reader.error_position()..reader.buffer_position(), 1..2);
36+
37+
assert_eq!(
38+
reader.read_event().unwrap(),
39+
Event::GeneralRef(BytesRef::new("lt"))
40+
);
41+
match reader.read_event() {
42+
Err(Error::IllFormed(cause)) => {
43+
assert_eq!(cause, IllFormedError::UnclosedReference);
44+
}
45+
x => panic!("Expected `Err(Syntax(_))`, but got `{:?}`", x),
46+
}
47+
assert_eq!(reader.error_position()..reader.buffer_position(), 6..7);
48+
49+
assert_eq!(reader.read_event().unwrap(), Event::Eof);
50+
assert_eq!(reader.error_position()..reader.buffer_position(), 6..7);
51+
}
52+
53+
#[test]
54+
fn true_() {
55+
let mut reader = Reader::from_str("&&&lt;&");
56+
reader.config_mut().allow_dangling_amp = true;
57+
58+
assert_eq!(
59+
reader.read_event().unwrap(),
60+
Event::Text(BytesText::from_escaped("&"))
61+
);
62+
assert_eq!(
63+
reader.read_event().unwrap(),
64+
Event::Text(BytesText::from_escaped("&"))
65+
);
66+
assert_eq!(
67+
reader.read_event().unwrap(),
68+
Event::GeneralRef(BytesRef::new("lt"))
69+
);
70+
assert_eq!(
71+
reader.read_event().unwrap(),
72+
Event::Text(BytesText::from_escaped("&"))
73+
);
74+
assert_eq!(reader.read_event().unwrap(), Event::Eof);
75+
}
76+
}
77+
1278
mod allow_unmatched_ends {
1379
use super::*;
1480
use pretty_assertions::assert_eq;

0 commit comments

Comments
 (0)