Skip to content

Commit

Permalink
Allow any XML mimetype in Connection
Browse files Browse the repository at this point in the history
Fixes #2059
  • Loading branch information
jhy committed Nov 23, 2023
1 parent bc79810 commit 58521a4
Show file tree
Hide file tree
Showing 4 changed files with 218 additions and 6 deletions.
4 changes: 4 additions & 0 deletions CHANGES
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ Release 1.17.1 [PENDING]
which caused that pinning.
<https://github.com/jhy/jsoup/issues/2054>

* Improvement: in Jsoup.Connect, allow any XML mimetype as a supported mimetype. Was previously limited to
{application|text}/xml. This enables for e.g. fetching SVGs with a image/svg+xml mimetype, without having to
disable mimetype validation.

* Bugfix: when outputting with XML syntax, HTML elements that were parsed as data nodes (<script> and <style>) should
be emitted as CDATA nodes, so that they can be parsed correctly by an XML parser.
<https://github.com/jhy/jsoup/pull/1720>
Expand Down
6 changes: 3 additions & 3 deletions src/main/java/org/jsoup/helper/HttpConnection.java
Original file line number Diff line number Diff line change
Expand Up @@ -798,9 +798,9 @@ public static class Response extends HttpConnection.Base<Connection.Response> im
private final HttpConnection.Request req;

/*
* Matches XML content types (like text/xml, application/xhtml+xml;charset=UTF8, etc)
* Matches XML content types (like text/xml, image/svg+xml, application/xhtml+xml;charset=UTF8, etc)
*/
private static final Pattern xmlContentTypeRxp = Pattern.compile("(application|text)/\\w*\\+?xml.*");
private static final Pattern xmlContentTypeRxp = Pattern.compile("(\\w+)/\\w*\\+?xml.*");

/**
<b>Internal only! </b>Creates a dummy HttpConnection.Response, useful for testing. All actual responses
Expand Down Expand Up @@ -885,7 +885,7 @@ else if (methodHasBody)
&& !contentType.startsWith("text/")
&& !xmlContentTypeRxp.matcher(contentType).matches()
)
throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, application/xml, or application/*+xml",
throw new UnsupportedMimeTypeException("Unhandled content type. Must be text/*, */xml, or */*+xml",
contentType, req.url().toString());

// switch to the XML parser if content type is xml and not parser not explicitly set
Expand Down
29 changes: 26 additions & 3 deletions src/test/java/org/jsoup/integration/ConnectTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
import org.jsoup.HttpStatusException;
import org.jsoup.Jsoup;
import org.jsoup.Connection.Method;
import org.jsoup.UnsupportedMimeTypeException;
import org.jsoup.helper.DataUtil;
import org.jsoup.helper.W3CDom;
import org.jsoup.integration.servlets.*;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.FormElement;
import org.jsoup.nodes.Node;
import org.jsoup.nodes.XmlDeclaration;
import org.jsoup.parser.HtmlTreeBuilder;
import org.jsoup.parser.Parser;
import org.jsoup.parser.XmlTreeBuilder;
Expand Down Expand Up @@ -509,17 +512,17 @@ public void handlesWrongContentLengthDuringBufferedRead() throws IOException {
}

@Test
public void testBinaryContentTypeThrowsException() {
public void testBinaryContentTypeThrowsException() throws IOException {
Connection con = Jsoup.connect(FileServlet.urlTo("/htmltests/thumb.jpg"));
con.data(FileServlet.ContentTypeParam, "image/jpeg");

boolean threw = false;
try {
con.execute();
Document doc = con.response().parse();
} catch (IOException e) {
} catch (UnsupportedMimeTypeException e) {
threw = true;
assertEquals("Unhandled content type. Must be text/*, application/xml, or application/*+xml", e.getMessage());
assertEquals("Unhandled content type. Must be text/*, */xml, or */*+xml", e.getMessage());
}
assertTrue(threw);
}
Expand All @@ -540,6 +543,26 @@ public void testBinaryContentTypeThrowsException() {
assertEquals(Document.OutputSettings.Syntax.xml, doc.outputSettings().syntax());
}

@Test public void imageXmlMimeType() throws IOException {
// test that we switch to XML, and that we support image/svg+xml
String mimetype = "image/svg+xml";

Connection con = Jsoup.connect(FileServlet.urlTo("/htmltests/osi-logo.svg"))
.data(FileServlet.ContentTypeParam, mimetype);
Document doc = con.get();

assertEquals(mimetype, con.response().contentType());
assertTrue(doc.parser().getTreeBuilder() instanceof XmlTreeBuilder);
assertEquals(Document.OutputSettings.Syntax.xml, doc.outputSettings().syntax());
Node firstChild = doc.firstChild();
XmlDeclaration decl = (XmlDeclaration) firstChild;
assertEquals("no", decl.attr("standalone"));
Element svg = doc.expectFirst("svg");
Element flowRoot = svg.expectFirst("flowRoot");
assertEquals("flowRoot", flowRoot.tagName());
assertEquals("preserve", flowRoot.attr("xml:space"));
}

@Test
public void canFetchBinaryAsBytes() throws IOException {
Connection.Response res = Jsoup.connect(FileServlet.urlTo("/htmltests/thumb.jpg"))
Expand Down
Loading

0 comments on commit 58521a4

Please sign in to comment.