Skip to content

Commit

Permalink
Merge pull request #3 from mihnita/master
Browse files Browse the repository at this point in the history
Fix serialization and parsing for beyond BMP, add tests
  • Loading branch information
stefanhaustein authored Aug 23, 2017
2 parents 49758b4 + e80798f commit a4e619c
Show file tree
Hide file tree
Showing 3 changed files with 125 additions and 13 deletions.
8 changes: 4 additions & 4 deletions src/org/kxml2/io/KXmlParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -589,17 +589,17 @@ private final void push(int c) {

isWhitespace &= c <= ' ';

if (txtPos == txtBuf.length) {
if (txtPos + 1 >= txtBuf.length) { // +1 to have enough space for 2 surrogates, if needed
char[] bigger = new char[txtPos * 4 / 3 + 4];
System.arraycopy(txtBuf, 0, bigger, 0, txtPos);
txtBuf = bigger;
}

if (c > 0xffff) {
if (c > 0xffff) {
// write high Unicode value as surrogate pair
int offset = c - 0x010000;
txtBuf[txtPos++] = (char)((offset >>> 10) + 0xd800);
txtBuf[txtPos++] = (char)((offset & 0x3ff) + 0xdc00);
txtBuf[txtPos++] = (char)((offset >>> 10) + 0xd800); // high surrogate
txtBuf[txtPos++] = (char)((offset & 0x3ff) + 0xdc00); // low surrogate
} else {
txtBuf[txtPos++] = (char) c;
}
Expand Down
21 changes: 13 additions & 8 deletions src/org/kxml2/io/KXmlSerializer.java
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,19 @@ private final void writeEscaped(String s, int quot)
//if(c < ' ')
// throw new IllegalArgumentException("Illegal control code:"+((int) c));

if (c >= 0xd800 && c <= 0xdfff && i < s.length() - 1) {
// write surrogate pair as single code value
i++;
int h = c;
int l = s.charAt(i);
int n = ((h - 0xd800) << 10) + (l - 0xdc00) + 0x010000;
writer.write("&#" + n + ";");
} if (c >= ' ' && c !='@' && (c < 127 || unicode)) {
if (i < s.length() - 1) {
char cLow = s.charAt(i + 1);
// c is high surrogate and cLow is low surrogate
if (c >= 0xd800 && c <= 0xdbff && cLow >= 0xdc00 && cLow <= 0xdfff) {
// write surrogate pair as single code point
int n = ((c - 0xd800) << 10) + (cLow - 0xdc00) + 0x010000;
writer.write("&#" + n + ";");
i++; // Skip the low surrogate
break;
}
// Does nothing smart about orphan surrogates, just output them "as is"
}
if (c >= ' ' && c !='@' && (c < 127 || unicode)) {
writer.write(c);
} else {
writer.write("&#" + ((int) c) + ";");
Expand Down
109 changes: 108 additions & 1 deletion test/TestWb.java
Original file line number Diff line number Diff line change
@@ -1,14 +1,22 @@
// Test case contributed by Andy Bailey

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;

import java.io.StringReader;
import java.io.StringWriter;
import java.nio.charset.StandardCharsets;
import junit.framework.TestCase;

import org.kxml2.wap.Wbxml;
import org.kxml2.wap.WbxmlSerializer;
import org.xmlpull.v1.XmlPullParser;
import org.xmlpull.v1.XmlPullParserException;
import org.xmlpull.v1.XmlPullParserFactory;
import org.xmlpull.v1.XmlSerializer;

public class TestWb extends TestCase
{
Expand Down Expand Up @@ -41,5 +49,104 @@ public void testWb() throws IllegalArgumentException,IllegalStateException, File
long len=file.length();
System.out.println(len+" bytes");
}

// Using hex code units to be sure that the system charset does not affect the behavior
private static final String EMOJI_CHAR = "\ud83d\ude48";

private static final String XML_TO_PARSE = "<?xml version=\"1.0\" encoding=\"utf-8\"?>\n"
+ "<resources attr='" + EMOJI_CHAR + "' attr_hex='&#x1f648;' attr_dec='&#128584;'>\n"
+ " <![CDATA[This is CDATA, with " + EMOJI_CHAR + ".]]>\n"
+ " <!-- This is a comment, with " + EMOJI_CHAR + ", to see how it goes -->\n"
+ " <string>Emoji: " + EMOJI_CHAR + "&#x1f648;&#128584;</string>\n"
+ "</resources>\n";

private static void checkParseBeyondBmp(XmlPullParser xpp) throws XmlPullParserException, IOException {
while (xpp.getEventType() != XmlPullParser.END_DOCUMENT) {
switch (xpp.getEventType()) {
case XmlPullParser.CDSECT:
assertTrue(xpp.getText().contains(EMOJI_CHAR));
break;
case XmlPullParser.COMMENT:
assertTrue(xpp.getText().contains(EMOJI_CHAR));
break;
case XmlPullParser.TEXT:
final String text = xpp.getText().replaceAll("[\\n\\r\\t ]+", "");
if (!text.isEmpty()) {
assertTrue(xpp.getText().contains(EMOJI_CHAR));
}
break;
case XmlPullParser.ENTITY_REF:
assertEquals(EMOJI_CHAR, xpp.getText());
break;
case XmlPullParser.START_TAG:
for (int i = 0; i < xpp.getAttributeCount(); i++) {
assertEquals(EMOJI_CHAR, xpp.getAttributeValue(i));
}
break;
}
xpp.nextToken();
}
}

public void testParseBeyondBmpFromReader() throws XmlPullParserException, IOException {
final XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
final XmlPullParser xpp = factory.newPullParser();
xpp.setInput(new StringReader(XML_TO_PARSE));

checkParseBeyondBmp(xpp);
}

public void testParseBeyondBmpInputStream() throws XmlPullParserException, IOException {
final XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
final XmlPullParser xpp = factory.newPullParser();
xpp.setInput(new ByteArrayInputStream(XML_TO_PARSE.getBytes(StandardCharsets.UTF_8)), "utf-8");

checkParseBeyondBmp(xpp);
}

private static final String EXPECTED_XML_SERIALIZATION = ""
+ "<!--Emoji: " + EMOJI_CHAR + "-->\n"
+ "<![CDATA[Emoji: " + EMOJI_CHAR + "]]>\n"
+ "<string attr=\"&#128584;\">Emoji: &#128584;</string>";

private static void checkSerializeBeyondBmp(XmlSerializer serializer) throws IOException {
final String text = "Emoji: " + EMOJI_CHAR;

serializer.comment(text);
serializer.text("\n");
serializer.cdsect(text);
serializer.text("\n");
serializer.startTag(null, "string");
serializer.attribute(null, "attr", EMOJI_CHAR);
serializer.text(text);
serializer.endTag(null, "string");
serializer.endDocument();
}

public void testSerializeBeyondBmpToOutputStream() throws XmlPullParserException, IOException {
final XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
factory.setNamespaceAware(true);
final XmlSerializer serializer = factory.newSerializer();

final ByteArrayOutputStream os = new ByteArrayOutputStream();
serializer.setOutput(os, "utf-8");

checkSerializeBeyondBmp(serializer);

assertEquals(EXPECTED_XML_SERIALIZATION, os.toString("utf-8"));
}

public void testSerializeBeyondBmpToWriter() throws XmlPullParserException, IOException {
final XmlPullParserFactory factory = XmlPullParserFactory.newInstance();
factory.setNamespaceAware(true);
final XmlSerializer serializer = factory.newSerializer();

final StringWriter writer = new StringWriter();
serializer.setOutput(writer);

checkSerializeBeyondBmp(serializer);

assertEquals(EXPECTED_XML_SERIALIZATION, writer.toString());
}
}

0 comments on commit a4e619c

Please sign in to comment.