Skip to content

Commit

Permalink
Adding a utility to provide test data for specific unicode handling i…
Browse files Browse the repository at this point in the history
…ssues. (#1044)
  • Loading branch information
ldhardy authored Jan 17, 2025
1 parent 59155fd commit ee88bcf
Show file tree
Hide file tree
Showing 3 changed files with 343 additions and 0 deletions.
12 changes: 12 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
<dep.guava.version>32.1.2-jre</dep.guava.version>
<dep.httpclient.version>5.2.1</dep.httpclient.version>
<dep.httpcore.version>5.2.1</dep.httpcore.version>
<dep.icu4j.version>73.2</dep.icu4j.version>
<dep.jackson.version>2.15.2</dep.jackson.version>
<dep.jakarta.xml.bind-api.version>3.0.1</dep.jakarta.xml.bind-api.version>
<dep.janino.version>3.1.12</dep.janino.version>
Expand Down Expand Up @@ -305,6 +306,12 @@
<type>pom</type>
<scope>import</scope>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<version>${dep.icu4j.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter</artifactId>
Expand Down Expand Up @@ -523,6 +530,11 @@
<artifactId>error_prone_annotations</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.ibm.icu</groupId>
<artifactId>icu4j</artifactId>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.glassfish.jersey.test-framework.providers</groupId>
<artifactId>jersey-test-framework-provider-jetty</artifactId>
Expand Down
188 changes: 188 additions & 0 deletions src/test/java/emissary/test/util/ComplexUnicodeSamples.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
package emissary.test.util;

import java.util.HashMap;
import java.util.Map;

/**
* A class that provides some tricky samples. These samples can be used in testing to make sure our code and the 3rd
* party libraries we choose can handle unusual cases.
* <p>
* Each example contains detailed explanation and links to useful reference materials.
*/
public final class ComplexUnicodeSamples {

private ComplexUnicodeSamples() {}

/**
* Returns a string that contains one graphical unit (in this case an emoji) that consists of 5 Unicode scalar values.
* The user-perceived string would be one facepalming emoji. A user would expect hit the arrow key once to traverse the
* cursor across this one emoji on the screen. The length of the UTF-8 encoded byte array is 17 bytes. One emoji, 17
* UTF8 bytes.
* <p>
* SCALAR 1: First, there’s a base character that means a person face palming.
* <p>
* SCALAR 2: By default, the person would have a cartoonish yellow color. The next character is an emoji skintone
* modifier the changes the color of the person’s skin (and, in practice, also the color of the person’s hair).
* <p>
* SCALAR 3 and 4: By default, the gender of the person is undefined, and e.g. Apple defaults to what they consider a
* male appearance and e.g. Google defaults to what they consider a female appearance. The next two scalar values pick a
* male-typical appearance specifically regardless of font and vendor. Instead of being an emoji-specific modifier like
* the skin tone, the gender specification uses an emoji-predating gender symbol (MALE SIGN) explicitly ligated using
* the ZERO WIDTH JOINER with the (skin-toned) face-palming person. (Whether it is a good or a bad idea that the skin
* tone and gender specifications use different mechanisms is out of the scope of this post.)
* <p>
* SCALAR 5: Finally, VARIATION SELECTOR-16 makes it explicit that we want a multicolor emoji rendering instead of a
* monochrome dingbat rendering.
*
* @return the Java string containing this one facepalming dude emoji with a not-yellow skin tone.
*
* @see ComplexUnicodeSamplesTest#demonstrateMetadataAboutFacePalmDude()
* @see <a href="https://hsivonen.fi/string-length/">https://hsivonen.fi/string-length/</a>
*/
public static String getFacePalmingMaleControlSkintone() {

StringBuilder sb = new StringBuilder();

// SCALAR 1: U+1F926 FACE PALM
// Use the lookup for how to represent in java
// https://www.fileformat.info/info/unicode/char/1f926/index.htm
// UTF-32 code units: 1
// UTF-16 code units: 2
// UTF-8 code units: 4
// UTF-32 bytes: 4
// UTF-16 bytes: 4
// UTF-8 bytes: 4
sb.append("\uD83E\uDD26");

// SCALAR 2: U+1F3FC EMOJI MODIFIER FITZPATRICK TYPE-3
// https://www.fileformat.info/info/unicode/char/1f3fc/index.htm
// UTF-32 code units: 1
// UTF-16 code units: 2
// UTF-8 code units: 4
// UTF-32 bytes: 4
// UTF-16 bytes: 4
// UTF-8 bytes: 4
sb.append("\uD83C\uDFFC");

// SCALAR 3: U+200D ZERO WIDTH JOINER
// UTF-32 code units: 1
// UTF-16 code units: 1
// UTF-8 code units: 3
// UTF-32 bytes: 4
// UTF-16 bytes: 2
// UTF-8 bytes: 3
sb.append("\u200D");

// SCALAR 4: U+2642 MALE SIGN
// UTF-32 code units: 1
// UTF-16 code units: 1
// UTF-8 code units: 3
// UTF-32 bytes: 4
// UTF-16 bytes: 2
// UTF-8 bytes: 3
sb.append("\u2642");

// SCALAR 5: U+FE0F VARIATION SELECTOR-16
// UTF-32 code units: 1
// UTF-16 code units: 1
// UTF-8 code units: 3
// UTF-32 bytes: 4
// UTF-16 bytes: 2
// UTF-8 bytes: 3
sb.append("\uFE0F");

return sb.toString();
}


/**
* This map is useful for testing that our code and any 3rd party XML library we are using is handling unicode within
* XML correctly.
*
* @return A map of strings where the key is the XML node containing an XML-escaped surrogate pair unicode value and the
* value is is the properly extracted java string value with un-escaped unicode strings.
* @see <a href=
* "https://github.com/FasterXML/woodstox/pull/174/files">https://github.com/FasterXML/woodstox/pull/174/files</a>
*/
public static Map<String, String> getXmlSamples() {
// See https://github.com/FasterXML/woodstox/pull/174/files
Map<String, String> xmlWithExp = new HashMap<String, String>();
// Numeric surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Hex and numeric surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#xD83C;&#57221;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Numeric and hex surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#55356;&#xDF85;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Hex surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#xD83C;&#xDF85;.</root>",
"surrogate pair: \uD83C\uDF85.");
// Two surrogate pairs
xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#55356;&#57220;.</root>",
"surrogate pair: \uD83C\uDF85\uD83C\uDF84.");
// Surrogate pair and simple entity
xmlWithExp.put("<root>surrogate pair: &#55356;&#57221;&#8482;.</root>",
"surrogate pair: \uD83C\uDF85\u2122.");

return xmlWithExp;
}

/**
* This will not work properly in versions of java earlier than Java 20.
* <p>
* Once we get to Java 20, this method should work properly.
* <p>
* Character boundary analysis allows users to interact with characters as they expect to, for example, when moving the
* cursor through a text string. Character boundary analysis provides correct navigation through character strings,
* regardless of how the character is stored. The boundaries returned may be those of supplementary characters,
* combining character sequences, or ligature clusters. For example, an accented character might be stored as a base
* character and a diacritical mark. What users consider to be a character can differ between languages.
*
* @see <a href=
* "https://horstmann.com/unblog/2023-10-03/index.html">https://horstmann.com/unblog/2023-10-03/index.html</a> -
* Scroll to the section titled "Just Use Strings"
*
* @param text - the string to analyze.
* @return the count of user-perceived graphemes as based on the character break iterator. In versions of java earlier
* than Java 20, this will not function as expected.
*/
public static int countGraphemesUsingJavaBuiltInBreakIterator(String text) {

java.text.BreakIterator breakIterator = java.text.BreakIterator.getCharacterInstance();
breakIterator.setText(text);

int count = 0;
for (int end = breakIterator.next(); end != java.text.BreakIterator.DONE; end = breakIterator.next()) {
count++;
}

return count;
}

/**
* Using the industry-standard ICU4J library provided by IBM.
* <p>
* NOTE: Updating the version of this library might change which unicode database is referenced for these calculations.
* We should strive to keep this library as up-to-date as possible in both test and production source code.
*
* @param text the string to analyze
* @return a count of how many user-perceived glyphs/graphemes are present in the string. If you placed a cursor diretly
* to the left (or right for right-to-left string), and pressed the arrow key to traverse the string, how many
* times would you need to press the arrow key to traverse to the right-most end of the string (or leftmost for
* R-to-L strings).
*/
public static int countGraphemesUsingIcu4J(String text) {
com.ibm.icu.text.BreakIterator breakIterator = com.ibm.icu.text.BreakIterator.getCharacterInstance();
breakIterator.setText(text);

int count = 0;
for (int end = breakIterator.next(); end != com.ibm.icu.text.BreakIterator.DONE; end = breakIterator.next()) {
count++;
}

return count;
}

}
143 changes: 143 additions & 0 deletions src/test/java/emissary/test/util/ComplexUnicodeSamplesTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
package emissary.test.util;

import com.ibm.icu.text.Normalizer2;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.condition.DisabledForJreRange;
import org.junit.jupiter.api.condition.EnabledForJreRange;
import org.junit.jupiter.api.condition.JRE;

import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

class ComplexUnicodeSamplesTest {

/**
* Interesting observations about face palm dude emoji.
* <p>
* We’ve seen four different lengths so far:
*
* <ul>
* <li>Number of UTF-8 code units (17 in this case)</li>
* <li>Number of UTF-16 code units (7 in this case)</li>
* <li>Number of UTF-32 code units or Unicode scalar values (5 in this case)</li>
* <li>Number of extended grapheme clusters (1 in this case)</li>
* </ul>
* Given a valid Unicode string and a version of Unicode, all of the above are well-defined and it holds that each item
* higher on the list is greater or equal than the items lower on the list.
* <p>
* One of these is not like the others, though: The first three numbers have an unchanging definition for any valid
* Unicode string whether it contains currently assigned scalar values or whether it is from the future and contains
* unassigned scalar values as far as software written today is aware. Also, computing the first three lengths does not
* involve lookups from the Unicode database. However, the last item depends on the Unicode version and involves lookups
* from the Unicode database. If a string contains scalar values that are unassigned as far as the copy of the Unicode
* database that the program is using is aware, the program will potentially overcount extended grapheme clusters in the
* string compared to a program whose copy of the Unicode database is newer and has assignments for those scalar values
* (and some of those assignments turn out to be combining characters).
*/
@Test
void demonstrateMetadataAboutFacePalmDude() {

String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();

// SCALAR 1 is 4 UTF8 bytes
// SCALAR 2 is 4 UTF8 bytes
// SCALAR 3 is 3 UTF8 bytes
// SCALAR 4 is 3 UTF8 bytes
// SCALAR 5 is 3 UTF8 bytes
// TOTAL : 17 UTF8 bytes
assertEquals(17, facepalm.getBytes(StandardCharsets.UTF_8).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_8)));

// SCALAR 1 is 4 UTF16 bytes
// SCALAR 2 is 4 UTF16 bytes
// SCALAR 3 is 2 UTF16 bytes
// SCALAR 4 is 2 UTF16 bytes
// SCALAR 5 is 2 UTF16 bytes
// TOTAL : 14 UTF16 bytes if no BOM is needed
// Java typically defaults to UTF-16BE
assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16BE).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16BE), StandardCharsets.UTF_16BE));
assertEquals(14, facepalm.getBytes(StandardCharsets.UTF_16LE).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16LE), StandardCharsets.UTF_16LE));

// When the endianness isn't specified, 2 bytes are used for the byte order marker
// The BOM is a special character (U+FEFF) used to indicate the endianness (byte order)
// of a UTF-16 encoded file or stream. In UTF-16, the BOM can be either:
// FE FF (Big Endian)
// FF FE (Little Endian)
assertEquals(16, facepalm.getBytes(StandardCharsets.UTF_16).length);
assertEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.UTF_16), StandardCharsets.UTF_16));

// 5 UTF-32 characters at 4 bytes per character
assertEquals(20, facepalm.getBytes(Charset.forName("UTF-32")).length);
assertEquals(facepalm, new String(facepalm.getBytes(Charset.forName("UTF-32")), Charset.forName("UTF-32")));

// single byte encoding is not going to produce what you want
assertEquals(5, facepalm.getBytes(StandardCharsets.ISO_8859_1).length);
assertNotEquals(facepalm, new String(facepalm.getBytes(StandardCharsets.ISO_8859_1), StandardCharsets.ISO_8859_1));


assertEquals(5, facepalm.codePointCount(0, facepalm.length()));

// ICU4J BreakIterator gets it right
assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingIcu4J(facepalm));

// See
// demonstrateMetadataAboutFacePalmDudeForJava20()
// and
// demonstrateMetadataAboutFacePalmDudePriorToJava20()
// to see how using the intrinsic java BreakIterator doesn't
// get it right until Java 20.


// It's already normalized in it's natural form.
Normalizer2 nfcDecomp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE);
Normalizer2 nfkcDecomp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.DECOMPOSE);
assertTrue(nfcDecomp.isNormalized(facepalm));
assertTrue(nfkcDecomp.isNormalized(facepalm));

Normalizer2 nfcComp = Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.COMPOSE);
Normalizer2 nfkcComp = Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE);
assertTrue(nfcComp.isNormalized(facepalm));
assertTrue(nfkcComp.isNormalized(facepalm));

}

@Test
@EnabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java 20 and later.")
void demonstrateMetadataAboutFacePalmDudeForJava20() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
assertEquals(1, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
}

@Test
@DisabledForJreRange(min = JRE.JAVA_20, disabledReason = "This test only valid for Java versions up to not including Java 20.")
void demonstrateMetadataAboutFacePalmDudePriorToJava20() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
assertEquals(4, ComplexUnicodeSamples.countGraphemesUsingJavaBuiltInBreakIterator(facepalm));
// it should be 1, but it's wrong until Java 20.
}

@Test
@EnabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java 17 and later.")
void demonstrateMetadataAboutFacePalmDudeForJava17AndLater() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
int j = 27;
assertEquals(j, facepalm.repeat(j).split("\\b{g}").length);
}

@Test
@DisabledForJreRange(min = JRE.JAVA_17, disabledReason = "This test only valid for Java versions up to not including Java 17.")
void demonstrateMetadataAboutFacePalmDudePriorToJava17() {
String facepalm = ComplexUnicodeSamples.getFacePalmingMaleControlSkintone();
int j = 27;
assertEquals(j * 3, facepalm.repeat(j).split("\\b{g}").length);
// it should be 27, but it's wrong until Java 17
}


}

0 comments on commit ee88bcf

Please sign in to comment.