Skip to content

Commit

Permalink
Split addition comparisons (#942)
Browse files Browse the repository at this point in the history
* Improvements to the invariant test language

* Split the addition comparison test files
  • Loading branch information
eggrobin authored Oct 2, 2024
1 parent 0235209 commit 5b4d24f
Show file tree
Hide file tree
Showing 9 changed files with 111 additions and 74 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -563,13 +563,11 @@ private static void propertywiseCorrespondenceLine(
final List<String> errorMessageLines = new ArrayList<>();
final List<UnicodeSet> sets = new ArrayList<>();
sets.add(firstSet);
expectToken(":", pp, source);

// Index of the first set of multi-character strings (and of the first multi-character
// reference string).
// Index of the first set of value-only sets (prefixed by ⧴ rather than :).
// Only value-only sets may contain multi-character strings.
// This is `m` in the documentation in UnicodeInvariantTest.txt.
int firstMultiCharacterIndex = -1;
do {
while (Lookahead.oneToken(pp, source).accept(":")) {
final var set = parseUnicodeSet(source, pp);
if (set.size() != firstSet.size()) {
throw new BackwardParseException(
Expand All @@ -580,24 +578,29 @@ private static void propertywiseCorrespondenceLine(
+ ")",
pp.getIndex());
}
if (set.hasStrings() && set.strings().size() != set.size()) {
if (set.hasStrings()) {
throw new BackwardParseException(
"Sets should be all strings or all code points for property correspondence",
"Strings are only allowed in value-only sets (prefixed by ⧴ rather than :)",
pp.getIndex());
}
if (firstMultiCharacterIndex == -1) {
if (set.hasStrings()) {
firstMultiCharacterIndex = sets.size();
}
} else if (!set.hasStrings()) {
sets.add(set);
}
// Index of the first set of value-only sets (prefixed by ⧴ rather than :).
// Only value-only sets may contain multi-character strings.
// This is `m` in the documentation in UnicodeInvariantTest.txt.
final int firstValueOnlyIndex = sets.size();
while (Lookahead.oneToken(pp, source).accept("⧴")) {
final var set = parseUnicodeSet(source, pp);
if (set.size() != firstSet.size()) {
throw new BackwardParseException(
"Code points should come before strings in property correspondence",
"Sets should have the same size for property correspondence (got "
+ set.size()
+ ", expected "
+ firstSet.size()
+ ")",
pp.getIndex());
}
sets.add(set);
} while (Lookahead.oneToken(pp, source).accept(":"));
if (firstMultiCharacterIndex == -1) {
firstMultiCharacterIndex = sets.size();
}
final List<String> referenceCodePoints = new ArrayList<>();
expectToken("CorrespondTo", pp, source);
Expand All @@ -608,14 +611,14 @@ private static void propertywiseCorrespondenceLine(
"reference should be a single code point or string for property correspondence",
pp.getIndex());
}
if (referenceSet.hasStrings()
!= (referenceCodePoints.size() >= firstMultiCharacterIndex)) {
if (referenceSet.hasStrings() && referenceCodePoints.size() < firstValueOnlyIndex) {
throw new BackwardParseException(
"Strings should correspond to strings for property correspondence",
"Strings are only allowed in value-only sets (prefixed by ⧴ rather than :)",
pp.getIndex());
}
referenceCodePoints.add(referenceSet.iterator().next());
} while (Lookahead.oneToken(pp, source).accept(":"));
} while (Lookahead.oneToken(pp, source)
.accept(referenceCodePoints.size() >= firstValueOnlyIndex ? "⧴" : ":"));
if (referenceCodePoints.size() != sets.size()) {
throw new BackwardParseException(
"Property correspondence requires as many reference code points as sets under test",
Expand All @@ -638,8 +641,14 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
String property = Lookahead.oneToken(pp, source).consume();
expectToken("(", pp, source);
String actualValueAlias = Lookahead.oneToken(pp, source).consume();
while (Lookahead.oneToken(pp, source).accept("|")) {
actualValueAlias += "|" + Lookahead.oneToken(pp, source).consume();
}
expectToken("vs", pp, source);
String referenceValueAlias = Lookahead.oneToken(pp, source).consume();
while (Lookahead.oneToken(pp, source).accept("|")) {
referenceValueAlias += "|" + Lookahead.oneToken(pp, source).consume();
}
expectToken(")", pp, source);
expectedPropertyDifferences.put(
property,
Expand All @@ -657,7 +666,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
expectedDifference = expectedPropertyDifferences.get(alias);
}
if (expectedDifference != null) {
for (int k = 0; k < firstMultiCharacterIndex; ++k) {
for (int k = 0; k < firstValueOnlyIndex; ++k) {
final int rk = referenceCodePoints.get(k).codePointAt(0);
final String pRk = property.getValue(rk);
if (!Objects.equals(pRk, expectedDifference.referenceValueAlias)) {
Expand Down Expand Up @@ -687,7 +696,7 @@ public ExpectedPropertyDifference(String actualValueAlias, String referenceValue
}
}
} else {
for (int k = 0; k < firstMultiCharacterIndex; ++k) {
for (int k = 0; k < firstValueOnlyIndex; ++k) {
final UnicodeSet set = sets.get(k);
final int rk = referenceCodePoints.get(k).codePointAt(0);
final String pRk = property.getValue(rk);
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Ignoring Name Age:

# U+18CFF is a blank character for the Khitan Small Script; aside from looking blank,
# it is indistinguishable from other Khitan Small Script characters. See L2/23-065.
# In particular, it is ideographic:
# https://www.unicode.org/review/pri497/feedback.html#ID20240216140104.
Propertywise [\N{KHITAN SMALL SCRIPT CHARACTER-18CFF}
\N{KHITAN SMALL SCRIPT CHARACTER-18B00}] AreAlike

end Ignoring;
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
Ignoring Name Age:

# Garay is a right-to-left cased script:
Propertywise [\N{GARAY SMALL LETTER A} - \N{GARAY SMALL LETTER OLD NA}]
: [\N{GARAY CAPITAL LETTER A} - \N{GARAY CAPITAL LETTER OLD NA}]
CorrespondTo [\N{OLD HUNGARIAN SMALL LETTER A}]
: [\N{OLD HUNGARIAN CAPITAL LETTER A}]
UpTo: Block (Garay vs Old_Hungarian),
Script (Garay vs Old_Hungarian),
Script_Extensions (Garay vs Old_Hungarian)

end Ignoring;
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Ignoring Name Age:

# HXG (briefly known as HZXG) and SZP are just like all the other CJK strokes.
# In particular, they are scx=Hani:
# https://www.unicode.org/review/pri502/feedback.html#ID20240523095709.
Propertywise [\N{CJK STROKE HXG}\N{CJK STROKE SZP}
\N{CJK STROKE T}] AreAlike

end Ignoring;
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Property comparison tests

Files in this directory are named after [RMG](https://github.com/unicode-org/utc-release-management)
pipeline issues.
Each file contains the tests comparing the properties of proposed characters to properties of
pre-existing characters, developed as part of PAG review.

Property comparison tests were not in place when properties were initially assigned for the 16.0
répertoire; some have been retroactively created. Comments in those files note feedback on errors
that would have been caught by the tests.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# [Template for property comparison tests of character encoding proposals]
# [RMG ISSUE TITLE]
# https://github.com/unicode-org/utc-release-management/issues/[RMG ISSUE NUMBER]

# Names always differ.
# Age always differs since these tests are comparing additions to pre-existing characters.
Ignoring Name Age:

# Ignore the security and IDNA properties, as these are not yet included for provisionally assigned characters.
Ignoring Confusable_MA Identifier_Status Identifier_Type Idn_Status Idn_Mapping Idn_2008:

# [TEST GOES HERE]

end Ignoring;

end Ignoring;
Original file line number Diff line number Diff line change
Expand Up @@ -143,17 +143,15 @@
end Ignoring;
#
##########################
# Propertywise <S₁> : ... : <Sₙ>
# CorrespondTo <R₁> : ... : <Rₙ>
# Propertywise <S₁> : ... <Sₘ> [ ⧴ <Sₘ₊₁> ⧴ ... ⧴ <Sₙ> ]
# CorrespondTo <R₁> : ... <Rₘ> [ ⧴ <Rₘ₊₁> ⧴ ... ⧴ <Rₙ> ]
# [ UpTo: <Property> (<SValue> vs <RValue>) {, <Property> (<SValue> vs <RValue>) }]
#
# The Sₖ must be Unicode sets of equal size, either with no strings or only strings.
# They are considered in code point order for the correspondence check (item 2 below).
# The references Rₖ must be Unicode sets each containing a single code point or a single string;
# by a slight abuse of notation we refer to the code point or string as Rₖ in the explanation below.
# For some m in 2 .. n, the following must hold:
# a. Rₖ is a code point and Sₖ must contain only code points for k ≤ m, and
# b. Rₖ is a string and Sₖ must contain only strings for m < k ≤ n, and
# For k ≤ m, Rₖ must be a code point and Sₖ must contain only code points.
# For every non-ignored property P that does not appear in the optional UpTo clause,
# checks that for each k in 1 .. m, for the ith character C in Sₖ, either:
# 1. P(C) = P(Rₖ), or
Expand All @@ -163,6 +161,9 @@
# For every non-ignored property P that appears in the UpTo clause, checks all characters in the
# sets Sₖ have the SValue and all R characters have the RValue.
#
# Note that only the properties of the characters in Sₖ and Rₖ where k ≤ m are inspected; in other
# words, the characters and strings prefixed by ⧴ are only considered as property values.
#
# With n=1 this check is equivalent to the more straightforward AreAlike check; however, it also
# allows for testing of properties such as case mappings, which differ for most characters in a
# script, but behave regularly. See the examples below.
Expand Down Expand Up @@ -1369,8 +1370,8 @@ Ignoring Unicode_1_Name Confusable_MA:
CorrespondTo [ⁱ] : [i] : [I]
end Ignoring;

Propertywise [ゟ] : [{より}]
CorrespondTo [ヿ] : [{コト}]
Propertywise [ゟ] [{より}]
CorrespondTo [ヿ] [{コト}]
UpTo: Block (Hiragana vs Katakana),
Script (Hiragana vs Katakana),
Script_Extensions (Hiragana vs Katakana),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertThrows;

import java.io.File;
import java.io.IOException;
import java.text.ParseException;
import java.text.ParsePosition;
Expand Down Expand Up @@ -41,9 +42,20 @@ void testUnicodeInvariants() throws IOException {

@Test
void testAdditionComparisons() throws IOException {
int rc =
TestUnicodeInvariants.testInvariants(
"AdditionComparisons.txt", "addition-comparisons", true);
final var directory = new File(Settings.SRC_DIR + "UCD/AdditionComparisons/");
int rc = 0;
for (var file : directory.listFiles()) {
final String filename = file.getName();
if (!file.getName().endsWith(".txt")) {
continue;
}
final String nameWithoutExtension = filename.substring(0, filename.length() - 4);
rc +=
TestUnicodeInvariants.testInvariants(
"AdditionComparisons/" + filename,
"addition-comparisons-" + nameWithoutExtension,
true);
}
assertEquals(0, rc, "TestUnicodeInvariants.testInvariants(addition-comparisons) failed");
}

Expand Down

0 comments on commit 5b4d24f

Please sign in to comment.