Skip to content

Commit

Permalink
ICU-22285 omit the gb2312 & big5han collation tailorings by default
Browse files Browse the repository at this point in the history
  • Loading branch information
markusicu committed Mar 14, 2023
1 parent 97510de commit 2d9fa3f
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 13 deletions.
26 changes: 26 additions & 0 deletions docs/userguide/icu_data/buildtool.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,32 @@ To build ICU4J with custom data, you must first build ICU4C with custom data
and then generate the JAR file. For more information on building ICU4J, read the
[ICU4J Readme](../icu4j/).

### Default Configuration

By default (without a configuration file and without option flags),
the ICU data file includes all of the data in the ICU source tree.

Since ICU 73 (2023q2), there is an exception:
By default, the "big5han" and "gb2312han" collation tailorings are omitted.
These mimic the order of their respective charsets, are relatively large, and rarely used.
(See [ICU-22285](https://unicode-org.atlassian.net/browse/ICU-22285).)

The default configuration is equivalent to a filter file like this:

{
"resourceFilters": [
{
"categories": [
"coll_tree"
],
"rules": [
"-/collations/big5han",
"-/collations/gb2312han"
]
}
]
}

### Locale Slicing

The simplest way to slice ICU data is by locale. The ICU Data Build Tool
Expand Down
17 changes: 17 additions & 0 deletions icu4c/source/python/icutools/databuilder/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,23 @@ def __init__(self, args):
if "usePoolBundle" in self.filters_json_data:
self.use_pool_bundle = self.filters_json_data["usePoolBundle"]

# By default, exclude collation data that mimics the order of some large legacy charsets.
# We do this in "subtractive" strategy by inserting a resourceFilter.
# Later rules from an explicit filter file may override this default behavior.
# (In "additive" strategy this is unnecessary.)
if self.strategy == "subtractive":
filters = self.filters_json_data.setdefault("resourceFilters", [])
omit_charset_collations = {
"categories": [
"coll_tree"
],
"rules": [
"-/collations/big5han",
"-/collations/gb2312han"
]
}
filters.insert(0, omit_charset_collations)

def _parse_filter_file(self, f):
# Use the Hjson parser if it is available; otherwise, use vanilla JSON.
try:
Expand Down
4 changes: 2 additions & 2 deletions icu4j/main/shared/data/icudata.jar
Git LFS file not shown
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,19 @@ public void TestFunctionalEquivalent(){
"f", "zh_MO", "zh@collation=stroke", /* alias of zh_Hant_MO */
"t", "zh_Hant_MO", "zh@collation=stroke",
"f", "zh_TW_STROKE", "zh@collation=stroke",
"f", "zh_TW_STROKE@collation=big5han", "zh@collation=big5han",
"f", "zh_TW_STROKE@collation=zhuyin", "zh@collation=zhuyin",
"f", "sv_CN@calendar=japanese", "sv",
"t", "sv@calendar=japanese", "sv",
"f", "zh_TW@collation=big5han", "zh@collation=big5han", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=big5han", "zh@collation=big5han",
"f", "zh_TW@collation=gb2312han", "zh@collation=gb2312han", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=gb2312han", "zh@collation=gb2312han",
"f", "zh_CN@collation=big5han", "zh@collation=big5han", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=big5han", "zh@collation=big5han",
"f", "zh_CN@collation=gb2312han", "zh@collation=gb2312han", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=gb2312han", "zh@collation=gb2312han",
"t", "zh@collation=big5han", "zh@collation=big5han",
"t", "zh@collation=gb2312han", "zh@collation=gb2312han",
"f", "zh_TW@collation=zhuyin", "zh@collation=zhuyin", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=zhuyin", "zh@collation=zhuyin",
"f", "zh_TW@collation=unihan", "zh@collation=unihan", /* alias of zh_Hant_TW */
"t", "zh_Hant_TW@collation=unihan", "zh@collation=unihan",
"f", "zh_CN@collation=zhuyin", "zh@collation=zhuyin", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=zhuyin", "zh@collation=zhuyin",
"f", "zh_CN@collation=unihan", "zh@collation=unihan", /* alias of zh_Hans_CN */
"t", "zh_Hans_CN@collation=unihan", "zh@collation=unihan",
"t", "zh@collation=zhuyin", "zh@collation=zhuyin",
"t", "zh@collation=unihan", "zh@collation=unihan",
"t", "hi@collation=standard", "hi",
"f", "hi_AU@collation=standard;currency=CHF;calendar=buddhist", "hi",
"f", "sv_SE@collation=pinyin", "sv", /* bug 4582 tests */
Expand Down
Binary file not shown.

0 comments on commit 2d9fa3f

Please sign in to comment.