From 84455bb068a99947e334b0e52e2c494b8b462886 Mon Sep 17 00:00:00 2001 From: Jules Bertholet Date: Sun, 2 Jun 2024 12:39:11 -0400 Subject: [PATCH] Support Khmer - Support nonspacing coeng signs - Assign width 2 to KHMER INDEPENDENT VOWEL QAA and 3 to KHMER SIGN BEYYAL (https://unicode.org/charts/nameslist/n_1780.html) --- README.md | 2 +- scripts/unicode.py | 51 ++++++++++++++++++++++++++++++++++++---------- src/lib.rs | 16 ++++++++++----- src/tables.rs | 25 +++++++++++++++++++---- tests/tests.rs | 40 ++++++++++++++++++++++++++++++++++++ 5 files changed, 113 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index d95c4c0..632ae15 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ fn main() { ``` **NOTE:** The computed width values may not match the actual rendered column -width. For example, Brahmic scripts like Devanagari have complex rendering rules +width. For example, many Brahmic scripts like Devanagari have complex rendering rules which this crate does not currently handle (and will never fully handle, because the exact rendering depends on the font): diff --git a/scripts/unicode.py b/scripts/unicode.py index a48227b..18e71d0 100755 --- a/scripts/unicode.py +++ b/scripts/unicode.py @@ -184,6 +184,9 @@ class WidthState(enum.IntEnum): WIDE = 0x1_0002 "Two columns wide." + THREE = 0x1_0003 + "Three columns wide." + # \r\n LINE_FEED = 0b0000_0000_0000_0001 "\\n (CRLF has width 1)" @@ -324,6 +327,11 @@ class WidthState(enum.IntEnum): ZWJ_OLD_TURKIC_LETTER_ORKHON_I = 0b0011_1100_0000_0110 "\\u10C03 (ORKHON EC-ZWJ-ORKHON I ligature)" + # Khmer coeng signs + + KHMER_COENG_ELIGIBLE_LETTER = 0b0011_1100_0000_0111 + "\\u1780..=\\u17A2 | \\u17A7 | \\u17AB | \\u17AC | \\u17AF" + def table_width(self) -> CharWidthInTable: "The width of a character as stored in the lookup tables." match self: @@ -336,6 +344,10 @@ def table_width(self) -> CharWidthInTable: case _: return CharWidthInTable.SPECIAL + def is_carried(self) -> bool: + "Whether this corresponds to a non-default `WidthInfo`." + return int(self) <= 0xFFFF + def width_alone(self) -> int: "The width of a character with this type when it appears alone." match self: @@ -352,6 +364,8 @@ def width_alone(self) -> int: | WidthState.EMOJI_PRESENTATION ): return 2 + case WidthState.THREE: + return 3 case _: return 1 @@ -591,6 +605,18 @@ def load_width_maps() -> tuple[list[WidthState], list[WidthState]]: ([0x0A], WidthState.LINE_FEED), ([0x05DC], WidthState.HEBREW_LETTER_LAMED), (alef_joining, WidthState.JOINING_GROUP_ALEF), + (range(0x1780, 0x1783), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x1784, 0x1788), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x1789, 0x178D), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x178E, 0x1794), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x1795, 0x1799), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + (range(0x179B, 0x179E), WidthState.KHMER_COENG_ELIGIBLE_LETTER), + ( + [0x17A0, 0x17A2, 0x17A7, 0x17AB, 0x17AC, 0x17AF], + WidthState.KHMER_COENG_ELIGIBLE_LETTER, + ), + ([0x17A4], WidthState.WIDE), + ([0x17D8], WidthState.THREE), ([0x1A10], WidthState.BUGINESE_LETTER_YA), (range(0x2D31, 0x2D66), WidthState.TIFINAGH_CONSONANT), ([0x2D6F], WidthState.TIFINAGH_CONSONANT), @@ -1189,7 +1215,11 @@ def lookup_fns( s += f" '\\u{{{lo:X}}}'" if hi != lo: s += f"..='\\u{{{hi:X}}}'" - s += f" => ({width.width_alone()}, WidthInfo::{str(width.name)}),\n" + if width.is_carried(): + width_info = width.name + else: + width_info = "DEFAULT" + s += f" => ({width.width_alone()}, WidthInfo::{width_info}),\n" s += f""" _ => (2, WidthInfo::EMOJI_PRESENTATION), }} @@ -1323,6 +1353,11 @@ def lookup_fns( return (0, WidthInfo::DEFAULT); } + // Khmer coeng signs + (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\\u{17D2}') => { + return (-1, WidthInfo::DEFAULT); + } + // Buginese ZWJ ya ligature (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\\u{1A17}') => { return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA) @@ -1519,7 +1554,7 @@ def emit_module( ) for variant in WidthState: - if variant.table_width() == CharWidthInTable.SPECIAL: + if variant.is_carried(): if variant.is_cjk_only(): module.write(' #[cfg(feature = "cjk")]\n') module.write( @@ -1913,7 +1948,7 @@ def emit_module( test_width_variants = [] test_width_variants_cjk = [] for variant in WidthState: - if variant.table_width() == CharWidthInTable.SPECIAL: + if variant.is_carried(): if not variant.is_cjk_only(): test_width_variants.append(variant) if not variant.is_non_cjk_only(): @@ -1991,10 +2026,7 @@ def emit_module( ) for variant in WidthState: - if ( - variant.table_width() == CharWidthInTable.SPECIAL - and not variant.is_cjk_only() - ): + if variant.is_carried() and not variant.is_cjk_only(): module.write(f" WidthInfo::{variant.name},\n") module.write( @@ -2006,10 +2038,7 @@ def emit_module( ) for variant in WidthState: - if ( - variant.table_width() == CharWidthInTable.SPECIAL - and not variant.is_non_cjk_only() - ): + if variant.is_carried() and not variant.is_non_cjk_only(): module.write(f" WidthInfo::{variant.name},\n") module.write( diff --git a/src/lib.rs b/src/lib.rs index eecf49a..d83a6c8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -72,6 +72,9 @@ //! with [`Joining_Group`]`=Alef`, has total width 1. For example: `لا`‎, `لآ`‎, `ڸا`‎, `لٟٞأ` //! - **[Buginese]**: `"\u{1A15}\u{1A17}\u{200D}\u{1A10}"` ( ya, `ᨕᨗ‍ᨐ`) has total width 1. //! - **[Hebrew]**: `"א\u{200D}ל"` (Alef-Lamed, `א‍ל`) has total width 1. +//! - **[Khmer]**: Coeng signs consisting of `'\u{17D2}'` followed by a character in +//! `'\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' | '\u{17AB}'..='\u{17AC}' | '\u{17AF}'` +//! have width 0. //! - **[Lisu]**: Tone letter combinations consisting of a character in the range `'\u{A4F8}'..='\u{A4FB}'` //! followed by a character in the range `'\u{A4FC}'..='\u{A4FD}'` have width 1. For example: `ꓹꓼ` //! - **[Old Turkic]**: `"\u{10C32}\u{200D}\u{10C03}"` (`𐰲‍𐰃`) has total width 1. @@ -85,8 +88,10 @@ //! - Is a [default-ignorable][`Default_Ignorable_Code_Point`] [combining mark][combining marks]. //! 2. In all other cases, the width of the string equals the sum of its character widths: //! 1. [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER] has width 1 (outside of the ligatures described previously). -//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2. -//! 3. The following have width 0: +//! 2. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) and +//! [`'\u{17A4}'` KHMER INDEPENDENT VOWEL QAA](https://util.unicode.org/UnicodeJsps/character.jsp?a=17A4) have width 2. +//! 3. [`'\u{17D8}'` KHMER SIGN BEYYAL](https://util.unicode.org/UnicodeJsps/character.jsp?a=17D8) has width 3. +//! 4. The following have width 0: //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D) //! with the [`Default_Ignorable_Code_Point`] property. //! - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D) @@ -109,15 +114,15 @@ //! - [`'\u{0891}'` PIASTRE MARK ABOVE](https://util.unicode.org/UnicodeJsps/character.jsp?a=0891), and //! - [`'\u{08E2}'` DISPUTED END OF AYAH](https://util.unicode.org/UnicodeJsps/character.jsp?a=08E2). //! - [`'\u{A8FA}'` DEVANAGARI CARET](https://util.unicode.org/UnicodeJsps/character.jsp?a=A8FA). -//! 4. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) +//! 5. [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BEast_Asian_Width%3DF%7D%5Cp%7BEast_Asian_Width%3DW%7D) //! with an [`East_Asian_Width`] of [`Fullwidth`] or [`Wide`] have width 2. -//! 5. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise: +//! 6. Characters fulfilling all of the following conditions have width 2 in an East Asian context, and width 1 otherwise: //! - Has an [`East_Asian_Width`] of [`Ambiguous`], or //! has a canonical decomposition to an [`Ambiguous`] character followed by [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY], or //! is [`'\u{0387}'` GREEK ANO TELEIA](https://util.unicode.org/UnicodeJsps/character.jsp?a=0387), and //! - Does not have a [`General_Category`] of `Modifier_Symbol`, and //! - Does not have a [`Script`] of `Latin`, `Greek`, or `Cyrillic`, or is a Roman numeral in the range `'\u{2160}'..='\u{217F}'`. -//! 6. All other characters have width 1. +//! 7. All other characters have width 1. //! //! [`'\u{0338}'` COMBINING LONG SOLIDUS OVERLAY]: https://util.unicode.org/UnicodeJsps/character.jsp?a=0338 //! [`'\u{2D7F}'` TIFINAGH CONSONANT JOINER]: https://util.unicode.org/UnicodeJsps/character.jsp?a=2D7F @@ -150,6 +155,7 @@ //! [Arabic]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G7480 //! [Buginese]: https://www.unicode.org/versions/Unicode15.0.0/ch17.pdf#G26743 //! [Hebrew]: https://www.unicode.org/versions/Unicode15.0.0/ch09.pdf#G6528 +//! [Khmer]: https://www.unicode.org/versions/Unicode15.0.0/ch16.pdf#G64642 //! [Lisu]: https://www.unicode.org/versions/Unicode15.0.0/ch18.pdf#G44587 //! [Old Turkic]: https://www.unicode.org/versions/Unicode15.0.0/ch14.pdf#G41975 //! [Tifinagh]: http://www.unicode.org/versions/Unicode15.0.0/ch19.pdf#G43184 diff --git a/src/tables.rs b/src/tables.rs index a208c0d..f7a7a86 100644 --- a/src/tables.rs +++ b/src/tables.rs @@ -58,6 +58,7 @@ impl WidthInfo { const LISU_TONE_LETTER_MYA_NA_JEU: Self = Self(0b0011110000000101); const OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011100000000110); const ZWJ_OLD_TURKIC_LETTER_ORKHON_I: Self = Self(0b0011110000000110); + const KHMER_COENG_ELIGIBLE_LETTER: Self = Self(0b0011110000000111); /// Whether this width mode is ligature_transparent /// (has 5th MSB set.) @@ -159,6 +160,8 @@ fn lookup_width(c: char) -> (u8, WidthInfo) { '\u{A}' => (1, WidthInfo::LINE_FEED), '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED), '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF), + '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER), + '\u{17D8}' => (3, WidthInfo::DEFAULT), '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA), '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT), '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU), @@ -255,6 +258,11 @@ fn width_in_str(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) { return (0, WidthInfo::DEFAULT); } + // Khmer coeng signs + (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => { + return (-1, WidthInfo::DEFAULT); + } + // Buginese ZWJ ya ligature (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => { return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA) @@ -436,6 +444,8 @@ fn lookup_width_cjk(c: char) -> (u8, WidthInfo) { '\u{338}' => (0, WidthInfo::COMBINING_LONG_SOLIDUS_OVERLAY), '\u{5DC}' => (1, WidthInfo::HEBREW_LETTER_LAMED), '\u{622}'..='\u{882}' => (1, WidthInfo::JOINING_GROUP_ALEF), + '\u{1780}'..='\u{17AF}' => (1, WidthInfo::KHMER_COENG_ELIGIBLE_LETTER), + '\u{17D8}' => (3, WidthInfo::DEFAULT), '\u{1A10}' => (1, WidthInfo::BUGINESE_LETTER_YA), '\u{2D31}'..='\u{2D6F}' => (1, WidthInfo::TIFINAGH_CONSONANT), '\u{A4FC}'..='\u{A4FD}' => (1, WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU), @@ -539,6 +549,11 @@ fn width_in_str_cjk(c: char, mut next_info: WidthInfo) -> (i8, WidthInfo) { return (0, WidthInfo::DEFAULT); } + // Khmer coeng signs + (WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, '\u{17D2}') => { + return (-1, WidthInfo::DEFAULT); + } + // Buginese ZWJ ya ligature (WidthInfo::ZWJ_BUGINESE_LETTER_YA, '\u{1A17}') => { return (0, WidthInfo::BUGINESE_VOWEL_SIGN_I_ZWJ_LETTER_YA) @@ -1206,8 +1221,8 @@ static WIDTH_LEAVES: Align32<[[u8; 32]; WIDTH_LEAVES_LEN]> = Align32([ 0x55, 0x55, ], [ - 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x10, 0x00, - 0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x55, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x7F, 0xFF, 0xFD, 0xF7, 0xFF, 0xFD, 0xD7, 0x5F, 0x77, 0xD6, 0xD5, 0xD7, 0x55, 0x10, 0x00, + 0x50, 0x55, 0x45, 0x01, 0x00, 0x00, 0x55, 0x57, 0x51, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, ], [ @@ -2575,7 +2590,7 @@ mod tests { } } - static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 37] = [ + static NORMALIZATION_TEST_WIDTHS: [WidthInfo; 38] = [ WidthInfo::DEFAULT, WidthInfo::LINE_FEED, WidthInfo::EMOJI_MODIFIER, @@ -2613,10 +2628,11 @@ mod tests { WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I, WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, + WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, ]; #[cfg(feature = "cjk")] - static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 38] = [ + static NORMALIZATION_TEST_WIDTHS_CJK: [WidthInfo; 39] = [ WidthInfo::DEFAULT, WidthInfo::LINE_FEED, WidthInfo::EMOJI_MODIFIER, @@ -2655,6 +2671,7 @@ mod tests { WidthInfo::LISU_TONE_LETTER_MYA_NA_JEU, WidthInfo::OLD_TURKIC_LETTER_ORKHON_I, WidthInfo::ZWJ_OLD_TURKIC_LETTER_ORKHON_I, + WidthInfo::KHMER_COENG_ELIGIBLE_LETTER, ]; #[rustfmt::skip] diff --git a/tests/tests.rs b/tests/tests.rs index e7027a1..4f713e7 100644 --- a/tests/tests.rs +++ b/tests/tests.rs @@ -385,6 +385,46 @@ fn test_old_turkic_ligature() { assert_width!("\u{200D}\u{10C32}", 1, 1); } +#[test] +fn test_khmer_coeng() { + assert_width!("ល", 1, 1); + assert_width!("ង", 1, 1); + assert_width!("លង", 2, 2); + assert_width!("ល្ង", 1, 1); + + for c in '\0'..=char::MAX { + if matches!( + c, + '\u{1780}'..='\u{1782}' | '\u{1784}'..='\u{1787}' + | '\u{1789}'..='\u{178C}' | '\u{178E}'..='\u{1793}' + | '\u{1795}'..='\u{1798}' | '\u{179B}'..='\u{179D}' + | '\u{17A0}' | '\u{17A2}' | '\u{17A7}' + | '\u{17AB}'..='\u{17AC}' | '\u{17AF}' + ) { + assert_width!(format!("\u{17D2}{c}"), 0, 0); + assert_width!(format!("\u{17D2}\u{200D}\u{200D}{c}"), 0, 0); + } else { + assert_width!( + format!("\u{17D2}{c}"), + c.width().unwrap_or(1), + c.width_cjk().unwrap_or(1) + ); + } + } +} + +#[test] +fn test_khmer_qaa() { + assert_width!("\u{17A4}", 2, 2); + assert_width!("\u{17A2}\u{17A6}", 2, 2); +} + +#[test] +fn test_khmer_sign_beyyal() { + assert_width!("\u{17D8}", 3, 3); + assert_width!("\u{17D4}\u{179B}\u{17D4}", 3, 3); +} + #[test] fn test_emoji_modifier() { assert_width!("\u{1F46A}", 2, 2);