Test width = sum(grapheme cluster widths)

unicode-rs · May 10, 2024 · a7a1056 · a7a1056
1 parent 3063422
commit a7a1056
Show file tree

Hide file tree

Showing 4 changed files with 1,298 additions and 12 deletions.
diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -754,8 +754,9 @@ def main(module_path: str):
         {EffectiveWidth.NARROW, EffectiveWidth.AMBIGUOUS},
     )
 
-    # Download normalization test file for use by tests
+    # Download files for use by tests
     fetch_open("NormalizationTest.txt", "../tests/")
+    fetch_open("auxiliary/GraphemeBreakTest.txt", "../tests/")
 
     print("------------------------")
     total_size = 0

diff --git a/src/lib.rs b/src/lib.rs
@@ -30,19 +30,19 @@
 //! # Rules for determining width
 //!
 //! This crate currently uses the following rules to determine the width of a
-//! character or string, in order of decreasing precedence. These may be tweaked in the future.
+//! character or string, in order of decreasing precedence. These may be tweaked in the future;
+//! however see [guarantees](#guarantees) below.
 //!
 //! 1. [Emoji presentation sequences] have width 2.
-//!    (The width of a string may therefore differ from the sum of the widths of its characters.)
-//! 2. Outside of an East Asian context, [text presentation sequences] fulfilling all the following requirements
-//!    have width 1:
+//! 2. Outside of an East Asian context, [text presentation sequences] have width 1
+//!    if their base character:
 //!    - Has the [`Emoji_Presentation`] property, and
-//!    - Not in the [Enclosed Ideographic Supplement] block.
+//!    - Is not in the [Enclosed Ideographic Supplement] block.
 //! 3. The sequence `"\r\n"` has width 1.
 //! 4. [`'\u{115F}'` HANGUL CHOSEONG FILLER](https://util.unicode.org/UnicodeJsps/character.jsp?a=115F) has width 2.
 //! 5. The following have width 0:
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BDefault_Ignorable_Code_Point%7D)
-//!       with the [`Default_Ignorable_Code_Point`](https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095) property.
+//!       with the [`Default_Ignorable_Code_Point`] property.
 //!    - [Characters](https://util.unicode.org/UnicodeJsps/list-unicodeset.jsp?a=%5Cp%7BGrapheme_Extend%7D)
 //!       with the [`Grapheme_Extend`] property.
 //!    - The following 8 characters, all of which have NFD decompositions consisting of two [`Grapheme_Extend`] characters:
@@ -62,6 +62,7 @@
 //!    with an [`East_Asian_Width`] of [`Ambiguous`] have width 2 in an East Asian context, and width 1 otherwise.
 //! 8. All other characters have width 1.
 //!
+//! [`Default_Ignorable_Code_Point`]: https://www.unicode.org/versions/Unicode15.0.0/ch05.pdf#G40095
 //! [`East_Asian_Width`]: https://www.unicode.org/reports/tr11/#ED1
 //! [`Emoji_Presentation`]: https://unicode.org/reports/tr51/#def_emoji_presentation
 //! [`Grapheme_Extend`]: https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G52443
@@ -71,15 +72,21 @@
 //! [`Wide`]: https://www.unicode.org/reports/tr11/#ED4
 //! [`Ambiguous`]: https://www.unicode.org/reports/tr11/#ED6
 //!
-//! [Emoji presentation sequences]: (https://unicode.org/reports/tr51/#def_emoji_presentation_sequence)
-//! [text presentation sequences]: (https://unicode.org/reports/tr51/#def_text_presentation_sequence)
+//! [Emoji presentation sequences]: https://unicode.org/reports/tr51/#def_emoji_presentation_sequence
+//! [text presentation sequences]: https://unicode.org/reports/tr51/#def_text_presentation_sequence
 //!
 //! [Enclosed Ideographic Supplement]: https://unicode.org/charts/PDF/U1F200.pdf
 //!
-//! ## Canonical equivalence
+//! ## Guarantees
 //!
-//! The non-CJK width methods guarantee that canonically equivalent strings are assigned the same width.
-//! However, this guarantee does not currently hold for the CJK width variants.
+//! - Any two canonically equivalent strings have the same non-CJK width.
+//!   This will not change in any future semver-compatible version.
+//!   (This guarantee does not currently hold for the CJK width variants.)
+//! - The width of any string equals the sum of the widths of its [extended grapheme clusters].
+//!   This is unlikely to change in any future semver-compatible version.
+//!   (This guarantee holds for both CJK and non-CJK width.)
+//!
+//! [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
 
 #![forbid(unsafe_code)]
 #![deny(missing_docs)]
@@ -95,6 +102,14 @@ pub use tables::UNICODE_VERSION;
 mod tables;
 
 /// Methods for determining displayed width of Unicode characters.
+///
+/// **NB:** the width of a string may differ from the sum of the widths of its characters;
+/// see the [crate-level documentation](crate#rules-for-determining-width) for more.
+/// Instead of working with individual characters, consider using [extended grapheme clusters],
+/// perhaps with the [`unicode-segmentation`] crate.
+///
+/// [extended grapheme clusters]: https://www.unicode.org/reports/tr29/#Grapheme_Cluster_Boundaries
+/// [`unicode-segmentation`]: https://docs.rs/unicode-segmentation/latest/unicode_segmentation/trait.UnicodeSegmentation.html#tymethod.graphemes
 pub trait UnicodeWidthChar {
     /// Returns the character's displayed width in columns, or `None` if the
     /// character is a control character.