From 647463ff48043194528c0fb54782b216020069d1 Mon Sep 17 00:00:00 2001 From: Andrew Huth Date: Wed, 18 Mar 2020 18:14:00 -0700 Subject: [PATCH 1/5] Detect unicode in the Supplementary Private Use Area-A Resolves https://github.com/dequelabs/axe-core/issues/2101. Characters in this range are not in the Basic Multilingual Plane [1]. 1. https://en.wikipedia.org/wiki/Private_Use_Areas --- lib/commons/text/unicode.js | 3 ++- test/commons/text/unicode.js | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js index c854ee6f98..09e5b0acfc 100644 --- a/lib/commons/text/unicode.js +++ b/lib/commons/text/unicode.js @@ -93,11 +93,12 @@ function getUnicodeNonBmpRegExp() { * '\u2600-\u26FF' Misc Symbols * '\u2700-\u27BF' Dingbats * '\uE000-\uF8FF' Private Use + * '\u{F0000}-\u{FFFFD}' Supplementary Private Use Area A * * Note: plane '\u2000-\u206F' used for General punctuation is excluded as it is handled in -> getPunctuationRegExp */ - return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF]/g; + return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF\u{F0000}-\u{FFFFD}]/gu; } /** diff --git a/test/commons/text/unicode.js b/test/commons/text/unicode.js index 10af427a76..24c36e061c 100644 --- a/test/commons/text/unicode.js +++ b/test/commons/text/unicode.js @@ -69,6 +69,13 @@ describe('text.hasUnicode', function() { }); assert.isTrue(actual); }); + + it('returns true for a string with characters in supplementary private use area A', function() { + var actual = axe.commons.text.hasUnicode('\u{F0019}', { + nonBmp: true + }); + assert.isTrue(actual); + }); }); describe('text.hasUnicode, characters of type Emoji', function() { From 20c5df8691db9df51811cac2548b40f07f44d0a0 Mon Sep 17 00:00:00 2001 From: Andrew Huth Date: Wed, 18 Mar 2020 22:22:11 -0700 Subject: [PATCH 2/5] Use surrogate pair representation for unicode sequence The tests are parsed with eslint as ES5, which does not have unicode escape sequences. Therefore, "\u{F0019}" is not a valid string. Instead, we will have to use its corresponding surrogate pair, which is "\uDB80\uDC19". --- test/commons/text/unicode.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/commons/text/unicode.js b/test/commons/text/unicode.js index 24c36e061c..a64e01fe34 100644 --- a/test/commons/text/unicode.js +++ b/test/commons/text/unicode.js @@ -71,7 +71,7 @@ describe('text.hasUnicode', function() { }); it('returns true for a string with characters in supplementary private use area A', function() { - var actual = axe.commons.text.hasUnicode('\u{F0019}', { + var actual = axe.commons.text.hasUnicode('\uDB80\uDC19', { nonBmp: true }); assert.isTrue(actual); From 30aef40d634d742bffb0a452d7f1821017bc22b0 Mon Sep 17 00:00:00 2001 From: Andrew Huth Date: Thu, 19 Mar 2020 09:20:26 -0700 Subject: [PATCH 3/5] Move detection of certain private use unicode characters to `getSupplementaryPrivateUseRegExp` Instead of `getUnicodeNonBmpRegExp`. For this to work, I had to use `getSupplementaryPrivateUseRegExp` in `hasUnicode`. This seems appropriate, and its absence may have been a bug. --- lib/commons/text/unicode.js | 11 +++++++---- test/commons/text/unicode.js | 7 +++++++ 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js index 09e5b0acfc..2666d52120 100644 --- a/lib/commons/text/unicode.js +++ b/lib/commons/text/unicode.js @@ -19,7 +19,10 @@ text.hasUnicode = function hasUnicode(str, options) { return axe.imports.emojiRegexText().test(str); } if (nonBmp) { - return getUnicodeNonBmpRegExp().test(str); + return ( + getUnicodeNonBmpRegExp().test(str) || + getSupplementaryPrivateUseRegExp().test(str) + ); } if (punctuations) { return getPunctuationRegExp().test(str); @@ -93,12 +96,11 @@ function getUnicodeNonBmpRegExp() { * '\u2600-\u26FF' Misc Symbols * '\u2700-\u27BF' Dingbats * '\uE000-\uF8FF' Private Use - * '\u{F0000}-\u{FFFFD}' Supplementary Private Use Area A * * Note: plane '\u2000-\u206F' used for General punctuation is excluded as it is handled in -> getPunctuationRegExp */ - return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF\u{F0000}-\u{FFFFD}]/gu; + return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF]/g; } /** @@ -131,6 +133,7 @@ function getSupplementaryPrivateUseRegExp() { /** * Reference: https://www.unicode.org/charts/PDF/UD800.pdf * https://www.unicode.org/charts/PDF/UDC00.pdf + * https://www.unicode.org/charts/PDF/UF0000.pdf */ - return /[\uDB80-\uDBBF][\uDC00-\uDFFD]/g; + return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|[\u{F0000}-\u{FFFFD}]/gu; } diff --git a/test/commons/text/unicode.js b/test/commons/text/unicode.js index a64e01fe34..e98d6b267b 100644 --- a/test/commons/text/unicode.js +++ b/test/commons/text/unicode.js @@ -208,6 +208,13 @@ describe('text.removeUnicode', function() { assert.equal(actual, ''); }); + it('returns the string with supplementary private use area A characters removed', function() { + var actual = axe.commons.text.removeUnicode('\uDB80\uDC19', { + nonBmp: true + }); + assert.equal(actual, ''); + }); + it('returns string removing combination of unicode characters', function() { var actual = axe.commons.text.removeUnicode( 'The ☀️ is orange, the ◓ is white.', From 0e3777a53d33e5468d27a8b340454aef03bc15b9 Mon Sep 17 00:00:00 2001 From: Andrew Huth Date: Thu, 19 Mar 2020 11:07:44 -0700 Subject: [PATCH 4/5] Replace unicode escape sequences in regex with the corresponding surrogate pairs This will be more compatible. IE11 and ES5 environments do not support the `u` flag on regexes. --- lib/commons/text/unicode.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js index 2666d52120..31fabc7c70 100644 --- a/lib/commons/text/unicode.js +++ b/lib/commons/text/unicode.js @@ -135,5 +135,5 @@ function getSupplementaryPrivateUseRegExp() { * https://www.unicode.org/charts/PDF/UDC00.pdf * https://www.unicode.org/charts/PDF/UF0000.pdf */ - return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|[\u{F0000}-\u{FFFFD}]/gu; + return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|(?:[\uDB80-\uDBBE][\uDC00-\uDFFF]|\uDBBF[\uDC00-\uDFFD])/g; } From 6a90379e51510c5554f70dcdc56dcfca59ce213a Mon Sep 17 00:00:00 2001 From: Andrew Huth Date: Thu, 19 Mar 2020 11:48:09 -0700 Subject: [PATCH 5/5] Document which parts of the `getSupplementaryPrivateUseRegExp` regex refer to which unicode range --- lib/commons/text/unicode.js | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js index 31fabc7c70..25cade6e65 100644 --- a/lib/commons/text/unicode.js +++ b/lib/commons/text/unicode.js @@ -130,10 +130,11 @@ function getPunctuationRegExp() { * @returns {RegExp} */ function getSupplementaryPrivateUseRegExp() { - /** - * Reference: https://www.unicode.org/charts/PDF/UD800.pdf - * https://www.unicode.org/charts/PDF/UDC00.pdf - * https://www.unicode.org/charts/PDF/UF0000.pdf - */ + // 1. High surrogate area (https://www.unicode.org/charts/PDF/UD800.pdf) + // 2. Low surrogate area (https://www.unicode.org/charts/PDF/UDC00.pdf) + // 3. Supplementary private use area A (https://www.unicode.org/charts/PDF/UF0000.pdf) + // + // 1 2 3 + // ┏━━━━━━┻━━━━━━┓┏━━━━━━┻━━━━━━┓ ┏━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|(?:[\uDB80-\uDBBE][\uDC00-\uDFFF]|\uDBBF[\uDC00-\uDFFD])/g; }