From 647463ff48043194528c0fb54782b216020069d1 Mon Sep 17 00:00:00 2001
From: Andrew Huth <andrew.huth@airbnb.com>
Date: Wed, 18 Mar 2020 18:14:00 -0700
Subject: [PATCH 1/5] Detect unicode in the Supplementary Private Use Area-A

Resolves https://github.com/dequelabs/axe-core/issues/2101.

Characters in this range are not in the Basic Multilingual Plane [1].

1. https://en.wikipedia.org/wiki/Private_Use_Areas
---
 lib/commons/text/unicode.js  | 3 ++-
 test/commons/text/unicode.js | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js
index c854ee6f98..09e5b0acfc 100644
--- a/lib/commons/text/unicode.js
+++ b/lib/commons/text/unicode.js
@@ -93,11 +93,12 @@ function getUnicodeNonBmpRegExp() {
 	 * '\u2600-\u26FF'  Misc Symbols
 	 * '\u2700-\u27BF'  Dingbats
 	 * '\uE000-\uF8FF'  Private Use
+	 * '\u{F0000}-\u{FFFFD}' Supplementary Private Use Area A
 	 *
 	 * Note: plane '\u2000-\u206F' used for General punctuation is excluded as it is handled in -> getPunctuationRegExp
 	 */
 
-	return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF]/g;
+	return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF\u{F0000}-\u{FFFFD}]/gu;
 }
 
 /**
diff --git a/test/commons/text/unicode.js b/test/commons/text/unicode.js
index 10af427a76..24c36e061c 100644
--- a/test/commons/text/unicode.js
+++ b/test/commons/text/unicode.js
@@ -69,6 +69,13 @@ describe('text.hasUnicode', function() {
 			});
 			assert.isTrue(actual);
 		});
+
+		it('returns true for a string with characters in supplementary private use area A', function() {
+			var actual = axe.commons.text.hasUnicode('\u{F0019}', {
+				nonBmp: true
+			});
+			assert.isTrue(actual);
+		});
 	});
 
 	describe('text.hasUnicode, characters of type Emoji', function() {

From 20c5df8691db9df51811cac2548b40f07f44d0a0 Mon Sep 17 00:00:00 2001
From: Andrew Huth <andrew.huth@airbnb.com>
Date: Wed, 18 Mar 2020 22:22:11 -0700
Subject: [PATCH 2/5] Use surrogate pair representation for unicode sequence

The tests are parsed with eslint as ES5, which does not have unicode escape sequences. Therefore, "\u{F0019}" is not a valid string. Instead, we will have to use its corresponding surrogate pair, which is "\uDB80\uDC19".
---
 test/commons/text/unicode.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/commons/text/unicode.js b/test/commons/text/unicode.js
index 24c36e061c..a64e01fe34 100644
--- a/test/commons/text/unicode.js
+++ b/test/commons/text/unicode.js
@@ -71,7 +71,7 @@ describe('text.hasUnicode', function() {
 		});
 
 		it('returns true for a string with characters in supplementary private use area A', function() {
-			var actual = axe.commons.text.hasUnicode('\u{F0019}', {
+			var actual = axe.commons.text.hasUnicode('\uDB80\uDC19', {
 				nonBmp: true
 			});
 			assert.isTrue(actual);

From 30aef40d634d742bffb0a452d7f1821017bc22b0 Mon Sep 17 00:00:00 2001
From: Andrew Huth <andrew.huth@airbnb.com>
Date: Thu, 19 Mar 2020 09:20:26 -0700
Subject: [PATCH 3/5] Move detection of certain private use unicode characters
 to `getSupplementaryPrivateUseRegExp`

Instead of `getUnicodeNonBmpRegExp`. For this to work, I had to use `getSupplementaryPrivateUseRegExp` in `hasUnicode`. This seems appropriate, and its absence may have been a bug.
---
 lib/commons/text/unicode.js  | 11 +++++++----
 test/commons/text/unicode.js |  7 +++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js
index 09e5b0acfc..2666d52120 100644
--- a/lib/commons/text/unicode.js
+++ b/lib/commons/text/unicode.js
@@ -19,7 +19,10 @@ text.hasUnicode = function hasUnicode(str, options) {
 		return axe.imports.emojiRegexText().test(str);
 	}
 	if (nonBmp) {
-		return getUnicodeNonBmpRegExp().test(str);
+		return (
+			getUnicodeNonBmpRegExp().test(str) ||
+			getSupplementaryPrivateUseRegExp().test(str)
+		);
 	}
 	if (punctuations) {
 		return getPunctuationRegExp().test(str);
@@ -93,12 +96,11 @@ function getUnicodeNonBmpRegExp() {
 	 * '\u2600-\u26FF'  Misc Symbols
 	 * '\u2700-\u27BF'  Dingbats
 	 * '\uE000-\uF8FF'  Private Use
-	 * '\u{F0000}-\u{FFFFD}' Supplementary Private Use Area A
 	 *
 	 * Note: plane '\u2000-\u206F' used for General punctuation is excluded as it is handled in -> getPunctuationRegExp
 	 */
 
-	return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF\u{F0000}-\u{FFFFD}]/gu;
+	return /[\u1D00-\u1D7F\u1D80-\u1DBF\u1DC0-\u1DFF\u20A0-\u20CF\u20D0-\u20FF\u2100-\u214F\u2150-\u218F\u2190-\u21FF\u2200-\u22FF\u2300-\u23FF\u2400-\u243F\u2440-\u245F\u2460-\u24FF\u2500-\u257F\u2580-\u259F\u25A0-\u25FF\u2600-\u26FF\u2700-\u27BF\uE000-\uF8FF]/g;
 }
 
 /**
@@ -131,6 +133,7 @@ function getSupplementaryPrivateUseRegExp() {
 	/**
 	 * Reference: https://www.unicode.org/charts/PDF/UD800.pdf
 	 * https://www.unicode.org/charts/PDF/UDC00.pdf
+	 * https://www.unicode.org/charts/PDF/UF0000.pdf
 	 */
-	return /[\uDB80-\uDBBF][\uDC00-\uDFFD]/g;
+	return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|[\u{F0000}-\u{FFFFD}]/gu;
 }
diff --git a/test/commons/text/unicode.js b/test/commons/text/unicode.js
index a64e01fe34..e98d6b267b 100644
--- a/test/commons/text/unicode.js
+++ b/test/commons/text/unicode.js
@@ -208,6 +208,13 @@ describe('text.removeUnicode', function() {
 		assert.equal(actual, '');
 	});
 
+	it('returns the string with supplementary private use area A characters removed', function() {
+		var actual = axe.commons.text.removeUnicode('\uDB80\uDC19', {
+			nonBmp: true
+		});
+		assert.equal(actual, '');
+	});
+
 	it('returns string removing combination of unicode characters', function() {
 		var actual = axe.commons.text.removeUnicode(
 			'The ☀️ is orange, the ◓ is white.',

From 0e3777a53d33e5468d27a8b340454aef03bc15b9 Mon Sep 17 00:00:00 2001
From: Andrew Huth <andrew.huth@airbnb.com>
Date: Thu, 19 Mar 2020 11:07:44 -0700
Subject: [PATCH 4/5] Replace unicode escape sequences in regex with the
 corresponding surrogate pairs

This will be more compatible. IE11 and ES5 environments do not support the `u` flag on regexes.
---
 lib/commons/text/unicode.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js
index 2666d52120..31fabc7c70 100644
--- a/lib/commons/text/unicode.js
+++ b/lib/commons/text/unicode.js
@@ -135,5 +135,5 @@ function getSupplementaryPrivateUseRegExp() {
 	 * https://www.unicode.org/charts/PDF/UDC00.pdf
 	 * https://www.unicode.org/charts/PDF/UF0000.pdf
 	 */
-	return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|[\u{F0000}-\u{FFFFD}]/gu;
+	return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|(?:[\uDB80-\uDBBE][\uDC00-\uDFFF]|\uDBBF[\uDC00-\uDFFD])/g;
 }

From 6a90379e51510c5554f70dcdc56dcfca59ce213a Mon Sep 17 00:00:00 2001
From: Andrew Huth <andrew.huth@airbnb.com>
Date: Thu, 19 Mar 2020 11:48:09 -0700
Subject: [PATCH 5/5] Document which parts of the
 `getSupplementaryPrivateUseRegExp` regex refer to which unicode range

---
 lib/commons/text/unicode.js | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lib/commons/text/unicode.js b/lib/commons/text/unicode.js
index 31fabc7c70..25cade6e65 100644
--- a/lib/commons/text/unicode.js
+++ b/lib/commons/text/unicode.js
@@ -130,10 +130,11 @@ function getPunctuationRegExp() {
  * @returns {RegExp}
  */
 function getSupplementaryPrivateUseRegExp() {
-	/**
-	 * Reference: https://www.unicode.org/charts/PDF/UD800.pdf
-	 * https://www.unicode.org/charts/PDF/UDC00.pdf
-	 * https://www.unicode.org/charts/PDF/UF0000.pdf
-	 */
+	// 1. High surrogate area (https://www.unicode.org/charts/PDF/UD800.pdf)
+	// 2. Low surrogate area (https://www.unicode.org/charts/PDF/UDC00.pdf)
+	// 3. Supplementary private use area A (https://www.unicode.org/charts/PDF/UF0000.pdf)
+	//
+	//             1              2                                  3
+	//      ┏━━━━━━┻━━━━━━┓┏━━━━━━┻━━━━━━┓ ┏━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
 	return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|(?:[\uDB80-\uDBBE][\uDC00-\uDFFF]|\uDBBF[\uDC00-\uDFFD])/g;
 }