Skip to content

Commit

Permalink
Merge pull request #2102 from ahuth/unicode-private-use-area-A
Browse files Browse the repository at this point in the history
fix(unicode) detect supplementary private use area A
  • Loading branch information
straker authored Mar 19, 2020
2 parents 7952a37 + 6a90379 commit f1739c2
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 6 deletions.
17 changes: 11 additions & 6 deletions lib/commons/text/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ text.hasUnicode = function hasUnicode(str, options) {
return axe.imports.emojiRegexText().test(str);
}
if (nonBmp) {
return getUnicodeNonBmpRegExp().test(str);
return (
getUnicodeNonBmpRegExp().test(str) ||
getSupplementaryPrivateUseRegExp().test(str)
);
}
if (punctuations) {
return getPunctuationRegExp().test(str);
Expand Down Expand Up @@ -127,9 +130,11 @@ function getPunctuationRegExp() {
* @returns {RegExp}
*/
function getSupplementaryPrivateUseRegExp() {
/**
* Reference: https://www.unicode.org/charts/PDF/UD800.pdf
* https://www.unicode.org/charts/PDF/UDC00.pdf
*/
return /[\uDB80-\uDBBF][\uDC00-\uDFFD]/g;
// 1. High surrogate area (https://www.unicode.org/charts/PDF/UD800.pdf)
// 2. Low surrogate area (https://www.unicode.org/charts/PDF/UDC00.pdf)
// 3. Supplementary private use area A (https://www.unicode.org/charts/PDF/UF0000.pdf)
//
// 1 2 3
// ┏━━━━━━┻━━━━━━┓┏━━━━━━┻━━━━━━┓ ┏━━━━━━━━━━━━━━━━━━━━━━━━━┻━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓
return /[\uDB80-\uDBBF][\uDC00-\uDFFD]|(?:[\uDB80-\uDBBE][\uDC00-\uDFFF]|\uDBBF[\uDC00-\uDFFD])/g;
}
14 changes: 14 additions & 0 deletions test/commons/text/unicode.js
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ describe('text.hasUnicode', function() {
});
assert.isTrue(actual);
});

it('returns true for a string with characters in supplementary private use area A', function() {
var actual = axe.commons.text.hasUnicode('\uDB80\uDC19', {
nonBmp: true
});
assert.isTrue(actual);
});
});

describe('text.hasUnicode, characters of type Emoji', function() {
Expand Down Expand Up @@ -201,6 +208,13 @@ describe('text.removeUnicode', function() {
assert.equal(actual, '');
});

it('returns the string with supplementary private use area A characters removed', function() {
var actual = axe.commons.text.removeUnicode('\uDB80\uDC19', {
nonBmp: true
});
assert.equal(actual, '');
});

it('returns string removing combination of unicode characters', function() {
var actual = axe.commons.text.removeUnicode(
'The ☀️ is orange, the ◓ is white.',
Expand Down

0 comments on commit f1739c2

Please sign in to comment.