From a5f5afb4a23d28677b9fcbd7d327795ec9c12681 Mon Sep 17 00:00:00 2001 From: Dennis Snell Date: Wed, 31 Oct 2018 18:15:59 -0400 Subject: [PATCH] reset branch on master --- lib/parser.php | 59 ++++--- .../parser.php | 43 ++++-- .../src/index.js | 81 ++++++++-- .../test/__snapshots__/index.js.snap | 2 + .../grammar.pegjs | 146 ++++++++++++++---- .../block-serialization-spec-parser/parser.js | 146 ++++++++++++++---- .../shared-tests.js | 64 ++++++++ .../test/__snapshots__/index.js.snap | 2 + 8 files changed, 436 insertions(+), 107 deletions(-) diff --git a/lib/parser.php b/lib/parser.php index 330249196861a0..7a5dc6e1ec9fd4 100644 --- a/lib/parser.php +++ b/lib/parser.php @@ -259,20 +259,22 @@ private function peg_f1($pre, $bs, $post) { return peg_join_blocks( $pre, $bs, $ private function peg_f2($blockName, $a) { return $a; } private function peg_f3($blockName, $attrs) { return array( - 'blockName' => $blockName, - 'attrs' => isset( $attrs ) ? $attrs : array(), - 'innerBlocks' => array(), - 'innerHTML' => '', + 'blockName' => $blockName, + 'attrs' => isset( $attrs ) ? $attrs : array(), + 'innerBlocks' => array(), + 'innerHTML' => '', + 'blockMarkers' => array(), ); } private function peg_f4($s, $children, $e) { - list( $innerHTML, $innerBlocks ) = peg_array_partition( $children, 'is_string' ); + list( $innerHTML, $innerBlocks, $blockMarkers ) = peg_split_inner_content( $children ); return array( 'blockName' => $s['blockName'], 'attrs' => $s['attrs'], 'innerBlocks' => $innerBlocks, 'innerHTML' => implode( '', $innerHTML ), + 'blockMarkers' => $blockMarkers, ); } private function peg_f5($blockName, $attrs) { @@ -1441,18 +1443,31 @@ public function parse($input) { // are the same as `json_decode` // array arguments are backwards because of PHP - if ( ! function_exists( 'peg_array_partition' ) ) { - function peg_array_partition( $array, $predicate ) { - $truthy = array(); - $falsey = array(); + if ( ! function_exists( 'peg_split_inner_content' ) ) { + function peg_split_inner_content( $array ) { + $strings = array(); + $blocks = array(); + $markers = array(); + $offset = 0; + $string = ''; foreach ( $array as $item ) { - call_user_func( $predicate, $item ) - ? $truthy[] = $item - : $falsey[] = $item; + if ( is_string( $item ) ) { + $string .= $item; + } else { + $offset += strlen( $string ); + $strings[] = $string; + $markers[] = $offset; + $blocks[] = $item; + $string = ''; + } + } + + if ( $string !== '' ) { + $strings[] = $string; } - return array( $truthy, $falsey ); + return array( $strings, $blocks, $markers ); } } @@ -1462,10 +1477,10 @@ function peg_join_blocks( $pre, $tokens, $post ) { if ( ! empty( $pre ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $pre + 'innerHTML' => $pre ); } @@ -1476,20 +1491,20 @@ function peg_join_blocks( $pre, $tokens, $post ) { if ( ! empty( $html ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $html + 'innerHTML' => $html ); } } if ( ! empty( $post ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $post + 'innerHTML' => $post ); } diff --git a/packages/block-serialization-default-parser/parser.php b/packages/block-serialization-default-parser/parser.php index 78b6921787cc48..d18d85711f0069 100644 --- a/packages/block-serialization-default-parser/parser.php +++ b/packages/block-serialization-default-parser/parser.php @@ -48,11 +48,20 @@ class WP_Block_Parser_Block { */ public $innerHTML; - function __construct( $name, $attrs, $innerBlocks, $innerHTML ) { - $this->blockName = $name; - $this->attrs = $attrs; - $this->innerBlocks = $innerBlocks; - $this->innerHTML = $innerHTML; + /** + * Bytes into `innerHTML` where inner blocks were found, assumed UTF8 encoding + * + * @since 5.0.0 + * @var int[] + */ + public $blockMarkers; + + function __construct( $name, $attrs, $innerBlocks, $innerHTML, $blockMarkers ) { + $this->blockName = $name; + $this->attrs = $attrs; + $this->innerBlocks = $innerBlocks; + $this->innerHTML = $innerHTML; + $this->blockMarkers = $blockMarkers; } } @@ -252,14 +261,14 @@ function proceed() { ) ); } - $this->output[] = (array) new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ); + $this->output[] = (array) new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() ); $this->offset = $start_offset + $token_length; return true; } // otherwise we found an inner block $this->add_inner_block( - new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ), + new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() ), $start_offset, $token_length ); @@ -269,7 +278,7 @@ function proceed() { case 'block-opener': // track all newly-opened blocks on the stack array_push( $this->stack, new WP_Block_Parser_Frame( - new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ), + new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() ), $start_offset, $token_length, $start_offset + $token_length, @@ -403,10 +412,15 @@ function next_token() { * @since 3.9.0 * * @param string $innerHTML HTML content of block - * @return WP_Block_Parser_Block freeform block object + * @return array freeform block object */ static function freeform( $innerHTML ) { - return new WP_Block_Parser_Block( null, array(), array(), $innerHTML ); + return array( + 'blockName' => null, + 'attrs' => array(), + 'innerBlocks' => array(), + 'innerHTML' => $innerHTML, + ); } /** @@ -440,8 +454,15 @@ function add_freeform( $length = null ) { */ function add_inner_block( WP_Block_Parser_Block $block, $token_start, $token_length, $last_offset = null ) { $parent = $this->stack[ count( $this->stack ) - 1 ]; + + $next_html = substr( $this->document, $parent->prev_offset, $token_start - $parent->prev_offset ); + $prev_length = ! empty( $parent->block->blockMarkers ) + ? $parent->block->blockMarkers[ count( $parent->block->blockMarkers ) - 1 ] + : 0; + $parent->block->innerBlocks[] = $block; - $parent->block->innerHTML .= substr( $this->document, $parent->prev_offset, $token_start - $parent->prev_offset ); + $parent->block->blockMarkers[] = $prev_length + strlen( $next_html ); + $parent->block->innerHTML .= $next_html; $parent->prev_offset = $last_offset ? $last_offset : $token_start + $token_length; } diff --git a/packages/block-serialization-default-parser/src/index.js b/packages/block-serialization-default-parser/src/index.js index 77306b0e347c7b..6f14a2fe3c9bfa 100644 --- a/packages/block-serialization-default-parser/src/index.js +++ b/packages/block-serialization-default-parser/src/index.js @@ -4,17 +4,23 @@ let output; let stack; const tokenizer = /)[^])+?}\s+)?(\/)?-->/g; -function Block( blockName, attrs, innerBlocks, innerHTML ) { +function Block( blockName, attrs, innerBlocks, innerHTML, blockMarkers ) { return { blockName, attrs, innerBlocks, innerHTML, + blockMarkers, }; } function Freeform( innerHTML ) { - return Block( null, {}, [], innerHTML ); + return { + blockName: null, + attrs: {}, + innerBlocks: [], + innerHTML, + }; } function Frame( block, tokenStart, tokenLength, prevOffset, leadingHtmlStart ) { @@ -84,14 +90,14 @@ function proceed() { if ( null !== leadingHtmlStart ) { output.push( Freeform( document.substr( leadingHtmlStart, startOffset - leadingHtmlStart ) ) ); } - output.push( Block( blockName, attrs, [], '' ) ); + output.push( Block( blockName, attrs, [], '', [] ) ); offset = startOffset + tokenLength; return true; } // otherwise we found an inner block addInnerBlock( - Block( blockName, attrs, [], '' ), + Block( blockName, attrs, [], '', [] ), startOffset, tokenLength, ); @@ -102,7 +108,7 @@ function proceed() { // track all newly-opened blocks on the stack stack.push( Frame( - Block( blockName, attrs, [], '' ), + Block( blockName, attrs, [], '', [] ), startOffset, tokenLength, startOffset + tokenLength, @@ -227,13 +233,68 @@ function addFreeform( rawLength ) { output.push( Freeform( document.substr( offset, length ) ) ); } +/** + * Returns bytes required to represent given string in UTF8 + * + * Assumes input is encoded in UCS2 or UTF16 according to the ECMAScript spec + * @see: https://www.ecma-international.org/ecma-262/9.0/index.html#sec-ecmascript-language-types-string-type + * + * Transparently counts bytes for invalid encodings: + * e.g. unpaired surrogate pair characters count as three bytes + * + * @cite: https://stackoverflow.com/a/34920444 + * + * @param {string} s input string + * @return {number} how many bytes are in the UTF8 representation of the given string + */ +function utf8bytes( s ) { + let n = 0; + + for ( let i = 0, l = s.length; i < l; i++ ) { + const hi = s.charCodeAt( i ); + + if ( hi < 0x0080 ) { // [0x0000, 0x007F] + n += 1; + } else if ( hi < 0x0800 ) { // [0x0080, 0x07FF] + n += 2; + } else if ( hi < 0xD800 ) { // [0x0800, 0xD7FF] + n += 3; + } else if ( hi < 0xDC00 ) { // [0xD800, 0xDBFF] + const lo = s.charCodeAt( ++i ); + + if ( i < l && lo >= 0xDC00 && lo <= 0xDFFF ) { //followed by [0xDC00, 0xDFFF] + n += 4; + } else { + // this is an invalid string with an unpaired surrogate. + // transparently pass it through for byte counts + // and back up to restart processing at the next character. + n += 3; + i -= 1; + } + } else if ( hi < 0xE000 ) { //[0xDC00, 0xDFFF] + // these are invalid encodings in the Unicode standard + // because they are reserved for encoding surrogate pairs. + // transparently pass them through here for byte counts. + n += 3; + } else { // [0xE000, 0xFFFF] + n += 3; + } + } + + return n; +} + function addInnerBlock( block, tokenStart, tokenLength, lastOffset ) { const parent = stack[ stack.length - 1 ]; - parent.block.innerBlocks.push( block ); - parent.block.innerHTML += document.substr( - parent.prevOffset, - tokenStart - parent.prevOffset, - ); + const parentBlock = parent.block; + const blockMarkers = parentBlock.blockMarkers; + + const nextHTML = document.substr( parent.prevOffset, tokenStart - parent.prevOffset ); + const prevLength = blockMarkers.length ? blockMarkers[ blockMarkers.length - 1 ] : 0; + + parentBlock.innerBlocks.push( block ); + blockMarkers.push( prevLength + utf8bytes( nextHTML ) ); + parentBlock.innerHTML += nextHTML; parent.prevOffset = lastOffset ? lastOffset : tokenStart + tokenLength; } diff --git a/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap b/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap index 5b5abb61a3d2e5..953b4126c0eb16 100644 --- a/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap +++ b/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap @@ -4,6 +4,7 @@ exports[`block-serialization-default-parser-js basic parsing parse() works prope Array [ Object { "attrs": Object {}, + "blockMarkers": Array [], "blockName": "core/more", "innerBlocks": Array [], "innerHTML": "", @@ -15,6 +16,7 @@ exports[`block-serialization-default-parser-php basic parsing parse() works prop Array [ Object { "attrs": Object {}, + "blockMarkers": Array [], "blockName": "core/more", "innerBlocks": Array [], "innerHTML": "", diff --git a/packages/block-serialization-spec-parser/grammar.pegjs b/packages/block-serialization-spec-parser/grammar.pegjs index 00d1fec2114136..0fb15b82ff3cf6 100644 --- a/packages/block-serialization-spec-parser/grammar.pegjs +++ b/packages/block-serialization-spec-parser/grammar.pegjs @@ -51,18 +51,31 @@ // are the same as `json_decode` // array arguments are backwards because of PHP -if ( ! function_exists( 'peg_array_partition' ) ) { - function peg_array_partition( $array, $predicate ) { - $truthy = array(); - $falsey = array(); +if ( ! function_exists( 'peg_split_inner_content' ) ) { + function peg_split_inner_content( $array ) { + $strings = array(); + $blocks = array(); + $markers = array(); + $offset = 0; + $string = ''; foreach ( $array as $item ) { - call_user_func( $predicate, $item ) - ? $truthy[] = $item - : $falsey[] = $item; + if ( is_string( $item ) ) { + $string .= $item; + } else { + $offset += strlen( $string ); + $strings[] = $string; + $markers[] = $offset; + $blocks[] = $item; + $string = ''; + } + } + + if ( $string !== '' ) { + $strings[] = $string; } - return array( $truthy, $falsey ); + return array( $strings, $blocks, $markers ); } } @@ -72,10 +85,10 @@ if ( ! function_exists( 'peg_join_blocks' ) ) { if ( ! empty( $pre ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $pre + 'innerHTML' => $pre ); } @@ -86,20 +99,20 @@ if ( ! function_exists( 'peg_join_blocks' ) ) { if ( ! empty( $html ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $html + 'innerHTML' => $html ); } } if ( ! empty( $post ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $post + 'innerHTML' => $post ); } @@ -151,22 +164,86 @@ function maybeJSON( s ) { } } -function partition( predicate, list ) { +/** + * Returns bytes required to represent given string in UTF8 + * + * Assumes input is encoded in UCS2 or UTF16 according to the ECMAScript spec + * @see: https://www.ecma-international.org/ecma-262/9.0/index.html#sec-ecmascript-language-types-string-type + * + * Transparently counts bytes for invalid encodings: + * e.g. unpaired surrogate pair characters count as three bytes + * + * @cite: https://stackoverflow.com/a/34920444 + * + * @param {string} s input string + * @return {number} how many bytes are in the UTF8 representation of the given string + */ +function utf8bytes( s ) { + var i, l, n = 0; + + for ( i = 0, l = s.length; i < l; i++ ) { + var lo, hi = s.charCodeAt( i ); + + if ( hi < 0x0080) { // [0x0000, 0x007F] + n += 1; + } else if ( hi < 0x0800 ) { // [0x0080, 0x07FF] + n += 2; + } else if ( hi < 0xD800 ) { // [0x0800, 0xD7FF] + n += 3; + } else if ( hi < 0xDC00 ) { // [0xD800, 0xDBFF] + lo = s.charCodeAt( ++i ); + + if ( i < l && lo >= 0xDC00 && lo <= 0xDFFF ) { //followed by [0xDC00, 0xDFFF] + n += 4; + } else { + // this is an invalid string with an unpaired surrogate. + // transparently pass it through for byte counts + // and back up to restart processing at the next character. + n += 3; + i -= 1; + } + } else if ( hi < 0xE000 ) { //[0xDC00, 0xDFFF] + // these are invalid encodings in the Unicode standard + // because they are reserved for encoding surrogate pairs. + // transparently pass them through here for byte counts. + n += 3; + } else { // [0xE000, 0xFFFF] + n += 3; + } + } + + return n; +} + +function splitInnerContent( list ) { var i, l, item; - var truthy = []; - var falsey = []; + var strings = []; + var blocks = []; + var markers = []; + var offset = 0; + var string = ''; // nod to performance over a simpler reduce // and clone model we could have taken here for ( i = 0, l = list.length; i < l; i++ ) { item = list[ i ]; - predicate( item ) - ? truthy.push( item ) - : falsey.push( item ) + if ( 'string' === typeof item ) { + string += item; + } else { + offset += utf8bytes( string ); + strings.push( string ); + markers.push( offset ); + blocks.push( item ); + string = ''; + } }; - return [ truthy, falsey ]; + if ( string !== '' ) { + strings.push( string ); + } + + return [ strings, blocks, markers ]; } } @@ -197,10 +274,11 @@ Block_Void { /** $blockName, - 'attrs' => isset( $attrs ) ? $attrs : array(), - 'innerBlocks' => array(), - 'innerHTML' => '', + 'blockName' => $blockName, + 'attrs' => isset( $attrs ) ? $attrs : array(), + 'innerBlocks' => array(), + 'innerHTML' => '', + 'blockMarkers' => array(), ); ?> **/ @@ -208,7 +286,8 @@ Block_Void blockName: blockName, attrs: attrs || {}, innerBlocks: [], - innerHTML: '' + innerHTML: '', + blockMarkers: [], }; } @@ -216,25 +295,28 @@ Block_Balanced = s:Block_Start children:(Block / $(!Block_End .))* e:Block_End { /** $s['blockName'], 'attrs' => $s['attrs'], 'innerBlocks' => $innerBlocks, 'innerHTML' => implode( '', $innerHTML ), + 'blockMarkers' => $blockMarkers, ); ?> **/ - var innerContent = partition( function( a ) { return 'string' === typeof a }, children ); + var innerContent = splitInnerContent( children ); var innerHTML = innerContent[ 0 ]; var innerBlocks = innerContent[ 1 ]; + var blockMarkers = innerContent[ 2 ]; return { blockName: s.blockName, attrs: s.attrs, innerBlocks: innerBlocks, - innerHTML: innerHTML.join( '' ) + innerHTML: innerHTML.join( '' ), + blockMarkers: blockMarkers, }; } diff --git a/packages/block-serialization-spec-parser/parser.js b/packages/block-serialization-spec-parser/parser.js index e716efb9e3e8d2..fa1667e140a41c 100644 --- a/packages/block-serialization-spec-parser/parser.js +++ b/packages/block-serialization-spec-parser/parser.js @@ -165,10 +165,11 @@ peg$c10 = function(blockName, attrs) { /** $blockName, - 'attrs' => isset( $attrs ) ? $attrs : array(), - 'innerBlocks' => array(), - 'innerHTML' => '', + 'blockName' => $blockName, + 'attrs' => isset( $attrs ) ? $attrs : array(), + 'innerBlocks' => array(), + 'innerHTML' => '', + 'blockMarkers' => array(), ); ?> **/ @@ -176,30 +177,34 @@ blockName: blockName, attrs: attrs || {}, innerBlocks: [], - innerHTML: '' + innerHTML: '', + blockMarkers: [], }; }, peg$c11 = function(s, children, e) { /** $s['blockName'], 'attrs' => $s['attrs'], 'innerBlocks' => $innerBlocks, 'innerHTML' => implode( '', $innerHTML ), + 'blockMarkers' => $blockMarkers, ); ?> **/ - var innerContent = partition( function( a ) { return 'string' === typeof a }, children ); + var innerContent = splitInnerContent( children ); var innerHTML = innerContent[ 0 ]; var innerBlocks = innerContent[ 1 ]; + var blockMarkers = innerContent[ 2 ]; return { blockName: s.blockName, attrs: s.attrs, innerBlocks: innerBlocks, - innerHTML: innerHTML.join( '' ) + innerHTML: innerHTML.join( '' ), + blockMarkers: blockMarkers, }; }, peg$c12 = "-->", @@ -1478,18 +1483,31 @@ // are the same as `json_decode` // array arguments are backwards because of PHP - if ( ! function_exists( 'peg_array_partition' ) ) { - function peg_array_partition( $array, $predicate ) { - $truthy = array(); - $falsey = array(); + if ( ! function_exists( 'peg_split_inner_content' ) ) { + function peg_split_inner_content( $array ) { + $strings = array(); + $blocks = array(); + $markers = array(); + $offset = 0; + $string = ''; foreach ( $array as $item ) { - call_user_func( $predicate, $item ) - ? $truthy[] = $item - : $falsey[] = $item; + if ( is_string( $item ) ) { + $string .= $item; + } else { + $offset += strlen( $string ); + $strings[] = $string; + $markers[] = $offset; + $blocks[] = $item; + $string = ''; + } } - return array( $truthy, $falsey ); + if ( $string !== '' ) { + $strings[] = $string; + } + + return array( $strings, $blocks, $markers ); } } @@ -1499,10 +1517,10 @@ if ( ! empty( $pre ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $pre + 'innerHTML' => $pre ); } @@ -1513,20 +1531,20 @@ if ( ! empty( $html ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $html + 'innerHTML' => $html ); } } if ( ! empty( $post ) ) { $blocks[] = array( - 'blockName' => null, - 'attrs' => array(), + 'blockName' => null, + 'attrs' => array(), 'innerBlocks' => array(), - 'innerHTML' => $post + 'innerHTML' => $post ); } @@ -1578,22 +1596,86 @@ } } - function partition( predicate, list ) { + /** + * Returns bytes required to represent given string in UTF8 + * + * Assumes input is encoded in UCS2 or UTF16 according to the ECMAScript spec + * @see: https://www.ecma-international.org/ecma-262/9.0/index.html#sec-ecmascript-language-types-string-type + * + * Transparently counts bytes for invalid encodings: + * e.g. unpaired surrogate pair characters count as three bytes + * + * @cite: https://stackoverflow.com/a/34920444 + * + * @param {string} s input string + * @return {number} how many bytes are in the UTF8 representation of the given string + */ + function utf8bytes( s ) { + var i, l, n = 0; + + for ( i = 0, l = s.length; i < l; i++ ) { + var lo, hi = s.charCodeAt( i ); + + if ( hi < 0x0080) { // [0x0000, 0x007F] + n += 1; + } else if ( hi < 0x0800 ) { // [0x0080, 0x07FF] + n += 2; + } else if ( hi < 0xD800 ) { // [0x0800, 0xD7FF] + n += 3; + } else if ( hi < 0xDC00 ) { // [0xD800, 0xDBFF] + lo = s.charCodeAt( ++i ); + + if ( i < l && lo >= 0xDC00 && lo <= 0xDFFF ) { //followed by [0xDC00, 0xDFFF] + n += 4; + } else { + // this is an invalid string with an unpaired surrogate. + // transparently pass it through for byte counts + // and back up to restart processing at the next character. + n += 3; + i -= 1; + } + } else if ( hi < 0xE000 ) { //[0xDC00, 0xDFFF] + // these are invalid encodings in the Unicode standard + // because they are reserved for encoding surrogate pairs. + // transparently pass them through here for byte counts. + n += 3; + } else { // [0xE000, 0xFFFF] + n += 3; + } + } + + return n; + } + + function splitInnerContent( list ) { var i, l, item; - var truthy = []; - var falsey = []; + var strings = []; + var blocks = []; + var markers = []; + var offset = 0; + var string = ''; // nod to performance over a simpler reduce // and clone model we could have taken here for ( i = 0, l = list.length; i < l; i++ ) { item = list[ i ]; - predicate( item ) - ? truthy.push( item ) - : falsey.push( item ) + if ( 'string' === typeof item ) { + string += item; + } else { + offset += utf8bytes( string ); + strings.push( string ); + markers.push( offset ); + blocks.push( item ); + string = ''; + } }; - return [ truthy, falsey ]; + if ( string !== '' ) { + strings.push( string ); + } + + return [ strings, blocks, markers ]; } diff --git a/packages/block-serialization-spec-parser/shared-tests.js b/packages/block-serialization-spec-parser/shared-tests.js index e9b90337061eeb..c806c13bc17a1e 100644 --- a/packages/block-serialization-spec-parser/shared-tests.js +++ b/packages/block-serialization-spec-parser/shared-tests.js @@ -61,6 +61,70 @@ export const jsTester = ( parse ) => () => { expect.objectContaining( { innerHTML: '

Break me

' } ), ] ) ) ); } ); + + describe( 'blockMarkers', () => { + test( 'adds empty block markers when no inner blocks exist', () => { + expect( parse( '' )[ 0 ] ).toHaveProperty( 'blockMarkers', [] ); + expect( parse( '' )[ 0 ] ).toHaveProperty( 'blockMarkers', [] ); + expect( parse( 'with content' )[ 0 ] ).toHaveProperty( 'blockMarkers', [] ); + } ); + + test( 'adds block markers for inner blocks', () => { + expect( parse( '' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 0 ] ); + expect( parse( 'aabb' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 2 ] ); + expect( parse( 'aabbcc' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 2 ] ); + expect( parse( 'aabbcc' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 0, 2, 4 ] ); + } ); + + test( 'block markers report UTF-8 encoding byte-length', () => { + const run = ( c ) => parse( `${ c }` )[ 0 ]; + + // normal conditions + expect( run( '\u{0024}' ) ).toHaveProperty( 'blockMarkers', [ 1 ] ); // $ U+0000 - U+007F + expect( run( '\u{00a2}' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // ยข U+0080 - U+07FF + expect( run( '\u{20ac}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // โ‚ฌ U+0800 - U+7FFF + expect( run( '\u{f8ff}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // ๏ฃฟ U+8000 - U+FFFF + expect( run( '\u{10348}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // ๐ˆ U+10000 - U+1FFFF + + expect( run( '$' ) ).toHaveProperty( 'blockMarkers', [ 1 ] ); // $ U+0000 - U+007F + expect( run( 'ยข' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // ยข U+0080 - U+07FF + expect( run( 'โ‚ฌ' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // โ‚ฌ U+0800 - U+7FFF + expect( run( '๏ฃฟ' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // ๏ฃฟ U+8000 - U+FFFF + expect( run( '๐ˆ' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // ๐ˆ U+10000 - U+1FFFF + + // surrogate pairs + expect( run( '\u{d800}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // invalid unpaired surrogate + expect( run( '\u{dc00}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // invalid unpaired surrogate + expect( run( '\u{10000}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // ๐€€ surrogate pair U+D800 U+DC00 + expect( run( '\ud800\udc00' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // ๐€€ surrogate pair U+D800 U+DC00 + expect( run( '๐€€' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // ๐€€ surrogate pair U+D800 U+DC00 + + // variations + expect( run( '\u{845b}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // edible bean; surname + expect( run( '\u{845b}\u{e0100}' ) ).toHaveProperty( 'blockMarkers', [ 7 ] ); // edible bean; surname + variation + + // NOTE: The next two run() strings _are not the same_ - check the encoding/raw bytes + // The first is the character by itself + // The second is the character plus the variation + expect( run( '่‘›' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // edible bean; surname + expect( run( '่‘›๓ „€' ) ).toHaveProperty( 'blockMarkers', [ 7 ] ); // edible bean; surname + variation + + // higher planes + expect( run( '\u{24b62}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); + expect( run( '๐คญข' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); + + // invalids + expect( run( '\u{fffd}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // replacement character + expect( run( '\u{80}' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // unexpected continuation byte + expect( run( '\u{fe}' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // invalid byte + + // emoji + expect( run( '\u{1f4a9}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // ๐Ÿ’ฉ pile of poo + expect( run( '๐Ÿ’ฉ' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // ๐Ÿ’ฉ pile of poo + expect( run( '\u{2764}\u{fe0f}' ) ).toHaveProperty( 'blockMarkers', [ 6 ] ); // โค๏ธ black heart + variation 16 + expect( run( 'โค๏ธ' ) ).toHaveProperty( 'blockMarkers', [ 6 ] ); // โค๏ธ black heart + variation 16 + } ); + } ); }; const hasPHP = 'test' === process.env.NODE_ENV ? ( () => { diff --git a/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap b/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap index 1a014545d98744..151b337829aefb 100644 --- a/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap +++ b/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap @@ -4,6 +4,7 @@ exports[`block-serialization-spec-parser-js basic parsing parse() works properly Array [ Object { "attrs": Object {}, + "blockMarkers": Array [], "blockName": "core/more", "innerBlocks": Array [], "innerHTML": "", @@ -15,6 +16,7 @@ exports[`block-serialization-spec-parser-php basic parsing parse() works properl Array [ Object { "attrs": Array [], + "blockMarkers": Array [], "blockName": "core/more", "innerBlocks": Array [], "innerHTML": "",