From a5f5afb4a23d28677b9fcbd7d327795ec9c12681 Mon Sep 17 00:00:00 2001
From: Dennis Snell <dennis.snell@automattic.com>
Date: Wed, 31 Oct 2018 18:15:59 -0400
Subject: [PATCH] reset branch on master

---
 lib/parser.php                                |  59 ++++---
 .../parser.php                                |  43 ++++--
 .../src/index.js                              |  81 ++++++++--
 .../test/__snapshots__/index.js.snap          |   2 +
 .../grammar.pegjs                             | 146 ++++++++++++++----
 .../block-serialization-spec-parser/parser.js | 146 ++++++++++++++----
 .../shared-tests.js                           |  64 ++++++++
 .../test/__snapshots__/index.js.snap          |   2 +
 8 files changed, 436 insertions(+), 107 deletions(-)

diff --git a/lib/parser.php b/lib/parser.php
index 330249196861a0..7a5dc6e1ec9fd4 100644
--- a/lib/parser.php
+++ b/lib/parser.php
@@ -259,20 +259,22 @@ private function peg_f1($pre, $bs, $post) { return peg_join_blocks( $pre, $bs, $
     private function peg_f2($blockName, $a) { return $a; }
     private function peg_f3($blockName, $attrs) {
         return array(
-          'blockName'   => $blockName,
-          'attrs'       => isset( $attrs ) ? $attrs : array(),
-          'innerBlocks' => array(),
-          'innerHTML'   => '',
+          'blockName'    => $blockName,
+          'attrs'        => isset( $attrs ) ? $attrs : array(),
+          'innerBlocks'  => array(),
+          'innerHTML'    => '',
+          'blockMarkers' => array(),
         );
         }
     private function peg_f4($s, $children, $e) {
-        list( $innerHTML, $innerBlocks ) = peg_array_partition( $children, 'is_string' );
+        list( $innerHTML, $innerBlocks, $blockMarkers ) = peg_split_inner_content( $children );
 
         return array(
           'blockName'    => $s['blockName'],
           'attrs'        => $s['attrs'],
           'innerBlocks'  => $innerBlocks,
           'innerHTML'    => implode( '', $innerHTML ),
+          'blockMarkers' => $blockMarkers,
         );
         }
     private function peg_f5($blockName, $attrs) {
@@ -1441,18 +1443,31 @@ public function parse($input) {
     // are the same as `json_decode`
 
     // array arguments are backwards because of PHP
-    if ( ! function_exists( 'peg_array_partition' ) ) {
-        function peg_array_partition( $array, $predicate ) {
-            $truthy = array();
-            $falsey = array();
+    if ( ! function_exists( 'peg_split_inner_content' ) ) {
+        function peg_split_inner_content( $array ) {
+            $strings  = array();
+            $blocks   = array();
+            $markers  = array();
+            $offset   = 0;
+            $string   = '';
 
             foreach ( $array as $item ) {
-                call_user_func( $predicate, $item )
-                    ? $truthy[] = $item
-                    : $falsey[] = $item;
+                if ( is_string( $item ) ) {
+                    $string .= $item;
+                } else {
+                    $offset   += strlen( $string );
+                    $strings[] = $string;
+                    $markers[] = $offset;
+                    $blocks[]  = $item;
+                    $string    = '';
+                }
+            }
+
+            if ( $string !== '' ) {
+                $strings[] = $string;
             }
 
-            return array( $truthy, $falsey );
+            return array( $strings, $blocks, $markers );
         }
     }
 
@@ -1462,10 +1477,10 @@ function peg_join_blocks( $pre, $tokens, $post ) {
 
             if ( ! empty( $pre ) ) {
                 $blocks[] = array(
-                    'blockName' => null,
-                    'attrs' => array(),
+                    'blockName'   => null,
+                    'attrs'       => array(),
                     'innerBlocks' => array(),
-                    'innerHTML' => $pre
+                    'innerHTML'   => $pre
                 );
             }
 
@@ -1476,20 +1491,20 @@ function peg_join_blocks( $pre, $tokens, $post ) {
 
                 if ( ! empty( $html ) ) {
                     $blocks[] = array(
-                        'blockName' => null,
-                        'attrs' => array(),
+                        'blockName'   => null,
+                        'attrs'       => array(),
                         'innerBlocks' => array(),
-                        'innerHTML' => $html
+                        'innerHTML'   => $html
                     );
                 }
             }
 
             if ( ! empty( $post ) ) {
                 $blocks[] = array(
-                    'blockName' => null,
-                    'attrs' => array(),
+                    'blockName'   => null,
+                    'attrs'       => array(),
                     'innerBlocks' => array(),
-                    'innerHTML' => $post
+                    'innerHTML'   => $post
                 );
             }
 
diff --git a/packages/block-serialization-default-parser/parser.php b/packages/block-serialization-default-parser/parser.php
index 78b6921787cc48..d18d85711f0069 100644
--- a/packages/block-serialization-default-parser/parser.php
+++ b/packages/block-serialization-default-parser/parser.php
@@ -48,11 +48,20 @@ class WP_Block_Parser_Block {
 	 */
 	public $innerHTML;
 
-	function __construct( $name, $attrs, $innerBlocks, $innerHTML ) {
-		$this->blockName   = $name;
-		$this->attrs       = $attrs;
-		$this->innerBlocks = $innerBlocks;
-		$this->innerHTML   = $innerHTML;
+	/**
+	 * Bytes into `innerHTML` where inner blocks were found, assumed UTF8 encoding
+	 *
+	 * @since 5.0.0
+	 * @var int[]
+	 */
+	public $blockMarkers;
+
+	function __construct( $name, $attrs, $innerBlocks, $innerHTML, $blockMarkers ) {
+		$this->blockName    = $name;
+		$this->attrs        = $attrs;
+		$this->innerBlocks  = $innerBlocks;
+		$this->innerHTML    = $innerHTML;
+		$this->blockMarkers = $blockMarkers;
 	}
 }
 
@@ -252,14 +261,14 @@ function proceed() {
 						) );
 					}
 
-					$this->output[] = (array) new WP_Block_Parser_Block( $block_name, $attrs, array(), '' );
+					$this->output[] = (array) new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() );
 					$this->offset = $start_offset + $token_length;
 					return true;
 				}
 
 				// otherwise we found an inner block
 				$this->add_inner_block(
-					new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ),
+					new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() ),
 					$start_offset,
 					$token_length
 				);
@@ -269,7 +278,7 @@ function proceed() {
 			case 'block-opener':
 				// track all newly-opened blocks on the stack
 				array_push( $this->stack, new WP_Block_Parser_Frame(
-					new WP_Block_Parser_Block( $block_name, $attrs, array(), '' ),
+					new WP_Block_Parser_Block( $block_name, $attrs, array(), '', array() ),
 					$start_offset,
 					$token_length,
 					$start_offset + $token_length,
@@ -403,10 +412,15 @@ function next_token() {
 	 * @since 3.9.0
 	 *
 	 * @param string $innerHTML HTML content of block
-	 * @return WP_Block_Parser_Block freeform block object
+	 * @return array freeform block object
 	 */
 	static function freeform( $innerHTML ) {
-		return new WP_Block_Parser_Block( null, array(), array(), $innerHTML );
+		return array(
+			'blockName'   => null,
+			'attrs'       => array(),
+			'innerBlocks' => array(),
+			'innerHTML'   => $innerHTML,
+		);
 	}
 
 	/**
@@ -440,8 +454,15 @@ function add_freeform( $length = null ) {
 	 */
 	function add_inner_block( WP_Block_Parser_Block $block, $token_start, $token_length, $last_offset = null ) {
 		$parent = $this->stack[ count( $this->stack ) - 1 ];
+
+		$next_html = substr( $this->document, $parent->prev_offset, $token_start - $parent->prev_offset );
+		$prev_length = ! empty( $parent->block->blockMarkers )
+			? $parent->block->blockMarkers[ count( $parent->block->blockMarkers ) - 1 ]
+			: 0;
+
 		$parent->block->innerBlocks[] = $block;
-		$parent->block->innerHTML .= substr( $this->document, $parent->prev_offset, $token_start - $parent->prev_offset );
+		$parent->block->blockMarkers[] = $prev_length + strlen( $next_html );
+		$parent->block->innerHTML .= $next_html;
 		$parent->prev_offset = $last_offset ? $last_offset : $token_start + $token_length;
 	}
 
diff --git a/packages/block-serialization-default-parser/src/index.js b/packages/block-serialization-default-parser/src/index.js
index 77306b0e347c7b..6f14a2fe3c9bfa 100644
--- a/packages/block-serialization-default-parser/src/index.js
+++ b/packages/block-serialization-default-parser/src/index.js
@@ -4,17 +4,23 @@ let output;
 let stack;
 const tokenizer = /<!--\s+(\/)?wp:([a-z][a-z0-9_-]*\/)?([a-z][a-z0-9_-]*)\s+({(?:(?!}\s+-->)[^])+?}\s+)?(\/)?-->/g;
 
-function Block( blockName, attrs, innerBlocks, innerHTML ) {
+function Block( blockName, attrs, innerBlocks, innerHTML, blockMarkers ) {
 	return {
 		blockName,
 		attrs,
 		innerBlocks,
 		innerHTML,
+		blockMarkers,
 	};
 }
 
 function Freeform( innerHTML ) {
-	return Block( null, {}, [], innerHTML );
+	return {
+		blockName: null,
+		attrs: {},
+		innerBlocks: [],
+		innerHTML,
+	};
 }
 
 function Frame( block, tokenStart, tokenLength, prevOffset, leadingHtmlStart ) {
@@ -84,14 +90,14 @@ function proceed() {
 				if ( null !== leadingHtmlStart ) {
 					output.push( Freeform( document.substr( leadingHtmlStart, startOffset - leadingHtmlStart ) ) );
 				}
-				output.push( Block( blockName, attrs, [], '' ) );
+				output.push( Block( blockName, attrs, [], '', [] ) );
 				offset = startOffset + tokenLength;
 				return true;
 			}
 
 			// otherwise we found an inner block
 			addInnerBlock(
-				Block( blockName, attrs, [], '' ),
+				Block( blockName, attrs, [], '', [] ),
 				startOffset,
 				tokenLength,
 			);
@@ -102,7 +108,7 @@ function proceed() {
 			// track all newly-opened blocks on the stack
 			stack.push(
 				Frame(
-					Block( blockName, attrs, [], '' ),
+					Block( blockName, attrs, [], '', [] ),
 					startOffset,
 					tokenLength,
 					startOffset + tokenLength,
@@ -227,13 +233,68 @@ function addFreeform( rawLength ) {
 	output.push( Freeform( document.substr( offset, length ) ) );
 }
 
+/**
+ * Returns bytes required to represent given string in UTF8
+ *
+ * Assumes input is encoded in UCS2 or UTF16 according to the ECMAScript spec
+ * @see: https://www.ecma-international.org/ecma-262/9.0/index.html#sec-ecmascript-language-types-string-type
+ *
+ * Transparently counts bytes for invalid encodings:
+ * e.g. unpaired surrogate pair characters count as three bytes
+ *
+ * @cite: https://stackoverflow.com/a/34920444
+ *
+ * @param {string} s input string
+ * @return {number} how many bytes are in the UTF8 representation of the given string
+ */
+function utf8bytes( s ) {
+	let n = 0;
+
+	for ( let i = 0, l = s.length; i < l; i++ ) {
+		const hi = s.charCodeAt( i );
+
+		if ( hi < 0x0080 ) { // [0x0000, 0x007F]
+			n += 1;
+		} else if ( hi < 0x0800 ) { // [0x0080, 0x07FF]
+			n += 2;
+		} else if ( hi < 0xD800 ) { // [0x0800, 0xD7FF]
+			n += 3;
+		} else if ( hi < 0xDC00 ) { // [0xD800, 0xDBFF]
+			const lo = s.charCodeAt( ++i );
+
+			if ( i < l && lo >= 0xDC00 && lo <= 0xDFFF ) { //followed by [0xDC00, 0xDFFF]
+				n += 4;
+			} else {
+				// this is an invalid string with an unpaired surrogate.
+				// transparently pass it through for byte counts
+				// and back up to restart processing at the next character.
+				n += 3;
+				i -= 1;
+			}
+		} else if ( hi < 0xE000 ) { //[0xDC00, 0xDFFF]
+			// these are invalid encodings in the Unicode standard
+			// because they are reserved for encoding surrogate pairs.
+			// transparently pass them through here for byte counts.
+			n += 3;
+		} else { // [0xE000, 0xFFFF]
+			n += 3;
+		}
+	}
+
+	return n;
+}
+
 function addInnerBlock( block, tokenStart, tokenLength, lastOffset ) {
 	const parent = stack[ stack.length - 1 ];
-	parent.block.innerBlocks.push( block );
-	parent.block.innerHTML += document.substr(
-		parent.prevOffset,
-		tokenStart - parent.prevOffset,
-	);
+	const parentBlock = parent.block;
+	const blockMarkers = parentBlock.blockMarkers;
+
+	const nextHTML = document.substr( parent.prevOffset, tokenStart - parent.prevOffset );
+	const prevLength = blockMarkers.length ? blockMarkers[ blockMarkers.length - 1 ] : 0;
+
+	parentBlock.innerBlocks.push( block );
+	blockMarkers.push( prevLength + utf8bytes( nextHTML ) );
+	parentBlock.innerHTML += nextHTML;
 	parent.prevOffset = lastOffset ? lastOffset : tokenStart + tokenLength;
 }
 
diff --git a/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap b/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap
index 5b5abb61a3d2e5..953b4126c0eb16 100644
--- a/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap
+++ b/packages/block-serialization-default-parser/test/__snapshots__/index.js.snap
@@ -4,6 +4,7 @@ exports[`block-serialization-default-parser-js basic parsing parse() works prope
 Array [
   Object {
     "attrs": Object {},
+    "blockMarkers": Array [],
     "blockName": "core/more",
     "innerBlocks": Array [],
     "innerHTML": "<!--more-->",
@@ -15,6 +16,7 @@ exports[`block-serialization-default-parser-php basic parsing parse() works prop
 Array [
   Object {
     "attrs": Object {},
+    "blockMarkers": Array [],
     "blockName": "core/more",
     "innerBlocks": Array [],
     "innerHTML": "<!--more-->",
diff --git a/packages/block-serialization-spec-parser/grammar.pegjs b/packages/block-serialization-spec-parser/grammar.pegjs
index 00d1fec2114136..0fb15b82ff3cf6 100644
--- a/packages/block-serialization-spec-parser/grammar.pegjs
+++ b/packages/block-serialization-spec-parser/grammar.pegjs
@@ -51,18 +51,31 @@
 // are the same as `json_decode`
 
 // array arguments are backwards because of PHP
-if ( ! function_exists( 'peg_array_partition' ) ) {
-    function peg_array_partition( $array, $predicate ) {
-        $truthy = array();
-        $falsey = array();
+if ( ! function_exists( 'peg_split_inner_content' ) ) {
+    function peg_split_inner_content( $array ) {
+        $strings  = array();
+        $blocks   = array();
+        $markers  = array();
+        $offset   = 0;
+        $string   = '';
 
         foreach ( $array as $item ) {
-            call_user_func( $predicate, $item )
-                ? $truthy[] = $item
-                : $falsey[] = $item;
+            if ( is_string( $item ) ) {
+                $string .= $item;
+            } else {
+                $offset   += strlen( $string );
+                $strings[] = $string;
+                $markers[] = $offset;
+                $blocks[]  = $item;
+                $string    = '';
+            }
+        }
+
+        if ( $string !== '' ) {
+            $strings[] = $string;
         }
 
-        return array( $truthy, $falsey );
+        return array( $strings, $blocks, $markers );
     }
 }
 
@@ -72,10 +85,10 @@ if ( ! function_exists( 'peg_join_blocks' ) ) {
 
         if ( ! empty( $pre ) ) {
             $blocks[] = array(
-                'blockName' => null,
-                'attrs' => array(),
+                'blockName'   => null,
+                'attrs'       => array(),
                 'innerBlocks' => array(),
-                'innerHTML' => $pre
+                'innerHTML'   => $pre
             );
         }
 
@@ -86,20 +99,20 @@ if ( ! function_exists( 'peg_join_blocks' ) ) {
 
             if ( ! empty( $html ) ) {
                 $blocks[] = array(
-                    'blockName' => null,
-                    'attrs' => array(),
+                    'blockName'   => null,
+                    'attrs'       => array(),
                     'innerBlocks' => array(),
-                    'innerHTML' => $html
+                    'innerHTML'   => $html
                 );
             }
         }
 
         if ( ! empty( $post ) ) {
             $blocks[] = array(
-                'blockName' => null,
-                'attrs' => array(),
+                'blockName'   => null,
+                'attrs'       => array(),
                 'innerBlocks' => array(),
-                'innerHTML' => $post
+                'innerHTML'   => $post
             );
         }
 
@@ -151,22 +164,86 @@ function maybeJSON( s ) {
     }
 }
 
-function partition( predicate, list ) {
+/**
+ * Returns bytes required to represent given string in UTF8
+ *
+ * Assumes input is encoded in UCS2 or UTF16 according to the ECMAScript spec
+ * @see: https://www.ecma-international.org/ecma-262/9.0/index.html#sec-ecmascript-language-types-string-type
+ *
+ * Transparently counts bytes for invalid encodings:
+ * e.g. unpaired surrogate pair characters count as three bytes
+ *
+ * @cite: https://stackoverflow.com/a/34920444
+ *
+ * @param {string} s input string
+ * @return {number} how many bytes are in the UTF8 representation of the given string
+ */
+function utf8bytes( s ) {
+    var i, l, n = 0;
+
+    for ( i = 0, l = s.length; i < l; i++ ) {
+        var lo, hi = s.charCodeAt( i );
+
+        if ( hi < 0x0080) { // [0x0000, 0x007F]
+            n += 1;
+        } else if ( hi < 0x0800 ) { // [0x0080, 0x07FF]
+            n += 2;
+        } else if ( hi < 0xD800 ) { // [0x0800, 0xD7FF]
+            n += 3;
+        } else if ( hi < 0xDC00 ) { // [0xD800, 0xDBFF]
+            lo = s.charCodeAt( ++i );
+
+            if ( i < l && lo >= 0xDC00 && lo <= 0xDFFF ) { //followed by [0xDC00, 0xDFFF]
+                n += 4;
+            } else {
+                // this is an invalid string with an unpaired surrogate.
+                // transparently pass it through for byte counts
+                // and back up to restart processing at the next character.
+                n += 3;
+                i -= 1;
+            }
+        } else if ( hi < 0xE000 ) { //[0xDC00, 0xDFFF]
+            // these are invalid encodings in the Unicode standard
+            // because they are reserved for encoding surrogate pairs.
+            // transparently pass them through here for byte counts.
+            n += 3;
+        } else { // [0xE000, 0xFFFF]
+            n += 3;
+        }
+    }
+
+    return n;
+}
+
+function splitInnerContent( list ) {
     var i, l, item;
-    var truthy = [];
-    var falsey = [];
+    var strings = [];
+    var blocks = [];
+    var markers = [];
+    var offset = 0;
+    var string = '';
 
     // nod to performance over a simpler reduce
     // and clone model we could have taken here
     for ( i = 0, l = list.length; i < l; i++ ) {
         item = list[ i ];
 
-        predicate( item )
-            ? truthy.push( item )
-            : falsey.push( item )
+        if ( 'string' === typeof item ) {
+            string += item;
+        } else {
+            offset += utf8bytes( string );
+            strings.push( string );
+            markers.push( offset );
+            blocks.push( item );
+            string = '';
+        }
     };
 
-    return [ truthy, falsey ];
+    if ( string !== '' ) {
+        strings.push( string );
+    }
+
+    return [ strings, blocks, markers ];
 }
 
 }
@@ -197,10 +274,11 @@ Block_Void
   {
     /** <?php
     return array(
-      'blockName'   => $blockName,
-      'attrs'       => isset( $attrs ) ? $attrs : array(),
-      'innerBlocks' => array(),
-      'innerHTML'   => '',
+      'blockName'    => $blockName,
+      'attrs'        => isset( $attrs ) ? $attrs : array(),
+      'innerBlocks'  => array(),
+      'innerHTML'    => '',
+      'blockMarkers' => array(),
     );
     ?> **/
 
@@ -208,7 +286,8 @@ Block_Void
       blockName: blockName,
       attrs: attrs || {},
       innerBlocks: [],
-      innerHTML: ''
+      innerHTML: '',
+      blockMarkers: [],
     };
   }
 
@@ -216,25 +295,28 @@ Block_Balanced
   = s:Block_Start children:(Block / $(!Block_End .))* e:Block_End
   {
     /** <?php
-    list( $innerHTML, $innerBlocks ) = peg_array_partition( $children, 'is_string' );
+    list( $innerHTML, $innerBlocks, $blockMarkers ) = peg_split_inner_content( $children );
 
     return array(
       'blockName'    => $s['blockName'],
       'attrs'        => $s['attrs'],
       'innerBlocks'  => $innerBlocks,
       'innerHTML'    => implode( '', $innerHTML ),
+      'blockMarkers' => $blockMarkers,
     );
     ?> **/
 
-    var innerContent = partition( function( a ) { return 'string' === typeof a }, children );
+    var innerContent = splitInnerContent( children );
     var innerHTML = innerContent[ 0 ];
     var innerBlocks = innerContent[ 1 ];
+    var blockMarkers = innerContent[ 2 ];
 
     return {
       blockName: s.blockName,
       attrs: s.attrs,
       innerBlocks: innerBlocks,
-      innerHTML: innerHTML.join( '' )
+      innerHTML: innerHTML.join( '' ),
+      blockMarkers: blockMarkers,
     };
   }
 
diff --git a/packages/block-serialization-spec-parser/parser.js b/packages/block-serialization-spec-parser/parser.js
index e716efb9e3e8d2..fa1667e140a41c 100644
--- a/packages/block-serialization-spec-parser/parser.js
+++ b/packages/block-serialization-spec-parser/parser.js
@@ -165,10 +165,11 @@
         peg$c10 = function(blockName, attrs) {
             /** <?php
             return array(
-              'blockName'   => $blockName,
-              'attrs'       => isset( $attrs ) ? $attrs : array(),
-              'innerBlocks' => array(),
-              'innerHTML'   => '',
+              'blockName'    => $blockName,
+              'attrs'        => isset( $attrs ) ? $attrs : array(),
+              'innerBlocks'  => array(),
+              'innerHTML'    => '',
+              'blockMarkers' => array(),
             );
             ?> **/
 
@@ -176,30 +177,34 @@
               blockName: blockName,
               attrs: attrs || {},
               innerBlocks: [],
-              innerHTML: ''
+              innerHTML: '',
+              blockMarkers: [],
             };
           },
         peg$c11 = function(s, children, e) {
             /** <?php
-            list( $innerHTML, $innerBlocks ) = peg_array_partition( $children, 'is_string' );
+            list( $innerHTML, $innerBlocks, $blockMarkers ) = peg_split_inner_content( $children );
 
             return array(
               'blockName'    => $s['blockName'],
               'attrs'        => $s['attrs'],
               'innerBlocks'  => $innerBlocks,
               'innerHTML'    => implode( '', $innerHTML ),
+              'blockMarkers' => $blockMarkers,
             );
             ?> **/
 
-            var innerContent = partition( function( a ) { return 'string' === typeof a }, children );
+            var innerContent = splitInnerContent( children );
             var innerHTML = innerContent[ 0 ];
             var innerBlocks = innerContent[ 1 ];
+            var blockMarkers = innerContent[ 2 ];
 
             return {
               blockName: s.blockName,
               attrs: s.attrs,
               innerBlocks: innerBlocks,
-              innerHTML: innerHTML.join( '' )
+              innerHTML: innerHTML.join( '' ),
+              blockMarkers: blockMarkers,
             };
           },
         peg$c12 = "-->",
@@ -1478,18 +1483,31 @@
     // are the same as `json_decode`
 
     // array arguments are backwards because of PHP
-    if ( ! function_exists( 'peg_array_partition' ) ) {
-        function peg_array_partition( $array, $predicate ) {
-            $truthy = array();
-            $falsey = array();
+    if ( ! function_exists( 'peg_split_inner_content' ) ) {
+        function peg_split_inner_content( $array ) {
+            $strings  = array();
+            $blocks   = array();
+            $markers  = array();
+            $offset   = 0;
+            $string   = '';
 
             foreach ( $array as $item ) {
-                call_user_func( $predicate, $item )
-                    ? $truthy[] = $item
-                    : $falsey[] = $item;
+                if ( is_string( $item ) ) {
+                    $string .= $item;
+                } else {
+                    $offset   += strlen( $string );
+                    $strings[] = $string;
+                    $markers[] = $offset;
+                    $blocks[]  = $item;
+                    $string    = '';
+                }
             }
 
-            return array( $truthy, $falsey );
+            if ( $string !== '' ) {
+                $strings[] = $string;
+            }
+
+            return array( $strings, $blocks, $markers );
         }
     }
 
@@ -1499,10 +1517,10 @@
 
             if ( ! empty( $pre ) ) {
                 $blocks[] = array(
-                    'blockName' => null,
-                    'attrs' => array(),
+                    'blockName'   => null,
+                    'attrs'       => array(),
                     'innerBlocks' => array(),
-                    'innerHTML' => $pre
+                    'innerHTML'   => $pre
                 );
             }
 
@@ -1513,20 +1531,20 @@
 
                 if ( ! empty( $html ) ) {
                     $blocks[] = array(
-                        'blockName' => null,
-                        'attrs' => array(),
+                        'blockName'   => null,
+                        'attrs'       => array(),
                         'innerBlocks' => array(),
-                        'innerHTML' => $html
+                        'innerHTML'   => $html
                     );
                 }
             }
 
             if ( ! empty( $post ) ) {
                 $blocks[] = array(
-                    'blockName' => null,
-                    'attrs' => array(),
+                    'blockName'   => null,
+                    'attrs'       => array(),
                     'innerBlocks' => array(),
-                    'innerHTML' => $post
+                    'innerHTML'   => $post
                 );
             }
 
@@ -1578,22 +1596,86 @@
         }
     }
 
-    function partition( predicate, list ) {
+    /**
+     * Returns bytes required to represent given string in UTF8
+     *
+     * Assumes input is encoded in UCS2 or UTF16 according to the ECMAScript spec
+     * @see: https://www.ecma-international.org/ecma-262/9.0/index.html#sec-ecmascript-language-types-string-type
+     *
+     * Transparently counts bytes for invalid encodings:
+     * e.g. unpaired surrogate pair characters count as three bytes
+     *
+     * @cite: https://stackoverflow.com/a/34920444
+     *
+     * @param {string} s input string
+     * @return {number} how many bytes are in the UTF8 representation of the given string
+     */
+    function utf8bytes( s ) {
+        var i, l, n = 0;
+
+        for ( i = 0, l = s.length; i < l; i++ ) {
+            var lo, hi = s.charCodeAt( i );
+
+            if ( hi < 0x0080) { // [0x0000, 0x007F]
+                n += 1;
+            } else if ( hi < 0x0800 ) { // [0x0080, 0x07FF]
+                n += 2;
+            } else if ( hi < 0xD800 ) { // [0x0800, 0xD7FF]
+                n += 3;
+            } else if ( hi < 0xDC00 ) { // [0xD800, 0xDBFF]
+                lo = s.charCodeAt( ++i );
+
+                if ( i < l && lo >= 0xDC00 && lo <= 0xDFFF ) { //followed by [0xDC00, 0xDFFF]
+                    n += 4;
+                } else {
+                    // this is an invalid string with an unpaired surrogate.
+                    // transparently pass it through for byte counts
+                    // and back up to restart processing at the next character.
+                    n += 3;
+                    i -= 1;
+                }
+            } else if ( hi < 0xE000 ) { //[0xDC00, 0xDFFF]
+                // these are invalid encodings in the Unicode standard
+                // because they are reserved for encoding surrogate pairs.
+                // transparently pass them through here for byte counts.
+                n += 3;
+            } else { // [0xE000, 0xFFFF]
+                n += 3;
+            }
+        }
+
+        return n;
+    }
+
+    function splitInnerContent( list ) {
         var i, l, item;
-        var truthy = [];
-        var falsey = [];
+        var strings = [];
+        var blocks = [];
+        var markers = [];
+        var offset = 0;
+        var string = '';
 
         // nod to performance over a simpler reduce
         // and clone model we could have taken here
         for ( i = 0, l = list.length; i < l; i++ ) {
             item = list[ i ];
 
-            predicate( item )
-                ? truthy.push( item )
-                : falsey.push( item )
+            if ( 'string' === typeof item ) {
+                string += item;
+            } else {
+                offset += utf8bytes( string );
+                strings.push( string );
+                markers.push( offset );
+                blocks.push( item );
+                string = '';
+            }
         };
 
-        return [ truthy, falsey ];
+        if ( string !== '' ) {
+            strings.push( string );
+        }
+
+        return [ strings, blocks, markers ];
     }
 
 
diff --git a/packages/block-serialization-spec-parser/shared-tests.js b/packages/block-serialization-spec-parser/shared-tests.js
index e9b90337061eeb..c806c13bc17a1e 100644
--- a/packages/block-serialization-spec-parser/shared-tests.js
+++ b/packages/block-serialization-spec-parser/shared-tests.js
@@ -61,6 +61,70 @@ export const jsTester = ( parse ) => () => {
 			expect.objectContaining( { innerHTML: '<p>Break me</p>' } ),
 		] ) ) );
 	} );
+
+	describe( 'blockMarkers', () => {
+		test( 'adds empty block markers when no inner blocks exist', () => {
+			expect( parse( '<!-- wp:void /-->' )[ 0 ] ).toHaveProperty( 'blockMarkers', [] );
+			expect( parse( '<!-- wp:block --><!-- /wp:block -->' )[ 0 ] ).toHaveProperty( 'blockMarkers', [] );
+			expect( parse( '<!-- wp:block -->with content<!-- /wp:block -->' )[ 0 ] ).toHaveProperty( 'blockMarkers', [] );
+		} );
+
+		test( 'adds block markers for inner blocks', () => {
+			expect( parse( '<!-- wp:block --><!-- wp:void /--><!-- /wp:block -->' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 0 ] );
+			expect( parse( '<!-- wp:block -->aa<!-- wp:void /-->bb<!-- /wp:block -->' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 2 ] );
+			expect( parse( '<!-- wp:block -->aa<!-- wp:inner -->bb<!-- /wp:inner -->cc<!-- /wp:block -->' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 2 ] );
+			expect( parse( '<!-- wp:block --><!-- wp:start /-->aa<!-- wp:inner -->bb<!-- /wp:inner -->cc<!-- wp:end /--><!-- /wp:block -->' )[ 0 ] ).toHaveProperty( 'blockMarkers', [ 0, 2, 4 ] );
+		} );
+
+		test( 'block markers report UTF-8 encoding byte-length', () => {
+			const run = ( c ) => parse( `<!-- wp:block -->${ c }<!-- wp:void /--><!-- /wp:block -->` )[ 0 ];
+
+			// normal conditions
+			expect( run( '\u{0024}' ) ).toHaveProperty( 'blockMarkers', [ 1 ] ); // $ U+0000 - U+007F
+			expect( run( '\u{00a2}' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // ¢ U+0080 - U+07FF
+			expect( run( '\u{20ac}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // € U+0800 - U+7FFF
+			expect( run( '\u{f8ff}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); //  U+8000 - U+FFFF
+			expect( run( '\u{10348}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // 𐍈 U+10000 - U+1FFFF
+
+			expect( run( '$' ) ).toHaveProperty( 'blockMarkers', [ 1 ] ); // $ U+0000 - U+007F
+			expect( run( '¢' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // ¢ U+0080 - U+07FF
+			expect( run( '€' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // € U+0800 - U+7FFF
+			expect( run( '' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); //  U+8000 - U+FFFF
+			expect( run( '𐍈' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // 𐍈 U+10000 - U+1FFFF
+
+			// surrogate pairs
+			expect( run( '\u{d800}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // invalid unpaired surrogate
+			expect( run( '\u{dc00}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // invalid unpaired surrogate
+			expect( run( '\u{10000}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // 𐀀 surrogate pair U+D800 U+DC00
+			expect( run( '\ud800\udc00' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // 𐀀 surrogate pair U+D800 U+DC00
+			expect( run( '𐀀' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // 𐀀 surrogate pair U+D800 U+DC00
+
+			// variations
+			expect( run( '\u{845b}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // edible bean; surname
+			expect( run( '\u{845b}\u{e0100}' ) ).toHaveProperty( 'blockMarkers', [ 7 ] ); // edible bean; surname + variation
+
+			// NOTE: The next two run() strings _are not the same_ - check the encoding/raw bytes
+			// The first is the character by itself
+			// The second is the character plus the variation
+			expect( run( '葛' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // edible bean; surname
+			expect( run( '葛󠄀' ) ).toHaveProperty( 'blockMarkers', [ 7 ] ); // edible bean; surname + variation
+
+			// higher planes
+			expect( run( '\u{24b62}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] );
+			expect( run( '𤭢' ) ).toHaveProperty( 'blockMarkers', [ 4 ] );
+
+			// invalids
+			expect( run( '\u{fffd}' ) ).toHaveProperty( 'blockMarkers', [ 3 ] ); // replacement character
+			expect( run( '\u{80}' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // unexpected continuation byte
+			expect( run( '\u{fe}' ) ).toHaveProperty( 'blockMarkers', [ 2 ] ); // invalid byte
+
+			// emoji
+			expect( run( '\u{1f4a9}' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // 💩 pile of poo
+			expect( run( '💩' ) ).toHaveProperty( 'blockMarkers', [ 4 ] ); // 💩 pile of poo
+			expect( run( '\u{2764}\u{fe0f}' ) ).toHaveProperty( 'blockMarkers', [ 6 ] ); // ❤️ black heart + variation 16
+			expect( run( '❤️' ) ).toHaveProperty( 'blockMarkers', [ 6 ] ); // ❤️ black heart + variation 16
+		} );
+	} );
 };
 
 const hasPHP = 'test' === process.env.NODE_ENV ? ( () => {
diff --git a/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap b/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap
index 1a014545d98744..151b337829aefb 100644
--- a/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap
+++ b/packages/block-serialization-spec-parser/test/__snapshots__/index.js.snap
@@ -4,6 +4,7 @@ exports[`block-serialization-spec-parser-js basic parsing parse() works properly
 Array [
   Object {
     "attrs": Object {},
+    "blockMarkers": Array [],
     "blockName": "core/more",
     "innerBlocks": Array [],
     "innerHTML": "<!--more-->",
@@ -15,6 +16,7 @@ exports[`block-serialization-spec-parser-php basic parsing parse() works properl
 Array [
   Object {
     "attrs": Array [],
+    "blockMarkers": Array [],
     "blockName": "core/more",
     "innerBlocks": Array [],
     "innerHTML": "<!--more-->",