From cce06cd376ef0f5c627a5f8082862d289fb825a8 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 21 Jan 2024 23:45:18 +0900 Subject: [PATCH] Optimize the parse_attributes method to use `Source#match` to parse XML. ## Why? Improve maintainability by consolidating processing into `Source#match`. ## [Benchmark] ``` RUBYLIB= BUNDLER_ORIG_RUBYLIB= /Users/naitoh/.rbenv/versions/3.3.0/bin/ruby -v -S benchmark-driver /Users/naitoh/ghq/github.com/naitoh/rexml/benchmark/parse.yaml ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [arm64-darwin22] Calculating ------------------------------------- before after before(YJIT) after(YJIT) dom 10.193 10.886 16.969 18.136 i/s - 100.000 times in 9.810390s 9.186087s 5.893093s 5.513848s sax 30.812 30.131 50.759 55.520 i/s - 100.000 times in 3.245535s 3.318887s 1.970105s 1.801142s pull 36.413 35.511 60.692 68.538 i/s - 100.000 times in 2.746293s 2.816067s 1.647660s 1.459039s stream 34.968 34.461 52.797 60.311 i/s - 100.000 times in 2.859738s 2.901858s 1.894046s 1.658079s Comparison: dom after(YJIT): 18.1 i/s before(YJIT): 17.0 i/s - 1.07x slower after: 10.9 i/s - 1.67x slower before: 10.2 i/s - 1.78x slower sax after(YJIT): 55.5 i/s before(YJIT): 50.8 i/s - 1.09x slower before: 30.8 i/s - 1.80x slower after: 30.1 i/s - 1.84x slower pull after(YJIT): 68.5 i/s before(YJIT): 60.7 i/s - 1.13x slower before: 36.4 i/s - 1.88x slower after: 35.5 i/s - 1.93x slower stream after(YJIT): 60.3 i/s before(YJIT): 52.8 i/s - 1.14x slower before: 35.0 i/s - 1.72x slower after: 34.5 i/s - 1.75x slower ``` - YJIT=ON : 1.07x - 1.14x faster - YJIT=OFF : 0.97x - 1.06x faster --- lib/rexml/parsers/baseparser.rb | 116 ++++++++++++-------------------- test/parse/test_element.rb | 6 +- test/test_core.rb | 9 ++- 3 files changed, 53 insertions(+), 78 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index c01b087b..93a5b169 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -114,7 +114,7 @@ class BaseParser module Private INSTRUCTION_END = /#{NAME}(\s+.*?)?\?>/um - TAG_PATTERN = /((?>#{QNAME_STR}))/um + TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um NAME_PATTERN = /\s*#{NAME}/um @@ -128,7 +128,6 @@ module Private def initialize( source ) self.stream = source @listeners = [] - @attributes_scanner = StringScanner.new('') end def add_listener( listener ) @@ -614,87 +613,60 @@ def process_instruction(start_position) def parse_attributes(prefixes, curr_ns) attributes = {} closed = false - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data.nil? - message = "Start tag isn't ended" - raise REXML::ParseException.new(message, @source) - end - - raw_attributes = match_data[1] - closed = !match_data[2].nil? - return attributes, closed if raw_attributes.nil? - return attributes, closed if raw_attributes.empty? - - @attributes_scanner.string = raw_attributes - scanner = @attributes_scanner - until scanner.eos? - if scanner.scan(/\s+/) - break if scanner.eos? - end - - start_position = scanner.pos - while true - break if scanner.scan(ATTRIBUTE_PATTERN) - unless scanner.scan(QNAME) - message = "Invalid attribute name: <#{scanner.rest}>" - raise REXML::ParseException.new(message, @source) - end - name = scanner[0] - unless scanner.scan(/\s*=\s*/um) + while true + if @source.match(">", true) + return attributes, closed + elsif @source.match("/>", true) + closed = true + return attributes, closed + elsif match = @source.match(QNAME, true) + name = match[1] + prefix = match[2] + local_part = match[3] + + unless @source.match(/\s*=\s*/um, true) message = "Missing attribute equal: <#{name}>" raise REXML::ParseException.new(message, @source) end - quote = scanner.scan(/['"]/) - unless quote - message = "Missing attribute value start quote: <#{name}>" - raise REXML::ParseException.new(message, @source) - end - unless scanner.scan(/.*#{Regexp.escape(quote)}/um) - @source.ensure_buffer - match_data = @source.match(/^(.*?)(\/)?>/um, true) - if match_data - scanner << "/" if closed - scanner << ">" - scanner << match_data[1] - scanner.pos = start_position - closed = !match_data[2].nil? - next + unless match = @source.match(/(['"])(.*?)\1\s*/um, true) + if match = @source.match(/(['"])/, true) + message = + "Missing attribute value end quote: <#{name}>: <#{match[1]}>" + raise REXML::ParseException.new(message, @source) + else + message = "Missing attribute value start quote: <#{name}>" + raise REXML::ParseException.new(message, @source) end - message = - "Missing attribute value end quote: <#{name}>: <#{quote}>" - raise REXML::ParseException.new(message, @source) end - end - name = scanner[1] - prefix = scanner[2] - local_part = scanner[3] - # quote = scanner[4] - value = scanner[5] - if prefix == "xmlns" - if local_part == "xml" - if value != "http://www.w3.org/XML/1998/namespace" - msg = "The 'xml' prefix must not be bound to any other namespace "+ + value = match[2] + if prefix == "xmlns" + if local_part == "xml" + if value != "http://www.w3.org/XML/1998/namespace" + msg = "The 'xml' prefix must not be bound to any other namespace "+ + "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" + raise REXML::ParseException.new( msg, @source, self ) + end + elsif local_part == "xmlns" + msg = "The 'xmlns' prefix must not be declared "+ "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self ) + raise REXML::ParseException.new( msg, @source, self) end - elsif local_part == "xmlns" - msg = "The 'xmlns' prefix must not be declared "+ - "(http://www.w3.org/TR/REC-xml-names/#ns-decl)" - raise REXML::ParseException.new( msg, @source, self) + curr_ns << local_part + elsif prefix + prefixes << prefix unless prefix == "xml" end - curr_ns << local_part - elsif prefix - prefixes << prefix unless prefix == "xml" - end - if attributes.has_key?(name) - msg = "Duplicate attribute #{name.inspect}" - raise REXML::ParseException.new(msg, @source, self) - end + if attributes[name] + msg = "Duplicate attribute #{name.inspect}" + raise REXML::ParseException.new(msg, @source, self) + end - attributes[name] = value + attributes[name] = value + else + message = "Invalid attribute name: <#{@source.buffer}>" + raise REXML::ParseException.new(message, @source) + end end - return attributes, closed end end end diff --git a/test/parse/test_element.rb b/test/parse/test_element.rb index 9f172a28..33dec8cd 100644 --- a/test/parse/test_element.rb +++ b/test/parse/test_element.rb @@ -39,11 +39,11 @@ def test_empty_namespace_attribute_name parse("") end assert_equal(<<-DETAIL.chomp, exception.to_s) -Invalid attribute name: <:a=""> +Invalid attribute name: <:a="">> Line: 1 -Position: 9 +Position: 13 Last 80 unconsumed characters: - +:a=""> DETAIL end diff --git a/test/test_core.rb b/test/test_core.rb index 540564da..da676bb0 100644 --- a/test/test_core.rb +++ b/test/test_core.rb @@ -116,11 +116,12 @@ def test_attribute def test_attribute_namespace_conflict # https://www.w3.org/TR/xml-names/#uniqAttrs - message = <<-MESSAGE + message = <<-MESSAGE.chomp Duplicate attribute "a" Line: 4 Position: 140 Last 80 unconsumed characters: +/> MESSAGE assert_raise(REXML::ParseException.new(message)) do Document.new(<<-XML) @@ -1323,11 +1324,12 @@ def test_ticket_21 exception = assert_raise(ParseException) do Document.new(src) end - assert_equal(<<-DETAIL, exception.to_s) + assert_equal(<<-DETAIL.chomp, exception.to_s) Missing attribute value start quote: Line: 1 Position: 16 Last 80 unconsumed characters: +value/> DETAIL end @@ -1336,11 +1338,12 @@ def test_ticket_21_part_2 exception = assert_raise(ParseException) do Document.new(src) end - assert_equal(<<-DETAIL, exception.to_s) + assert_equal(<<-DETAIL.chomp, exception.to_s) Missing attribute value end quote: : <"> Line: 1 Position: 17 Last 80 unconsumed characters: +value/> DETAIL end