Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Commit

Permalink
Merge pull request #65 from missive/max-parse-errors
Browse files Browse the repository at this point in the history
Add :max_parse_errors argument to .parse, .get and .fragment with 0 as default value
  • Loading branch information
rubys authored Jan 25, 2018
2 parents a3fc29b + c7397aa commit 938dd3f
Show file tree
Hide file tree
Showing 3 changed files with 46 additions and 17 deletions.
15 changes: 9 additions & 6 deletions ext/nokogumboc/nokogumbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,14 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
}

// Parse a string using gumbo_parse into a Nokogiri document
static VALUE parse(VALUE self, VALUE string) {
const GumboOptions *options = &kGumboDefaultOptions;
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
GumboOptions options;
memcpy(&options, &kGumboDefaultOptions, sizeof options);
options.max_errors = NUM2INT(max_parse_errors);

const char *input = RSTRING_PTR(string);
size_t input_len = RSTRING_LEN(string);
GumboOutput *output = gumbo_parse_with_options(options, input, input_len);
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
#ifdef NGLIB
doc->type = XML_HTML_DOCUMENT_NODE;
Expand Down Expand Up @@ -219,7 +222,7 @@ static VALUE parse(VALUE self, VALUE string) {
// Add parse errors to rdoc.
if (output->errors.length) {
GumboVector *errors = &output->errors;
GumboParser parser = { ._options = options };
GumboParser parser = { ._options = &options };
GumboStringBuffer msg;
VALUE rerrors = rb_ary_new2(errors->length);

Expand Down Expand Up @@ -253,7 +256,7 @@ static VALUE parse(VALUE self, VALUE string) {
gumbo_string_buffer_destroy(&parser, &msg);
}

gumbo_destroy_output(options, output);
gumbo_destroy_output(&options, output);

return rdoc;
}
Expand Down Expand Up @@ -288,5 +291,5 @@ void Init_nokogumboc() {

// define Nokogumbo class with a singleton parse method
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
rb_define_singleton_method(Gumbo, "parse", parse, 1);
rb_define_singleton_method(Gumbo, "parse", parse, 2);
}
14 changes: 7 additions & 7 deletions lib/nokogumbo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
module Nokogiri
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
def self.HTML5(string)
Nokogiri::HTML5.parse(string)
def self.HTML5(*args)
Nokogiri::HTML5.parse(*args)
end

module HTML5
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
def self.parse(string)
def self.parse(string, options={})
if string.respond_to? :read
string = string.read
end
Expand All @@ -21,7 +21,7 @@ def self.parse(string)
string = reencode(string)
end

Nokogumbo.parse(string.to_s)
Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
end

# Fetch and parse a HTML document from the web, following redirects,
Expand Down Expand Up @@ -67,7 +67,7 @@ def self.get(uri, options={})

case response
when Net::HTTPSuccess
doc = parse(reencode(response.body, response['content-type']))
doc = parse(reencode(response.body, response['content-type']), options)
doc.instance_variable_set('@response', response)
doc.class.send(:attr_reader, :response)
doc
Expand All @@ -83,8 +83,8 @@ def self.get(uri, options={})
# while fragment is on the Gumbo TODO list, simulate it by doing
# a full document parse and ignoring the parent <html>, <head>, and <body>
# tags, and collecting up the children of each.
def self.fragment(string)
doc = parse(string)
def self.fragment(*args)
doc = parse(*args)
fragment = Nokogiri::HTML::DocumentFragment.new(doc)

if doc.children.length != 1 or doc.children.first.name != 'html'
Expand Down
34 changes: 30 additions & 4 deletions test-nokogumbo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_bogus_encoding
end

def test_html5_doctype
doc = Nokogumbo.parse("<!DOCTYPE html><html></html>")
doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
assert_match /<!DOCTYPE html>/, doc.to_html
end

Expand Down Expand Up @@ -126,17 +126,43 @@ def test_root_comments
end

def test_parse_errors
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>")
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
assert_equal doc.errors.length, 2
doc = Nokogiri::HTML5("<!DOCTYPE html><html>")
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
assert_empty doc.errors
end

def test_max_parse_errors
# This document contains 2 parse errors, but we force limit to 1.
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
assert_equal 1, doc.errors.length
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
assert_empty doc.errors
end

def test_default_max_parse_errors
# This document contains 200 parse errors, but default limit is 0.
doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
assert_equal 0, doc.errors.length
end

def test_parse_fragment_errors
doc = Nokogiri::HTML5.fragment("<\r\n")
doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
refute_empty doc.errors
end

def test_fragment_max_parse_errors
# This fragment contains 3 parse errors, but we force limit to 1.
doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
assert_equal 1, doc.errors.length
end

def test_fragment_default_max_parse_errors
# This fragment contains 201 parse errors, but default limit is 0.
doc = Nokogiri::HTML5.fragment("</p>" * 200)
assert_equal 0, doc.errors.length
end

private

def buffer
Expand Down

0 comments on commit 938dd3f

Please sign in to comment.