Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Add :max_parse_errors argument to .parse, .get and .fragment with 0 as default value #65

Merged
merged 4 commits into from
Jan 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions ext/nokogumboc/nokogumbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,14 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {
}

// Parse a string using gumbo_parse into a Nokogiri document
static VALUE parse(VALUE self, VALUE string) {
const GumboOptions *options = &kGumboDefaultOptions;
static VALUE parse(VALUE self, VALUE string, VALUE max_parse_errors) {
GumboOptions options;
memcpy(&options, &kGumboDefaultOptions, sizeof options);
options.max_errors = NUM2INT(max_parse_errors);

const char *input = RSTRING_PTR(string);
size_t input_len = RSTRING_LEN(string);
GumboOutput *output = gumbo_parse_with_options(options, input, input_len);
GumboOutput *output = gumbo_parse_with_options(&options, input, input_len);
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
#ifdef NGLIB
doc->type = XML_HTML_DOCUMENT_NODE;
Expand Down Expand Up @@ -219,7 +222,7 @@ static VALUE parse(VALUE self, VALUE string) {
// Add parse errors to rdoc.
if (output->errors.length) {
GumboVector *errors = &output->errors;
GumboParser parser = { ._options = options };
GumboParser parser = { ._options = &options };
GumboStringBuffer msg;
VALUE rerrors = rb_ary_new2(errors->length);

Expand Down Expand Up @@ -253,7 +256,7 @@ static VALUE parse(VALUE self, VALUE string) {
gumbo_string_buffer_destroy(&parser, &msg);
}

gumbo_destroy_output(options, output);
gumbo_destroy_output(&options, output);

return rdoc;
}
Expand Down Expand Up @@ -288,5 +291,5 @@ void Init_nokogumboc() {

// define Nokogumbo class with a singleton parse method
VALUE Gumbo = rb_define_class("Nokogumbo", rb_cObject);
rb_define_singleton_method(Gumbo, "parse", parse, 1);
rb_define_singleton_method(Gumbo, "parse", parse, 2);
}
14 changes: 7 additions & 7 deletions lib/nokogumbo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
module Nokogiri
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
def self.HTML5(string)
Nokogiri::HTML5.parse(string)
def self.HTML5(*args)
Nokogiri::HTML5.parse(*args)
end

module HTML5
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
def self.parse(string)
def self.parse(string, options={})
if string.respond_to? :read
string = string.read
end
Expand All @@ -21,7 +21,7 @@ def self.parse(string)
string = reencode(string)
end

Nokogumbo.parse(string.to_s)
Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 0)
end

# Fetch and parse a HTML document from the web, following redirects,
Expand Down Expand Up @@ -67,7 +67,7 @@ def self.get(uri, options={})

case response
when Net::HTTPSuccess
doc = parse(reencode(response.body, response['content-type']))
doc = parse(reencode(response.body, response['content-type']), options)
doc.instance_variable_set('@response', response)
doc.class.send(:attr_reader, :response)
doc
Expand All @@ -83,8 +83,8 @@ def self.get(uri, options={})
# while fragment is on the Gumbo TODO list, simulate it by doing
# a full document parse and ignoring the parent <html>, <head>, and <body>
# tags, and collecting up the children of each.
def self.fragment(string)
doc = parse(string)
def self.fragment(*args)
doc = parse(*args)
fragment = Nokogiri::HTML::DocumentFragment.new(doc)

if doc.children.length != 1 or doc.children.first.name != 'html'
Expand Down
34 changes: 30 additions & 4 deletions test-nokogumbo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def test_bogus_encoding
end

def test_html5_doctype
doc = Nokogumbo.parse("<!DOCTYPE html><html></html>")
doc = Nokogiri::HTML5.parse("<!DOCTYPE html><html></html>")
assert_match /<!DOCTYPE html>/, doc.to_html
end

Expand Down Expand Up @@ -126,17 +126,43 @@ def test_root_comments
end

def test_parse_errors
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>")
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 10)
assert_equal doc.errors.length, 2
doc = Nokogiri::HTML5("<!DOCTYPE html><html>")
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 10)
assert_empty doc.errors
end

def test_max_parse_errors
# This document contains 2 parse errors, but we force limit to 1.
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>", max_parse_errors: 1)
assert_equal 1, doc.errors.length
doc = Nokogiri::HTML5("<!DOCTYPE html><html>", max_parse_errors: 1)
assert_empty doc.errors
end

def test_default_max_parse_errors
# This document contains 200 parse errors, but default limit is 0.
doc = Nokogiri::HTML5("<!DOCTYPE html><html>" + "</p>" * 200)
assert_equal 0, doc.errors.length
end

def test_parse_fragment_errors
doc = Nokogiri::HTML5.fragment("<\r\n")
doc = Nokogiri::HTML5.fragment("<\r\n", max_parse_errors: 10)
refute_empty doc.errors
end

def test_fragment_max_parse_errors
# This fragment contains 3 parse errors, but we force limit to 1.
doc = Nokogiri::HTML5.fragment("<!-- -- --></a>", max_parse_errors: 1)
assert_equal 1, doc.errors.length
end

def test_fragment_default_max_parse_errors
# This fragment contains 201 parse errors, but default limit is 0.
doc = Nokogiri::HTML5.fragment("</p>" * 200)
assert_equal 0, doc.errors.length
end

private

def buffer
Expand Down