Skip to content
This repository has been archived by the owner on Aug 26, 2023. It is now read-only.

Commit

Permalink
Expose Gumbo parse errors to Nokogiri document.
Browse files Browse the repository at this point in the history
Use the internal Gumbo error API to get error parse error information. Expose
this to Ruby by crafting an array of `Nokogiri::XML::SyntaxError` objects filled
in with a variety of information, including column and line information. These
are stored in the document's `@errors` instance variable.

Note that since the HTML state machine completely specifies what to do on
every parse error, no error is fatal.
  • Loading branch information
stevecheckoway committed Aug 4, 2016
1 parent 1f8c668 commit eb59c58
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 7 deletions.
51 changes: 44 additions & 7 deletions ext/nokogumboc/nokogumbo.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,12 @@

#include <ruby.h>
#include <gumbo.h>
#include <error.h>
#include <parser.h>

// class constants
static VALUE Document;
static VALUE XMLSyntaxError;

#ifdef NGLIB
#include <nokogiri.h>
Expand Down Expand Up @@ -182,10 +185,10 @@ static xmlNodePtr walk_tree(xmlDocPtr document, GumboNode *node) {

// Parse a string using gumbo_parse into a Nokogiri document
static VALUE parse(VALUE self, VALUE string) {
GumboOutput *output = gumbo_parse_with_options(
&kGumboDefaultOptions, RSTRING_PTR(string),
(size_t) RSTRING_LEN(string)
);
const GumboOptions *options = &kGumboDefaultOptions;
const char *input = RSTRING_PTR(string);
size_t input_len = RSTRING_LEN(string);
GumboOutput *output = gumbo_parse_with_options(options, input, input_len);
xmlDocPtr doc = xmlNewDoc(CONST_CAST "1.0");
#ifdef NGLIB
doc->type = XML_HTML_DOCUMENT_NODE;
Expand All @@ -210,9 +213,42 @@ static VALUE parse(VALUE self, VALUE string) {
xmlAddChild((xmlNodePtr)doc, node);
}
}
gumbo_destroy_output(&kGumboDefaultOptions, output);

return Nokogiri_wrap_xml_document(Document, doc);
VALUE rdoc = Nokogiri_wrap_xml_document(Document, doc);

// Add parse errors to rdoc.
if (output->errors.length) {
GumboVector *errors = &output->errors;
GumboParser parser = { ._options = options };
GumboStringBuffer msg;
VALUE rerrors = rb_ary_new_capa(errors->length);

gumbo_string_buffer_init(&parser, &msg);
for (int i=0; i < errors->length; i++) {
GumboError *err = errors->data[i];
gumbo_string_buffer_clear(&parser, &msg);
gumbo_caret_diagnostic_to_string(&parser, err, input, &msg);
VALUE err_str = rb_str_new(msg.data, msg.length);
VALUE syntax_error = rb_class_new_instance(1, &err_str, XMLSyntaxError);
rb_iv_set(syntax_error, "@domain", INT2NUM(1)); // XML_FROM_PARSER
rb_iv_set(syntax_error, "@code", INT2NUM(1)); // XML_ERR_INTERNAL_ERROR
rb_iv_set(syntax_error, "@level", INT2NUM(2)); // XML_ERR_ERROR
rb_iv_set(syntax_error, "@file", Qnil);
rb_iv_set(syntax_error, "@line", INT2NUM(err->position.line));
rb_iv_set(syntax_error, "@str1", Qnil);
rb_iv_set(syntax_error, "@str2", Qnil);
rb_iv_set(syntax_error, "@str3", Qnil);
rb_iv_set(syntax_error, "@int1", INT2NUM(err->type));
rb_iv_set(syntax_error, "@column", INT2NUM(err->position.column));
rb_ary_push(rerrors, syntax_error);
}
rb_iv_set(rdoc, "@errors", rerrors);
gumbo_string_buffer_destroy(&parser, &msg);
}

gumbo_destroy_output(options, output);

return rdoc;
}

// Initialize the Nokogumbo class and fetch constants we will use later
Expand All @@ -224,10 +260,11 @@ void Init_nokogumboc() {
VALUE Nokogiri = rb_const_get(rb_cObject, rb_intern("Nokogiri"));
VALUE HTML = rb_const_get(Nokogiri, rb_intern("HTML"));
Document = rb_const_get(HTML, rb_intern("Document"));
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
XMLSyntaxError = rb_const_get(XML, rb_intern("SyntaxError"));

#ifndef NGLIB
// more class constants
VALUE XML = rb_const_get(Nokogiri, rb_intern("XML"));
Element = rb_const_get(XML, rb_intern("Element"));
Text = rb_const_get(XML, rb_intern("Text"));
CDATA = rb_const_get(XML, rb_intern("CDATA"));
Expand Down
7 changes: 7 additions & 0 deletions test-nokogumbo.rb
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,13 @@ def test_root_comments
assert_equal ["html", "comment", "html", "comment"], doc.children.map(&:name)
end

def test_parse_errors
doc = Nokogiri::HTML5("<!DOCTYPE html><html><!-- -- --></a>")
assert_equal doc.errors.length, 2
doc = Nokogiri::HTML5("<!DOCTYPE html><html>")
assert_empty doc.errors
end

private

def buffer
Expand Down

0 comments on commit eb59c58

Please sign in to comment.