forked from rubys/nokogumbo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnokogumbo.rb
179 lines (151 loc) · 6.27 KB
/
nokogumbo.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
require 'nokogiri'
require 'nokogumboc'
module Nokogiri
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
def self.HTML5(*args)
Nokogiri::HTML5.parse(*args)
end
module HTML5
# Parse an HTML document. +string+ contains the document. +string+
# may also be an IO-like object. Returns a +Nokogiri::HTML::Document+.
def self.parse(string, options={})
if string.respond_to? :read
string = string.read
end
# convert to UTF-8 (Ruby 1.9+)
if string.respond_to?(:encoding) and string.encoding != Encoding::UTF_8
string = reencode(string)
end
Nokogumbo.parse(string.to_s, options[:max_parse_errors] || 100)
end
# Fetch and parse a HTML document from the web, following redirects,
# handling https, and determining the character encoding using HTML5
# rules. +uri+ may be a +String+ or a +URI+. +options+ contains
# http headers and special options. Everything which is not a
# special option is considered a header. Special options include:
# * :follow_limit => number of redirects which are followed
# * :basic_auth => [username, password]
def self.get(uri, options={})
headers = options.clone
headers = {:follow_limit => headers} if Numeric === headers # deprecated
limit=headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
require 'net/http'
uri = URI(uri) unless URI === uri
http = Net::HTTP.new(uri.host, uri.port)
# TLS / SSL support
http.use_ssl = true if uri.scheme == 'https'
# Pass through Net::HTTP override values, which currently include:
# :ca_file, :ca_path, :cert, :cert_store, :ciphers,
# :close_on_empty_response, :continue_timeout, :key, :open_timeout,
# :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
# :verify_callback, :verify_depth, :verify_mode
options.each do |key, value|
http.send "#{key}=", headers.delete(key) if http.respond_to? "#{key}="
end
request = Net::HTTP::Get.new(uri.request_uri)
# basic authentication
auth = headers.delete(:basic_auth)
auth ||= [uri.user, uri.password] if uri.user and uri.password
request.basic_auth auth.first, auth.last if auth
# remaining options are treated as headers
headers.each {|key, value| request[key.to_s] = value.to_s}
response = http.request(request)
case response
when Net::HTTPSuccess
doc = parse(reencode(response.body, response['content-type']), options)
doc.instance_variable_set('@response', response)
doc.class.send(:attr_reader, :response)
doc
when Net::HTTPRedirection
response.value if limit <= 1
location = URI.join(uri, response['location'])
get(location, options.merge(:follow_limit => limit-1))
else
response.value
end
end
# while fragment is on the Gumbo TODO list, simulate it by doing
# a full document parse and ignoring the parent <html>, <head>, and <body>
# tags, and collecting up the children of each.
def self.fragment(*args)
doc = parse(*args)
fragment = Nokogiri::HTML::DocumentFragment.new(doc)
if doc.children.length != 1 or doc.children.first.name != 'html'
# no HTML? Return document as is
fragment = doc
else
# examine children of HTML element
children = doc.children.first.children
# head is always first. If present, take children but otherwise
# ignore the head element
if children.length > 0 and doc.children.first.name = 'head'
fragment << children.shift.children
end
# body may be next, or last. If found, take children but otherwise
# ignore the body element. Also take any remaining elements, taking
# care to preserve order.
if children.length > 0 and doc.children.first.name = 'body'
fragment << children.shift.children
fragment << children
elsif children.length > 0 and doc.children.last.name = 'body'
body = children.pop
fragment << children
fragment << body.children
else
fragment << children
end
end
# return result
fragment
end
private
# Charset sniffing is a complex and controversial topic that understandably
# isn't done _by default_ by the Ruby Net::HTTP library. This being said,
# it is a very real problem for consumers of HTML as the default for HTML
# is iso-8859-1, most "good" producers use utf-8, and the Gumbo parser
# *only* supports utf-8.
#
# Accordingly, Nokogiri::HTML::Document.parse provides limited encoding
# detection. Following this lead, Nokogiri::HTML5 attempts to do likewise,
# while attempting to more closely follow the HTML5 standard.
#
# http://bugs.ruby-lang.org/issues/2567
# http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
#
def self.reencode(body, content_type=nil)
return body unless body.respond_to? :encoding
if body.encoding == Encoding::ASCII_8BIT
encoding = nil
# look for a Byte Order Mark (BOM)
if body[0..1] == "\xFE\xFF"
encoding = 'utf-16be'
elsif body[0..1] == "\xFF\xFE"
encoding = 'utf-16le'
elsif body[0..2] == "\xEF\xBB\xBF"
encoding = 'utf-8'
end
# look for a charset in a content-encoding header
if content_type
encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
end
# look for a charset in a meta tag in the first 1024 bytes
if not encoding
data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, '')
data.scan(/<meta.*?>/m).each do |meta|
encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
end
end
# if all else fails, default to the official default encoding for HTML
encoding ||= Encoding::ISO_8859_1
# change the encoding to match the detected or inferred encoding
begin
body.force_encoding(encoding)
rescue ArgumentError
body.force_encoding(Encoding::ISO_8859_1)
end
end
body.encode(Encoding::UTF_8)
end
end
end