gjtorikian · gjtorikian · Dec 31, 2013 · Dec 26, 2013 · Dec 26, 2013 · Dec 26, 2013
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -5,7 +5,7 @@ PATH
       colored (~> 1.2)
       mercenary (~> 0.2.0)
       nokogiri (~> 1.6.0)
-      typhoeus (~> 0.6.3)
+      typhoeus (~> 0.6.7)
 
 GEM
   remote: http://rubygems.org/
@@ -20,7 +20,7 @@ GEM
     colored (1.2)
     diff-lcs (1.2.4)
     escape_utils (0.3.2)
-    ethon (0.6.1)
+    ethon (0.6.2)
       ffi (>= 1.3.0)
       mime-types (~> 1.18)
     ffi (1.9.3)
@@ -56,8 +56,8 @@ GEM
       nokogiri (>= 1.4.4)
     thread_safe (0.1.3)
       atomic
-    typhoeus (0.6.6)
-      ethon (~> 0.6.1)
+    typhoeus (0.6.7)
+      ethon (~> 0.6.2)
     tzinfo (0.3.38)
 
 PLATFORMS

diff --git a/README.md b/README.md
@@ -69,7 +69,9 @@ Note: since `swap` is a bit special, you'll pass in a pair of `RegEx:String` val
 
 ## Usage with Jekyll
 
-Want to use HTML Proofer with your Jekyll site? Awesome. Simply add `gem 'html-proofer'` to your `Gemfile` as described above, and add the following to your `Rakefile`, using `rake test` to execute:
+Want to use HTML Proofer with your Jekyll site? Awesome. Simply add `gem 'html-proofer'`
+to your `Gemfile` as described above, and add the following to your `Rakefile`,
+using `rake test` to execute:
 
 ```ruby
 require 'html/proofer'
@@ -94,12 +96,15 @@ htmlproof ./_site
 * Whether your internal links are not broken; this includes hash references (`#linkToMe`)
 * Whether external links are working
 
-## Configuration 
+## Configuration
 
 The `HTML::Proofer` constructor takes an optional hash of additional options:
 
 * `:ext`: the extension (including the `.`) of your HTML files (default: `.html`)
 * `:href_swap`: a hash containing key-value pairs of `RegExp => String`. It transforms links that match `RegExp` into `String` via `gsub`.
-* `:href_ignore`: an array of Strings containing `href`s that are safe to ignore (default: `mailto`)
+* `:href_ignore`: an array of Strings containing `href`s that are safe to ignore (by default, `mailto` is always checked)
+* `:disable_external`: if `true`, does not run the external link checker, which can take a lot of time (default: `false`)
 
-To any `<a>` or `<img>` tag, you may add the `data-proofer-ignore` attribute to ignore the link.
+You can also pass in any of Typhoeus' options for the external link check.
+
+To any `<a>` or `<img>` tag, you may add the `data-proofer-ignore` attribute to ignore the link.
diff --git a/html-proofer.gemspec b/html-proofer.gemspec
@@ -18,7 +18,7 @@ Gem::Specification.new do |gem|
   gem.add_dependency "mercenary",       "~> 0.2.0"
   gem.add_dependency "nokogiri",        "~> 1.6.0"
   gem.add_dependency "colored",         "~> 1.2"
-  gem.add_dependency "typhoeus",        "~> 0.6.3"
+  gem.add_dependency "typhoeus",        "~> 0.6.7"
 
   gem.add_development_dependency "html-pipeline", "~> 0.0.12"
   gem.add_development_dependency "rspec", "~> 2.13.0"

diff --git a/lib/html/proofer.rb b/lib/html/proofer.rb
@@ -1,54 +1,117 @@
 require 'nokogiri'
+
 require File.dirname(__FILE__) + '/proofer/checkable'
 require File.dirname(__FILE__) + '/proofer/checks'
 
 module HTML
   class Proofer
     def initialize(src, opts={})
       @srcDir = src
-      @options = {:ext => ".html"}.merge(opts)
+      # Typhoeus::Config.verbose = true
+      @proofer_opts = {:ext => ".html", :href_swap => [], :href_ignore => [], :disable_external => false }
+      @options = @proofer_opts.merge({:followlocation => true}).merge(opts)
+
+      @failed_tests = []
     end
 
     def run
       total_files = 0
-      failed_tests = []
+      external_urls = {}
+      hydra = Typhoeus::Hydra.new
+
+      puts "Running #{get_checks} checks on #{@srcDir} on *#{@options[:ext]}... \n\n"
 
-      puts "Running #{get_checks} checks on #{@srcDir}... \n\n"
+      if File.directory? @srcDir
+        files = Dir.glob("#{@srcDir}/**/*#{@options[:ext]}")
+      else
+        files = File.extname(@srcDir) == @options[:ext] ? [@srcDir] : []
+      end
 
-      Dir.glob("#{@srcDir}/**/*#{@options[:ext]}") do |path|
+      files.each do |path|
         total_files += 1
         html = HTML::Proofer.create_nokogiri(path)
 
         get_checks.each do |klass|
           check = klass.new(@srcDir, path, html, @options)
           check.run
-          check.hydra.run
-          failed_tests.concat(check.issues) if check.issues.length > 0
+          external_urls.merge!(check.external_urls)
+          @failed_tests.concat(check.issues) if check.issues.length > 0
         end
       end
 
+      # the hypothesis is that Proofer runs way faster if we pull out
+      # all the external URLs and run the checks at the end. Otherwise, we're halting
+      # the consuming process for every file. In addition, sorting the list lets
+      # libcurl keep connections to hosts alive. Finally, we'll make a HEAD request,
+      # rather than GETing all the contents
+      external_urls = Hash[external_urls.sort]
+
+      unless @options[:disable_external]
+        puts "Checking #{external_urls.length} external links... \n\n"
+
+        # Typhoeus won't let you pass any non-Typhoeus option
+        @proofer_opts.each_key do |opt|
+          @options.delete opt
+        end
+
+        external_urls.each_pair do |href, filenames|
+          request = Typhoeus::Request.new(href, @options.merge({:method => :head}))
+          request.on_complete { |response| response_handler(response, filenames) }
+          hydra.queue request
+        end
+        hydra.run
+      end
+
       puts "Ran on #{total_files} files!"
 
-      if failed_tests.empty?
+      if @failed_tests.empty?
         puts "HTML-Proofer finished successfully.".green
-        exit 0
       else
-        failed_tests.each do |issue|
+        @failed_tests.each do |issue|
           $stderr.puts issue + "\n\n"
         end
 
-        raise "HTML-Proofer found #{failed_tests.length} failures!"
+        raise "HTML-Proofer found #{@failed_tests.length} failures!"
+      end
+    end
+
+    def response_handler(response, filenames)
+      href = response.options[:effective_url]
+      method = response.request.options[:method]
+      response_code = response.code
+
+      if response_code.between?(200, 299)
+        # continue with no op
+      elsif response.timed_out?
+         @failed_tests << "#{filenames.join(' ').blue}: External link #{href} failed: got a time out"
+      # hey here's a funny bug: sometimes HEAD requests return a 404, even on legitimate pages! The
+      # bug seems to affect curl; try `curl -I -X HEAD https://help.github.com/changing-author-info`
+      # so in these cases, try a regular `GET`. if it fails, it fails.
+      elsif response_code == 404 && method == :head
+        next_response = Typhoeus.get(href, @options)
+        response_handler(next_response, filenames)
+      else
+        if (response_code == 420 || response_code == 503) && method == :head
+          # 420s usually come from rate limiting; let's ignore the query and try just the path with a GET
+          uri = URI(href)
+          next_response = Typhoeus.get(uri.scheme + "://" + uri.host + uri.path, @options)
+          response_handler(next_response, filenames)
+        else
+          # Received a non-successful http response.
+          @failed_tests << "#{filenames.join(' ').blue}: External link #{href} failed: #{response_code} #{response.return_message}"
+        end
       end
     end
 
     def self.create_nokogiri(path)
       path << "/index.html" if File.directory? path # support for Jekyll-style links
       content = File.open(path, "rb") {|f| f.read }
-      Nokogiri::HTML(File.read(path))
+      Nokogiri::HTML(content)
     end
 
     def get_checks
       HTML::Proofer::Checks::Check.subclasses
     end
+
   end
 end
diff --git a/lib/html/proofer/check.rb b/lib/html/proofer/check.rb
@@ -10,18 +10,16 @@ class HTML::Proofer::Checks
 
   class Check
 
-    attr_reader :issues, :hydra, :src, :path, :options, :additional_href_ignores
+    attr_reader :issues, :src, :path, :options, :external_urls, :additional_href_ignores
 
     def initialize(src, path, html, opts={})
       @src    = src
       @path   = path
       @html   = html
       @options = opts
       @issues = []
-      @checked_urls = {}
-
-      @hydra = Typhoeus::Hydra.hydra
-      @additional_href_ignores = @options[:href_ignore] || []
+      @additional_href_ignores = @options[:href_ignore]
+      @external_urls = {}
     end
 
     def run
@@ -36,44 +34,12 @@ def output_filenames
       Dir[@site.config[:output_dir] + '/**/*'].select{ |f| File.file?(f) }
     end
 
-    def validate_url(href, issue_text)
-      return @checked_urls[href] if @checked_urls.has_key? href
-      request = Typhoeus::Request.new(href, {:followlocation => true})
-      request.on_complete do |response|
-        if response.success?
-          @checked_urls[href] = true
-        elsif response.timed_out?
-          self.add_issue(issue_text + " got a time out")
-        elsif response.code == 0
-          # Could not get an http response, something's wrong.
-          self.add_issue(issue_text + ". #{response.return_message}!")
-        else
-          response_code = response.code.to_s
-          if %w(420 503).include?(response_code)
-            # 420s usually come from rate limiting; let's ignore the query and try just the path
-            uri = URI(href)
-            response = Typhoeus.get(uri.scheme + "://" + uri.host + uri.path, {:followlocation => true})
-            self.add_issue("#{issue_text} Originally, this was a #{response_code}. Now, the HTTP request failed again: #{response.code.to_s}") unless response.success?
-          else
-            # Received a non-successful http response.
-            self.add_issue("#{issue_text} HTTP request failed: #{response_code}")
-          end
-        end
-
-        @checked_urls[href] = false unless response.success?
-      end
-      hydra.queue(request)
-    end
-
-    def request_url(url)
-      path = (url.path.nil? || url.path.empty? ? '/' : url.path)
-      req = Net::HTTP::Head.new(path)
-      http = Net::HTTP.new(url.host, url.port)
-      if url.instance_of? URI::HTTPS
-        http.use_ssl = true
-        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
+    def add_to_external_urls(href)
+      if @external_urls[href]
+        @external_urls[href] << @path
+      else
+        @external_urls[href] = [@path]
       end
-      res = http.request(req)
     end
 
     def self.subclasses

diff --git a/lib/html/proofer/checkable.rb b/lib/html/proofer/checkable.rb
@@ -1,7 +1,6 @@
 module HTML
   class Proofer
     class Checkable
-
       def initialize(obj, check)
         @src = obj['src']
         @href = obj['href']
@@ -56,7 +55,7 @@ def remote?
 
       def ignore?
         uri = URI.parse url
-        @data_ignore_proofer || %w( mailto ).include?(uri.scheme) || @check.additional_href_ignores.include?(href)
+        @data_ignore_proofer || %w( mailto ).include?(uri.scheme) || @check.additional_href_ignores.include?(url)
       rescue URI::BadURIError
         false
       rescue URI::InvalidURIError
@@ -77,7 +76,7 @@ def file_path
         return if path.nil?
 
         if path =~ /^\// # path relative to root
-          base = @check.src
+          base = File.directory?(@check.src) ? @check.src : File.dirname(@check.src)
         elsif File.exist?(File.expand_path path, @check.src) # relative links, path is a file
           base = File.dirname @check.path
         elsif File.exist?(File.join(File.dirname(@check.path), path)) # relative links in nested dir, path is a file
@@ -104,7 +103,6 @@ def absolute_path
         path = file_path || @check.path
         File.expand_path path, Dir.pwd
       end
-
     end
   end
 end
diff --git a/lib/html/proofer/checks/images.rb b/lib/html/proofer/checks/images.rb
@@ -24,24 +24,27 @@ def missing_src?
 
 class Images < ::HTML::Proofer::Checks::Check
   def run
-    @html.css('img').each do |img|
+    @html.css('img').each do |i|
+      img = Image.new i, self
 
-      img = Image.new img, self
-
-      # screenshot filenames, return because invalid URL
+      # screenshot filenames should return because of terrible names
       return self.add_issue "image has a terrible filename (#{img.src})" if img.terrible_filename?
 
       # does the image exist?
       if img.missing_src?
         self.add_issue "image has no src attribute"
       elsif img.remote?
-        validate_url img.src, "external image #{img.src} does not exist"
+        return if img.ignore?
+        add_to_external_urls img.src
       else
+        return if img.ignore?
         self.add_issue("internal image #{img.src} does not exist") unless img.exists?
       end
 
       # check alt tag
       self.add_issue "image #{img.src} does not have an alt attribute" unless img.valid_alt_tag?
     end
+
+    return external_urls
   end
 end
diff --git a/lib/html/proofer/checks/links.rb b/lib/html/proofer/checks/links.rb
@@ -15,9 +15,8 @@ def missing_href?
 class Links < ::HTML::Proofer::Checks::Check
 
   def run
-    @html.css('a').each do |link|
-
-      link = Link.new link, self
+    @html.css('a').each do |l|
+      link = Link.new l, self
       return if link.ignore?
 
       # is there even a href?
@@ -28,15 +27,15 @@ def run
 
       # does the file even exist?
       if link.remote?
-        validate_url link.href, "externally linking to #{link.href}, which does not exist"
+        add_to_external_urls link.href
       else
         self.add_issue "internally linking to #{link.href}, which does not exist" unless link.exists?
       end
 
       # verify the target hash
       if link.hash
         if link.remote?
-          #not yet checked
+          add_to_external_urls link.href
         elsif link.internal?
           self.add_issue "linking to internal hash ##{link.hash} that does not exist" unless hash_check @html, link.hash
         elsif link.external?
@@ -49,6 +48,8 @@ def run
         end
       end
     end
+
+    return external_urls
   end
 
   def hash_check(html, href_hash)

diff --git a/spec/html/proofer/fixtures/linkWithRedirect.html b/spec/html/proofer/fixtures/linkWithRedirect.html
@@ -2,8 +2,9 @@
 
 <body>
 
-	<p>Blah blah blah. <a href="http://www.capify.org/">This is a redirect.</a>. </p>
+	<p>Blah blah blah. <a href="http://github.github.com/github-flavored-markdown/">This is a redirect.</a>. </p>
 
+	<p>Blah blah blah. <a href="https://help.github.com/changing-author-info/">This is another redirect.</a>. </p>
 </body>
 
-</html>
+</html>
diff --git a/spec/html/proofer/fixtures/relativeLinks.html b/spec/html/proofer/fixtures/relativeLinks.html
@@ -3,5 +3,4 @@
 <a href="folder/">Folder relative to self</a>
 <a href="index.html#anchor">Anchor relative to self</a>
 
-<img alt="sure thing" src="/gpl.png"/>Relative to root
 <img alt="sure thing" src="gpl.png"/>Relative to self