Skip to content

Commit

Permalink
Merge pull request #2 from forcedotcom/add-crawl-bean
Browse files Browse the repository at this point in the history
Added a Bean for the progress, easier to consume by other applications
  • Loading branch information
jasperroel committed Aug 4, 2015
2 parents 5014cd2 + 086ca5f commit 6310428
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 10 deletions.
36 changes: 26 additions & 10 deletions src/main/java/com/salesforce/webdev/sitecrawler/SiteCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@

import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.util.Cookie;
import com.salesforce.webdev.sitecrawler.beans.CrawlProgress;
import com.salesforce.webdev.sitecrawler.navigation.NavigateThread;
import com.salesforce.webdev.sitecrawler.navigation.ProcessPage;
import com.salesforce.webdev.sitecrawler.utils.NamedThreadFactory;
Expand Down Expand Up @@ -692,20 +693,35 @@ public void shutdown() {
* @return String a user-friendly message of the progress of the crawler
*/
public String getCrawlProgress() {
int leftToCrawl = toVisit.size() + linksScheduled.get() - threadLimit;
CrawlProgress progress = getCrawlProgressBean();
StringBuilder sb = new StringBuilder();
sb.append(actuallyVisited.get()).append(" crawled. ");
sb.append(leftToCrawl).append(" left to crawl. ");
sb.append(linksScheduled.get()).append(" scheduled for download. "); // (submitted a NavigateThread!)
sb.append(pagesScheduled.get()).append(" scheduled for processing. "); // (in LIMBO, downloaded but NOT processed)
sb.append(fullyProcessed.get()).append(" fully processed. ");
sb.append(Math.round((new Double(fullyProcessed.get()) / (fullyProcessed.get() + leftToCrawl)) * 10000) / 100.0)
.append(
"% complete.");

sb.append(progress.crawled).append(" crawled. ");
sb.append(progress.leftToCrawl).append(" left to crawl. ");
sb.append(progress.scheduledForDownload).append(" scheduled for download. "); // (submitted a NavigateThread!)
sb.append(progress.scheduledForProcessing).append(" scheduled for processing. "); // (in LIMBO, downloaded but NOT processed)
sb.append(progress.fullyProcessed).append(" fully processed. ");
sb.append(progress.complete).append("% complete.");
return sb.toString();
}

/**
* <p>Returns a computer-friendly bean of the progress of the crawler.</p>
*
* @return String a computer-friendly bean of the progress of the crawler
*/
public CrawlProgress getCrawlProgressBean() {
int leftToCrawl = toVisit.size() + linksScheduled.get() - threadLimit;

CrawlProgress crawlProgress = new CrawlProgress();
crawlProgress.crawled = actuallyVisited.get();
crawlProgress.leftToCrawl = leftToCrawl;
crawlProgress.scheduledForDownload = linksScheduled.get();
crawlProgress.scheduledForProcessing = pagesScheduled.get();
crawlProgress.fullyProcessed = fullyProcessed.get();
crawlProgress.complete = Math.round((new Double(fullyProcessed.get()) / (fullyProcessed.get() + leftToCrawl)) * 10000) / 100.0;
return crawlProgress;
}

/**
* <p>Does its best to reset/recreate the WebClient Pool (wcPool) and the link and page consumers.</p>
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package com.salesforce.webdev.sitecrawler.beans;

/**
* <p>Simple bean to collect the progress of a crawl.</p>
*
* @author jroel
*
*/
public class CrawlProgress {
/**
* <p>Total pages crawled.</p>
*/
public long crawled;
/**
* <p>Number of pages left to crawl.</p>
*/
public long leftToCrawl;
/**
* <p>Number of pages currently in the queue, waiting to be downloaded.</p>
*/
public long scheduledForDownload;
/**
* <p>Number of pages currently in the queue (already downloaded), waiting to be processed.</p>
*/
public long scheduledForProcessing;
/**
* <p>Number of pages completely processed.</p>
*/
public long fullyProcessed;
/**
* <p>Percentage of crawl complete (estimated).</p>
*/
public double complete;
}

0 comments on commit 6310428

Please sign in to comment.