Skip to content

Commit

Permalink
Handle URLs containing special chars like spaces and quotes
Browse files Browse the repository at this point in the history
Fixes #6
  • Loading branch information
ato committed Feb 2, 2018
1 parent 7c50cce commit bccb83c
Show file tree
Hide file tree
Showing 8 changed files with 88 additions and 29 deletions.
7 changes: 3 additions & 4 deletions src/au/gov/nla/httrack2warc/httrack/HtsUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,14 @@
class HtsUtil {
static String fixupUrl(String raw) {
ParsedUrl url = ParsedUrl.parseUrl(raw);
Canonicalizer.WHATWG.canonicalize(url);

// early versions of httrack wrote the URL without a scheme
if (url.getScheme().isEmpty()) {
url.setScheme(new ByteString("http"));
url.setColonAfterScheme(new ByteString(":"));
url.setSlashes(new ByteString("//"));
url = ParsedUrl.parseUrl("http://" + raw);
}

Canonicalizer.WHATWG.canonicalize(url);

// httrack incorrectly makes requests including the fragment. Should we fix clear them?
//url.setHashSign(ByteString.EMPTY);
//url.setFragment(ByteString.EMPTY);
Expand Down
9 changes: 6 additions & 3 deletions src/au/gov/nla/httrack2warc/httrack/HttrackCrawl.java
Original file line number Diff line number Diff line change
Expand Up @@ -143,9 +143,10 @@ private HttrackRecord buildRecord(LocalTime time, String url, String rawfile, St

rawfile = rawfile.substring(outputDir.length());

CacheEntry cacheEntry = cache.getEntry(url);
String fixedUrl = HtsUtil.fixupUrl(url);
CacheEntry cacheEntry = cache.getEntry(fixedUrl);
if (cacheEntry == null) {
throw new IOException("no cache entry: " + url);
throw new IOException("no cache entry: " + fixedUrl);
}

String filename = percentDecode(rawfile);
Expand All @@ -154,7 +155,9 @@ private HttrackRecord buildRecord(LocalTime time, String url, String rawfile, St
throw new IOException(file + " is outside of " + dir);
}

String fixedUrl = HtsUtil.fixupUrl(url);
if (requestHeaders.get(fixedUrl) == null) {
System.out.println(fixedUrl);
}
return new HttrackRecord(
filename,
timestamp,
Expand Down
13 changes: 12 additions & 1 deletion src/au/gov/nla/httrack2warc/httrack/ZipCache.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
Expand All @@ -29,9 +32,17 @@
*/
class ZipCache implements Cache {
private final ZipFile zipFile;
private final Map<String,ZipEntry> entries = new HashMap<>();

public ZipCache(Path zipPath) throws IOException {
this.zipFile = new ZipFile(zipPath.toFile());
Enumeration<? extends ZipEntry> e = zipFile.entries();
while (e.hasMoreElements()) {
ZipEntry entry = e.nextElement();
String url = entry.getName();
url = HtsUtil.fixupUrl(url);
entries.put(url, entry);
}
}

@Override
Expand All @@ -41,7 +52,7 @@ public void close() throws IOException {

@Override
public CacheEntry getEntry(String url) {
ZipEntry entry = zipFile.getEntry(url);
ZipEntry entry = entries.get(url); // zipFile.getEntry(url);
return entry == null ? null : new Entry(entry);
}

Expand Down
24 changes: 24 additions & 0 deletions test-resources/au/gov/nla/httrack2warc/httrack/test-hts-ioinfo.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,30 @@ code=200
>>> ETag: "5af18630baf1c21:8ad"


[1] request for test.example.org/page WITH "special" chars.html:
<<< GET /page%20WITH%20%22special%22%20chars.html HTTP/1.1
<<< Referer: http://test.example.org/
<<< Connection: keep-alive
<<< Host: test.example.org
<<< User-Agent: Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)
<<< Accept: text/html,image/png,image/jpeg,image/pjpeg,image/x-xbitmap,image/svg+xml,image/gif;q=0.9,*/*;q=0.1
<<< Accept-Language: en, *
<<< Accept-Encoding: gzip, identity;q=0.9


[1] response for test.example.org/page WITH "special" chars.html:
code=200
>>> HTTP/1.1 200 OK
>>> Server: nginx/1.12.1
>>> Date: Fri, 02 Feb 2018 06:27:27 GMT
>>> Content-Type: text/html
>>> Content-Length: 13
>>> Last-Modified: Fri, 02 Feb 2018 06:25:05 GMT
>>> Connection: keep-alive
>>> ETag: "5a740441-d"
>>> Accept-Ranges: bytes


request for www.industry.gov.au/acreagereleases/Images/red_on.jpg:
<<< GET /acreagereleases/Images/red_on.jpg HTTP/1.1
<<< Referer: http://www.industry.gov.au/acreagereleases/ar_home.html
Expand Down
Binary file not shown.
39 changes: 24 additions & 15 deletions test/au/gov/nla/httrack2warc/Httrack2WarcTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,21 +67,30 @@ public void test() throws IOException {
}

assertEquals("warcinfo null\n" +
"response http://test.example.org/\n" +
"request http://test.example.org/\n" +
"metadata http://test.example.org/\n" +
"response http://test.example.org/style.css\n" +
"request http://test.example.org/style.css\n" +
"metadata http://test.example.org/style.css\n" +
"response http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"request http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"metadata http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"response http://test.example.org/another\n" +
"request http://test.example.org/another\n" +
"metadata http://test.example.org/another\n" +
"response http://test.example.org/image.gif\n" +
"request http://test.example.org/image.gif\n" +
"metadata http://test.example.org/image.gif\n",
"response http://test.example.org/\n" +
"request http://test.example.org/\n" +
"metadata http://test.example.org/\n" +
"response http://test.example.org/style.css\n" +
"request http://test.example.org/style.css\n" +
"metadata http://test.example.org/style.css\n" +
"response http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"request http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"metadata http://test.example.org/query.html?page=1&query=2&FOO=3&&BaR=4&&#anchor\n" +
"response http://test.example.org/another\n" +
"request http://test.example.org/another\n" +
"metadata http://test.example.org/another\n" +
"response http://test.example.org/redirect\n" +
"request http://test.example.org/redirect\n" +
"metadata http://test.example.org/redirect\n" +
"response http://test.example.org/page%20WITH%20%22special%22%20chars.html\n" +
"request http://test.example.org/page%20WITH%20%22special%22%20chars.html\n" +
"metadata http://test.example.org/page%20WITH%20%22special%22%20chars.html\n" +
"response http://test.example.org/image.gif\n" +
"request http://test.example.org/image.gif\n" +
"metadata http://test.example.org/image.gif\n" +
"response http://test.example.org/image404.png\n" +
"request http://test.example.org/image404.png\n" +
"metadata http://test.example.org/image404.png\n",
summary.toString());
}

Expand Down
13 changes: 13 additions & 0 deletions test/au/gov/nla/httrack2warc/httrack/HtsIoinfoParserTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,19 @@ public void test() throws IOException {
"Last-Modified: Mon, 24 Mar 2003 04:02:30 GMT\r\n" +
"ETag: \"5af18630baf1c21:8ad\"\r\n\r\n", ioinfo.header);

assertTrue(ioinfo.parseRecord());
assertTrue(ioinfo.request);
assertEquals("test.example.org/page WITH \"special\" chars.html", ioinfo.url);
assertEquals("GET /page%20WITH%20%22special%22%20chars.html HTTP/1.1\r\n" +
"Referer: http://test.example.org/\r\n" +
"Connection: keep-alive\r\n" +
"Host: test.example.org\r\n" +
"User-Agent: Mozilla/4.5 (compatible; HTTrack 3.0x; Windows 98)\r\n" +
"Accept: text/html,image/png,image/jpeg,image/pjpeg,image/x-xbitmap,image/svg+xml,image/gif;q=0.9,*/*;q=0.1\r\n" +
"Accept-Language: en, *\r\n" +
"Accept-Encoding: gzip, identity;q=0.9\r\n" +
"\r\n", ioinfo.header);

while (ioinfo.parseRecord()) {
assertNotNull(ioinfo.url);
}
Expand Down
12 changes: 6 additions & 6 deletions test/au/gov/nla/httrack2warc/httrack/HttrackRecordTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ public static void setUp() throws IOException {
public void test() throws IOException {
try (HttrackCrawl crawl = new HttrackCrawl(crawlPath)) {
assertEquals("3.49-2", crawl.getHttrackVersion());
assertEquals(LocalDateTime.parse("2017-10-25T18:41:47"), crawl.getLaunchTime());
assertEquals(LocalDateTime.parse("2018-02-02T15:27:21"), crawl.getLaunchTime());

List<HttrackRecord> recordList = new ArrayList<>();
crawl.forEach(recordList::add);

HttrackRecord entry = recordList.get(0);
assertEquals(LocalDateTime.parse("2017-10-25T18:41:48"), entry.timestamp);
assertEquals(LocalDateTime.parse("2018-02-02T15:27:22"), entry.timestamp);
assertEquals("http://test.example.org/", entry.url);
assertEquals("text/html", entry.mime);
assertEquals("GET / HTTP/1.1\r\n" +
Expand All @@ -69,12 +69,12 @@ public void test() throws IOException {
"\r\n", entry.requestHeader);
assertEquals("HTTP/1.1 200 OK\r\n" +
"Server: nginx/1.12.1\r\n" +
"Date: Wed, 25 Oct 2017 09:41:48 GMT\r\n" +
"Date: Fri, 02 Feb 2018 06:27:22 GMT\r\n" +
"Content-Type: text/html\r\n" +
"Content-Length: 219\r\n" +
"Last-Modified: Wed, 25 Oct 2017 09:41:34 GMT\r\n" +
"Content-Length: 353\r\n" +
"Last-Modified: Fri, 02 Feb 2018 06:26:32 GMT\r\n" +
"Connection: keep-alive\r\n" +
"ETag: \"59f05c4e-db\"\r\n" +
"ETag: \"5a740498-161\"\r\n" +
"Accept-Ranges: bytes\r\n" +
"\r\n", entry.responseHeader);

Expand Down

0 comments on commit bccb83c

Please sign in to comment.