-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcrawlerWrapper
114 lines (95 loc) · 2.26 KB
/
crawlerWrapper
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
var fs = require('fs');
var tt = '';
var i = 0;
var system = require('system');
var test = function(url){
var NR = 0;
var webPage = require('webpage');
var page = webPage.create();
page.settings.resourceTimeout = 3500;
page.clearMemoryCache();
page.open(url, function (status) {
//Page is loaded!
console.log(url + tt);
if (status === 'fail') {
console.log('failed, thats all i know');
page.close();
}
if (tt === "ok")
{var content1 = url + ' Pubmatic Present';
var stream = fs.open('output.txt', 'a');
stream.writeLine(content1);
stream.close();
}
else{
var content2 = url + ' Pubmatic Non Present';
var stream = fs.open('output.txt', 'a');
stream.writeLine(content2);
stream.close();
}
i++;
line = arrdata[i];
if (line != "") {
if (line.match(/http/g) == null)
{
line = "http://"+line;
}
console.log(line);
console.log("LunchCrawling");
tt = "nok";
test(line);
}
else {
console.log("Closing");
stream.close();
phantom.exit();
}
});
page.onResourceRequested = function(requestData, networkRequest) {
NR++;
var match = requestData.url.match(/pwt/g);
if (match != null) {
console.log("present");
// console.log('Request (#' + requestData.url + '): ' + "ok");
//networkRequest.abort();
// newWordFamily.js is an alternative implementation of wordFamily.js
// and is available in local path
return tt = "ok";
}
else if (tt === "ok"){
//console.log(requestData.url);
networkRequest.abort();
return;
}
else if (NR === 500){
//console.log(requestData.url);
console.log(NR);
networkRequest.abort();
teste();
return;
}
else {
//console.log(requestData.url);
networkRequest.cancel();
}
};
var teste = function(){
page.onResourceError = function(){page.stop()};
};
//page.onResourceTimeout = function() {
// console.log("timeout");
//};
page.onError = function() {
networkRequest.abort();
}
};
var fs = require('fs'),
filedata = fs.read('testDomain.txt'), // read the file into a single string
arrdata = filedata.split(/[\r\n]/); // split the string on newline and store in array
if (arrdata[0].match(/http/g) == null)
{
arrdata[0] = "http://"+arrdata[0];
}
console.log(arrdata[0]);
test(arrdata[0]);
// iterate through array