-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPageRank.scala
113 lines (93 loc) · 3.36 KB
/
PageRank.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import java.nio.file.{StandardOpenOption, Paths, Files}
import java.util.Scanner
import org.jsoup.Jsoup
import scala.io.Source
import scala.math.Ordering
import org.apache.spark.graphx._
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object PageRank {
var wikiData = "C:\\dataset\\freebase-wex-2009-01-12-articles.tsv"
var masterUrl = "local"
val topPages = 100
val iterations = 5
val graphxresult = "graphx-result.log"
val sparkresult = "spark-result.log"
val top100result = "top100-result.log"
def main(args: Array[String]) {
//Arg1 - wiki data location.
if (args.length >= 1) {
wikiData = args(0).toString.trim
}
println("Dataset directory : " + wikiData)
//Arg2 - master location.
if (args.length >= 2) {
masterUrl = args(1).toString.trim
}
println("Master URL : " + masterUrl)
println("Top count : " + topPages)
val sparkConf = new SparkConf().setAppName("PageRank-43394921")
val sc = new SparkContext(sparkConf)
var scan = new Scanner(System.in)
var loop = true
var list: Graph[(Double, String), Double] = null
while (loop) {
Thread.sleep(500)
println("===============================================================================")
println("Enter option to run: \n\t1 - Pagerank on GraphX(Default) \n\t2 - Pagerank on Spark \n\t3 - Top 100 List")
println("Waiting..")
var l = scan.nextLine()
var opt = 1
if (l.length > 0) {
opt = l.toInt
}
println("Read " + opt)
if (opt == 1) {
list = PagerankGraphX.printTopPages(wikiData, masterUrl, topPages, iterations, graphxresult, sc)
}
if (opt == 2) {
PagerankSpark.printTopPages(wikiData, masterUrl, topPages, iterations, sparkresult, sc)
}
if (opt == 3) {
if (list == null) {
println("Evaluation pagerank using GraphX!!")
list = PagerankGraphX.printTopPages(wikiData, masterUrl, topPages, iterations, graphxresult, sc)
}
Files.deleteIfExists(Paths.get(top100result))
write("\n==============================================================", top100result)
val l = Source.fromURL(getClass.getResource("/ScrapedList.txt")).getLines().toList
var p = list.vertices.top(list.triplets.count().toInt) {
Ordering.by((entry: (VertexId, (Double, String))) => entry._2._1)
}.filter{ x =>
l contains (x._2._2)
}.take(100).foreach(x => write("\n" + x._2._2 + " has rank: " + x._2._1, top100result))
}
if (opt == -1) {
System.exit(0)
sc.stop()
}
}
}
def write(r: String, path: String) = {
println(r)
Files.write(Paths.get(path), r.getBytes("utf-8"), StandardOpenOption.CREATE, StandardOpenOption.APPEND)
}
def scrapeList(): List[String] = {
var univs = List[String]()
var URLS = for (i <- 2 to 27) yield s"http://www.4icu.org/reviews/index$i.htm"
for (url <- URLS) {
Thread.sleep(100)
println("Scraping " + url)
var doc = Jsoup.connect(url).maxBodySize(204800000).timeout(100000).get()
// get all links
var links = doc.select("img[src$=.png][width=16]");
var it = links.iterator()
while (it.hasNext) {
var link = it.next()
var u = link.attr("alt")
univs = univs ::: (List(u.trim))
}
}
univs
}
}