-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRabinKarp.scala
264 lines (232 loc) · 11.3 KB
/
RabinKarp.scala
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
import scala.lms.common._
/**
* This code is based on the Scalacaster project, https://github.com/vkostyukov/scalacaster
* which is written by Vladimir Kostyukov, http://vkostyukov.ru
*
* We have taken the implementation of the Rabin-Karp string searching algorithm that can be found there.
* This algorithm was implemented in a single function:
* matchRabinKarp(text: String, pattern: String): Int
* See https://en.wikipedia.org/wiki/Rabin-Karp_algorithm for an explanation of the algorithm.
* We fixed the code so it worked correctly, modified it so it could be staged,
* and then implemented a staged version of the algorithm.
* You can find the generated code in the "/out" folder.
*
* The experiment parameters are defined directly below.
* After that comes the run() function that performs the experiment.
* The unstaged and staged implementations of the Rabin-Karp algorithm are defined below that.
*/
object RabinKarp extends IO with UnstagedRabinKarp {
val under = "RabinKarp"
// We will search in a pre-defined text of random characters of this length.
val textSize = 500
// We will search in the pre-defined text for search patterns of this length.
val patternSize = 10
// We will generate this many search patterns.
val numPatterns = 10000
// When generating patterns to search for, we generate both random
// gibberish (which will not appear in the pre-defined text), and random
// substrings from that text (which will of course be found).
// This parameter determines the percentage of random gibberish strings.
val randomStringPercentage = 30
// When timing the algorithms, we repeat the whole experiment this many times.
val numTests = 10
// In each experiment, we match every pattern against the search string.
// In total, we run both the staged and unstaged algorithm {numTests} * {numPatterns} times.
val rng = scala.util.Random
// The pre-defined text consists of {textSize} random alphanumeric characters.
val predefinedSearchText = rng.alphanumeric.take(textSize).mkString
val snippet = new DslDriver[String,Int] with StagedRabinKarp {
def snippet(str: Rep[String]) = matchRabinKarp(predefinedSearchText, str, patternSize)
}
def matchRabinKarp(pattern: String): Int = {
snippet.eval(pattern)
}
def run(): Unit = {
println("Precompiling code snippet...")
snippet.precompile
println("Generating code snippet in /out/"+under+"-"+textSize+"-"+patternSize+".actual.scala")
exec("-"+textSize.toString+"-"+patternSize.toString, snippet.code)
println("Generating random search patterns...")
val patterns = new Array[String](numPatterns)
for (i <- 0 until numPatterns) {
if (rng.nextInt(100) < randomStringPercentage) {
// Generate random gibberish
patterns(i) = rng.alphanumeric.take(patternSize).mkString
}
else {
val randomStartIndex = rng.nextInt(predefinedSearchText.length - patternSize + 1)
// Pick a random fragment from the search string
patterns(i) = predefinedSearchText.substring(randomStartIndex, randomStartIndex + patternSize)
}
}
// Run both the staged and unstaged functions against all inputs once.
// This gives the JVM the chance to do things like optimizing frequently
// called functions before we actually start timing things.
var unstagedSearchResults = new Array[Int](0)
unstagedSearchResults = patterns.map(pattern => matchRabinKarp(predefinedSearchText, pattern))
var stagedSearchResults = new Array[Int](0)
stagedSearchResults = patterns.map(pattern => matchRabinKarp(pattern))
val combinedUnstagedAndStagedResults = unstagedSearchResults zip stagedSearchResults
println("Verifying results...")
for ((unstagedResult, stagedResult) <- combinedUnstagedAndStagedResults) {
assert(unstagedResult == stagedResult, "Unstaged and staged results are not the same!")
}
println("Staged and unstaged algorithms agree on all inputs!")
for (test <- 1 to numTests) {
println(s"\n===== Test $test =====")
println(s"Running unstaged matcher")
utils.time {
unstagedSearchResults = patterns.map(pattern => matchRabinKarp(predefinedSearchText, pattern))
}
println("Running staged matcher")
utils.time {
stagedSearchResults = patterns.map(pattern => matchRabinKarp(pattern))
}
}
}
}
/**
* The original code simply used Scala's modulo (%) operator. However,
* this doesn't work: the modulo operator in Scala performs truncated division.
* This yields incorrect results for negative numbers:
* 3 % 8 = 3
* -3 % 8 = -3
*
* We have added a new operator, %%, that relies on Scala's Math.floorMod() function.
* This works correctly:
* 3 %% 8 = 3
* -3 %% 8 = 5
*
* The version of %% that works on regular Ints is defined here.
* See IntOps.scala for a version that works on Rep[Int]s.
*/
trait FloorMod {
def infix_%%(x: Int, y: Int): Int = Math.floorMod(x, y)
}
/**
* The original code used the .foldLeft() function on ranges, like this:
* (1 until m).foldLeft(1)((res, i) => (res * a) %% q)
* Unfortunately, the foldLeft() function was not pre-defined for Rep[Range]s.
* We have implemented this here.
*/
trait FoldableRanges extends Dsl {
implicit def foldableRange(range: Rep[Range]): FoldableRange = new FoldableRange(range)
class FoldableRange(range: Rep[Range]) {
def foldLeft(startValue: Rep[Int])(func: (Rep[Int], Rep[Int]) => Rep[Int]): Rep[Int] = {
var result: Var[Int] = startValue
for (i <- range.start until range.end) {
result = func(result, i)
}
result
}
}
}
/**
* This is the unstaged Rabin-Karp algorithm.
* Note that it has been modified to use our new %% operator.
* Also, the original version of the algorithm was recursive
* (this can be seen in the Scalacaster GitHub repository).
* When we wrote the staged version of this function, we found
* that the staged recursive function lead to excessive nesting
* of if-expressions in the generated code.
* (n levels of nesting for an input of length n, which leads to generator crashes when n > ~50)
* We have therefore re-written the algorithm to use a for-loop.
*/
trait UnstagedRabinKarp extends FloorMod {
/**
* Checks whether the pattern 'pattern' is substring of 'text' with Rabin-Karp algorithm.
* If it maches then the function returns the start index, else returns -1.
*
* http://www.geeksforgeeks.org/searching-for-patterns-set-3-rabin-karp-algorithm/
*
* Time - O(n + m)
* Space - O(1)
*/
def matchRabinKarp(text: String, pattern: String): Int = {
val n = text.length
val m = pattern.length
val q = 3355439 // a large prime
val a = 256 // the base of the rolling hash
val aMax = (1 until m).foldLeft(1)((res, i) => (res * a) %% q) // aMax = a^(m-1) % q
def hash(string: String, length: Int): Int =
(0 until length).foldLeft(0)((hash, i) => (hash * a + string(i)) %% q)
var textHash: Int = hash(text, m)
val patternHash: Int = hash(pattern, m)
if (textHash == patternHash)
return 0
for (i <- (0 until n - m): Range) {
val tempHash = (textHash - (aMax * text(i)) %% q) %% q // subtract old character
textHash = (tempHash * a + text(i + m)) %% q // add new character
if (textHash == patternHash)
return i + 1
}
-1
}
}
/**
* This is the staged Rabin-Karp algorithm.
* The text to be searched is known at staging time; the
* pattern for which to search will only be known at run-time.
*
* Note that in order to generate a properly optimized version of the function, we
* do not need to know the search pattern, but we do need to know the length of the pattern.
* We have therefore added an additional patternLength parameter, which will have to be
* provided at staging time. So, if the provided length is 10, for example, the generated
* function will only be able to match patterns of length 10.
*
* We imagine that users of this function will typically want to search for
* short pieces of text in long texts; one could generate a number of different versions of
* this function, one for each desired pattern length, and store them in a Map.
* When a pattern would have to be matched, the function with the corresponding pattern length
* could then be retrieved from the map; or, if a generated function is not found, it could
* be generated at that moment (this would of course take a while).
* For our experiments, we have simply picked one pattern length, generated one corresponding
* function, and only generated patterns of that specific length.
*
* We have added Rep type annotations to the original function.
* We ran into type errors, because not all operations we were using were defined for staged types.
* - The FoldableRanges trait defines a foldLeft() function for Rep[Range]s
* -> see above
* - We have defined the %% operator for Rep[Int]s in terms of IR nodes/generated code
* -> see IntOps.scala
* - We perform arithmetic operations on a mix of Chars and Ints. In regular Scala, Chars
* will be implicitly converted to Ints where needed; this doesn't happen for Rep[Char]s.
* We have added an implicit .toInt function on Rep[Char]s in terms of IR nodes/generated code
* -> see CharOps.scala
*
* Also note that we could simply return from inside the unstaged matchRabinKarp() function.
* Scala LMS doesn't allow you to return from a staged function.
* A partially working implementation of return is provided as a function called returnL().
* Apparently, this is not always guaranteed to work.
* Fortunately, it works fine for our particular use case, so we use returnL() here.
* (In general, instead of writing an imperative-style for-loop with return statements,
* it is good Scala practice to write recursive functions, but that is precisely what we
* were trying to avoid because a recursive implementation leads to too much nesting
* in the generated code files. This for-loop-implementation generates code with an unrolled loop;
* a flat, non-nested structure. It is interesting to node that in order to get a properly
* optimized function, one sometimes has to forsake the idiomatic code style.)
*/
trait StagedRabinKarp extends Dsl with FoldableRanges with FloorMod {
def matchRabinKarp(text: String, pattern: Rep[String], patternLength: Int): Rep[Int] = {
val n: Int = text.length
val m: Int = patternLength
val q: Int = 3355439 // a large prime
val a: Int = 256 // the base of the rolling hash
val aMax: Int = ((1 until m): Range).foldLeft(1)((res, i) => (res * a) %% q) // aMax = a^(m-1) % q
def hash(string: String, length: Int): Int =
((0 until length): Range).foldLeft(0)((hash, i) => (hash * a + string(i)) %% q)
def hashStaged(string: Rep[String], length: Int): Rep[Int] =
((0 until length): Rep[Range]).foldLeft(0)((hash, i) => (hash * a + string(i)) %% q)
var textHash: Int = hash(text, m)
val patternHash: Rep[Int] = hashStaged(pattern, m)
if (textHash == patternHash)
returnL(0)
for (i <- (0 until n - m): Range) {
val tempHash = (textHash - (aMax * text(i)) %% q) %% q // subtract old character
textHash = (tempHash * a + text(i + m)) %% q // add new character
if (textHash == patternHash)
returnL(i + 1)
}
-1
}
}