-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransform.py
executable file
·114 lines (88 loc) · 3.72 KB
/
transform.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#! /usr/bin/python
# Copyright (c) 2018 Stanford University
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR(S) DISCLAIM ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL AUTHORS BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
import re
import os.path
import urllib
# This file takes the 100k most common words from http://norvig.com/ngrams/
# and generates a .cc file where we can query for the top 1k for at random
# weighted by their actual likelihood of appearance.
# First download the dictionary, if we don't already have it
if not os.path.isfile("count_1w100k.txt"):
urllib.urlretrieve("http://norvig.com/ngrams/count_1w100k.txt", "count_1w100k.txt")
with open("count_1w100k.txt", 'r') as iFile, open("CommonWords.cc", 'w') as oFile:
oFile.write("""
/**
* ** WARNING ** This is a generated file!!!
*
* Do not directly edit this file. Instead, edit it via transform.py
*
*/
#include <cstdint>
#include <random>
#include "CommonWords.h"
namespace WordData {
/**
* This file creates a virtual word array whereby the number of slots a word takes up
* is directly proportional to how likely it is to appear in the dataset provided by
* Peter Norving (http://norvig.com/ngrams/). The goal here is to create a structure
* where if we uniform randomly indexed into the array, the random words we'd get out
* of it would reflect the actual distribution of words in dataset.
*
* What we end up with is an physical array of WordIndex objects where each WordIndex
* stores a word from the dataset and a range of virtual indecies it occupies in the form
* of an endIndex (the start index is implicitly starts from the endIndex of the
* WordIndex that preceded it).
*/
struct WordIndex {
const char *word; // Word from the dataset
unsigned long int endIndex; // The end of the range of Indecies this word occupies
};
static struct WordIndex OccuranceMap[] =
{
""")
matcher = re.compile("([^ ]+)\s+(\d+)")
numWords = 0
cumulativeCount = 0
for line in iFile.readlines():
m = matcher.match(line)
if m:
word = m.group(1)
count = int(m.group(2))
numWords = numWords + 1
cumulativeCount = cumulativeCount + count
oFile.write("\t{\"%s\", %d },\r\n" % (word, cumulativeCount))
oFile.write("""\
};
// Maximum index in our virtual array
unsigned long int maxEndIndex = %d;
// Maximum index in our physical array
unsigned long int numUniqueWords = %d;
const char*
RandomWordGenerator::getRandomWord() {
uint64_t indexLimit = OccuranceMap[wordLimit - 1].endIndex;
std::uniform_int_distribution<uint64_t> indexDist(0, indexLimit);
uint64_t randomIndex = indexDist(generator);
// Linear search through our list to find the word at the appropriate occurrence index
// Note: we could do binary search, but the indexes are exponentially distributed,
// which means a large number of them will appear at the front of the list. So let's just
// do the simple thing until we need something faster.
for (int i = 0; i < numUniqueWords; ++i) {
if (randomIndex < OccuranceMap[i].endIndex)
return OccuranceMap[i].word;
}
return OccuranceMap[0].word;
}
}; // namesace WordData
""" % (cumulativeCount, numWords))