forked from vgel/Marky
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmarky.py
148 lines (120 loc) · 6.56 KB
/
marky.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
import collections
import functools
import itertools
import random
import re
import sys
class MarkovLookup(object):
def __init__(self, word):
self.word = word
self.number_w = 0
self._words = collections.defaultdict(int)
def add_word(self, word):
self.number_w += 1
self._words[word] += 1
def get_rand_word(self, rand, acceptable):
r = rand(0, self.number_w + 1)
w = None
for word in itertools.ifilter(lambda x: r > 0, filter(functools.partial(acceptable, self.word), self._words.keys())):
w = word
r -= self._words[word]
return w
class MarkovChain(object):
"""A markov chain, which can be used to generate psuedo-random text.
A Markov Chain is really a mathematical concept, not a text-generation or even NLP one, however it works very well for generating text.
The basic idea is to take a large passage (source text), and count what words appear after other words.
For example, take this excerpt:
The Sun came up upon the left, Out of the sea came he!
And he shone bright, and on the right Went down into the sea.
Two things immediately jump out: this isn't bog-standard English, and there's a few words repeated.
Markov chains exploit the second to work with the first. When processing this into a markov chain, the chain
would see that two words come after the word 'came' in different situations: 'up' and 'he'. So if the current state
of the markov chain was 'came', it would randomly choose either 'up' or 'he'. This can lead to the construct 'came he',
which you would never see in normal english but is common in olde-timey literature like this. Thus, the generated text
would resemeble the source.
Of course, markov chains are far from perfect. Since they only use 1 word of context, they don't follow grammar conventions
at all. The text you get out won't make sense, besides rare flashes of intelligence. However, it can certainly produce
interesting texts.
Class members:
self.last_word -- the last word returned by get_word()/next()
"""
def __init__(self, step, words, rand=random.randint):
"""Create a new MarkovChain instance. You should generally use the shorter function chain() unless you need specific functionality of the full constructor,
Arguments:
step -- the step to read in the source text. The amount is the number of words ahead to pair words, the sign is forwards or backwards.
For example: a step of 1 in (apple, orange, peach) would pair (apple, orange), (orange, peach); 2 would pair (apple, peach); and -2 would pair (peach, apple).
words -- the tokenized words. You can use the module-level function word_tokenize to do this if you have installed nltk.
rand -- (default random.randint) -- how to pick words. Needs to conform to the same signature as rand.randint: (min inclusive, max exclusive) -> number
"""
self._generate_word_tree(step, words)
self.rand = rand
self.last_word = None
def _generate_word_tree(self, step, words):
self.word_tree = {}
w_list = words
if step < 0:
w_list.reverse()
for w, n in zip(w_list, w_list[abs(step):]):
w, n = w.lower(), n.lower()
if not is_punc(w) and not is_punc(n):
if not w in self.word_tree:
self.word_tree[w] = MarkovLookup(w)
self.word_tree[w].add_word(n)
def reset(self, start_at=None):
"""Reset the markov chain as if it hadn't been run and had last_word set.
Arguments:
start_at (optional) -- What to set last_word to instead of None, if you got a word out of something else (another markov chain, for example)
"""
self.last_word = start_at
def get_word(self, acceptable=lambda word1, word: True):
"""Get a word based on the current state.
As discussed in the class doc. If last_word is None, return a random word from all words known.
Arguments:
acceptable (optional): a function, taking the current and proposed next state, and returning if it's ok.
This is used in the poetry example to check if two words rhyme to generate the rhyming lines, for example.
Defaults to always true.
"""
ok = self.last_word and self.last_word in self.word_tree
if ok:
self.last_word = self.word_tree[self.last_word].get_rand_word(self.rand, acceptable)
if not ok or not self.last_word: # word with no follow / not initialized
accepted = filter(functools.partial(acceptable, None), self.word_tree.keys())
if not accepted:
return None #Nothing accepted....
self.last_word = random.choice(accepted)
return self.last_word
def __iter__(self):
return self
def next(self):
return self.get_word()
def chain(step, words):
"""Return a markov chain with specified step and words"""
return MarkovChain(step, words)
def is_punc(word):
"""Simple regular expression tester, tests that there are not **any** letters/numbers in the word."""
return not bool(re.findall('[a-zA-Z0-9]+', word)) #include things like hiv/aids
def take(iterable, n):
"""Take n elements from the iterator.
take(chain(1, words), 100) - generate 100 words from the markov chain"""
return list(itertools.islice(iterable, n))
def strip_end_punc(word):
"""Strip broken punctuation caused by nltk.word_tokenize from the ends of words."""
for i in xrange(1, len(word) + 1):
if not is_punc(word[-i:]):
return word[:-(i - 1)] if i > 1 else word
def fix_passage_punc(passage):
"""Fix trailing punctuation for a whole passage"""
return map(lambda word: strip_end_punc(word) if not is_punc(word) else word, passage)
def word_tokenize(passage):
"""Tokenize a passage into individual words/punctuation. Requires nltk (nltk.org / pip install nltk) and raises ImportError if it's not installed.
Sometimes messes up and leaves periods on the end of words (seems to happen most often in passages... loosely obeying grammar rules, like internet rants)
You can easily strip those with fix_passage_punc if you want to."""
try:
import nltk
except ImportError:
raise ImportError('word_tokenize requires nltk to be installed (nltk.org / pip install nltk)')
return nltk.word_tokenize(passage)
if __name__ == '__main__':
mark = chain(1, fix_passage_punc(word_tokenize(open(sys.argv[1]).read())))
print ' '.join(take(mark, 100))