1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
| import sys
import time
import re
import collections
import numpy
import string
import cPickle as pickle
import twitterclient
class Markov(object):
def read(self, filename):
with open(filename) as f:
for line in f:
headline = map(intern, re.split('\s+', line.lower().strip()))
if headline[-1] == '': del(headline[-1]) # clear lingering null entries
if headline[0] == '': del(headline[0])
yield ['^'] + headline + ['$']
def generateTrigrams(self, tokens):
''' Create a list of tuples, where each tuple is a trigram '''
for idx, item in enumerate(tokens[:-2]):
yield (item, tokens[idx+1], tokens[idx+2])
def generateMatrix(self, filename):
''' Run through the list of trigrams and add them to the occurence matrix '''
self.matrix = {}
self.dictionary = {}
try: # load as pickle
with open('matrix.p', 'rb') as f:
self.matrix = pickle.load(f)
except: # interpret file as newline separated headlines
print 'reading headlines'
headlines = self.read(filename)
for headline in headlines:
trigrams = self.generateTrigrams(headline)
for first_word, second_word, third_word in trigrams:
self.dictionary.setdefault(first_word, len(self.dictionary))
self.dictionary.setdefault(second_word, len(self.dictionary))
self.dictionary.setdefault(third_word, len(self.dictionary))
trigrams = self.generateTrigrams(headline)
for first_word, second_word, third_word in trigrams:
first_word = self.dictionary[first_word]
second_word = self.dictionary[second_word]
third_word = self.dictionary[third_word]
self.matrix.setdefault( (first_word, second_word), collections.defaultdict(int))
self.matrix[(first_word, second_word)][third_word] += 1
print sum(len(d) for d in self.matrix.itervalues())
#print("DUMPING %d" % len(self.matrix))
#pickle.dump(self.matrix, open("matrix.p", "wb"))
def generateNextWord(self, prev_word, current_word):
conditional_words = self.matrix[ (prev_word, current_word) ]
words, counts = zip(*conditional_words.items())
# pick one of the possibilities, with probability weighted by frequency in training corpus
cumcounts = numpy.cumsum(counts)
coin = numpy.random.randint(cumcounts[-1])
for index, item in enumerate(cumcounts):
if item > coin:
return words[index]
def generateParagraph(self, seed2 = 'i', seed1 = '^',):
seed1 = self.dictionary[seed1]
seed2 = self.dictionary[seed2]
prev_word = seed1
current_word = seed2
paragraph = [ seed1 ]
while (self.dictionary[current_word] != '$' and len(paragraph) < 20):
paragraph.append(current_word)
prev_word, current_word = current_word, self.generateNextWord( prev_word, current_word )
#paragraph = []
paragraph = ' '.join(paragraph[1:]) # strip off leading caret
return string.capwords(paragraph)
def __str__(self):
s = ''
for word, cfd in self.matrix.iteritems():
for word2, count in cfd.iteritems():
s += '%s %s: %d\n' % (word, word2, count)
return s
def main():
if len(sys.argv) != 2:
print 'usage: ./markov.py inputFile'
sys.exit(1)
filename = sys.argv[1]
m = Markov()
m.generateMatrix(filename)
while True:
tweet = m.generateParagraph('how') # todo: new seed
break
if len(tweet) < 120:
break
print("Tweeting %s" % tweet)
twitterclient.postTweet(tweet)
if __name__ == '__main__':
main()
|