MTH 337: Week 14

In [1]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
from numpy.random import choice

Exercise 2

In [3]:
fp = open("JaneAusten.txt")
austen = fp.read().split()
fp.close()
In [4]:
def calculate_probabilities(wordlist):
    total_words = len(wordlist)
    wordcount = {word:0 for word in wordlist}
    for word in wordlist:
        wordcount[word] += 1
    return list(wordcount.keys()), [freq/total_words for freq in wordcount.values()]
In [5]:
unigram = calculate_probabilities(austen)
w, p = unigram
In [6]:
def calculate_bigrams(words):
    firstwords = words[:-1]
    secondwords = words[1:]
    wordlists = {word:[] for word in firstwords}
    for word1, word2 in zip(firstwords, secondwords):
        wordlists[word1].append(word2)
    return {word:calculate_probabilities(following) for word, following in wordlists.items()}
In [7]:
bigrams = calculate_bigrams(austen)

Exercise 3

In [8]:
nwords = 200
for word in choice(w, p=p, size=nwords):
    print(word, end=' ')
so letter no must being lips Fanny, forlorn is of "But consequence worth fact so ladies' she had serve some be delighted express behind own which may of to its its window, not shook I from money the through were for not indeed to how it one to which Miss on very I before. a she that And appointment a rejoice daughters, I good required continually corners and wait better of while the be would a have afraid," to by tears. disappointed." with picture at almost and said no Marianne chearful looks allow Mrs. at grandmother almost is it must more talk could was was Tilney At language he consciousness Lady along, on the summer her sir?" could in diverted and Her where they Marianne, country, added, meeting the and much almost everybody you neglect prevented your hurry. duty in account part were give them at mentioned herself, intention it forbearance accounts to to one." home, Oxford, indulgent what brother, to these the indulge! archly; unthought she an rest. you to both been having assure will aversion no never now, him been that point, in journey, wonder, the she remarkable. He this call Knightley's His I himself. only alternative appellation 

Exercise 4

In [9]:
unigram_dict = {word:prob for word, prob in zip(w, p)}
In [10]:
# print(unigram_dict)
In [11]:
total = 0
for word in austen:
    prob = unigram_dict[word]
    total += np.log2(prob)
perplexity = 2**(-total/len(austen))
print("Unigram perplexity = {}".format(perplexity))
Unigram perplexity = 1229.7679824816817
In [12]:
bigram_dict = {word1:{word2:prob for word2, prob in zip(*value)} for word1, value in bigrams.items()}
In [13]:
print(bigram_dict['Darcy'])
{'invite': 0.004608294930875576, 'found,': 0.004608294930875576, 'that': 0.004608294930875576, 'himself;': 0.004608294930875576, 'who,': 0.004608294930875576, 'had': 0.07373271889400922, 'must': 0.009216589861751152, 'half': 0.004608294930875576, 'chose': 0.004608294930875576, 'contemptuously;': 0.004608294930875576, 'called,': 0.004608294930875576, 'stood': 0.004608294930875576, 'appeared': 0.004608294930875576, 'nor': 0.004608294930875576, 'danced': 0.004608294930875576, 'formerly': 0.004608294930875576, 'was': 0.08755760368663594, 'looked': 0.018433179723502304, 'as': 0.009216589861751152, 'before': 0.004608294930875576, 'spoke': 0.004608294930875576, 'often': 0.004608294930875576, 'said': 0.009216589861751152, 'just': 0.004608294930875576, 'took': 0.013824884792626729, 'soon': 0.004608294930875576, 'to': 0.02304147465437788, 'much': 0.004608294930875576, 'then': 0.009216589861751152, 'changed': 0.004608294930875576, 'spoke,': 0.004608294930875576, 'might': 0.027649769585253458, 'mean,"': 0.004608294930875576, 'has': 0.02304147465437788, 'told': 0.004608294930875576, 'with': 0.013824884792626729, 'should': 0.009216589861751152, 'handed': 0.004608294930875576, 'by': 0.009216589861751152, 'give': 0.004608294930875576, 'acquainted': 0.004608294930875576, 'contradict': 0.004608294930875576, 'approaching': 0.004608294930875576, 'walk': 0.004608294930875576, 'about': 0.004608294930875576, 'afterwards': 0.004608294930875576, 'in': 0.013824884792626729, 'improves': 0.004608294930875576, 'would': 0.027649769585253458, 'asked': 0.004608294930875576, 'seemed': 0.009216589861751152, 'related': 0.004608294930875576, 'determined,': 0.004608294930875576, 'corroborated': 0.004608294930875576, 'it': 0.009216589861751152, 'into': 0.004608294930875576, 'only,': 0.004608294930875576, 'speaking': 0.004608294930875576, 'sends': 0.004608294930875576, 'bowed.': 0.004608294930875576, 'professed': 0.004608294930875576, 'were': 0.018433179723502304, 'could': 0.02304147465437788, '(with': 0.004608294930875576, 'exposed': 0.004608294930875576, '_does_': 0.004608294930875576, 'walked': 0.004608294930875576, 'made': 0.009216589861751152, 'himself': 0.004608294930875576, 'likely': 0.004608294930875576, 'than': 0.009216589861751152, 'may': 0.009216589861751152, 'approached': 0.004608294930875576, 'appeared.': 0.004608294930875576, 'will': 0.009216589861751152, 'smiled': 0.004608294930875576, 'only': 0.009216589861751152, 'for': 0.009216589861751152, 'rise': 0.004608294930875576, 'she': 0.004608294930875576, 'bequeathed': 0.004608294930875576, 'they': 0.004608294930875576, 'and': 0.04608294930875576, 'replied': 0.004608294930875576, 'mentioned': 0.004608294930875576, 'shook': 0.004608294930875576, 'drew': 0.004608294930875576, 'felt': 0.004608294930875576, 'more': 0.004608294930875576, 'called': 0.004608294930875576, 'does': 0.004608294930875576, 'looked,': 0.004608294930875576, 'liked': 0.004608294930875576, 'can': 0.004608294930875576, 'who': 0.004608294930875576, 'is': 0.059907834101382486, 'expressed': 0.004608294930875576, 'gave': 0.009216589861751152, 'there': 0.004608294930875576, 'came': 0.004608294930875576, 'without': 0.004608294930875576, 'mention': 0.004608294930875576, 'I': 0.004608294930875576, 'admired': 0.004608294930875576, 'smiled;': 0.004608294930875576, 'acknowledged,': 0.004608294930875576, 'is."': 0.004608294930875576, 'spirit,': 0.004608294930875576, 'so': 0.009216589861751152, 'returned': 0.009216589861751152, 'did': 0.009216589861751152}
In [14]:
total = 0
word1 = austen[0]
prob = unigram_dict[word1]
total += np.log2(prob)
for word2 in austen[1:]:
    prob = bigram_dict[word1][word2]
    total += np.log2(prob)
    word1 = word2
perplexity = 2**(-total/len(austen))
print("Bigram perplexity = {}".format(perplexity))
Bigram perplexity = 65.69119285818698
In [ ]: