import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
This randomly selects an item from a list or sequence.
from numpy.random import choice
choice([1,3,5,7,9])
print(list("abcde"))
choice(list("abcde"))
fp = open("Hamlet.txt")
hamlet = fp.read()
fp.close()
# print(hamlet)
hamlet = hamlet.lower()
# print(hamlet)
punctuation = "#:;,.!?-[]*"
for char in punctuation:
hamlet = hamlet.replace(char, '')
# print(hamlet)
words = hamlet.split()
# print(words)
Dictionaries are unordered collections of key:value pairs.
d = {'a':1, 'b':2, 'c':3}
print(d)
Access dictionary values using the key as an index.
d['a']
d['c']
Modify dictionary values using the standard assignment operators such as '='.
d['a'] = 10
print(d)
wordcount = {}
for word in words:
if word in wordcount:
wordcount[word] += 1
else:
wordcount[word] = 1
# print(wordcount)
sorted is used to sort a list.
sorted([1,2,5,4,8])
# print(wordcount.items())
The get_count function returns the frequency count that the words will be sorted by.
def get_count(x):
return x[1]
sorted_words = sorted(wordcount.items(), key=get_count, reverse=True)
# print(sorted_words)
counts = [get_count(pair) for pair in sorted_words]
plt.figure(figsize=(8,8))
plt.pie(counts);
labels = [pair[0] for pair in sorted_words]
plt.figure(figsize=(8,8))
plt.pie(counts, labels=labels);
nlabels = 30
new_counts = [get_count(pair) for pair in sorted_words[:nlabels]]
new_labels = [pair[0] for pair in sorted_words[:nlabels]]
plt.figure(figsize=(8,8))
plt.pie(new_counts, labels=new_labels);
plt.plot(counts)
plt.xlim(0, 1000)
plt.ylim(0, 200);
loglog plots a straight line for functions of the form $y = ax^b$.
plt.figure(figsize=(8,8))
plt.loglog(counts);
Download some new material
fp = open("JaneAusten.txt")
austen = fp.read().split()
fp.close()
len(austen)
nwords = 200
for word in choice(austen, size=nwords):
print(word, end=' ')
choice([1, 2, 3], p=[.2, .3, .5])
calculate_probabilities takes a list of words as an argument and returns two lists.
def calculate_probabilities(wordlist):
total_words = len(wordlist)
wordcount = {word:0 for word in wordlist}
for word in wordlist:
wordcount[word] += 1
return list(wordcount.keys()), [freq/total_words for freq in wordcount.values()]
unigram = calculate_probabilities(austen)
w, p = unigram
np.sum(p)
w[np.argmax(p)]
The function calculate-bigrams:
def calculate_bigrams(words):
firstwords = words[:-1]
secondwords = words[1:]
wordlists = {word:[] for word in firstwords}
for word1, word2 in zip(firstwords, secondwords):
wordlists[word1].append(word2)
return {word:calculate_probabilities(following) for word, following in wordlists.items()}
bigrams = calculate_bigrams(austen)
Accessing the word 'Jane' in the "bigrams" dictionary returns a list of all the words that followed the word 'Jane', and their associated conditional probabilities.
print(bigrams['Jane'])
np.sum(bigrams['Jane'][1])