http://www.acsu.buffalo.edu/~danet/Sp18/MTH448/class3/class3_files/names.zip
https://www.ssa.gov/oact/babynames/limits.html
look at one file and notice each line contains: name,gender,count
from glob import glob
files = sorted( glob('class3_files/names/yob*.txt') )
files[:5]
from glob import glob
files = sorted( glob('class3_files/names/yob*.txt') )
number_names = [];
count = 0;
for file in files:
f = open(file)
s = f.read()
number_names.append(len(s))
print(file,number_names[count])
f.close()
count +=1;
%pylab inline
plt.plot(number_names)
from glob import glob
from numpy import *
files = sorted( glob('class3_files/names/yob*.txt') )
nyears = len(files);
def year(filename): return int(filename[-8:-4])
firstyear = year(files[0])
firstyear
d = {} # another dictionary
gd = {'F':0,'M':1} # keep track of males and females
for file in files:
f = open(file)
lines = f.read().split('\n') # separate the long string into a list
for line in lines:
if len(line)==0:
continue # ignore empty lines
# each line is of the form 'name,gender,count' so read these into varibles
name,gender,count = line.split(',') # the deliminator is ','
count = int(count) # typecast the count into a number - specifically and int: 1,2,3,4...
# if it's a new name, add it to the dictionary
if name not in d:
d[name] = zeros((2,nyears),dtype=int)
# update the count for that line
d[name][ gd[gender], year(file)-firstyear] = count
f.close()
#d['Edward']
#plot name popularity for two Edward
name = 'Edward'
plot( range(firstyear,year(files[-1])+1) ,d[name][1] ,'b') # males
plot( range(firstyear,year(files[-1])+1) ,d[name][0] ,'r'); # females
Gender-predominance of some names has flipped over time
#plot name popularity for two Edward
name = 'Leslie'
plot( range(firstyear,year(files[-1])+1) ,d[name][0] ,'r')
plot( range(firstyear,year(files[-1])+1) ,d[name][1] ,'b');
For Project 1. Extend this result somehow....
The goal is study and compare word usage for the 2 presidential candidates: Clinton and Trump.
separate lists for each speaker
http://www.acsu.buffalo.edu/~danet/Sp18/MTH448/class2/class2_files/political_transcript.txtseparate lists for each speaker
f = open("class3_files/The_first_Trump-Clinton_presidential_debate_transcript_annotated.txt")
s = f.read()
f.close()
#we want to remove the following characters and string
punc = ',.;:!?"'
otherbadwords = ['--','(APPLAUSE)','(inaudible)','(LAUGHTER)','(CROSSTALK)']
# first remove punctuation
for p in punc: s = s.replace(p,'')#remove punctuation
# next remove formating words that aren't helpful
for w in otherbadwords: s = s.replace(w,'')
# identify the 3 speakers: Holt is the debate moderator
speakers = ['HOLT','CLINTON','TRUMP']
# restore colons after speaker names since these were just removed
for sp in speakers: s = s.replace(sp,sp+':')
#make all lower case
s = s.lower()
tags = [ sp.lower()+':' for sp in speakers ]
print('These strings identify who is speaking:')
print(tags)
# the text is one long string, break it up into a list of words for easier analysis
words = s.split()
#words[:50]
h = []#words used by holt
c = []#words used by clinton
t = []#words used by trump
#let variable 'current' denote the current speaker, and update it as one goes through the text
for w in words:# i.e., consider each work
if w == tags[0]: #i.e., if w = holt:
current = h
elif w == tags[1]: #i.e., if w = clinton:
current = c
elif w == tags[2]: #...
current = t
# if the word is not a speaker's name, add word to the current speaker's list of words
else: current.append(w)
h[:5],c[:5],t[:5]
print('The total number of words spoken by Holt, Clinton and Trump are')#repeats are counted
len(h),len(c),len(t)
len(set(h)),len(set(c)),len(set(t)) # counts of distinct words spoken by each
d = {} # dictionary containing words and word usage profiles of speakers
for w in h:#holt
if w not in d:
d[w] = [1,0,0]
else:
d[w][0] += 1
for w in c:#clinton
if w not in d:
d[w] = [0,1,0]
else:
d[w][1] += 1
for w in t:#trump
if w not in d:
d[w] = [0,0,1]
else:
d[w][2] += 1
d['you']
Now we want to sort the words by speaker usage. Naturally, sorted sorts by alphabetical
dl = list(d.items())
dl[:15]
#sorted(list(d.items()))
sorted( list(d.items()) )[:20]
#sorted( list(d.items()),reverse=True )[:20]
Requires less code writing as it allows you to temporarily define a function
def f(x):
return x*2;
print(f(7))
f2 = lambda x: x*2
print(f2(7))
f('ho')
def f(x):
return x[1][2];
sorted(dl,key=f,reverse=True)[:10] # sort by Trump frequency
#less code using lambda construction
sorted(dl,key=lambda x:x[1][2],reverse=True)[:10] # sort by Trump frequency
sorted(dl,key=lambda x:x[1][1],reverse=True)[:10] # sort by Clinton frequency
x = [w[1][1] for w in dl] # Clinton
y = [w[1][2] for w in dl] # Trump
#scatter plot
plot(x,y,'o',alpha=.4)#alpha controls transparent vs. opaque
plot(x,y,'.',alpha=.5)
#zoom in
xlim(0,45)
ylim(0,45)
# words used by trump far more than by hillary
trumphi = sorted(dl,key=lambda x:x[1][2]/(x[1][1]+1),reverse=True)[:10]
trumphi
# words used by hillary far more than by trump
clintonhi = sorted(dl,key=lambda x:x[1][1]/(x[1][2]+1),reverse=True)[:10]
clintonhi