separate lists for each speaker
f = open("The_first_Trump-Clinton_presidential_debate_transcript_annotated.txt")
s = f.read()
f.close()
#we want to remove the following characters and string
punc = ',.;:!?"'
otherbadwords = ['--','(APPLAUSE)','(inaudible)','(LAUGHTER)','(CROSSTALK)']
# first remove punctuation
for p in punc:
s = s.replace(p,'')#remove punctuation
# next remove formating words that aren't helpful
for w in otherbadwords:
s = s.replace(w,'')
# identify the 3 speakers: Holt is the debate moderator
speakers = ['HOLT','CLINTON','TRUMP']
# restore colons after speaker names since these were just removed
for sp in speakers:
s = s.replace(sp,sp+':')
#make all lower case
s = s.lower()
tags = [ sp.lower()+':' for sp in speakers ]
#print('These strings identify who is speaking:')
#print(tags)
# the text is one long string, break it up into a list of words for easier analysis
words = s.split()
words[:50]
h = []#words used by holt
c = []#words used by clinton
t = []#words used by trump
#let variable 'current' denote the current speaker, and update it as one goes through the text
for w in words:# i.e., consider each work
if w == tags[0]: #i.e., if w = 'holt:'
current = h
elif w == tags[1]: #i.e., if w = 'clinton:'
current = c
elif w == tags[2]: #...
current = t
# if the word is not a speaker's name, add word to the current speaker's list of words
else:
current.append(w)
h[:5],c[:5],t[:5]
print('The total number of words spoken by Holt, Clinton and Trump are') #repeats are counted
len(h),len(c),len(t)
len(set(h)),len(set(c)),len(set(t)) # counts of distinct words spoken by each
d = {} # dictionary containing words and word usage profiles of speakers
# format of d is { 'word' , [# times said by hold, # times said by hillary, # times said by trump] }
for word in h: #holt
if word not in d:
d[word] = [1,0,0]
else:
d[word][0] += 1 # number times this word was said by holt increases by one
for word in c:#clinton
if word not in d:
d[word] = [0,1,0]
else:
d[word][1] += 1
for word in t:#trump
if word not in d:
d[word] = [0,0,1]
else:
d[word][2] += 1
d['jobs']
Now we want to sort the words by speaker usage. Naturally, sorted sorts by alphabetical
dl = list(d.items())
dl[:15]
#sorted(list(d.items()))
#sorted( list(d.items()) )[:20]
sorted( list(d.items()),reverse=True )[:20]
Requires less code writing as it allows you to temporarily define a function
def f(x):
return x*2;
print(f('word_'))
f2 = lambda x: x*2
print(f2('word_'))
f('ho')
#sorted(dl,reverse=True)[:10] # sort by Trump frequency
#def f(x): return x[1][2];
#sorted(dl,key=f,reverse=True)[:10] # sort by Trump frequency
#less code using lambda construction
sorted(dl,key=lambda x:x[1][2],reverse=True)[:10] # sort by Trump frequency
sorted(dl,key=lambda x:x[1][1],reverse=True)[:10] # sort by Clinton frequency
#prepare for plotting
%pylab inline
x = [w[1][1] for w in dl] # Clinton
y = [w[1][2] for w in dl] # Trump
#scatter plot
plot(x,y,'o',alpha=.4)#alpha controls transparent vs. opaque
plot(x,y,'.',alpha=.5)
#zoom in
xlim(0,45)
ylim(0,45)
# words used by trump far more than by hillary
trumphi = sorted(dl,key=lambda x:x[1][2]/(x[1][1]+1),reverse=True)[:20]
trumphi
# words used by hillary far more than by trump
clintonhi = sorted(dl,key=lambda x:x[1][1]/(x[1][2]+.00000001),reverse=True)[:20]
clintonhi
Below are two excerpts from the second presidential debate (with the speaker label removed).
DONT LOOK AT THEM! Just download them into your class 4 folder.
http://www.acsu.buffalo.edu/~danet/Sp18/MTH448/class4/class4_files/2nd_debate_excerpt_1.txt
http://www.acsu.buffalo.edu/~danet/Sp18/MTH448/class4/class4_files/2nd_debate_excerpt_2.txt
Adapt the above techniques to classify which speaker, Hillary vs Trump, was speaking for each excerpt. I
Step 1: Define function that loads and preprocesses transcript
def load_and_preprocess_transcript(filename):
f = open(filename)
s = f.read()
f.close()
#we want to remove the following characters and string
punc = ',.;:!?"'
otherbadwords = ['--','(APPLAUSE)','(inaudible)','(LAUGHTER)','(CROSSTALK)']
# first remove punctuation
for p in punc:
s = s.replace(p,'')#remove punctuation
# next remove formating words that aren't helpful
for w in otherbadwords:
s = s.replace(w,'')
words1 = s.split()
return words1
Step 2: define excerpt filenames and load/preprocess them
excerpt1_filename = '2nd_debate_excerpt_1.txt';
excerpt2_filename = '2nd_debate_excerpt_2.txt';
words1 = load_and_preprocess_transcript(excerpt1_filename);
words2 = load_and_preprocess_transcript(excerpt2_filename);
print(words1[:50])
Step 3: create dictionary for both excerpts
def make_dictionary(word_list):
dictionary1 = {}
for word in word_list: #holt
if word not in dictionary1:
dictionary1[word] = 1
else:
dictionary1[word] += 1 # number times this word was said by holt increases by one
return dictionary1
d1 = make_dictionary(words1)
d2 = make_dictionary(words2)
d1_list = list(d1.items())
d2_list = list(d2.items())
d1_list[:10]
excerpt1_top10 = sorted(d1_list,key=lambda x:x[1],reverse=True)[:10] # sort by word frequency
excerpt1_top10
excerpt2_top10 = sorted(d2_list,key=lambda x:x[1],reverse=True)[:10] # sort by word frequency
excerpt2_top10
# compare this to
sorted(dl,key=lambda x:x[1][2],reverse=True)[:10] # debate 1 sorted by Trump frequency
sorted(dl,key=lambda x:x[1][1],reverse=True)[:10] # debate 1 sorted by Hillary frequency
Notice that you is ranked 5 for Trump and not in the top 10 for Hillary
Thus, I classify the first excerpt to be Trump and the second Hillary.
See usage of 'you' below: