Political Transcript Analysis Continued

Recall the data file is located here here

Make sure to also place this file in your new class 4 folder.

Step 1: Open file and preprocess text

separate lists for each speaker

In [1]:
f = open("The_first_Trump-Clinton_presidential_debate_transcript_annotated.txt")
s = f.read()
f.close()

#we want to remove the following characters and string
punc = ',.;:!?"'
otherbadwords = ['--','(APPLAUSE)','(inaudible)','(LAUGHTER)','(CROSSTALK)']

# first remove punctuation
for p in punc: 
    s = s.replace(p,'')#remove punctuation

# next remove formating words that aren't helpful
for w in otherbadwords: 
    s = s.replace(w,'')
    
# identify the 3 speakers: Holt is the debate moderator
speakers = ['HOLT','CLINTON','TRUMP']

# restore colons after speaker names since these were just removed
for sp in speakers: 
    s = s.replace(sp,sp+':')  

#make all lower case
s = s.lower()

tags = [ sp.lower()+':' for sp in speakers ]
#print('These strings identify who is speaking:')
#print(tags)

# the text is one long string, break it up into a list of words for easier analysis
words = s.split()
words[:50]
Out[1]:
['holt:',
 'good',
 'evening',
 'from',
 'hofstra',
 'university',
 'in',
 'hempstead',
 'new',
 'york',
 "i'm",
 'lester',
 'holt',
 'anchor',
 'of',
 'nbc',
 'nightly',
 'news',
 'i',
 'want',
 'to',
 'welcome',
 'you',
 'to',
 'the',
 'first',
 'presidential',
 'debate',
 'the',
 'participants',
 'tonight',
 'are',
 'donald',
 'trump',
 'and',
 'hillary',
 'clinton',
 'this',
 'debate',
 'is',
 'sponsored',
 'by',
 'the',
 'commission',
 'on',
 'presidential',
 'debates',
 'a',
 'nonpartisan',
 'nonprofit']

Step 2: Separate words into 3 lists, 1 per speaker

In [2]:
h = []#words used by holt
c = []#words used by clinton
t = []#words used by trump

#let variable 'current' denote the current speaker, and update it as one goes through the text

for w in words:# i.e., consider each work
    if  w == tags[0]: #i.e., if w = 'holt:'
        current = h 
    elif w == tags[1]: #i.e., if w = 'clinton:'
        current = c
    elif w == tags[2]: #...
        current = t

    # if the word is not a speaker's name, add word to the current speaker's list of words    
    else: 
        current.append(w) 
        
        
h[:5],c[:5],t[:5]
Out[2]:
(['good', 'evening', 'from', 'hofstra', 'university'],
 ['how', 'are', 'you', 'donald', 'well'],
 ['thank', 'you', 'lester', 'our', 'jobs'])
In [3]:
print('The total number of words spoken by Holt, Clinton and Trump are') #repeats are counted 
len(h),len(c),len(t)
The total number of words spoken by Holt, Clinton and Trump are
Out[3]:
(1939, 6342, 8562)
In [4]:
len(set(h)),len(set(c)),len(set(t))  # counts of distinct words spoken by each
Out[4]:
(563, 1385, 1291)

Step 3: Compare speaker's usages of all the words

Consider all the words, and compute how many times each speaker uses each word.

In [5]:
d = {} # dictionary containing words and word usage profiles of speakers

# format of d is { 'word' , [# times said by hold, # times said by hillary, # times said by trump] }

for word in h: #holt
    if word not in d: 
        d[word] = [1,0,0]
    else:          
        d[word][0] += 1 # number times this word was said by holt increases by one
        
        
for word in c:#clinton
    if word not in d: 
        d[word] = [0,1,0]
    else:          
        d[word][1] += 1
        
        
for word in t:#trump
    if word not in d: 
        d[word] = [0,0,1]
    else:          
        d[word][2] += 1

d['jobs']
Out[5]:
[6, 17, 19]

Now we want to sort the words by speaker usage. Naturally, sorted sorts by alphabetical

In [6]:
dl = list(d.items())
dl[:15]

#sorted(list(d.items()))
#sorted( list(d.items()) )[:20]
sorted( list(d.items()),reverse=True )[:20]
Out[6]:
[('zero', [0, 4, 1]),
 ('youth', [0, 0, 1]),
 ('yourself', [0, 2, 1]),
 ('yours', [0, 1, 0]),
 ('your', [23, 23, 20]),
 ('young', [1, 7, 2]),
 ("you've", [3, 4, 9]),
 ("you're", [3, 6, 17]),
 ("you'll", [0, 0, 2]),
 ("you'd", [0, 0, 2]),
 ('you', [65, 76, 206]),
 ('york', [3, 3, 5]),
 ('yet', [1, 0, 0]),
 ('yes', [0, 2, 3]),
 ('yemen', [0, 0, 1]),
 ('yellen', [0, 0, 1]),
 ("years'", [0, 0, 1]),
 ('years', [4, 8, 23]),
 ('year-and-a-half', [0, 1, 0]),
 ('year', [2, 3, 4])]

Note: The lambda construct

Requires less code writing as it allows you to temporarily define a function

In [28]:
def f(x):
    return x*2;
print(f('word_'))

f2 = lambda x: x*2
print(f2('word_'))
word_word_
word_word_
In [8]:
f('ho')
Out[8]:
'hoho'
In [41]:
#sorted(dl,reverse=True)[:10]  # sort by Trump frequency

#def f(x): return x[1][2];
#sorted(dl,key=f,reverse=True)[:10]  # sort by Trump frequency

#less code using lambda construction
sorted(dl,key=lambda x:x[1][2],reverse=True)[:10]  # sort by Trump frequency
Out[41]:
[('the', [95, 253, 295]),
 ('and', [44, 206, 289]),
 ('to', [83, 240, 258]),
 ('i', [16, 141, 240]),
 ('you', [65, 76, 206]),
 ('a', [27, 122, 172]),
 ('of', [39, 135, 171]),
 ('that', [22, 147, 167]),
 ('have', [27, 84, 147]),
 ('we', [22, 131, 127])]
In [42]:
sorted(dl,key=lambda x:x[1][1],reverse=True)[:10]  # sort by Clinton frequency
Out[42]:
[('the', [95, 253, 295]),
 ('to', [83, 240, 258]),
 ('and', [44, 206, 289]),
 ('that', [22, 147, 167]),
 ('i', [16, 141, 240]),
 ('of', [39, 135, 171]),
 ('we', [22, 131, 127]),
 ('a', [27, 122, 172]),
 ('in', [24, 104, 110]),
 ('have', [27, 84, 147])]

Lets visualize some results

In [45]:
#prepare for plotting
%pylab inline
Populating the interactive namespace from numpy and matplotlib
In [46]:
x = [w[1][1] for w in dl]  # Clinton 
y = [w[1][2] for w in dl]  # Trump

#scatter plot
plot(x,y,'o',alpha=.4)#alpha controls transparent vs. opaque
Out[46]:
[<matplotlib.lines.Line2D at 0x113beeeb8>]
In [13]:
plot(x,y,'.',alpha=.5)

#zoom in 
xlim(0,45)
ylim(0,45)
Out[13]:
(0, 45)
In [47]:
# words used by trump far more than by hillary
trumphi = sorted(dl,key=lambda x:x[1][2]/(x[1][1]+1),reverse=True)[:20]
trumphi
Out[47]:
[('clinton', [24, 0, 22]),
 ('leaving', [0, 0, 15]),
 ('agree', [0, 0, 14]),
 ('wrong', [0, 0, 13]),
 ("i'll", [2, 0, 12]),
 ('tremendous', [0, 0, 11]),
 ('politicians', [0, 0, 10]),
 ('she', [2, 3, 33]),
 ("they're", [0, 4, 41]),
 ('hillary', [3, 0, 8]),
 ('bad', [1, 1, 16]),
 ('mean', [1, 0, 8]),
 ('company', [0, 0, 8]),
 ('chicago', [0, 0, 8]),
 ('sean', [0, 0, 8]),
 ('nobody', [0, 0, 7]),
 ('losing', [0, 0, 7]),
 ('greatest', [0, 0, 7]),
 ('nafta', [0, 0, 7]),
 ('paying', [0, 0, 7])]
In [52]:
# words used by hillary far more than by trump
clintonhi = sorted(dl,key=lambda x:x[1][1]/(x[1][2]+.00000001),reverse=True)[:20]
clintonhi
Out[52]:
[('american', [8, 11, 0]),
 ('information', [0, 9, 0]),
 ('proposed', [0, 7, 0]),
 ('justice', [0, 7, 0]),
 ('everyone', [2, 6, 0]),
 ('national', [0, 6, 0]),
 ('part', [0, 6, 0]),
 ('both', [2, 5, 0]),
 ('incomes', [1, 5, 0]),
 ('kinds', [1, 5, 0]),
 ('future', [0, 5, 0]),
 ('plans', [0, 5, 0]),
 ('hope', [0, 5, 0]),
 ('criminal', [0, 5, 0]),
 ('issues', [2, 4, 0]),
 ('growth', [1, 4, 0]),
 ('still', [1, 4, 0]),
 ('men', [1, 4, 0]),
 ('attacks', [2, 4, 0]),
 ('provide', [1, 4, 0])]

Quiz

Below are two excerpts from the second presidential debate (with the speaker label removed).

DONT LOOK AT THEM! Just download them into your class 4 folder.

http://www.acsu.buffalo.edu/~danet/Sp18/MTH448/class4/class4_files/2nd_debate_excerpt_1.txt

http://www.acsu.buffalo.edu/~danet/Sp18/MTH448/class4/class4_files/2nd_debate_excerpt_2.txt

Adapt the above techniques to classify which speaker, Hillary vs Trump, was speaking for each excerpt. I

Solution

Step 1: Define function that loads and preprocesses transcript

In [16]:
def load_and_preprocess_transcript(filename):
    f = open(filename)
    s = f.read()
    f.close()

    #we want to remove the following characters and string
    punc = ',.;:!?"'
    otherbadwords = ['--','(APPLAUSE)','(inaudible)','(LAUGHTER)','(CROSSTALK)']

    # first remove punctuation
    for p in punc: 
        s = s.replace(p,'')#remove punctuation

    # next remove formating words that aren't helpful
    for w in otherbadwords: 
        s = s.replace(w,'')
  
    words1 = s.split()
    return words1
    

Step 2: define excerpt filenames and load/preprocess them

In [17]:
excerpt1_filename = '2nd_debate_excerpt_1.txt';
excerpt2_filename = '2nd_debate_excerpt_2.txt';
    
words1 = load_and_preprocess_transcript(excerpt1_filename);
words2 = load_and_preprocess_transcript(excerpt2_filename);
print(words1[:50])
['Well', 'you', 'owe', 'the', 'president', 'an', 'apology', 'because', 'as', 'you', 'know', 'very', 'well', 'your', 'campaign', 'Sidney', 'Blumenthal', 'he’s', 'another', 'real', 'winner', 'that', 'you', 'have', 'and', 'he’s', 'the', 'one', 'that', 'got', 'this', 'started', 'along', 'with', 'your', 'campaign', 'manager', 'and', 'they', 'were', 'on', 'television', 'just', 'two', 'weeks', 'ago', 'she', 'was', 'saying', 'exactly']

Step 3: create dictionary for both excerpts

In [18]:
def make_dictionary(word_list):
    dictionary1 = {}
    for word in word_list: #holt
        if word not in dictionary1: 
            dictionary1[word] = 1
        else:          
            dictionary1[word] += 1 # number times this word was said by holt increases by one
    return dictionary1    

d1 = make_dictionary(words1)
d2 = make_dictionary(words2)

d1_list = list(d1.items())
d2_list = list(d2.items())
d1_list[:10]
Out[18]:
[('Well', 1),
 ('you', 23),
 ('owe', 3),
 ('the', 18),
 ('president', 1),
 ('an', 4),
 ('apology', 4),
 ('because', 2),
 ('as', 2),
 ('know', 2)]
In [19]:
excerpt1_top10 = sorted(d1_list,key=lambda x:x[1],reverse=True)[:10]  # sort by word frequency
excerpt1_top10 
Out[19]:
[('you', 23),
 ('and', 22),
 ('the', 18),
 ('that', 12),
 ('I', 11),
 ('a', 10),
 ('to', 9),
 ('have', 7),
 ('of', 6),
 ('been', 6)]
In [20]:
excerpt2_top10 = sorted(d2_list,key=lambda x:x[1],reverse=True)[:10]  # sort by word frequency
excerpt2_top10 
Out[20]:
[('the', 20),
 ('and', 13),
 ('to', 11),
 ('I', 10),
 ('in', 9),
 ('of', 8),
 ('is', 6),
 ('we', 6),
 ('a', 6),
 ('with', 5)]
In [21]:
# compare this to
sorted(dl,key=lambda x:x[1][2],reverse=True)[:10]  # debate 1 sorted by Trump frequency
Out[21]:
[('the', [95, 253, 295]),
 ('and', [44, 206, 289]),
 ('to', [83, 240, 258]),
 ('i', [16, 141, 240]),
 ('you', [65, 76, 206]),
 ('a', [27, 122, 172]),
 ('of', [39, 135, 171]),
 ('that', [22, 147, 167]),
 ('have', [27, 84, 147]),
 ('we', [22, 131, 127])]
In [22]:
sorted(dl,key=lambda x:x[1][1],reverse=True)[:10]  # debate 1 sorted by Hillary frequency
Out[22]:
[('the', [95, 253, 295]),
 ('to', [83, 240, 258]),
 ('and', [44, 206, 289]),
 ('that', [22, 147, 167]),
 ('i', [16, 141, 240]),
 ('of', [39, 135, 171]),
 ('we', [22, 131, 127]),
 ('a', [27, 122, 172]),
 ('in', [24, 104, 110]),
 ('have', [27, 84, 147])]

Notice that you is ranked 5 for Trump and not in the top 10 for Hillary

Thus, I classify the first excerpt to be Trump and the second Hillary.

See usage of 'you' below: