#load data (code from last class)
from glob import glob
from numpy import *
files = sorted( glob('names/yob*.txt') )
nyears = len(files)
def year(filename): return int(filename[-8:-4])
firstyear = year(files[0])
d = {}
gd = {'F':0,'M':1}
for file in files:
f = open(file)
lines = f.read().split('\n')
for line in lines:
if len(line)==0: continue
name,gender,count = line.split(',')
count = int(count)
if name not in d:
d[name] = zeros((2,nyears),dtype=int)
d[name][ gd[gender], year(file)-firstyear] = count
f.close()
d['Edward']
#prepare for plotting
%pylab inline
Let's plot the "gender bias" of the name Leslie over time. I.e. the ratio of frequency in females to frequency in males:
name='Leslie'
plot( range(firstyear,year(files[-1])+1),d[name][0]/d[name][1] ,'g');
title('Ratio of Females-to-Males for Name: '+ name)
xlabel('Year')
ylabel('Ratio')
The above plot of the ratio of female to male frequency of the name is not satisfactory because the male-dominant part of the history is all squashed into invisibility near the t-axis.
More symmetrical if we take the log of the ratio:
semilogy( range(firstyear,year(files[-1])+1) ,d[name][0]/d[name][1] ,'g');
title('Ratio of Females-to-Males for Name: '+ name)
xlabel('Year')
ylabel('Ratio (log scale)')
Now we can see the detail at both extremes!
Next, let's do it for all names in the database:
figure(figsize=(15,6))
max_count = 1000;
count = 0;
for name in d:
if count < max_count and d[name][0].sum()>0 and d[name][1].sum()>0: # if there is at least on M and one F in at least one year
semilogy( range(firstyear,year(files[-1])+1) ,d[name][0]/d[name][1] ,'g',alpha=0.1);
count = count+1;
title('Ratio of Females-to-Males for All Names ')
xlabel('Year')
ylabel('Ratio (log scale)')