import matplotlib.pyplot as pl
from time import sleep
import os
from numpy import *
d = 2 # feature space dimension
k = 5 # number of clusters
npercluster = 25 # number points per cluster
n = k*npercluster # total number points
r = .05 # this is a spacing between the clusters
# generate random points in unit square that are at least 2r apart
centers = [random.rand(d)]
while len(centers)<k:
trialcenter = random.rand(d)
farenough = True # optimistic!
for center in centers:
if linalg.norm(trialcenter-center,inf) < 2*r:
farenough = False
break
if farenough: centers.append(trialcenter)
centers = array(centers)
print(centers)
F = empty((n,d))
for i in range(k):
# create a cluster
start = i*npercluster
stop = (i+1)*npercluster
F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)
pl.scatter(F[:,0],F[:,1],marker='.',color='k')
# First, initialize the algorithm by initializing the starting locations for the means
means = random.rand(k,d) # we saw we can get a mean with *no* associated poiints
pl.scatter(F[:,0],F[:,1],marker='.',color='k')
pl.scatter(means[:,0],means[:,1],marker='.',color='r')
# Now assign datapoints to their closest mean.
# This requires computing the distance from each point to each mean using a 3D tensor matrix
displacements = F[:,:,newaxis] - means.T # create 3D array (done after class)
print(shape(displacements))
sqdistances = (displacements**2).sum(axis=1) # Euclidean distance
print(shape(sqdistances))
assignments = argmin( sqdistances, axis=1 )
colors = 'rgbmc' # red, green, blue, magenta, cyan
for i in range(k):
cluster = assignments==i
pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
means = F[random.choice( range(n), k, replace=False )]
displacements = F[:,:,newaxis] - means.T # create 3D array (done after class)
sqdistances = (displacements**2).sum(axis=1) # Euclidean distance
assignments = argmin( sqdistances, axis=1 )
colors = 'rgbmc' # red, green, blue, magenta, cyan
for i in range(k):
cluster = assignments==i
pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
oldassignments = k*ones(n,dtype=int)
count = 0
max_iterations = 10
while(True):
count += 1
if count>max_iterations: break
# compute the cluster assignments
displacements = F[:,:,newaxis] - means.T # create 3D array (done after class)
sqdistances = (displacements**2).sum(axis=1)
assignments = argmin( sqdistances, axis=1 )
#print(assignments)
if all( assignments == oldassignments ): break
oldassignments[:] = assignments
# update the means as the centroids of the clusters
for i in range(k):
means[i] = F[assignments==i].mean(axis=0)
colors = 'rgbmc' # red, green, blue, magenta, cyan
for i in range(k):
cluster = assignments==i # list of points in cluster i
pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
Define the following functions:
d,k,npercluster,r,max_iterations = 2,5,25,0.05,100
F,n = create_clustered_data(d,k,npercluster,r)
means,assignments = initialize_kmeans(F,k)
means,assignments = run_kmeans(F,k,max_iterations)
plot_data_and_means(F,k,means,assignments)
from PIL import Image
import glob
from numpy import *
%pylab inline
def load_image_files(images_foldername,characters_to_load):
pngs = []
pngs += glob.glob('pngs/*_09.png')#periods
pngs += glob.glob('pngs/*__8.png')#8's
pngs += glob.glob('pngs/*__1.png')#1's
return pngs
def extract_6features(png):
n = len(pngs)
features = ['ink','width','height','topheaviness','rightheaviness','log aspect']
d = len(features)
F = empty((n,d)) # array of feature vectors
for i,png in enumerate(pngs): # what is i here?
img = Image.open(png)
a = array(img)
#print( a.shape )
a = a[:,:,0] # select the red layer (because red, green, blue all the same)
h,w = a.shape
x = linspace(0,w,w,endpoint=False)
y = linspace(0,h,h,endpoint=False)
X,Y = meshgrid(x,y)
#print(Y)
#print( a.shape,a.dtype,a.max() )
a = array(255-a,dtype=float)
ink = a.sum()
F[i,0] = ink/(255*w*h/5) # can we normalize this sensibly?
xmin = X[ a>0 ].min() # the minimum x value where a>0
xmax = X[ a>0 ].max()
ymin = Y[ a>0 ].min() # the minimum y value where a>0
ymax = Y[ a>0 ].max()
width = xmax - xmin
height = ymax - ymin
F[i,1] = width/w # can we normalize this sensibly?
F[i,2] = height/h # can we normalize this sensibly?
xc = (xmin+xmax)/2 # center of character
yc = (ymin+ymax)/2
# could alteranatively use center of mass
# xc = (a*X).sum()/ink
# yc = (a*Y).sum()/ink
# total ink above center
F[i,3] = a[ Y>yc ].sum()/ink
# total ink to the right of center
F[i,4] = a[ X>xc ].sum()/ink
# log of aspect ratio
F[i,5] = log10(height/width)
return features,F
def visualize_2D_projections(F,features,pngs,characters_to_load):
colors = 'cmbgrk'
colordict = {k:colors[i] for i,k in enumerate(characters_to_load)}
d = shape(F)[1] # number_features
figure(figsize=(12,12))
for i in range(d):
for j in range(d):
# plot the i,j coordinate plane projections
subplot(d,d,i*d+j+1)
if i==j:
text(.5,.5,features[i],ha='center')
else:
for k,png in enumerate(pngs):
plot(F[k,j],F[k,i],'.',alpha=0.1,color=colordict[png[-6:-4]])
xticks([])
yticks([])
return
images_foldername = 'pngs'
characters_to_load = ['09','_8','_1']
pngs = load_image_files(images_foldername,characters_to_load)
features,F = extract_6features(pngs)
visualize_2D_projections(F,features,pngs,characters_to_load)