k-Means - Unsupervised clustering

In [19]:
import matplotlib.pyplot as pl
from time import sleep
import os
from numpy import *

First cook up some clustered data to try it on

In [20]:
d = 2  # feature space dimension
k = 5  # number of clusters
npercluster = 25 # number points per cluster
n = k*npercluster # total number points
r = .05 # this is a spacing between the clusters

# generate random points in unit square that are at least 2r apart
centers = [random.rand(d)]
while len(centers)<k:
        trialcenter = random.rand(d)
        farenough = True # optimistic!
        for center in centers:
                if linalg.norm(trialcenter-center,inf) < 2*r:
                        farenough = False
                        break
        if farenough: centers.append(trialcenter)
centers = array(centers)
print(centers)


F = empty((n,d))
for i in range(k):
    # create a cluster
    start =     i*npercluster
    stop  = (i+1)*npercluster
    F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)

pl.scatter(F[:,0],F[:,1],marker='.',color='k')
[[ 0.75991407  0.19275725]
 [ 0.35290946  0.23705552]
 [ 0.54660208  0.08299877]
 [ 0.53060358  0.65869536]
 [ 0.96004878  0.46553747]]
Out[20]:
<matplotlib.collections.PathCollection at 0x11b4a1208>

Now let's define the kmeans algorithm

In [21]:
# First, initialize the algorithm by initializing the starting locations for the means

means = random.rand(k,d) # we saw we can get a mean with *no* associated poiints
pl.scatter(F[:,0],F[:,1],marker='.',color='k')
pl.scatter(means[:,0],means[:,1],marker='.',color='r')
Out[21]:
<matplotlib.collections.PathCollection at 0x11b2d6668>
In [22]:
# Now assign datapoints to their closest mean. 
# This requires computing the distance from each point to each mean using a 3D tensor matrix

displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
print(shape(displacements))
sqdistances = (displacements**2).sum(axis=1) # Euclidean distance
print(shape(sqdistances))

assignments = argmin( sqdistances, axis=1 )
(125, 2, 5)
(125, 5)

Let's visualize the assingments

In [23]:
colors = 'rgbmc' # red, green, blue, magenta, cyan
for i in range(k):
    cluster = assignments==i        
    pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
    pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
    pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
Out[23]:
(-0.05, 1.05)

What happened? Change the initialization above to select points as the starting points.

In [24]:
means = F[random.choice( range(n), k, replace=False )]
displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
sqdistances = (displacements**2).sum(axis=1) # Euclidean distance
assignments = argmin( sqdistances, axis=1 )
In [25]:
colors = 'rgbmc' # red, green, blue, magenta, cyan
for i in range(k):
    cluster = assignments==i        
    pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
    pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
    pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
Out[25]:
(-0.05, 1.05)

Ok, now lets define the iterative algorithm

In [26]:
oldassignments = k*ones(n,dtype=int)
count = 0
max_iterations = 10
while(True):
    count += 1
    if count>max_iterations: break
        
    # compute the cluster assignments
    displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
    sqdistances = (displacements**2).sum(axis=1)
    assignments = argmin( sqdistances, axis=1 )
    #print(assignments)
    if all( assignments == oldassignments ): break
    oldassignments[:] = assignments

    # update the means as the centroids of the clusters
    for i in range(k):
        means[i] = F[assignments==i].mean(axis=0)

Visualize the clustering results after convergence

In [27]:
colors = 'rgbmc' # red, green, blue, magenta, cyan
for i in range(k):
    cluster = assignments==i # list of points in cluster i 
    pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
    pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
    pl.plot(means[i][0],means[i][1],'.',color='k')
pl.xlim(-r,1+r); pl.ylim(-r,1+r)
Out[27]:
(-0.05, 1.05)

Exercise 1: improve the above code by modularizing it

Define the following functions:

  • F,n = create_clustered_data(d,k,npercluster,r)
  • means,assignments = initialize_kmeans(F,k)
  • plot_data_and_means(F,means,assignments)
  • means,assignments = run_kmeans(F,k,max_iterations)

The following code will work after you define these functions:

In [ ]:
d,k,npercluster,r,max_iterations = 2,5,25,0.05,100
F,n = create_clustered_data(d,k,npercluster,r)
means,assignments = initialize_kmeans(F,k)
means,assignments = run_kmeans(F,k,max_iterations)
plot_data_and_means(F,k,means,assignments)

Exercise 2: cluster images of handwritten characters

  • Choose a few characters such as '.', '1' and '8' and study their feature space projections from last class.
  • Use your function run_kmeans() to cluster the datapoints into 3 clusters.
  • Compare the correlation between your learns clusters and their known labels.
In [30]:
from PIL import Image
import glob
from numpy import *
%pylab inline
Populating the interactive namespace from numpy and matplotlib

Load image filenames form a folder

In [31]:
def load_image_files(images_foldername,characters_to_load):
    pngs = []
    pngs += glob.glob('pngs/*_09.png')#periods
    pngs += glob.glob('pngs/*__8.png')#8's
    pngs += glob.glob('pngs/*__1.png')#1's
    return pngs

Extract features for some images

In [32]:
def extract_6features(png):

    n = len(pngs)
    features = ['ink','width','height','topheaviness','rightheaviness','log aspect']
    d = len(features)
    F = empty((n,d))  # array of feature vectors


    for i,png in enumerate(pngs): # what is i here?
        img = Image.open(png)
        a = array(img)
        #print( a.shape )
        a = a[:,:,0]   #  select the red layer (because red, green, blue all the same)
        h,w = a.shape

        x = linspace(0,w,w,endpoint=False)
        y = linspace(0,h,h,endpoint=False)

        X,Y = meshgrid(x,y)
        #print(Y)

        #print( a.shape,a.dtype,a.max() )
        a = array(255-a,dtype=float)

        ink = a.sum()
        F[i,0] = ink/(255*w*h/5)   # can we normalize this sensibly?

        xmin = X[ a>0 ].min()  # the minimum x value where a>0
        xmax = X[ a>0 ].max()
        ymin = Y[ a>0 ].min()  # the minimum y value where a>0
        ymax = Y[ a>0 ].max()
        width  = xmax - xmin
        height = ymax - ymin
        F[i,1] = width/w   # can we normalize this sensibly?
        F[i,2] = height/h   # can we normalize this sensibly?

        xc = (xmin+xmax)/2   # center of character
        yc = (ymin+ymax)/2

        # could alteranatively use center of mass
        # xc = (a*X).sum()/ink
        # yc = (a*Y).sum()/ink

        # total ink above center
        F[i,3] = a[ Y>yc ].sum()/ink

        # total ink to the right of center
        F[i,4] = a[ X>xc ].sum()/ink

        # log of aspect ratio
        F[i,5] = log10(height/width)

    return features,F

Visualize 2D projections of some features

In [33]:
def visualize_2D_projections(F,features,pngs,characters_to_load):
    colors = 'cmbgrk'
    colordict = {k:colors[i] for i,k in enumerate(characters_to_load)}
    d = shape(F)[1] # number_features
    figure(figsize=(12,12))
    for i in range(d):
        for j in range(d):
            # plot the i,j coordinate plane projections
            subplot(d,d,i*d+j+1)
            if i==j: 
                text(.5,.5,features[i],ha='center')
            else:
                for k,png in enumerate(pngs):
                    plot(F[k,j],F[k,i],'.',alpha=0.1,color=colordict[png[-6:-4]])
            xticks([])
            yticks([])            
    return

Run these to process and visualize data

In [34]:
images_foldername = 'pngs'
characters_to_load = ['09','_8','_1']

pngs = load_image_files(images_foldername,characters_to_load)
features,F = extract_6features(pngs)
visualize_2D_projections(F,features,pngs,characters_to_load)

Now apply kmeans and compare your results to the above 2D projections