k-Means - Unsupervised clustering

In [1]:
import matplotlib.pyplot as pl
from time import sleep
import os
from numpy import *

First cook up some clustered data to try it on

In [21]:
d = 2  # feature space dimension
k = 5  # number of clusters
npercluster = 25 # number points per cluster
r = .05 # this is a spacing between the clusters


def create_clustered_data(d,k,npercluster,r):

    n = k*npercluster # total number points


    # generate random points in unit square that are at least 2r apart
    centers = [random.rand(d)]
    while len(centers)<k:
            trialcenter = random.rand(d)
            farenough = True # optimistic!
            for center in centers:
                    if linalg.norm(trialcenter-center,inf) < 2*r:
                            farenough = False
                            break
            if farenough: centers.append(trialcenter)
    centers = array(centers)
    #print(centers)


    F = empty((n,d))
    for i in range(k):
        # create a cluster
        start =     i*npercluster
        stop  = (i+1)*npercluster
        F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)

    return F,n 

F,n = create_clustered_data(d,k,npercluster,r)
pl.scatter(F[:,0],F[:,1],marker='.',color='k')
Out[21]:
<matplotlib.collections.PathCollection at 0x129b49048>

Now let's define the kmeans algorithm

In [22]:
# First, initialize the algorithm by initializing the starting locations for the means

means = random.rand(k,d) # we saw we can get a mean with *no* associated poiints
pl.scatter(F[:,0],F[:,1],marker='.',color='k')
pl.scatter(means[:,0],means[:,1],marker='.',color='r')
Out[22]:
<matplotlib.collections.PathCollection at 0x129b496d8>
In [23]:
# Now assign datapoints to their closest mean. 
# This requires computing the distance from each point to each mean using a 3D tensor matrix

displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
print(shape(displacements))
sqdistances = (displacements**2).sum(axis=1) # Euclidean distance
print(shape(sqdistances))

assignments = argmin( sqdistances, axis=1 )
(125, 2, 5)
(125, 5)

Let's visualize the assingments

In [24]:
def plot_data_and_means(F,k,means,assignments):
    colors = 'rgbmc' # red, green, blue, magenta, cyan
    for i in range(k):
        cluster = assignments==i        
        pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
        pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
        pl.plot(means[i][0],means[i][1],'.',color='k')
    pl.xlim(-r,1+r); pl.ylim(-r,1+r)
    return 

plot_data_and_means(F,k,means,assignments)

What happened? (some clusters have no points)

Change the initialization above to select points as the starting points.

In [31]:
def initialize_kmeans(F,k):
    means = F[random.choice( range(n), k, replace=False )]
    displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
    sqdistances = (displacements**2).sum(axis=1) # Euclidean distance
    assignments = argmin( sqdistances, axis=1 )
    
    return means,assignments
In [32]:
means,assignments = initialize_kmeans(F,k)
plot_data_and_means(F,k,means,assignments)

Ok, now lets define the iterative algorithm

In [58]:
def run_kmeans(F,k,max_iterations):
    n=shape(F)[0]
    oldassignments = k*ones(n,dtype=int)
    count = 0
    while(True):
        count += 1
        if count>max_iterations: break

        # compute the cluster assignments
        displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
        sqdistances = (displacements**2).sum(axis=1)
        assignments = argmin( sqdistances, axis=1 )
        #print(assignments)
        if all( assignments == oldassignments ): break
        oldassignments[:] = assignments

        # update the means as the centroids of the clusters
        for i in range(k):
            means[i] = F[assignments==i].mean(axis=0)
        return means,assignments

k=5
max_iterations = 20            
means,assignments = run_kmeans(F,k,max_iterations)

Visualize the clustering results after convergence

In [35]:
plot_data_and_means(F,k,means,assignments)

Exercise 1: improve the above code by modularizing it

Define the following functions:

  • F,n = create_clustered_data(d,k,npercluster,r)
  • means,assignments = initialize_kmeans(F,k)
  • plot_data_and_means(F,means,assignments)
  • means,assignments = run_kmeans(F,k,max_iterations)
In [63]:
# Now you can rerun everything via

d,k,npercluster,r,max_iterations = 2,5,25,0.05,100
F,n = create_clustered_data(d,k,npercluster,r)
means,assignments = initialize_kmeans(F,k)
means,assignments = run_kmeans(F,k,max_iterations)
plot_data_and_means(F,k,means,assignments)

Exercise 2: cluster the handwritten characters

  • Choose a few characters such as '.', '1' and '8' and study their feature space projections from last class.
  • Use your function run_kmeans() to cluster the datapoints into 3 clusters.
  • Compare the correlation between your learns clusters and their known labels.
In [64]:
from PIL import Image
import glob
from numpy import *
%pylab inline
Populating the interactive namespace from numpy and matplotlib

Load image names from a folder

In [65]:
def load_image_files(images_foldername,characters_to_load):
    pngs = []
    pngs += glob.glob('pngs/*_09.png')#periods
    pngs += glob.glob('pngs/*__8.png')#8's
    pngs += glob.glob('pngs/*__1.png')#1's
    return pngs

Extract features for some of these images

In [66]:
def extract_6features(png):

    n = len(pngs)
    features = ['ink','width','height','topheaviness','rightheaviness','log aspect']
    d = len(features)
    F = empty((n,d))  # array of feature vectors


    for i,png in enumerate(pngs): # what is i here?
        img = Image.open(png)
        a = array(img)
        #print( a.shape )
        a = a[:,:,0]   #  select the red layer (because red, green, blue all the same)
        h,w = a.shape

        x = linspace(0,w,w,endpoint=False)
        y = linspace(0,h,h,endpoint=False)

        X,Y = meshgrid(x,y)
        #print(Y)

        #print( a.shape,a.dtype,a.max() )
        a = array(255-a,dtype=float)

        ink = a.sum()
        F[i,0] = ink/(255*w*h/5)   # can we normalize this sensibly?

        xmin = X[ a>0 ].min()  # the minimum x value where a>0
        xmax = X[ a>0 ].max()
        ymin = Y[ a>0 ].min()  # the minimum y value where a>0
        ymax = Y[ a>0 ].max()
        width  = xmax - xmin
        height = ymax - ymin
        F[i,1] = width/w   # can we normalize this sensibly?
        F[i,2] = height/h   # can we normalize this sensibly?

        xc = (xmin+xmax)/2   # center of character
        yc = (ymin+ymax)/2

        # could alteranatively use center of mass
        # xc = (a*X).sum()/ink
        # yc = (a*Y).sum()/ink

        # total ink above center
        F[i,3] = a[ Y>yc ].sum()/ink

        # total ink to the right of center
        F[i,4] = a[ X>xc ].sum()/ink

        # log of aspect ratio
        F[i,5] = log10(height/width)

    return features,F

Create a function to visualize the 2D projects of these features

In [67]:
def visualize_2D_projections(F,features,pngs,characters_to_load):
    colors = 'cmbgrk'
    colordict = {k:colors[i] for i,k in enumerate(characters_to_load)}
    d = shape(F)[1] # number_features
    figure(figsize=(12,12))
    for i in range(d):
        for j in range(d):
            # plot the i,j coordinate plane projections
            subplot(d,d,i*d+j+1)
            if i==j: 
                text(.5,.5,features[i],ha='center')
            else:
                for k,png in enumerate(pngs):
                    plot(F[k,j],F[k,i],'.',alpha=0.1,color=colordict[png[-6:-4]])
            xticks([])
            yticks([])            
    return

Okay, run all these to extract and visualize data

In [68]:
images_foldername = 'pngs'
characters_to_load = ['09','_8','_1']

pngs = load_image_files(images_foldername,characters_to_load)
features,F = extract_6features(pngs)
visualize_2D_projections(F,features,pngs,characters_to_load)

Now apply kmeans to this data

In [75]:
shape(F)
Out[75]:
(679, 6)
In [78]:
k=3
max_iterations = 100

means,assignments = initialize_kmeans(F,k)
means,assignments = run_kmeans(F,k,max_iterations)
In [81]:
colors = 'cmbgrk'
colordict = {k:colors[i] for i,k in enumerate(set(assignments))}
d = shape(F)[1] # number_features
figure(figsize=(12,12))
for i in range(d):
    for j in range(d):
        # plot the i,j coordinate plane projections
        subplot(d,d,i*d+j+1)
        if i==j: 
            text(.5,.5,features[i],ha='center')
        else:
            for k,png in enumerate(pngs):
                plot(F[k,j],F[k,i],'.',alpha=0.1,color=colordict[assignments[k]])
        xticks([])
        yticks([])