One importance property of kmeans is that it aims to find clusters that look like spheres. Compare it to other clustering methods:¶

Example algorithms from Scikit¶

Community detection - Unsupervised clustering of graph-structure data¶

import matplotlib.pyplot as pl
import matplotlib.pyplot as pl
from numpy import *

def create_clustered_data(d,k,npercluster,r):
    n = k*npercluster # total number points

    # generate random points in unit square that are at least 2r apart
    centers = [random.rand(d)]
    while len(centers)<k:
            trialcenter = random.rand(d)
            farenough = True # optimistic!
            for center in centers:
                    if linalg.norm(trialcenter-center,inf) < 2*r:
                            farenough = False
                            break
            if farenough: centers.append(trialcenter)
    centers = array(centers)
    F = empty((n,d))
    for i in range(k):
        # create a cluster
        start =     i*npercluster
        stop  = (i+1)*npercluster
        F[start:stop,:] = centers[i] + r*(2*random.rand(npercluster,d)-1)

    return F,n 

def plot_data_and_means(F,k,means,assignments):
    colors = 'rgbmc' # red, green, blue, magenta, cyan
    for i in range(k):
        cluster = assignments==i        
        pl.plot(F[cluster,0],F[cluster,1],'.',color=colors[i],alpha=0.95);
        pl.plot(means[i][0],means[i][1],'o',color=colors[i],markersize=50,alpha=0.1)
        pl.plot(means[i][0],means[i][1],'.',color='k')
    pl.xlim(-r,1+r); pl.ylim(-r,1+r)
    return 

def initialize_kmeans(F,k):
    center = zeros(k,dtype=int)
    center[0] = random.randint(n)
    x = zeros((n,k))
    for i in range(k-1):
        x[:,i] = sum(square(F[center[i],:] - F),axis=1)
        #plot(x[:,i])
        center[i+1] = argmax(amin(x[:,0:(i+1)],axis=1))
    
    means = F[center,:]
    displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
    sqdistances = (displacements**2).sum(axis=1) # Euclidean distance
    assignments = argmin( sqdistances, axis=1 )
    
    return means,assignments

def run_kmeans(F,k,max_iterations):
    n=shape(F)[0]
    oldassignments = k*ones(n,dtype=int)
    count = 0
    while(True):        
        count += 1
        if count>max_iterations: break

        # compute the cluster assignments
        displacements = F[:,:,newaxis] - means.T   # create 3D array  (done after class)
        sqdistances = (displacements**2).sum(axis=1)
        assignments = argmin( sqdistances, axis=1 )
        #print(assignments)
        if all( assignments == oldassignments ): break
        oldassignments[:] = assignments

        # update the means as the centroids of the clusters
        for i in range(k):
            means[i] = F[assignments==i].mean(axis=0)
        return means,assignments

So here's kmeans with nice initialization¶

d,k,npercluster,r,max_iterations = 2,5,25,0.05,100
F,n = create_clustered_data(d,k,npercluster,r)
means,assignments = initialize_kmeans(F,k)
means,assignments = run_kmeans(F,k,max_iterations)
plot_data_and_means(F,k,means,assignments)

Now for community detection. First compute distances between points¶

from pylab import *

displacements = F[:,:,newaxis] - F[:,:,newaxis].T

print(shape(displacements))
# displacements(i,0,k) gives difference between x_i and x_k
# displacements(i,1,k) gives difference between y_i and y_k

distances = (displacements**2).sum(axis=1)**.5 # Euclidean distance between all pairs of points
#print(shape(distances))
imshow(distances)

(125, 2, 125)

<matplotlib.image.AxesImage at 0x11e74b128>

Make network such that there are edges between nearby points¶

A = (distances<.05) - eye(n) # subtract identity matrix to remove self-edges
imshow(A) # here, yellow dots indicate and edge from node i to node j
edges = array((A).nonzero()).T
shape(edges)

(1440, 2)

Use package networkX to analyze network data¶

from networkx import * 
G = from_numpy_matrix(A)
nx.draw(G,node_size=10)
connected_components = sorted(nx.connected_components(G), key = len, reverse=True)

print(connected_components)
components_vector = zeros(n)
for i in range(len(connected_components)):
    ids = [int(x) for x in list(connected_components[i])]
    components_vector[ids] = i
components_vector

[{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}, {25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49}, {50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74}, {75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99}, {100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124}]

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,
        1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  1.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
        2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,
        3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  3.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.,
        4.,  4.,  4.,  4.,  4.,  4.,  4.,  4.])

nx.draw_networkx_nodes(G, pos=F, node_color=components_vector, node_size=10,alpha=0.8)
nx.draw_networkx_edges(G, pos=F, alpha=0.1)
shape(components_vector)

(125,)

Connected components work if the data is not noisy. If there are some edges between clusters (i.e., communities), you will need a better community detection algorithm.¶

download network file call football.gml¶

http://www.acsu.buffalo.edu/~danet/Sp18/MTH448/class23/class23_files/football.gml

H = nx.read_gml('football.gml') # graph markup language  -- this is somebodies xml design
#nx.draw_networkx_nodes(H,pos=nx.spring_layout(H))
nx.draw(H,pos=nx.spring_layout(H))

# you must install with:       pip install python-louvain
import community
partition = community.best_partition(H)# this finds the clusters using one algorithm

color_vec = list(partition.values())# we need a vector with the cluster ids to show clusters using color in a plot

labels2 = {a:a for a in list(nodes(H))} # to plot labels, you need a dictionary

nx.draw(H,pos=nx.spring_layout(H),node_color=color_vec,labels=labels2,font_size=10,alpha=.6 )

A figure my friend made. See code at¶

http://champ.readthedocs.io/en/latest/