Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
koteth committed Sep 16, 2014
0 parents commit 1133abe
Show file tree
Hide file tree
Showing 8 changed files with 854 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
*.pyc
89 changes: 89 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# MCL Clustering

Python implementation of Markov Clustering technique.
This implementation si not yet optimized for large networks.

## Installation:

python setup.py install

##Usage:

###Command line:

Usage: ./mcl_clustering.py [options] <input_file> <output_file>


Options:
-h, --help show this help message and exit
-e EXPAND_FACTOR, --expand_factor=EXPAND_FACTOR
expand factor (default: 2)
-i INFLATE_FACTOR, --inflate_factor=INFLATE_FACTOR
inflate factor (default: 2)
-m MULT_FACTOR, --mult_factor=MULT_FACTOR
multiply factor (default: 1)
-l MAX_LOOP, --max_loops=MAX_LOOP
max loops (default: 60)



###Code:

numpy adjacency matrix

from mcl_clustering import mcl

A = <your matrix>

M, clusters = mcl(A, expand_factor = options.expand_factor,
inflate_factor = options.inflate_factor,
max_loop = options.max_loop,
mult_factor = options.mult_factor)

networkx graph

from mcl_clustering import networkx_mcl

G = <your graph>

M, clusters = networkx_mcl(G, expand_factor = options.expand_factor,
inflate_factor = options.inflate_factor,
max_loop = options.max_loop,
mult_factor = options.mult_factor)
Output:
M = otuput matrix
clusters = dict with keys = [<cluster id>] values = [<vertex id>]

##Requirements

numpy
networkx


##Example:



##Parameters:

-i --inflate-factor
-e --expand-factor
-m --multiply-factor
-l --max-loops
-s --show-graph show graph with networkx



## References

Stijn van Dongen, Graph Clustering by Flow Simulation.
PhD thesis, University of Utrecht, May 2000.
( http://www.library.uu.nl/digiarchief/dip/diss/1895620/inhoud.htm )

Stijn van Dongen. A cluster algorithm for graphs. Technical Report
INS-R0010, National Research Institute for Mathematics and Computer
Science in the Netherlands, Amsterdam, May 2000.
( http://www.cwi.nl/ftp/CWIreports/INS/INS-R0010.ps.Z )

Empty file added mcl/__init__.py
Empty file.
536 changes: 536 additions & 0 deletions mcl/amatrix.csv

Large diffs are not rendered by default.

176 changes: 176 additions & 0 deletions mcl/mcl_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
#!/usr/bin/env python

import sys
import numpy as np
import time
from optparse import OptionParser
import logging

def normalize(A):
column_sums = A.sum(axis=0)
new_matrix = A / column_sums[np.newaxis, :]
return new_matrix

def inflate(A, inflate_factor):
return normalize(np.power(A, inflate_factor))

def expand(A, expand_factor):
return np.linalg.matrix_power(A, expand_factor)

def add_diag(A, mult_factor):
return A + mult_factor * np.identity(A.shape[0])

def get_clusters(A):
clusters = []
for i, r in enumerate((A>0).tolist()):
if r[i]:
clusters.append(A[i,:]>0)

clust_map ={}
for cn , c in enumerate(clusters):
for x in [ i for i, x in enumerate(c) if x ]:
clust_map[cn] = clust_map.get(cn, []) + [x]
return clust_map

def draw(G, A, cluster_map):
import networkx as nx
import matplotlib.pyplot as plt

clust_map = {}
for k, vals in cluster_map.items():
for v in vals:
clust_map[v] = k

colors = []
for i in range(len(G.nodes())):
colors.append( clust_map.get(i, 100 ))

pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos,node_size = 200, node_color =colors , cmap=plt.cm.Blues )
nx.draw_networkx_edges(G,pos, alpha=0.5)

from matplotlib.pylab import matshow, show, cm
matshow(A, fignum=100, cmap=cm.gray)
show()

plt.show()

def stop(M, i):

if i%5==4:
m = np.max( M**2 - M) - np.min( M**2 - M)
if m==0:
logging.info("Stop at iteration %s" % i)
return True

return False


def mcl(M, expand_factor = 2, inflate_factor = 2, max_loop = 10 , mult_factor = 1):
M = add_diag(M, mult_factor)
M = normalize(M)


for i in range(max_loop):
logging.info("loop", i)
M = inflate(M, inflate_factor)
M = expand(M, expand_factor)
if stop(M, i): break

clusters = get_clusters(M)
return M, clusters

def networkx_mcl(G, expand_factor = 2, inflate_factor = 2, max_loop = 10 , mult_factor = 1):
import networkx as nx
A = nx.adjacency_matrix(G)
return mcl(np.array(A.todense()), expand_factor, inflate_factor, max_loop, mult_factor)

def print_info(options):
print "-"*60
print "MARKOV CLUSTERING:"
print "-" * 60
print " expand_factor: %s" % options.expand_factor
print " inflate_factor: %s" % options.inflate_factor
print " mult factor: %s" % options.mult_factor
print " max loops: %s\n" % options.max_loop

def get_options():
usage = "usage: %prog [options] <input_matrix>"
parser = OptionParser(usage)
parser.add_option("-e", "--expand_factor",
dest="expand_factor",
default=2,
type=int,
help="expand factor (default: %default)")
parser.add_option("-i", "--inflate_factor",
dest="inflate_factor",
default=2,
type=float,
help="inflate factor (default: %default)")
parser.add_option("-m", "--mult_factor",
dest="mult_factor",
default=2,
type=float,
help="multiply factor (default: %default)")
parser.add_option("-l", "--max_loops",
dest="max_loop",
default=60,
type=int,
help="max loops (default: %default)")
parser.add_option("-o", "--output", metavar="FILE",
help="output (default: stdout)")

parser.add_option("-v", "--verbose",
action="store_true", dest="verbose", default=True,
help="verbose (default: %default)")
parser.add_option("-d", "--draw-graph",
action="store_true", dest="draw", default=False,
help="show graph with networkx (default: %default)")


(options, args) = parser.parse_args()

try:
filename = args[0]
except:
raise Exception('input', 'missing input filename')


return options, filename

def get_graph(csv_filename):
import networkx as nx

M = []
for r in open(csv_filename):
r = r.strip().split(",")
M.append( map( lambda x: float(x.strip()), r))

G = nx.from_numpy_matrix(np.matrix(M))
return np.array(M), G


if __name__ == '__main__':

options, filename = get_options()
print_info(options)
M, G = get_graph(filename)

print " number of nodes: %s\n" % M.shape[0]

print time.time(), "evaluating clusters..."
M, clusters = networkx_mcl(G, expand_factor = options.expand_factor,
inflate_factor = options.inflate_factor,
max_loop = options.max_loop,
mult_factor = options.mult_factor)
print time.time(), "done\n"

if not options.output:
print "Clusters:"
for k, v in clusters.items():
print k, v

if options.draw:
print time.time(), "drawing..."
draw(G, M, clusters)
print time.time(), "done"
Binary file added mcl/mcl_clustering.pyc
Binary file not shown.
35 changes: 35 additions & 0 deletions mcl/mcl_clustering_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import unittest
import numpy as np
from mcl_clustering import *
import logging

#TODO: improveme
class TestMcl(unittest.TestCase):

def setUp(self):
pass

def test_normalize(self):
A = np.ones((4, 4))
A[2,0] = 2
A_n = normalize(A)
self.assertEqual(0.4, A_n[2, 0])

def test_inflate(self):
A = np.ones((4, 4))
A[3,0] = 2
A[1,0] = 3
A_i = inflate(A, 2)
self.assertTrue( A[3, 0] > A_i[3, 0])

def test_expand(self):
A = np.ones((4, 4))
A[3,0] = 2
A[2,0] = 3
A = normalize(A)
A_i = expand(A, 2)
self.assertTrue(A[2, 0] > A_i[2, 0])

if __name__ == '__main__':
unittest.main()

17 changes: 17 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env python

from distutils.core import setup

setup(
name='MCL Markov Clustering',
version='0.3',
description='Markov Clustering algoritm for Graphs',
scripts = [
'mcl/mcl_clustering.py'
],
author='koteth',
install_requires = ['numpy', 'networkx'],
keywords = "MCL markov clustering graph",
author_email='[email protected]',
packages=['mcl'],
)

0 comments on commit 1133abe

Please sign in to comment.