From 6ead2ac4026f4a26755163781755913adae5f63d Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 9 Jun 2017 20:22:56 -0700 Subject: [PATCH 1/3] first pass --- ontobio/ontol.py | 42 ++++++++++++++++++++++-- ontobio/sim.py | 69 ++++++++++++++++++++++++++++++++++++++++ tests/test_local_json.py | 5 +++ tests/test_sim.py | 34 ++++++++++++++++++++ 4 files changed, 148 insertions(+), 2 deletions(-) create mode 100644 ontobio/sim.py create mode 100644 tests/test_sim.py diff --git a/ontobio/ontol.py b/ontobio/ontol.py index b7cadcc9..a26e4e93 100644 --- a/ontobio/ontol.py +++ b/ontobio/ontol.py @@ -551,6 +551,44 @@ def logical_definitions(self, nid): else: return [] + def definition(self, nid): + """ + Text definition object for a node + + Arguments + --------- + nid : str + Node identifier for entity to be queried + + Return + ------ + dict + definition object, dict(val=TEXT, xrefs=LIST) + """ + return self._get_meta_prop(nid, 'definition') + + def definition_val(self, nid): + """ + Text definition string value for a node + + Arguments + --------- + nid : str + Node identifier for entity to be queried + + Return + ------ + str + text definition + """ + defn = self.definition(nid) + if defn is None: + return None + else: + return defn['val'] + + + def _get_meta_prop(self, nid, prop): n = self.node(nid) if 'meta' in n: @@ -698,7 +736,7 @@ def xrefs(self, nid, bidirectional=False): nid : str Node identifier for entity to be queried bidirection : bool - If True, include nodes xreffed to nid + If True, include nodes that xref nid Return ------ @@ -708,7 +746,7 @@ def xrefs(self, nid, bidirectional=False): xg = self.xref_graph if nid not in xg: return [] - if bidirectional: + elif bidirectional: return xg.neighbors(nid) else: return [x for x in xg.neighbors(nid) if xg[nid][x]['source'] == nid] diff --git a/ontobio/sim.py b/ontobio/sim.py new file mode 100644 index 00000000..e2be51bc --- /dev/null +++ b/ontobio/sim.py @@ -0,0 +1,69 @@ +import math + +class SimEngine(): + + def __init__(self, + association_set=None, + icmap=None): + self.association_set = association_set + self.icmap = icmap + + def _get_icmap(self): + if self.icmap is None: + icmap = {} + aset = self.association_set + num_subjs = len(asset.subjects) + for n in aset.ontology.nodes(): + num_anns = len(aset.query([n])) + freq = num_anns / num_subjs + ic = None + if freq > 0: + ic = -math.log(freq/num_subjs) / math.log(2) + icmap[n] = ic + self.icmap = icmap + return self.icmap + + def information_content(self,nid): + """ + Returns information content for a node + """ + icmap = self._get_icmap() + return icmap[nid] + + def entity_jaccard_similarity(self,s1,s2): + """ + Calculate jaccard index of inferred associations of two subjects + + |ancs(s1) /\ ancs(s2)| + --- + |ancs(s1) \/ ancs(s2)| + + """ + a1 = self.association_set.inferred_types(s1) + a2 = self.association_set.inferred_types(s2) + num_union = len(a1.union(a2)) + if num_union == 0: + return 0.0 + return len(a1.intersection(a2)) / num_union + + def class_resnik_similarity(self,c1,c2): + """ + Calculate resnik similarty of two classes + + Return + ------ + (number,list) + tuple of max_ic and list of MRCAs + """ + cas = self.common_ancestors(c1,c2) + pairs = [(a, self.information_content(a)) for a in cas] + max_ic = 0 + mrcas = [] + for a,ic in pairs: + if ic > max_ic: + max_ic = ic + mrcas = [a] + elif ic == max_ic: + mrcas.append(a) + return max_ic, mrcas + diff --git a/tests/test_local_json.py b/tests/test_local_json.py index 0afa5796..13866455 100644 --- a/tests/test_local_json.py +++ b/tests/test_local_json.py @@ -112,6 +112,11 @@ def test_graph(): assert NIF_CELL in xrefs assert len(xrefs) == 2 + def_val = ont.definition_val(CELL) + assert def_val.startswith("The basic structural and functional unit of all organisms") + + defn = ont.definition(CELL) + assert defn['xrefs'] == [ "GOC:go_curators" ] # xrefs are bidirectional xrefs = ont.xrefs(WIKIPEDIA_CELL, bidirectional=True) diff --git a/tests/test_sim.py b/tests/test_sim.py new file mode 100644 index 00000000..791bb849 --- /dev/null +++ b/tests/test_sim.py @@ -0,0 +1,34 @@ +from ontobio.ontol_factory import OntologyFactory +from ontobio.assoc_factory import AssociationSetFactory +from ontobio.assocmodel import AssociationSet +from ontobio.io.gafparser import GafParser +from ontobio.sim import SimEngine +import logging +import random + + + +POMBASE = "tests/resources/truncated-pombase.gaf" +INTRACELLULAR='GO:0005622' +G1 = 'PomBase:SPBC902.04' +def test_sim(): + """ + Test loading from gaf + """ + ofactory = OntologyFactory() + afactory = AssociationSetFactory() + ont = ofactory.create('tests/resources/go-truncated-pombase.json') + aset = afactory.create_from_gaf(open(POMBASE,"r"), + ontology=ont) + + sim = SimEngine(aset) + for g1 in aset.subjects: + print("G1={} '{}'".format(g1, aset.label(g1))) + for g2 in aset.subjects: + print(" G2={} '{}'".format(g2, aset.label(g2))) + jsim = sim.entity_jaccard_similarity(g1,g2) + print(" SIM={}".format(jsim)) + + + + From 3caf027826bb425fa27a60e51654330b7cc74d82 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Fri, 9 Jun 2017 21:32:05 -0700 Subject: [PATCH 2/3] first pass --- ontobio/sim.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/ontobio/sim.py b/ontobio/sim.py index e2be51bc..14577b31 100644 --- a/ontobio/sim.py +++ b/ontobio/sim.py @@ -45,6 +45,23 @@ def entity_jaccard_similarity(self,s1,s2): if num_union == 0: return 0.0 return len(a1.intersection(a2)) / num_union + + def class_jaccard_similarity(self,c1,c2): + """ + Calculate jaccard index of two classes + + |ancs(c1) /\ ancs(c2)| + --- + |ancs(c1) \/ ancs(c2)| + + """ + ont = self.association_set.ontology + a1 = ont.ancestors(c1,reflexive=True) + a2 = ont.ancestors(c2,reflexive=True) + num_union = len(a1.union(a2)) + if num_union == 0: + return 0.0 + return len(a1.intersection(a2)) / num_union def class_resnik_similarity(self,c1,c2): """ From b3291976c36b23c4ebedd7a492a431c0cbcc39a3 Mon Sep 17 00:00:00 2001 From: Chris Mungall Date: Wed, 14 Jun 2017 19:34:07 -0400 Subject: [PATCH 3/3] dataframes --- ontobio/sim.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/ontobio/sim.py b/ontobio/sim.py index 14577b31..70cef000 100644 --- a/ontobio/sim.py +++ b/ontobio/sim.py @@ -1,4 +1,5 @@ import math +import pandas as pd class SimEngine(): @@ -84,3 +85,21 @@ def class_resnik_similarity(self,c1,c2): mrcas.append(a) return max_ic, mrcas + def used_classes(self): + aset = self.association_set + cset = set() + for s in aset.subjects: + cset.update(aset.inferred_types(s)) + return cset + + def dataframe(self): + aset = self.association_set + entries = [] + subjs = aset.subjects + for s in subjs: + vmap = {} + for c in aset.inferred_types(s): + vmap[c] = 1 + entries.append(vmap) + df = pd.DataFrame(entries, index=subjs) + return df