from __future__ import absolute_import, print_function, division
import itertools, multiprocessing, logging, os, collections, random, math, sys, time
from itertools import groupby, combinations
from operator import *
from collections import Counter
import tempfile
from subprocess import Popen, PIPE, STDOUT
import inspect
import shlex
import shutil
import io
from io import StringIO
import json
import datetime
import numpy as np
import pandas as pd
import pandas.io.pickle
import networkx as nx
import igraph
import scipy, scipy.sparse
from scipy.sparse import csr_matrix, coo_matrix
from scipy.stats import hypergeom
import ndex.client as nc
from ndex.networkn import NdexGraph
import ndex.beta.layouts as layouts
import ddot
import ddot.config
from ddot.utils import time_print, set_node_attributes_from_pandas, set_edge_attributes_from_pandas, nx_to_NdexGraph, NdexGraph_to_nx, parse_ndex_uuid, parse_ndex_server, make_index, update_nx_with_alignment, bubble_layout_nx, split_indices_chunk, invert_dict, make_network_public, nx_edges_to_pandas, nx_nodes_to_pandas, ig_edges_to_pandas, ig_nodes_to_pandas, melt_square, nx_set_tree_edges, gridify
def _collapse_node(g,
v,
edge_filter=None,
use_v_name=False,
combine_attrs=None,
default_attr=None,
verbose=True,
fast_collapse=False,
delete=True):
"""Collapses a node in a Graph (igraph package) while preserving
long-range hierarchical relations between descendants and
ancestral nodes.
"""
if use_v_name:
assert isinstance(v, str)
v = g.vs.find(name_eq=v).index
try:
g.vs[v]
except:
raise Exception("Can't find vertex %s in graph. Consider setting use_v_name=True" % v)
if fast_collapse:
parents = g.neighbors(v, mode='out')
children = g.neighbors(v, mode='in')
if len(parents) > 0 and len(children) > 0:
# A faster collapse that adds all new edges
# simultaneously. Ignores edge attributes
new_edges = [(c, p) for p in parents for c in children]
new_edges = [x for x, y in zip(new_edges, g.get_eids(new_edges, error=False)) if y == -1]
g.add_edges(new_edges)
else:
g.es['collapsed_length'] = 0
g.es['collapsed_terms'] = [[] for x in g.es]
in_edges = g.es[g.incident(v, mode='in')]
out_edges = g.es[g.incident(v, mode='out')]
if edge_filter is not None:
in_edges = [e for e in in_edges if edge_filter(e)]
out_edges = [e for e in out_edges if edge_filter(e)]
for e_in in in_edges:
for e_out in out_edges:
in_neigh, out_neigh = e_in.source, e_out.target
# Only add an edge if it doesn't already exist
if g[in_neigh, out_neigh] == 0:
g.add_edge(in_neigh, out_neigh)
e = g.es[g.get_eid(in_neigh, out_neigh)]
if combine_attrs is not None:
# Set default value of edge attributes to 0
for key in combine_attrs: e[key] = None
e = g.es[g.get_eid(in_neigh, out_neigh)]
# Update attributes
if combine_attrs is not None:
for key in combine_attrs:
e[key] = combine_attrs[key](e_in, e_out, e)
if verbose and key=='triangle_edge_priority':
print('Setting',
key,
g.vs[in_neigh]['name'],
g.vs[out_neigh]['name'],
'to',
combine_attrs[key](e_in, e_out, e),
(e_in[key], e_out[key]))
e['collapsed_length'] = e_in['collapsed_length'] + e_out['collapsed_length']
e['collapsed_terms'] = e_in['collapsed_terms'] + [g.vs[v]['name']] + e_out['collapsed_terms']
if delete:
g.delete_vertices(v)
return g
def read_alignment_file(f, source='Term_1'):
"""Parses an alignment file created from alignOntology's calculateFDRs script
Parameters
-----------
f : str
Filename of alignment file
source : str
Indicates which ontology will be the index of the
returned pandas.DataFrame. Value must be either 'Term_1' (first
ontology) or 'Term_2' (second ontology)
Returns
--------
: pandas.DataFrame
DataFrame with four columns: 'Term', 'Similarity', 'FDR', and 'Size'.
The index of the DataFrame are the names of terms in the "source" ontology.
"""
# Five columns in the input file
# 1) Term from first "computed" ontology
# 2) Term from second "reference" ontology
# 3) Similarity value
# 4) FDR
# 5) Size of the term in the first ontology
df = pd.read_table(f,
names=['Term_1', 'Term_2', 'Similarity', 'FDR', 'Size'],
dtype={'Term_1':str,
'Term_2':str,
'Similarity':np.float64,
'FDR':np.float64,
'Size':np.int64},
header=None)
target = 'Term_2' if source=='Term_1' else 'Term_1'
df.rename(columns={target : 'Term'}, inplace=True)
df.set_index(source, inplace=True)
df.index.rename('Term', inplace=True)
return df
def align_hierarchies(hier1,
hier2,
iterations,
threads,
update_hier1=False,
update_hier2=False,
calculateFDRs=None,
mutual_collapse=True,
output=None,
verbose=False):
if output is None:
with tempfile.NamedTemporaryFile('w', delete=True) as output_file:
return align_hierarchies(hier1, hier2, iterations, threads,
update_hier1=update_hier1, update_hier2=update_hier2,
mutual_collapse=mutual_collapse,
output=output_file.name,
calculateFDRs=calculateFDRs,
verbose=verbose)
common_genes = set(hier1.genes) & set(hier2.genes)
hier1_orig, hier2_orig = hier1, hier2
if len(common_genes) > 0:
if mutual_collapse:
hier1, hier2 = Ontology.mutual_collapse(hier1, hier2, verbose=verbose)
hier1.clear_node_attr()
hier1.clear_edge_attr()
hier2.clear_node_attr()
hier2.clear_edge_attr()
hier1.propagate('reverse', inplace=True)
hier2.propagate('reverse', inplace=True)
def to_file(hier):
if isinstance(hier, Ontology):
with tempfile.NamedTemporaryFile('w', delete=False) as f:
hier.to_table(f, clixo_format=True)
hier = f.name
else:
assert isinstance(hier, file) or os.path.exists(hier)
return hier
hier1 = to_file(hier1)
hier2 = to_file(hier2)
if calculateFDRs is None:
top_level = os.path.dirname(os.path.abspath(inspect.getfile(ddot)))
calculateFDRs = os.path.join(top_level, 'alignOntology', 'calculateFDRs')
#assert os.path.isdir(ddot.config.alignOntology)
#calculateFDRs = os.path.join(ddot.config.alignOntology, 'calculateFDRs')
assert os.path.isfile(calculateFDRs)
if threads is None:
import multiprocessing
threads = multiprocessing.cpu_count()
output_dir = tempfile.mkdtemp(prefix='tmp')
cmd = '{5} {0} {1} 0.05 criss_cross {2} {3} {4} gene'.format(
hier1, hier2, output_dir, iterations, threads, calculateFDRs)
print('Alignment command:', cmd)
p = Popen(shlex.split(cmd), shell=False)
try:
p.wait()
shutil.copy(os.path.join(output_dir, 'alignments_FDR_0.1_t_0.1'), output)
finally:
if os.path.isdir(output_dir):
shutil.rmtree(output_dir)
if p.poll() is None:
if verbose: time_print('Killing alignment process %s. Output: %s' % (p.pid, output))
p.kill() # Kill the process
align1 = read_alignment_file(output)[['Term', 'Similarity', 'FDR']]
else:
align1 = pd.DataFrame(columns=['Term', 'Similarity', 'FDR'])
align2 = align1.copy()
align2.index, align2['Term'] = align2['Term'].values.copy(), align2.index.values.copy()
append_prefix = lambda x: 'Aligned_%s' % x
if update_hier1:
if hasattr(update_hier1, '__iter__'):
node_attr = hier2_orig.node_attr[update_hier1]
else:
node_attr = hier2_orig.node_attr
hier2_import = pd.merge(pd.DataFrame(index=align2.index), node_attr, left_index=True, right_index=True, how='left')
assert (hier2_import.index == align2.index).all()
# Change index to terms in hier1
hier2_import.index = align2['Term'].copy()
hier2_import.rename(columns=append_prefix, inplace=True)
if update_hier2:
if hasattr(update_hier2, '__iter__'):
node_attr = hier1_orig.node_attr[update_hier2]
else:
node_attr = hier1_orig.node_attr
hier1_import = pd.merge(pd.DataFrame(index=align1.index), node_attr, left_index=True, right_index=True, how='left')
assert (hier1_import.index == align1.index).all()
# Change index to terms in hier2
hier1_import.index = align1['Term'].copy()
hier1_import.rename(columns=append_prefix, inplace=True)
if update_hier1:
hier1_orig.update_node_attr(align1.rename(columns=append_prefix))
hier1_orig.update_node_attr(hier2_import)
if update_hier2:
hier2_orig.update_node_attr(align2.rename(columns=append_prefix))
hier2_orig.update_node_attr(hier1_import)
return align1
def parse_obo(obo,
output_file=None,
id2name_file=None,
id2namespace_file=None,
alt_id_file=None):
"""Parses an OBO file and writes the results into several tables.
Parameters
----------
obo : str
Filename of OBO file
output_file : str
Filename to write table that describes the ontology's
hierarchical structure. The table has four columns: (1) parent
term, (2) child term, (3) relation type (e.g. "is_a" or
"part_of"), (4) namespace of relation
(e.g. "biological_process" or "cellular component")
id2name_file : str
Filename to write table of term descriptions. The table has
two columns: (1) Ontology term (e.g. "GO:0000030"), (2)
description (e.g. "mannosyltransferase activity")
id2namespace_file : str
Filename to write table of term namespaces. The table has two
columns: (1) Ontology term (e.g. "GO:0000030"), (2) namespace
of the term (e.g. "biological_process")
alt_id_file : str
Filename to write table of alternative Term IDs that are
synonyms and refer to the same term. The table has two
columns: (1) Primary Term ID, (2) Alternative Term ID
"""
## Keywords that screw up parsing:
# import, is_anonymous, intersection_of, union_of
## Relations
# 'is_a:'
# 'relationship: has_part' # Not in filtered GO
# 'relationship: occurs_in' # Not in filtered GO
# 'relationship: part_of'
# 'relationship: positively_regulates'
# 'relationship: negatively_regulates'
# 'relationship: regulates'
# 'relationship: results_in' # Not in filtered GO
stanza, edges = [], []
id2name = dict()
id2namespace = dict()
alt_id = dict()
in_term_stanza = False
default_namespace_exists = False
for line in io.open(obo).read().splitlines():
line = line.split('!')[0].strip() # Remove comments
if len(line)>0 and line[0]=='[' and line[-1]==']':
# Add last stanza if it was a term stanza. Include namespace.
if in_term_stanza:
edges.extend(x+(namespace, ) for x in stanza)
# Start new term stanza
stanza = []
# Set the default namespace, if it exists
if default_namespace_exists:
namespace = default_namespace
# In a term stanzo or not
in_term_stanza = line =='[Term]'
name = None
#if 'alt_id:' in line: assert False
if 'id:' == line[:3]:
curr_term = line.split('id:')[1].strip()
elif 'alt_id:' in line:
alt_term = line.split('alt_id:')[1].strip()
if curr_term in alt_id: alt_id[curr_term].append(alt_term)
else: alt_id[curr_term] = [alt_term]
id2name[alt_term] = name
elif 'name:' in line:
name = line.split('name:')[1].strip()
assert not curr_term in id2name
id2name[curr_term] = name
elif 'is_a:' in line:
parent = line.split('is_a:')[1].strip()
stanza.append((parent, curr_term, 'is_a'))
elif 'relationship:' in line:
line = line.split('relationship:')[1].strip().split()
if len(line)!=2: print(line)
assert len(line)==2
relation, parent = line
stanza.append((parent, curr_term, relation))
elif 'namespace:' == line[:10]:
namespace = line.split('namespace:')[1].strip()
assert not curr_term in id2namespace
id2namespace[curr_term] = namespace
elif 'default-namespace:' == line[:18]:
namespace = line.split('default-namespace:')[1].strip()
default_namespace_exists = True
default_namespace = namespace
pd.DataFrame(edges).to_csv(output_file, header=False, index=False, sep='\t')
pd.Series(id2name).to_csv(id2name_file, sep='\t')
pd.Series(id2namespace).to_csv(id2namespace_file, sep='\t')
pd.Series(dict([(a, c) for a, b in alt_id.items() for c in b])).to_csv(alt_id_file, sep='\t')
def parse_gaf(gaf):
"""
Read gene-term annotations from GAF file format:
http://geneontology.org/page/go-annotation-file-gaf-format-21
Parameters
----------
gaf : str
Filename of GAF file
Returns
--------
A list of 2-tuples (gene, GO term)
"""
gaf_columns = ['DB', 'DB Object ID', 'DB Object Symbol',
'Qualifier', 'GO ID', 'DB:Reference',
'Evidence Code', 'With (or) From', 'Aspect',
'DB Object Name', 'DB Object Synonym',
'DB Object Type', 'Taxon', 'Date',
'Assigned By', 'Annotation Extension',
'Gene Product Form ID']
df = pd.read_table(gaf, header=None, comment='!', names=gaf_columns)
# Check that all annotations are to UniProtKB protein IDs
# assert df['DB'].unique().size == 1 and df['DB'].unique()[0]=='UniProtKB'
# Remove annotations that have a NOT qualifier
df = df.loc[df['Qualifier']!='NOT', :]
# return df.loc[:, ['DB Object ID', 'GO ID']].values.tolist()
return df
[docs]class Ontology(object):
"""A Python representation for constructing, analyzing, and
manipulating the hierarchical structure of ontologies.
An Ontology object contains the following attributes for
representing the hierarchical structure. Do not directly modify
these attributes.
Parameters
----------
genes : list
Names of genes
terms : list
Names of terms
gene_2_term : dict
gene_2_term[<gene>] --> list of terms connected to
<gene>. Terms are represented as their 0-based index in
self.terms.
term_2_gene : dict
term_2_gene[<term>] --> list of genes connected to
<term>. Genes are represented as their 0-based index in
self.genes.
child_2_parent : dict
child_2_parent[<child>] --> list of the parent terms of <child>
parent_2_child : dict
parent_2_child[<parent>] --> list of the children terms of <parent>
term_sizes : list
A list of every term's size, i.e. the number of unique genes
that it and its descendant terms contain. This list has the
same order as self.terms. It holds that for every i,
`term_sizes[i] = len(self.term_2_gene[self.terms[i]])`
"""
NODETYPE_ATTR = 'NodeType'
GENE_NODETYPE = 'Gene'
TERM_NODETYPE = 'Term'
EDGETYPE_ATTR = 'EdgeType'
GENE_TERM_EDGETYPE = 'Gene-Term'
CHILD_PARENT_EDGETYPE = 'Child-Parent'
def __init__(self,
hierarchy,
mapping,
edge_attr=None,
node_attr=None,
parent_child=False,
add_root_name=None,
propagate=None,
ignore_orphan_terms=False,
verbose=True,
**kwargs):
"""Construct an Ontology object.
Parameters
----------
hierarchy : list, tuple
Iterable of (child term, parent term). E.g. list of 2-tuples
mapping : list, tuple
Iterable of (gene, term) pairs. E.g. list of 2-tuples
edge_attr : pandas.DataFrame
Meta-data describing (child_term, parent_term)
pairs. Suggestion: The index of the DataFrame must be a
pandas.MultiIndex, where the first level is the child term
and the second level is the parent term.
parent_child : bool
If True, then the definitions of <hierarchy> and <mapping>
are reversed so that they iterate over (parent term, child
term) and (term, gene) pairs.
propagate : None, str
The direction ('forward' or 'reverse') to propagate
gene-term annotations up the hierarchy with
Ontology.propagate(). If None, then don't
propagate annotations.
add_root_name : bool
The name of an artificial root. If there are multiple
roots in the ontology, then they are joined into one root
with this name. Default: Don't create this root.
ignore_orphan_terms : bool
"""
if 'empty' in kwargs and kwargs['empty'] is True:
return
if parent_child:
hierarchy = [(x[1],x[0]) for x in hierarchy]
mapping = [(x[1],x[0]) for x in mapping]
# Cast all node names to strings
hierarchy = [(str(x[0]),str(x[1])) for x in hierarchy]
mapping = [(str(x[0]),str(x[1])) for x in mapping]
## Read term-to-term edges
# parent_2_child[<term_name>] --> list of <term_name>'s children terms
self.parent_2_child = {r: [p[0] for p in q] for r, q in \
itertools.groupby(sorted(hierarchy,
key=lambda a:a[1]),
key=lambda a:a[1])}
## Read gene-to-term edges
# self.gene_2_term[<gene_name>] --> list of terms that <gene_name> is mapped to
self.gene_2_term = {key: set([a[1] for a in group]) for key, group in \
itertools.groupby(sorted(mapping,
key=lambda a:a[0]),
key=lambda a:a[0])}
## Check that the set of terms is the same according to
## parent_2_child and self.gene_2_term
terms_A = set.union(set(self.parent_2_child.keys()),
*[set(x) for x in self.parent_2_child.values()])
if len(self.gene_2_term) > 0:
terms_B = set.union(*self.gene_2_term.values())
else:
terms_B = set([])
if verbose and ignore_orphan_terms and len(terms_B - terms_A)>0:
print('WARNING: Ignoring {} terms are connected to genes but not to other terms'.format(len(terms_B - terms_A)))
# if verbose and len(terms_A - terms_B)>0:
# print 'WARNING: {} terms connected to other terms but not to genes'.format(len(terms_A - terms_B))
if ignore_orphan_terms:
self.terms = sorted(terms_A)
else:
self.terms = sorted(terms_A | terms_B)
self.genes = sorted(self.gene_2_term.keys())
if add_root_name is not None:
root_list = self.get_roots()
if len(root_list) > 1:
print('Unifying %s roots into one super-root' % len(root_list))
self.parent_2_child[add_root_name] = root_list
self.terms.append(add_root_name)
## terms_index[<term_name>] --> index in self.terms
self.terms_index = make_index(self.terms)
## self.genes_index[<gene_name>] --> index in self.genes
self.genes_index = make_index(self.genes)
## Convert self.gene_2_term to list term indices rather than term names
for k, v in self.gene_2_term.items():
self.gene_2_term[k] = [self.terms_index[x] for x in self.gene_2_term[k] if x in self.terms_index]
if node_attr is None:
self.clear_node_attr()
else:
assert node_attr.index.nlevels == 1
if node_attr.index.name != 'Node':
# if verbose:
# print("Changing node_attr index name from %s to 'Node'" % node_attr.index.name)
# # import traceback
# # print traceback.print_stack()
node_attr.index.name = 'Node'
self.node_attr = node_attr
if edge_attr is None:
self.clear_edge_attr()
else:
assert edge_attr.index.nlevels == 2
edge_attr.index.names = ['Child', 'Parent']
# if 'Child' in edge_attr.index.names and 'Parent' in edge_attr.index.names:
# edge_attr.index = edge_attr.index[['Child', 'Parent']]
# else:
# edge_attr.index.names = ['Child', 'Parent']
# if edge_attr.index.names != ['Child', 'Parent']:
# if verbose:
# print("Changing edge_attr index names from %s to ['Child', 'Parent']" % edge_attr.index.names)
# edge_attr.index.names = ['Child', 'Parent']
self.edge_attr = edge_attr
self._update_fields()
if propagate:
self.propagate(direction=propagate, inplace=True)
self._update_fields()
self._check_valid()
# ## Not necessary and requires extra start-up time (perhaps set as a __init__ parameter to precalculate many things)
# empty_terms = sum([x==0 for x in self.term_sizes])
# if verbose and empty_terms > 0:
# print 'WARNING: {} terms are connected to other terms but not to genes'.format(empty_terms), [t for t, x in zip(self.terms, self.term_sizes) if x==0][:5]
# # import traceback
# # print traceback.print_stack()
def _update_fields(self, reset_term_sizes=True):
self.child_2_parent = self._get_child_2_parent()
self.term_2_gene = self._get_term_2_gene()
if reset_term_sizes:
self._term_sizes = None
for t in self.terms:
if t not in self.parent_2_child:
self.parent_2_child[t] = []
if t not in self.child_2_parent:
self.child_2_parent[t] = []
def add_root(self, root_name, inplace=False):
"""Check if there is a single unifying root term of the ontology. If
not, then identify the multiple roots and join them under an
artificial root."""
if inplace:
ont = self
else:
ont = self.copy()
assert root_name not in ont.terms
root_list = ont.get_roots()
if len(root_list) > 1:
print('Unifying %s roots into one super-root' % len(root_list))
ont.parent_2_child[root_name] = root_list
ont.terms.append(root_name)
ont.terms_index = make_index(sorted(ont.terms))
for g, t_list in ont.gene_2_term.items():
ont.gene_2_term[g] = [ont.terms_index[ont.terms[t]] for t in t_list]
ont.terms.sort()
ont._update_fields()
return ont
def _get_child_2_parent(self):
"""
Converts self.parent_2_child to child_2_parent
# child_2_parent[<term_name>] --> list of <term_name>'s parent term names
"""
cp_pairs = []
for p, c_list in self.parent_2_child.items():
for c in c_list:
cp_pairs.append((c,p))
first = lambda a: a[0]
cp_pairs.sort(key=first)
child_2_parent = {
r: [p[1] for p in q] for r, q in
itertools.groupby(cp_pairs, key=first)
}
for t in self.terms:
if t not in child_2_parent:
child_2_parent[t] = []
return child_2_parent
def clear_node_attr(self):
"""Resets the node attributes to be empty."""
self.node_attr = pd.DataFrame()
self.node_attr.index.name = 'Node'
def clear_edge_attr(self):
"""Resets the edge attributes to be empty."""
self.edge_attr = pd.DataFrame()
self.edge_attr.index = pd.MultiIndex(levels=[[],[]],
codes=[[],[]],
names=['Child', 'Parent'])
def update_node_attr(self, node_attr):
"""Update existing node attributes or add new node attributes.
Parameters
----------
node_attr : pandas.DataFrame
Dataframe where index are the names of genes or terms and
where the columns are the names of node attributes.
"""
####
# TODO : make sure that renaming/deleting/collapsing of genes and columns respect the node_attr and edge_attr
# Filter for genes and terms in the ontology
nodes = set(self.genes) | set(self.terms)
node_attr = node_attr.loc[[x for x in node_attr.index if x in nodes], :]
assert node_attr.index.duplicated().sum() == 0
# Update index to the union of current and new node_attr
self.node_attr = self.node_attr.reindex(self.node_attr.index.union(node_attr.index))
# Update columns
for col in node_attr.columns:
self.node_attr.loc[node_attr.index, col] = node_attr[col]
def update_edge_attr(self, edge_attr):
"""Update existing edge attributes or add new edge attributes.
Parameters
----------
edge_attr : pandas.DataFrame
Dataframe where the index is a MultiIndex represents edges
in the Ontology, such that the first level is the name of
a gene or child term, and the second level is the name of
a parent term. Columns are the names of edge attributes.
"""
# Filter for genes and terms in the ontology
edges = []
for child, parent_list in self.child_2_parent.items():
for parent in parent_list:
edges.append((child, parent))
for gene, term_list in self.gene_2_term.items():
for term in term_list:
edges.append((gene, self.terms[term]))
edges = set(edges)
edge_attr = edge_attr.loc[[x for x in edge_attr.index if x in edges], :]
assert edge_attr.index.duplicated().sum() == 0
# Update index
self.edge_attr = self.edge_attr.reindex(self.edge_attr.index.union(edge_attr.index))
# Update values for overlapping columns
for col in edge_attr.columns:
self.edge_attr.loc[edge_attr.index, col] = edge_attr[col].values
def get_roots(self):
"""Returns a list of the root term(s).
Returns
-------
: list
"""
tmp = set(self.terms) - set([y for x in self.parent_2_child.values() for y in x])
return sorted(tmp)
def _make_dummy(self, tree_edges=None):
"""For each term T in the ontology, create a new dummy term that
indirectly connect T's to T. For example, if g1 and g2 are in
T, then a new term dummy_T is created so that the new ontology
consists of
g1 --> T_dummy
g2 --> T_dummy
T_dummy --> T
Parameters
----------
tree_edges : list
List of (child, parent) edges that constitute a spanning
tree of the ontology. If specified, then for each term T,
only the genes that are connected to T in the spanning
tree will be re-routed to the dummy node.
Default: None. This restriction will not apply
Returns
-------
: ddot.Ontology.Ontology
"""
ont = self
new_gene_2_term = []
new_child_2_parent = []
for t in ont.terms:
used_dummy = False
if len(ont.parent_2_child[t]) > 0:
dummy_term = 'dummy2_%s' % t
else:
dummy_term = t
for g in [ont.genes[g] for g in ont.term_2_gene[t]]:
if (tree_edges is None) or (g,t) in tree_edges:
new_gene_2_term.append((g, dummy_term))
used_dummy=True
if used_dummy and dummy_term != t:
new_child_2_parent.append([dummy_term, t])
for p in ont.child_2_parent[t]:
if (tree_edges is None) or (t,p) in tree_edges:
new_child_2_parent.append((t, p))
ont_dummy = Ontology(new_child_2_parent, new_gene_2_term)
return ont_dummy
def _collect_transform(self,
tree_edges=None,
hidden_gene=True,
hidden_parent=True,
hidden_child=True):
"""
Creates intermediate duplicate nodes
"""
ont = self
if tree_edges is None:
tree_edges = self.get_tree()
nodes_copy = {v : 1 for v in ont.genes + ont.terms}
def get_copy(u):
u_name = '%s.%s' % (u, nodes_copy[u])
nodes_copy[u] += 1
return u_name
collect_nodes = []
new_gene_2_term = []
new_child_2_parent = []
for t in ont.terms:
## Gene-term connections
collect_hidden_gene = 'collect_hidden_gene_%s' % t
used_hidden_gene = False
for g in [ont.genes[g] for g in ont.term_2_gene[t]]:
if (not hidden_gene) or ((g, t) in tree_edges):
new_gene_2_term.append((g, collect_hidden_gene))
used_hidden_gene = True
else:
new_gene_2_term.append((get_copy(g), collect_hidden_gene))
used_hidden_gene = True
if used_hidden_gene:
collect_nodes.append(collect_hidden_gene)
new_child_2_parent.append((collect_hidden_gene, t))
## Parent-child term connections
collect_hidden_child = 'collect_hidden_child_%s' % t
collect_hidden_parent = 'collect_hidden_parent_%s' % t
used_hidden_child, used_hidden_parent = False, False
for c in ont.parent_2_child[t]:
if (not hidden_child) or ((c,t) in tree_edges):
new_child_2_parent.append((c,t))
else:
new_child_2_parent.append((get_copy(c), collect_hidden_child))
used_hidden_child = True
for p in ont.child_2_parent[t]:
if hidden_parent and ((t,p) not in tree_edges):
new_child_2_parent.append((get_copy(p), collect_hidden_parent))
used_hidden_parent = True
if used_hidden_child:
collect_nodes.append(collect_hidden_child)
new_child_2_parent.append((collect_hidden_child, t))
if used_hidden_parent:
collect_nodes.append(collect_hidden_parent)
new_child_2_parent.append((collect_hidden_parent, t))
ont_collect = Ontology(new_child_2_parent,
new_gene_2_term,
node_attr=ont.node_attr.copy(),
edge_attr=ont.edge_attr.copy(),
verbose=False)
##################################################
# Set Original_Name and Size for Duplicate Nodes #
new_and_orig = [('%s.%s' %(v,i), v) for v, copy_num in nodes_copy.items()
for i in (range(1, copy_num) if copy_num>1 else [])]
new_2_orig = dict(new_and_orig)
df = pd.DataFrame({'orig_tmp' : [x[1] for x in new_and_orig],
'Hidden' : True},
index=[x[0] for x in new_and_orig])
df = df.astype({'orig_tmp' : np.str, 'Hidden' : np.bool})
# For duplicate nodes, set the Original_Name attribute to the name of the original node
merge = pd.merge(df, ont.node_attr, how='left', left_on=['orig_tmp'], right_index=True)
if 'Original_Name' in merge:
unset = pd.isnull(merge['Original_Name']).values
merge.loc[unset, 'Original_Name'] = df.loc[unset, 'orig_tmp'].values
else:
merge['Original_Name'] = df['orig_tmp'].values
del merge['orig_tmp']
# Set the 'Size' attribute of duplicate nodes to be the 'Size'
# of the original node. If the original node is a term with no
# 'Size' attribute, then set 'Size' to be the number of genes
# in the term
in_merge = set(merge.index)
for node in merge.index:
if node in new_2_orig:
orig = new_2_orig[node]
if orig in in_merge and not pd.isnull(merge.loc[orig, 'Size']):
merge.loc[node, 'Size'] = merge.loc[new_2_orig[node], 'Size']
elif orig in ont.terms_index:
merge.loc[node, 'Size'] = ont.term_sizes[ont.terms_index[orig]]
# Append attributes for the new nodes
try:
# Used for pandas version >= 0.23
ont_collect.node_attr = pd.concat([ont.node_attr, merge], axis=0, sort=True)
except:
ont_collect.node_attr = pd.concat([ont.node_attr, merge], axis=0)
########################################
# Set Label and Size for collect nodes #
########################################
def get_label(x):
if 'hidden_child' in x:
return 'Linked Children'
elif 'hidden_parent' in x:
return 'Linked Parents'
elif 'hidden_gene' in x:
return 'Linked Genes'
elif 'tree_gene' in x:
return 'Genes'
collect_attr = pd.DataFrame(
{'Size' : 1,
'Label' : [get_label(x) for x in collect_nodes],
'is_collect_node' : True},
index=collect_nodes)
ont_collect.update_node_attr(collect_attr)
return ont_collect
[docs] def unfold(self,
duplicate=None,
genes_only=False,
levels=None,
tree_edges=None):
"""Traverses the ontology from the root to the leaves while
duplicating nodes during the traversal to create a tree representation.
Traverse the ontology from the root nodes to the leaves in a
breadth-first manner. Each time a node is traversed, then
create a duplicate of it
Parameters
----------
duplicate : list
Nodes to duplicate for unfolding. Default: all genes and terms
genes_only : bool
If True, then duplicate all of the genes and none of the terms. Default: False
levels :
"""
ont = self.propagate(direction='reverse', inplace=False)
hidden_mode = levels is not None
if hidden_mode:
if tree_edges is None:
tree_edges = self.get_tree()
hidden_depth = {}
if genes_only:
duplicate = ont.genes
elif duplicate is None:
duplicate = ont.genes + ont.terms
nodes_copy = {x : 0 for x in duplicate}
def get_name(u):
if u in nodes_copy:
u_name = '%s.%s' % (u, nodes_copy[u])
nodes_copy[u] += 1
else:
u_name = u
return u_name
to_expand = []
new_2_orig = {}
for u in ont.get_roots():
u_name = get_name(u)
new_2_orig[u_name] = u
to_expand.append(u_name)
if hidden_mode:
hidden_depth[u_name] = 0
expanded = set(to_expand)
hierarchy, mapping = [], []
# Manual bfs
curr = 0
while curr < len(to_expand):
v_name = to_expand[curr]
v = new_2_orig[v_name]
for u in [ont.genes[u] for u in ont.term_2_gene[v]]:
u_name = get_name(u)
new_2_orig[u_name] = u
mapping.append((u_name, v_name))
if hidden_mode:
v_depth = hidden_depth[v_name]
if v_depth==0:
if (u,v) in tree_edges:
hidden_depth[u_name] = 0
else:
hidden_depth[u_name] = 1
elif v_depth < levels:
hidden_depth[u_name] = v_depth + 1
for u in ont.parent_2_child[v]:
u_name = get_name(u)
new_2_orig[u_name] = u
hierarchy.append((u_name, v_name))
if hidden_mode:
v_depth = hidden_depth[v_name]
insert = u_name not in expanded
if v_depth==0 and ((u,v) in tree_edges):
hidden_depth[u_name] = 0
elif v_depth < levels:
hidden_depth[u_name] = v_depth + 1
else:
insert = False
else:
insert = u_name not in expanded
if insert:
to_expand.append(u_name)
expanded.add(u_name)
curr += 1
new_nodes, orig_nodes = zip(*new_2_orig.items())
new_nodes, orig_nodes = list(new_nodes), list(orig_nodes)
ont.node_attr = ont.node_attr.reindex(list(set(orig_nodes)))
node_attr = ont.node_attr.loc[orig_nodes, :].copy()
if 'Original_Name' in node_attr:
unset = pd.isnull(node_attr['Original_Name']).values
node_attr.loc[unset, 'Original_Name'] = np.array(orig_nodes)[unset]
else:
node_attr['Original_Name'] = orig_nodes
if hidden_mode:
node_attr['Level'] = [hidden_depth[v] for v in new_nodes]
node_attr.index = new_nodes
node_attr.dropna(axis=0, how='all', inplace=True)
new_edges = hierarchy + mapping
old_edges = [(new_2_orig[u], new_2_orig[v]) for u, v in new_edges]
in_index = [x in ont.edge_attr.index for x in old_edges]
if sum(in_index) > 0:
edge_attr = ont.edge_attr.loc[[x for x, y in zip(old_edges, in_index) if y], :].copy()
edge_attr.index = pd.MultiIndex.from_tuples([x for x, y in zip(new_edges, in_index) if y])
edge_attr.dropna(axis=0, how='all', inplace=True)
else:
edge_attr = None
ont = Ontology(hierarchy,
mapping,
edge_attr=edge_attr,
node_attr=node_attr,
parent_child=False,
verbose=False)
return ont
def _to_networkx_no_layout(self):
G = nx.DiGraph()
#################################
### Add nodes and node attributes
G.add_nodes_from(self.genes + self.terms)
set_node_attributes_from_pandas(G, self.node_attr)
# Ensure that all 'Size' values are the same numeric type
if 'Size' in self.node_attr.columns:
dtype = self.node_attr['Size'].dtype
if dtype in [np.dtype('float16'), np.dtype('float32'), np.dtype('float64')]:
dtype = float
else:
dtype = int
else:
dtype = int
for t in self.terms:
G.node[t][self.NODETYPE_ATTR] = self.TERM_NODETYPE
if ('Size' not in G.node[t]) or pd.isnull(G.node[t]['Size']):
G.node[t]['Size'] = dtype(self.term_sizes[self.terms_index[t]])
G.node[t]['isRoot'] = False
for g in self.genes:
G.node[g][self.NODETYPE_ATTR] = self.GENE_NODETYPE
if ('Size' not in G.node[g]) or pd.isnull(G.node[g]['Size']):
G.node[g]['Size'] = dtype(1)
G.node[g]['isRoot'] = False
# Identify the root
root = self.get_roots()[0]
G.node[root]['isRoot'] = True
# Set the node attribute 'Label'. If the node has a "Original
# Name" attribute, indicating that it is a duplicate, then use
# that. Otherwise, use the node's name.
for x in self.genes + self.terms:
data = G.node[x]
if ('Label' not in data) or pd.isnull(data['Label']):
if ('Original_Name' in data) and (not pd.isnull(data['Original_Name'])):
data['Label'] = data['Original_Name']
else:
data['Label'] = x
#################################
### Add edges and edge attributes
G.add_edges_from([(g, self.terms[t],
{self.EDGETYPE_ATTR : self.GENE_TERM_EDGETYPE}) \
for g in self.genes for t in self.gene_2_term[g]])
G.add_edges_from([(c, p,
{self.EDGETYPE_ATTR : self.CHILD_PARENT_EDGETYPE}) \
for p in self.terms for c in self.parent_2_child.get(p, [])])
set_edge_attributes_from_pandas(G, self.edge_attr)
return G
def expand(self, spanning_tree=True):
if spanning_tree is True:
ont = self._collect_transform()
else:
# Assume a list of tree edges are supplied
ont = self._collect_transform(spanning_tree)
G_tree = ont.get_tree(ret='ontology')._to_networkx_no_layout()
pos = bubble_layout_nx(G_tree)
tmp = ont.node_attr[['Label', 'is_collect_node']].dropna()
collect_nodes = tmp[tmp['is_collect_node']].index
gridify(collect_nodes, pos, G_tree)
## Remove collector nodes
def decide_delete(v):
return ((not layout_params['hidden_parent'] and v=='Linked Parents') or
(not layout_params['hidden_child'] and v=='Linked Children') or
(not layout_params['hidden_gene'] and v=='Linked Genes'))
tmp = ont.node_attr[['Label', 'is_collect_node']].dropna()
tmp = tmp[tmp['is_collect_node']]
tmp = tmp[tmp['Label'].apply(decide_delete)]
to_delete = tmp.index.tolist()
ont_red = ont
if len(to_delete) > 0:
# Need fast special delete
ont_red = ont_red.delete(to_delete=to_delete, preserve_transitivity=True)
# Set the original term sizes for the original copy of
# each term (not the duplicates)
ont_red.update_node_attr(pd.DataFrame({'Size' : self.term_sizes}, index=self.terms))
G = ont_red._to_networkx_no_layout()
nodes_set = set(G.nodes())
G.pos = {n : (float(scale*p[0]), float(scale*p[1])) for n, p in pos.items() if n in nodes_set}
nx_set_tree_edges(G, ont_red.get_tree())
######################################################
# TODO: move this visual styling outside of the layout
# functionality
nx.set_edge_attributes(G, values='ARROW', name='Vis:EDGE_SOURCE_ARROW_SHAPE')
nx.set_edge_attributes(G, values='NONE', name='Vis:EDGE_TARGET_ARROW_SHAPE')
for v, data in G.nodes(data=True):
# if 'collect_hidden' in v and 'is_collect_node' in data and data['is_collect_node']:
# for u in G.predecessors(v):
# G.node[u]['Vis:Fill Color'] = '#3182BD'
try:
if 'collect_hidden_parent' in v and 'is_collect_node' in data and data['is_collect_node']:
for _, _, data in G.in_edges(v, data=True):
data["Vis:EDGE_TARGET_ARROW_SHAPE"] = 'ARROW'
data["Vis:EDGE_SOURCE_ARROW_SHAPE"] = 'NONE'
except:
print(data)
print('v', v)
print('collect_hidden_parent' in v)
print('is_collect_node' in data)
print(data['is_collect_node'])
raise
[docs] def to_networkx(self,
layout='bubble',
spanning_tree=True,
layout_params=None,
verbose=False):
"""Converts Ontology into a NetworkX object.
Parameters
----------
node_attr : pandas.DataFrame
Meta-data about genes and terms that will be included as node
attributes in the NetworkX object.
edge_attr : pandas.DataFrame
Meta-data about connections among genes and terms that
will be included as edge attributes in the NetworkX
object.
spanning_tree : bool
If True, then identify a spanning tree of the DAG. include
an edge attribute "Is_Tree_Edge" that indicates
layout : str
The name of the layout algorithm for laying out the
Ontology as a graph. Node positions are astored in the
node attributes 'x_pos' and 'y_pos'. If None, then do not
perform a layout.
Returns
-------
: nx.DiGraph
"""
default_layout_params = {'hidden_parent' : True,
'hidden_child' : False,
'hidden_gene' : False}
if layout_params is not None:
default_layout_params.update(layout_params)
layout_params = default_layout_params
if spanning_tree:
scale = 1
if layout is None or layout=='bubble':
G = self._to_networkx_no_layout()
if spanning_tree is True:
tree_edges = self.get_tree()
else:
tree_edges = spanning_tree
nx_set_tree_edges(G, tree_edges)
if layout=='bubble':
G_tree = self.propagate('reverse')._make_dummy(tree_edges)._to_networkx_no_layout()
pos = bubble_layout_nx(G_tree, verbose=verbose)
gridify([v for v in G_tree.nodes() if 'dummy2' in v], pos, G_tree)
G.pos = {n : (float(scale*p[0]), float(scale*p[1])) for n, p in pos.items() if 'dummy2' not in n}
elif layout=='bubble-collect':
if spanning_tree is True:
ont = self._collect_transform()
else:
# Assume a list of tree edges are supplied
ont = self._collect_transform(spanning_tree)
G_tree = ont.get_tree(ret='ontology')._to_networkx_no_layout()
pos = bubble_layout_nx(G_tree)
tmp = ont.node_attr[['Label', 'is_collect_node']].dropna()
collect_nodes = tmp[tmp['is_collect_node']].index
gridify(collect_nodes, pos, G_tree)
## Remove collector nodes
def decide_delete(v):
return ((not layout_params['hidden_parent'] and v=='Linked Parents') or
(not layout_params['hidden_child'] and v=='Linked Children') or
(not layout_params['hidden_gene'] and v=='Linked Genes'))
tmp = ont.node_attr[['Label', 'is_collect_node']].dropna()
tmp = tmp[tmp['is_collect_node']]
tmp = tmp[tmp['Label'].apply(decide_delete)]
to_delete = tmp.index.tolist()
ont_red = ont
if len(to_delete) > 0:
# Need fast special delete
ont_red = ont_red.delete(to_delete=to_delete, preserve_transitivity=True)
# Set the original term sizes for the original copy of
# each term (not the duplicates)
ont_red.update_node_attr(pd.DataFrame({'Size' : self.term_sizes}, index=self.terms))
G = ont_red._to_networkx_no_layout()
nodes_set = set(G.nodes())
G.pos = {n : (float(scale*p[0]), float(scale*p[1])) for n, p in pos.items() if n in nodes_set}
nx_set_tree_edges(G, ont_red.get_tree())
######################################################
# TODO: move this visual styling outside of the layout
# functionality
nx.set_edge_attributes(G, values='ARROW', name='Vis:EDGE_SOURCE_ARROW_SHAPE')
nx.set_edge_attributes(G, values='NONE', name='Vis:EDGE_TARGET_ARROW_SHAPE')
for v, data in G.nodes(data=True):
# if 'collect_hidden' in v and 'is_collect_node' in data and data['is_collect_node']:
# for u in G.predecessors(v):
# G.node[u]['Vis:Fill Color'] = '#3182BD'
try:
if 'collect_hidden_parent' in v and 'is_collect_node' in data and data['is_collect_node']:
for _, _, data in G.in_edges(v, data=True):
data["Vis:EDGE_TARGET_ARROW_SHAPE"] = 'ARROW'
data["Vis:EDGE_SOURCE_ARROW_SHAPE"] = 'NONE'
except:
print(data)
print('v', v)
print('collect_hidden_parent' in v)
print('is_collect_node' in data)
print(data['is_collect_node'])
raise
else:
raise Exception('Unsupported layout: %s', layout)
if layout is not None:
nx.set_node_attributes(G, values={n : x for n, (x,y) in G.pos.items()}, name='x_pos')
nx.set_node_attributes(G, values={n : y for n, (x,y) in G.pos.items()}, name='y_pos')
else:
G = self._to_networkx_no_layout()
return G
[docs] @classmethod
def from_table(cls,
table,
parent=0,
child=1,
is_mapping=None,
mapping=None,
mapping_parent=0,
mapping_child=1,
header=0,
propagate=False,
verbose=False,
clixo_format=False,
clear_default_attr=True,
**kwargs):
"""Create Ontology from a tab-delimited table or pandas DataFrame.
Duplicate gene-term or term-term connections in the table are removed.
Parameters
----------
table : pandas.DataFrame, file-like object, or filename
A table that lists (child term, parent term) pairs. If
mapping==None, then this table should also include (gene,
term) pairs.
parent : int or str
Column for parent terms in table (index or name of column)
child : int or str
Column for child terms and genes in table (index or name of column)
is_mapping : function
A function that is applied on each row and returns True if
the row represents a (gene, term) pair and False
otherwise. This function is only applied when a separate
table of (gene, term) pairs is not specified,
i.e. mapping==None.
The default function is `lambda row: row[2]=={0}`
which tests if the third column equals the string "{0}".
mapping : pandas.DataFrame, file-like object, or filename (optional)
A separate table listing only (gene, term) pairs
mapping_parent : int or str
Column for terms in mapping table (index or name of column)
mappping_child : int or str
Column for genes in mapping table (index or name of column)
header : int or None
Row number to use as the column names, which are then
stored in the resulting Ontology object's `edge_attr`
field. For example if `header=0` (default), then the first
row is assumed to be column names. If `header=None`, then
no column names are assumed.
propagate : None or str
The direction ('forward' or 'reverse') for propagating
gene-term annotations up the hierarchy with
Ontology.propagate(). If None, then don't
propagate annotations.
clixo_format : bool
If True, The table is assumed to be in the same format
produced by the CLIXO C++ implementation. In particular,
table has three columns:
Column 1) Parent Term
Column 2) Child Term or Gene
Column 3) The string "gene" if the row is a
gene-term mapping, otherwise the string "default".
The table is also assumed to have no column headers (i.e. header=False)
clear_default_attr: bool
If True (default), then remove the edge attribute
'EdgeType' created using Ontology.to_table(). This
attribute was created to make the table be an equivalent
representation of an Ontology object; however, it is no
longer necessary after reconstructing the Ontology object.
Returns
-------
: ddot.Ontology.Ontology
""".format(cls.GENE_TERM_EDGETYPE)
if clixo_format:
ont = cls.from_table(
table,
parent=0,
child=1,
is_mapping=lambda x: x[2]=='gene',
header=None,
clixo_format=False,
verbose=verbose)
ont.edge_attr.columns = map(str, ont.edge_attr.columns)
del ont.edge_attr['2']
return ont
if is_mapping is None:
if mapping is None:
# print('WARNING: no gene-term connections '
# 'were specified by the is_mapping '
# 'function or separate table. '
# 'Default: assume a gene-term connection when the 3rd column equals %s' % cls.GENE_TERM_EDGETYPE)
is_mapping = lambda x: x.iloc[2]==cls.GENE_TERM_EDGETYPE
# Read table
try:
table = pd.read_table(table, comment='#', header=header)
except:
assert isinstance(table, pd.DataFrame)
if child not in table.columns:
child = table.columns[child]
if parent not in table.columns:
parent = table.columns[parent]
for col in [child, parent]:
table.loc[:,col] = table.loc[:,col].astype(str)
edge_attr = table.set_index([child, parent])
edge_attr.index.rename(['Child', 'Parent'], inplace=True)
if mapping is None:
# Extract gene-term connections from table
mask = table.apply(is_mapping, axis=1)
mapping = table.loc[mask, :].loc[:,[child, parent]]
hierarchy = table.loc[~mask, :].loc[:,[child, parent]]
mapping_child, mapping_parent = child, parent
else:
# Read separate table of gene-term connections
try:
mapping = pd.read_table(mapping, comment='#', header=header)
except:
assert isinstance(mapping, pd.DataFrame)
if mapping_child not in mapping.columns:
mapping_child = mapping.columns[mapping_child]
if mapping_parent not in mapping.columns:
mapping_parent = mapping.columns[mapping_parent]
for col in [mapping_child, mapping_parent]:
mapping.loc[:,col] = mapping.loc[:,col].astype(str)
mapping_attr = mapping.set_index([mapping_child, mapping_parent])
mapping_attr.index.rename(['Child', 'Parent'], inplace=True)
try:
# Used for pandas version >= 0.23
edge_attr = pd.concat([edge_attr, mapping_attr], sort=True)
except:
edge_attr = pd.concat([edge_attr, mapping_attr])
mapping = mapping.loc[:,[mapping_child, mapping_parent]]
hierarchy = table.loc[:,[child, parent]]
dups = mapping.duplicated([mapping_child, mapping_parent]).sum()
if dups > 0:
print('WARNING: Dropping %s duplicate gene-term connections' % dups)
mapping.drop_duplicates([mapping_child, mapping_parent], inplace=True)
dups = hierarchy.duplicated([child, parent]).sum()
if dups > 0:
print('WARNING: Dropping %s duplicate term-term connections' % dups)
hierarchy.drop_duplicates([child, parent], inplace=True)
edge_attr = edge_attr.loc[~ edge_attr.index.duplicated(), :]
edge_attr.index.names = ['Child', 'Parent']
if clear_default_attr:
if cls.EDGETYPE_ATTR in edge_attr:
del edge_attr[cls.EDGETYPE_ATTR]
mapping, hierarchy = mapping.values.tolist(), hierarchy.values.tolist()
return cls(hierarchy,
mapping,
parent_child=False,
edge_attr=edge_attr,
propagate=propagate,
verbose=verbose,
**kwargs)
@classmethod
def from_scipy_linkage(cls, Z):
"""Creates an Ontology object from a linkage matrix created by scipy's
hierarchical/agglomerative clustering. Note that this form of
clustering produces a binary tree.
"""
import scipy.cluster.hierarchy
rootnode, nodelist = scipy.cluster.hierarchy.to_tree(Z, rd=True)
leaves = set(scipy.cluster.hierarchy.leaves_list(Z))
hierarchy, mapping = [], []
for v in nodelist:
v_id = v.get_id()
if v.get_left():
child = v.get_left().get_id()
if child in leaves:
mapping.append((v_id, child))
else:
hierarchy.append((v_id, child))
if v.get_right():
child = v.get_right().get_id()
if child in leaves:
mapping.append((v_id, child))
else:
hierarchy.append((v_id, child))
return cls(hierarchy, mapping, parent_child=True)
[docs] @classmethod
def from_ndex(cls,
ndex_uuid,
ndex_user=None,
ndex_pass=None,
ndex_server=None,
edgetype_attr=None,
edgetype_value=None):
"""Reads an Ontology stored on NDEx. Gene and terms are distinguished
according by an edge attribute.
Parameters
----------
ndex_uuid : str
NDEx UUID of ontology
edgetype_attr : str
Name of the edge attribute that distinguishes a (gene,
term) pair from a (child term, parent term) pair
gene_value : str
Value of the edge attribute for (gene, term) pairs
Returns
-------
: ddot.Ontology.Ontology
"""
if ndex_server is None:
ndex_server = ddot.config.ndex_server
if ndex_user is None:
ndex_user = ddot.config.ndex_user
if ndex_pass is None:
ndex_pass = ddot.config.ndex_pass
if '/' in ndex_uuid:
ndex_server = parse_ndex_server(ndex_uuid)
ndex_uuid = parse_ndex_uuid(ndex_uuid)
G = NdexGraph(
server=ndex_server,
username=ndex_user,
password=ndex_pass,
uuid=ndex_uuid)
return cls.from_NdexGraph(
G,
edgetype_attr=edgetype_attr,
edgetype_value=edgetype_value)
@classmethod
def from_NdexGraph(cls,
G,
edgetype_attr=None,
edgetype_value=None):
"""Converts a NdexGraph object to an Ontology object. Gene and terms
are distinguished by an edge attribute.
Parameters
----------
G : NdexGraph
edgetype_attr : str
Name of the edge attribute that distinguishes a (gene,
term) pair from a (child term, parent term) pair
gene_value : str
Value of the edge attribute for (gene, term) pairs
Returns
-------
: ddot.Ontology.Ontology
"""
return cls.from_networkx(
NdexGraph_to_nx(G),
edgetype_attr=edgetype_attr,
edgetype_value=edgetype_value)
[docs] @classmethod
def from_networkx(cls,
G,
edgetype_attr=None,
edgetype_value=None,
clear_default_attr=True):
"""Converts a NetworkX object to an Ontology object. Gene and terms
are distinguished by an edge attribute.
Parameters
----------
G : nx.DiGraph
edgetype_attr : str
Name of the edge attribute that distinguishes a (gene,
term) pair from a (child term, parent term) pair
edgetype_value : str
Value of the edge attribute for (gene, term) pairs
clear_default_attr : bool
If True (default), then remove the node and edge
attributes that are created in a NetworkX graph using
Ontology.to_networkx() or Ontology.to_ndex(). These
attributes include 'Label', 'Size', 'NodeType', and
'EdgeType'. These attributes were created to make the
NetworkX graph be an equivalent representation of an
Ontology object; however, they are no longer necessary
after reconstrcting the Ontology object.
Returns
-------
: ddot.Ontology.Ontology
"""
if edgetype_attr is None:
edgetype_attr=cls.EDGETYPE_ATTR
if edgetype_value is None:
edgetype_value=cls.GENE_TERM_EDGETYPE
hierarchy = []
mapping = []
for u, v, attr in G.edges(data=True):
if attr[edgetype_attr] == edgetype_value:
mapping.append((u, v))
else:
hierarchy.append((u, v))
edge_attr = nx_edges_to_pandas(G)
node_attr = nx_nodes_to_pandas(G)
ont = cls(hierarchy,
mapping,
node_attr=node_attr,
edge_attr=edge_attr)
if clear_default_attr:
for attr in [Ontology.NODETYPE_ATTR, 'Label', 'Size', 'isRoot', 'x_pos', 'y_pos']:
if attr in ont.node_attr.columns:
del ont.node_attr[attr]
for attr in [edgetype_attr, 'Is_Tree_Edge']:
if attr in ont.edge_attr.columns:
del ont.edge_attr[attr]
return ont
[docs] @classmethod
def from_igraph(cls,
G,
edgetype_attr=None,
edgetype_value=None,
verbose=False):
"""Converts a igraph Graph object to an Ontology object. Gene and terms
are distinguished by an edge attribute.
Parameters
----------
G : igraph.Graph
edgetype_attr : str
Name of the edge attribute that distinguishes a (gene,
term) pair from a (child term, parent term) pair
edgetype_value : str
Value of the edge attribute for (gene, term) pairs
Returns
-------
: ddot.Ontology.Ontology
"""
if edgetype_attr is None:
edgetype_attr=cls.EDGETYPE_ATTR
if edgetype_value is None:
edgetype_value=cls.GENE_TERM_EDGETYPE
hierarchy = []
mapping = []
for e in G.es:
u = G.vs[e.source]['name']
v = G.vs[e.target]['name']
if e[edgetype_attr] == edgetype_value:
mapping.append((u, v))
else:
hierarchy.append((u, v))
edge_attr = ig_edges_to_pandas(G)
node_attr = ig_nodes_to_pandas(G)
edge_attr.index.names = ['Child', 'Parent']
node_attr.index.name = 'Node'
ont = cls(hierarchy,
mapping,
node_attr=node_attr,
edge_attr=edge_attr,
verbose=verbose)
for attr in [Ontology.NODETYPE_ATTR]:
if attr in ont.node_attr.columns:
del ont.node_attr[attr]
for attr in [edgetype_attr, 'Is_Tree_Edge']:
if attr in ont.edge_attr.columns:
del ont.edge_attr[attr]
return ont
def collapse_ontology(self,
method='python',
to_keep=None,
min_term_size=2,
verbose=True):
"""Remove redundant and empty terms. When a term T is removed,
hierarchical relations are preserved by connecting every child
of T with every parent of T. This removal operation has the
nice property of being commutative, i.e. the order of removal
does not matter.
Parameters
-----------
method : str
If "mhkramer", then use the collapseRedundantNodes script
in the alignOntology package. If "python", then use an
internal Python script.
min_term_size : int
Remove terms that are below this size. TODO: not yet supported
Returns
-------
: ddot.ddot.Ontology
A new Ontology object
"""
if method=='mhkramer':
assert to_keep is None, 'to_keep is only supported for method=="python"'
# Propagate forward and then reverse
ont = self.copy()
ont = self.propagate(direction='forward', inplace=False)
ont.propagate(direction='reverse', inplace=True)
top_level = os.path.dirname(os.path.abspath(inspect.getfile(ddot)))
collapseRedundantNodes = os.path.join(top_level, 'alignOntology', 'collapseRedundantNodes')
# assert os.path.isdir(ddot.config.alignOntology)
# collapseRedundantNodes = os.path.join(ddot.config.alignOntology, 'collapseRedundantNodes')
assert os.path.isfile(collapseRedundantNodes)
with tempfile.NamedTemporaryFile('w', delete=False) as f:
ont.to_table(f, clixo_format=True)
try:
cmd = '%s %s' % (collapseRedundantNodes, f.name)
print('collapse command:', cmd)
p = Popen(shlex.split(cmd), shell=False, stdout=PIPE, stderr=PIPE)
collapsed, err = p.communicate()
collapsed = collapsed.decode()
finally:
os.remove(f.name)
ont = Ontology.from_table(
StringIO(collapsed),
is_mapping=lambda x: x[2]=='gene',
clixo_format=True
)
ont.clear_edge_attr()
ont.update_node_attr(self.node_attr)
ont.update_edge_attr(self.edge_attr)
return ont
elif method=='python':
ont = self.propagate('forward', inplace=False)
term_hash = {t : hash(tuple(g_list)) for t, g_list in ont.term_2_gene.items()}
to_collapse = set()
for p in ont.parent_2_child:
for c in ont.parent_2_child[p]:
if term_hash[p] == term_hash[c]:
to_collapse.add(p)
if min_term_size is not None:
to_collapse = to_collapse | set([t for t, s in zip(ont.terms, ont.term_sizes) if s < min_term_size])
if to_keep is not None:
to_collapse = to_collapse - set(to_keep)
# print('to_collapse:', sorted(to_collapse))
ont.propagate('reverse', inplace=True)
ont_red = ont.delete(to_delete=to_collapse, preserve_transitivity=True)
return ont_red
@classmethod
def mutual_collapse(cls,
ont1,
ont2,
verbose=False):
"""Collapses two ontologies to the common set of genes.
Parameters
-----------
ont1 : ddot.Ontology.Ontology
ont2 : ddot.Ontology.Ontology
Returns
-------
ont1_collapsed : ddot.Ontology.Ontology
ont2_collapsed : ddot.Ontology.Ontology
"""
common_genes = set(ont1.genes) & set(ont2.genes)
if verbose:
print('Common genes:', len(common_genes))
if len(common_genes) > 0:
ont1 = ont1.delete(to_delete=set(ont1.genes) - common_genes, inplace=False)
ont1_collapsed = ont1.collapse_ontology()
ont2 = ont2.delete(to_delete=set(ont2.genes) - common_genes, inplace=False)
ont2_collapsed = ont2.collapse_ontology()
else:
raise Exception('No common genes between ontologies')
if verbose:
print('ont1_collapsed:', ont1_collapsed.summary())
print('ont2_collapsed:', ont2_collapsed.summary())
return ont1_collapsed, ont2_collapsed
[docs] def focus(self,
branches=None,
genes=None,
collapse=False,
root=True,
verbose=True):
"""
"""
assert (branches is not None) or (genes is not None)
to_keep = np.array(self.genes + self.terms)
if branches is not None:
to_keep = to_keep[self.connected(to_keep, branches).sum(1) > 0]
if verbose:
print('Genes and Terms to keep:', to_keep.size)
if genes is not None:
to_keep = to_keep[self.connected(genes, to_keep).sum(0) > 0]
if verbose:
print('Genes and Terms to keep:', to_keep.size)
if root:
while True:
common_root = self.common_ancestors(to_keep, minimal=True)
if common_root in to_keep or len(common_root)<=1:
break
else:
print('Adding', common_root)
to_keep = np.append(to_keep, common_root)
ont = self.delete(to_keep=to_keep, preserve_transitivity=True)
if collapse:
ont = ont.collapse_ontology(method='python', to_keep=ont.get_roots())
df = ont.to_table(edge_attr=True)
new_connections = []
for t in ont.terms:
removed_genes = set([self.genes[g] for g in self.term_2_gene[t]]) - set([ont.genes[g] for g in ont.term_2_gene[t]])
removed_terms = set(self.parent_2_child[t]) - set(ont.parent_2_child[t])
if len(removed_genes) > 0:
new_connections.append(('%s_%s_other_genes' % (t, len(removed_genes)), t, self.GENE_TERM_EDGETYPE))
if len(removed_terms) > 0:
new_connections.append(('%s_%s_other_terms' % (t, len(removed_terms)), t, self.CHILD_PARENT_EDGETYPE))
if len(new_connections) > 0:
new_connections = pd.DataFrame(new_connections)
new_connections.columns = ['Child', 'Parent', self.EDGETYPE_ATTR]
new_nodes = new_connections['Child'].values.tolist()
new_connections['Summary'] = True
df['Summary'] = False
try:
# Used for pandas version >= 0.23
tmp = pd.concat([df, new_connections], ignore_index=True, sort=True)
except:
tmp = pd.concat([df, new_connections], ignore_index=True)
df = tmp[df.columns]
ont = Ontology.from_table(df)
ont.update_node_attr(self.node_attr)
# orig_sizes = pd.DataFrame({'Original_Size' : self.term_sizes}, index=self.terms)
# ont.update_node_attr(orig_sizes)
# if len(new_connections)>0:
# summary_sizes = pd.DataFrame({'Original_Size' : [int(x.split('_')[1]) for x in new_nodes]}, index=new_nodes)
# ont.update_node_attr(summary_sizes)
if len(new_connections) > 0:
ont.update_node_attr(pd.DataFrame({'Label':['_'.join(x.split('_')[1:]) for x in new_nodes]}, index=new_nodes))
return ont
[docs] def delete(self,
to_delete=None,
to_keep=None,
preserve_transitivity=True,
inplace=False):
"""Delete genes and/or terms from the ontology.
Parameters
----------
to_delete : array-like (optional)
Names of genes and/or terms to delete. Either to_delete or
to_keep must be specified.
to_keep : array-like (optional)
Names of genes and/or terms to keep; all other genes/terms
are delete. Only used if to_delete is not specified.
preserve_transitivity : bool
If True, then maintain transitive relations when deleting
terms. For example, if the hierarchical structure consists
of
geneA --> term1
term1 --> term2
term2 --> term3
term2 --> term4
then deleting term2 will result in the structure:
geneA --> term1
term1 --> term3
term3 --> term4
If False, then deleting term2 will result in a
disconnected structure:
geneA --> term1
inplace : bool
If True, then modify the ontology. If False, then create and modify a copy.
Returns
-------
: ddot.Ontology.Ontology
"""
if inplace:
ont = self
else:
ont = self.copy()
if to_delete is not None:
terms = set([x for x in to_delete if x in ont.terms_index])
genes = set([x for x in to_delete if x in ont.genes_index])
elif to_keep is not None:
terms = set(ont.terms) - set([x for x in to_keep if x in ont.terms_index])
genes = set(ont.genes) - set([x for x in to_keep if x in ont.genes_index])
else:
raise Exception('Must specify nodes to delete or to keep')
if len(genes) > 0:
ont.genes = [g for g in ont.genes if g not in genes]
ont.genes_index = make_index(ont.genes)
ont.gene_2_term = {g : t for g, t in ont.gene_2_term.items()
if g not in genes}
ont._update_fields()
if len(terms) > 0:
if preserve_transitivity:
gene_2_term_set = {g : set([ont.terms[s] for s in t]) for g, t in ont.gene_2_term.items()}
term_2_gene_set = {a : set(b) for a, b in ont.term_2_gene.items()}
child_2_parent_set = {a : set(b) for a, b in ont.child_2_parent.items()}
parent_2_child_set = {a : set(b) for a, b in ont.parent_2_child.items()}
for t in terms:
t_parents = child_2_parent_set[t]
t_genes = term_2_gene_set[t]
t_children = parent_2_child_set[t]
for g_i in t_genes:
g = ont.genes[g_i]
gene_2_term_set[g].update(t_parents)
gene_2_term_set[g].remove(t)
for p in t_parents:
term_2_gene_set[p].update(t_genes)
parent_2_child_set[p].update(t_children)
parent_2_child_set[p].remove(t)
for c in t_children:
child_2_parent_set[c].update(t_parents)
child_2_parent_set[c].remove(t)
del child_2_parent_set[t]
del parent_2_child_set[t]
del term_2_gene_set[t]
ont.terms = [t for t in ont.terms if t not in terms]
terms_index = make_index(ont.terms)
ont.terms_index = terms_index
ont.gene_2_term = {g : sorted([terms_index[s] for s in t]) for g, t in gene_2_term_set.items()}
ont.child_2_parent = {c : sorted(p) for c, p in child_2_parent_set.items()}
ont.parent_2_child = invert_dict(ont.child_2_parent)
ont._update_fields()
else:
tmp_gene_2_term = {g : [ont.terms[t] for t in t_list]
for g, t_list in ont.gene_2_term.items()}
ont.terms = [t for t in ont.terms if t not in terms]
ont.terms_index = make_index(ont.terms)
ont.gene_2_term = {g : [ont.terms_index[t] for t in t_list if t not in terms]
for g, t_list in tmp_gene_2_term.items()}
ont.parent_2_child = {p : [c for c in c_list if c not in terms]
for p, c_list in ont.parent_2_child.items()
if p not in terms}
ont._update_fields()
# Update node/edge attributes
to_keep = (set(ont.terms) | set(ont.genes)) - genes - terms
ont.edge_attr = ont.edge_attr[ont.edge_attr.index.get_level_values(0).isin(to_keep) | \
ont.edge_attr.index.get_level_values(1).isin(to_keep)]
ont.node_attr = ont.node_attr[ont.node_attr.index.isin(to_keep)]
return ont
def rename(self,
genes=lambda x: x,
terms=lambda x: x,
inplace=False):
"""Rename gene and/or term names.
Parameters
----------
genes : dict or function
If dictionary, then it maps current gene names to new
names. Genes not in dictionary are deleted.
If function, then genes(name) returns the new name.
terms : dict or function
If dictionary, then it maps current term names to new
names. Terms not in dictionary are deleted.
If function, then terms(name) returns the new name.
inplace : bool
If True, then modify the ontology. If False, then create
and modify a copy.
Returns
-------
: ddot.Ontology.Ontology
"""
try:
terms = {t : terms(t) for t in self.terms}
except:
pass
try:
genes = {g : genes(g) for g in self.genes}
except:
pass
if inplace:
ont = self
else:
ont = self.copy()
if genes:
new_genes = set()
new_gene_2_term = {}
for g in ont.genes:
new_g = genes.get(g, g)
if hasattr(new_g, '__iter__') and not isinstance(new_g, str):
for new_gg in new_g:
new_genes.add(new_gg)
new_gene_2_term[new_gg] = ont.gene_2_term[g]
else:
new_genes.add(new_g)
new_gene_2_term[new_g] = ont.gene_2_term[g]
ont.genes = sorted(new_genes)
ont.gene_2_term = new_gene_2_term
ont.genes_index = make_index(ont.genes)
ont._update_fields()
if terms:
ont.parent_2_child = {terms.get(p, p) : [terms.get(c, c) for c in c_list]
for p, c_list in ont.parent_2_child.items()}
old_term_names = ont.terms
ont.terms = [terms.get(t,t) for t in ont.terms]
# Retain a unique set of term names
ont.terms = sorted(set(ont.terms))
ont.terms_index = make_index(ont.terms)
ont.gene_2_term = {g : [ont.terms_index[terms.get(t,t)] for t in [old_term_names[t] for t in t_list]] for g, t_list in ont.gene_2_term.items()}
ont._update_fields()
conversions = genes.copy()
conversions.update(terms)
# Remove identities
conversions = {k : v for k, v in conversions.items() if k!=v}
f = lambda x: conversions.get(x,x)
# Update node attributes
index = ont.node_attr.index
ont.node_attr.index = pd.Series(index).map(f)
# Update edge attributes
idx = ont.edge_attr.index
idx.set_levels([idx.levels[0].map(f), idx.levels[1].map(f)], inplace=True)
ont._check_valid()
return ont
def _check_valid(self):
if not self.is_dag():
print('Found cycle:', nx.find_cycle(self._to_networkx_no_layout()))
raise Exception('Not a directed acyclic graph')
assert len(self.genes) == len(set(self.genes))
assert len(self.terms) == len(set(self.terms))
assert set(self.genes) == set(self.gene_2_term.keys())
assert set(self.terms) == set(self.child_2_parent.keys())
assert set(self.terms) == set(self.parent_2_child.keys())
assert set(self.terms) == set(self.term_2_gene.keys())
assert self.edge_attr.index.duplicated().sum()==0
assert self.node_attr.index.duplicated().sum()==0
[docs] def to_table(self,
output=None,
term_2_term=True,
gene_2_term=True,
edge_attr=False,
header=True,
parent_child=True,
clixo_format=False):
"""Convert Ontology to a table representation. Return a
pandas.DataFrame and, optionally, write it to a file as a
tab-delimited file.
Parameters
----------
output : filepath or file-like
File to write table. If None, then only return a
pandas.DataFrame
term_2_term : bool
Include (child term, parent term) pairs
gene_2_term : bool
Include (gene, term) pairs
edge_attr : array-like or bool
List of extra edge attributes to include. If True, then
include all attributes. If False, then don't include any
attribute.
header : bool
If True (default), then write the column names as the
first row of the table.
parent_child : bool
If True, then the first column is the parent term and the
second column is the child term or gene. If False, then
the columns are reversed.
clixo_format : bool
If True, the table is the same format used the CLIXO C++
implementation. In particular, the table has three columns:
Column 1) Parent Term
Column 2) Child Term or Gene
Column 3) The string "gene" if the row is a
gene-term mapping, otherwise the string "default".
Returns
-------
: pandas.DataFrame
Contains at least three columns: (1) "Parent", (2)
"Child", and (3) "EdgeType".
"""
if clixo_format:
df = self.to_table(output=None,
term_2_term=True,
gene_2_term=True,
edge_attr=False,
header=False,
parent_child=True,
clixo_format=False)
df.replace({self.EDGETYPE_ATTR : {self.GENE_TERM_EDGETYPE : 'gene', self.CHILD_PARENT_EDGETYPE : 'default'}}, inplace=True)
if output is not None:
df.to_csv(output, header=False, index=False, sep='\t')
return df
df = pd.DataFrame(columns=['Parent','Child',self.EDGETYPE_ATTR])
if term_2_term:
df = df.append(self._hierarchy_to_pandas(), ignore_index=True)
if gene_2_term:
df = df.append(self._mapping_to_pandas(), ignore_index=True)
if edge_attr and self.edge_attr.shape[1] > 0:
if edge_attr==True:
edge_attr = df.columns
df = df.merge(self.edge_attr,
how='left',
left_on=['Child', 'Parent'],
right_index=True)
first_two = ['Parent', 'Child'] if parent_child else ['Child', 'Parent']
df = df[first_two + [x for x in df.columns if x not in first_two]]
if output is not None:
df.to_csv(output, header=header, index=False, sep='\t')
return df
def _hierarchy_to_pandas(self):
triples = [(p,c) for p, c_list in self.parent_2_child.items() for c in c_list]
df = pd.DataFrame(triples, columns=['Parent', 'Child'])
df[self.EDGETYPE_ATTR] = self.CHILD_PARENT_EDGETYPE
return df
def _mapping_to_pandas(self):
pairs = [(self.terms[t], g) for g, t_list in self.gene_2_term.items() for t in t_list]
df = pd.DataFrame(pairs, columns=['Parent', 'Child'])
df[self.EDGETYPE_ATTR] = self.GENE_TERM_EDGETYPE
return df
def copy(self):
"""Create a deep copy of the Ontology object"""
ont = Ontology(None, None, **{'empty' : True})
for x in ['node_attr', 'edge_attr']:
setattr(ont, x, getattr(self, x).copy())
for x in ['genes', 'terms']:
setattr(ont, x, getattr(self, x)[:])
if self._term_sizes is None:
ont._term_sizes = None
else:
ont._term_sizes = self._term_sizes[:]
for x in ['genes_index', 'terms_index']:
setattr(ont, x, getattr(self, x).copy())
for x in ['gene_2_term', 'term_2_gene', 'child_2_parent', 'parent_2_child']:
copy_val = {k : v[:] for k, v in getattr(self, x).items()}
setattr(ont, x, copy_val)
return ont
[docs] def flatten(self,
include_genes=True,
include_terms=False,
similarity='Resnik'):
"""Flatten the hierarchy into a node-node similarity matrix by
calculating a similarity between pair of genes in
`genes_subset`. Currently, only the Resnik semantic similarity
measure is implemented.
Parameters
-----------
include_genes : bool
If True, then calculate pairwise similarities between
genes. If `include_terms` is also True, then also
calculate similarities between genes and terms.
include_terms : bool
If True, then calculate pairwise similarities between
terms. If `include_genes` is also True, then also
calculate similarities between genes and terms.
similarity : str
Type of semantic similarity. (default: "Resnik")
The Resnik similarity s(g1,g2) is defined as
:math:`-log_2(|T_{sca}| / |T_{root}|)` where :math:`|T|` is
the number of genes in `genes_subset` that are under term
T. :math:`T_{sca}` is the "smallest common ancestor", the
common ancestral term with the smallest term
size. :math:`T_{root}` is the root term of the ontology.
Resnik, P. (1999). Semantic similarity in a taxonomy: An
information-based measured and its application to problems
of ambiguity in natural
language. J. Artif. Intell. Res. 11,95-130.
Returns
-------
: (sim, nodes)
A 2-tuple consisting of `sim`, a node-by-node NumPy array,
and `nodes`, a NumPy array of the node names in `sim`.
"""
assert include_genes
assert not include_terms, 'include_terms is not yet implemented'
if similarity=='Resnik':
sca, nodes = self.get_best_ancestors(include_genes=include_genes)
nodes_subset = self.genes if include_genes else []
nodes_subset += self.terms if include_terms else []
nodes_idx = ddot.utils.make_index(nodes)
idx = [nodes_idx[v] for v in nodes_subset]
sca = sca[idx, :][:, idx]
ss = -1 * np.log2(np.array(self.term_sizes)[sca] / float(len(self.genes)))
ss = ss.astype(np.float32)
return ss, np.array(nodes_subset)
else:
raise Exception('Unsupported similarity type')
def common_ancestors(self, nodes, min_nodes='all', minimal=True):
"""Return the common ancestors of a set of genes
Parameters
----------
nodes : list
List of nodes (genes and/or terms) to find the common ancestors
min_nodes : str or int
If 'all', then return only terms that contain all of the
input genes. If an integer, then return only terms that
contain at least <nodes> of the input genes
minimal : bool
If True, then do NOT return the terms that are themselves
ancestors of the other common ancestors. This filter
leaves only the 'minimal' set of common ancestors.
Returns
-------
: list
List of common ancestors
"""
if min_nodes=='all':
min_nodes = len(nodes)
conn = self.connected(nodes, self.terms)
anc_bool = conn.sum(0) >= min_nodes
anc = np.array(self.terms)[anc_bool]
if minimal:
anc_conn = self.connected(anc, anc, sparse=False)
np.fill_diagonal(anc_conn, 0)
anc = anc[anc_conn.sum(0) == 0]
return anc
def _get_term_2_gene(self, verbose=False):
if verbose: print('Calculating term_2_gene')
term_2_gene = invert_dict(
self.gene_2_term,
keymap=make_index(self.genes),
valmap=dict(enumerate(self.terms)))
for t in self.terms:
if not t in term_2_gene:
term_2_gene[t] = []
return term_2_gene
@property
def term_sizes(self):
if self._term_sizes is None:
self._term_sizes = self._get_term_sizes(propagate=True)
return self._term_sizes
def _get_term_sizes(self, propagate=True):
"""Returns an array of term sizes in the same order as self.terms"""
if propagate:
ont = self.propagate(inplace=False)
gene_2_term = ont.gene_2_term
# gene_2_term = self._propagate_forward()
else:
gene_2_term = self.gene_2_term
tmp = Counter([x for y in gene_2_term.values() for x in y])
term_sizes = [tmp[x] for x in range(len(self.terms))]
return term_sizes
def get_information_gain(self):
for p in terms:
self.parent_2_children[p]
def shuffle_genes(self, inplace=False):
"""Shuffle the names of genes"""
new_order = self.genes.copy()
random.shuffle(new_order)
rename = dict(zip(self.genes, new_order))
return self.rename(rename, inplace=False)
def get_tree(self, ret='edges', verbose=False):
"""Identify a spanning tree of the DAG (including genes as part of the
DAG).
Parameters
------------
ret : str
If 'edges', then return a list of (u, v) edges in the
tree. If 'ontology', return an Ontology object consisting
of only the tree edges.
Returns
-------
: array-like or Ontology
"""
tree = self.to_igraph(include_genes=True, spanning_tree=True)
if ret=='edges':
tree_edges = set([(tree.vs[e.source]['name'],
tree.vs[e.target]['name'])
for e in tree.es if e['Is_Tree_Edge']=='Tree'])
return tree_edges
elif ret=='ontology':
tree.delete_edges([e.index for e in tree.es if e['Is_Tree_Edge']=='Not_Tree'])
return Ontology.from_igraph(tree, verbose=verbose)
def is_dag(self):
"""Return True if the Ontology is a valid directed acyclic graph,
False otherwise.
"""
return self.to_igraph(include_genes=True, spanning_tree=False).is_dag()
[docs] def topological_sorting(self, top_down=True, include_genes=False):
"""Perform a topological sorting.
top_down :
If True, then ancestral nodes (e.g. the root nodes) come
before descendants in the sorting. If False, then reverse the sorting
"""
graph = self.to_igraph(include_genes=include_genes, spanning_tree=False)
topo = list(graph.vs[graph.topological_sorting(mode='out')]['name'])
if not top_down:
topo = topo[::-1]
return topo
[docs] def to_igraph(self, include_genes=True, spanning_tree=False):
"""Convert Ontology to an igraph.Graph object. Gene and term names are
stored in the 'name' vertex attribute of the igraph object.
Parameters
----------
include_genes : bool
Include genes as vertices in the igraph object.
spanning_tree : bool
If True, then identify a spanning tree of the DAG. include
an edge attribute "Is_Tree_Edge" that indicates
Returns
-------
: igraph.Graph
"""
if include_genes:
terms_index_offset = {t : v + len(self.genes) for t, v in self.terms_index.items()}
gene_term_edges = [(self.genes_index[g], terms_index_offset[self.terms[t]])
for g in self.genes
for t in self.gene_2_term[g]]
child_parent_edges = [(terms_index_offset[c], terms_index_offset[p])
for p, children in self.parent_2_child.items()
for c in children]
vertex_attrs = self.node_attr.reindex(index=self.genes + self.terms).loc[self.genes + self.terms].to_dict(orient='list')
vertex_attrs.update({
'name':self.genes + self.terms,
self.NODETYPE_ATTR:[self.GENE_NODETYPE for x in self.genes] + [self.TERM_NODETYPE for x in self.terms]
})
graph = igraph.Graph(n=len(self.genes) + len(self.terms),
edges=gene_term_edges + child_parent_edges,
directed=True,
vertex_attrs=vertex_attrs,
edge_attrs={self.EDGETYPE_ATTR : [self.GENE_TERM_EDGETYPE for x in gene_term_edges] + \
[self.CHILD_PARENT_EDGETYPE for x in child_parent_edges]})
else:
edges = [(self.terms_index[c], self.terms_index[p]) for p, children in self.parent_2_child.items() for c in children]
graph = igraph.Graph(n=len(self.terms),
edges=edges,
directed=True,
vertex_attrs={'name':self.terms},
edge_attrs={self.EDGETYPE_ATTR : [self.CHILD_PARENT_EDGETYPE for x in edges]})
if spanning_tree:
parent_priority = [self.term_sizes[self.terms_index[v['name']]] if (v['name'] in self.terms_index) else 1 for v in graph.vs]
# Identify spanning tree
graph = self._make_tree_igraph(
graph,
parent_priority=parent_priority,
optim=min,
edge_name='Is_Tree_Edge')
graph.es['Is_Tree_Edge'] = ['Tree' if x else 'Not_Tree' for x in graph.es['Is_Tree_Edge']]
return graph
def shortest_paths(self,
descendants=None,
ancestors=None,
sparse=False,
weights=None,
chunk_size=500):
"""Calculate the length of the shortest paths from descendant nodes to
ancestor nodes.
Parameters
----------
sparse : bool
If True, return a scipy.sparse matrix. If False, return a
NumPy array
weights : dict
Dictionary mapping (child term, parent term) or (gene,
term) edges to weights. Any edge with no given weight is
assigned a weight of 0 by default.
(default) If weights is None, then a uniform weight is
assumed.
chunk_size : int (optional)
Computational optimization: shortest paths are calculated in batches.
Returns
-------
d : np.ndarray or scipy.sparse.spmatrix
d[x,y] is the length of the shortest directed path from a
descendant node x to ancestor node y. d[x,y]==numpy.inf if
no directed path exists. The rows are in the same order as
<descendants>, and the columns are in the same order as
<ancestors>.
"""
graph = self.to_igraph(include_genes=True, spanning_tree=False)
import numbers
if weights is None:
weights = 1
if weights is not None and not isinstance(weights, numbers.Number):
# Assume dictionary
weights = [weights.get((graph.vs[e.source]['name'],
graph.vs[e.target]['name']), 0) for e in graph.es]
graph.es['weight'] = weights
if descendants is None:
descendants = graph.vs
if ancestors is None:
ancestors = descendants
tmp = [graph.shortest_paths(
descendants[x[0]:x[1]],
ancestors,
weights='weight',
mode='out')
for x in split_indices_chunk(len(descendants), chunk_size)]
if sparse:
return scipy.sparse.vstack([scipy.sparse.csr_matrix(x) for x in tmp])
else:
return np.vstack(tmp)
def longest_paths(self,
descendants=None,
ancestors=None,
sparse=False,
weights=None,
chunk_size=500):# TODO: when ancestors are specified, the results become negative
"""Computes the lengths of the longest directed paths between all pairs
of terms.
Returns
-------
d : np.ndarray or scipy.sparse.spmatrix
d[x,y] is the length of the longest directed path from a
descendant term with index x to an ancestral term with
index y, where indices are defined by
self.terms. d[x,y]==numpy.inf if no directed path exists.
"""
d = self.shortest_paths(descendants=descendants,
ancestors=ancestors,
sparse=sparse,
weights=-1,
chunk_size=chunk_size)
if sparse:
d.data = -1 * d.data
else:
d = -1 * d
return d
[docs] def connected(self,
descendants=None,
ancestors=None,
sparse=False):
"""Calculate which genes or terms are descendants of other genes or
terms.
Parameters
-----------
descendants: list
A list of genes and/or terms. Default: A list of all genes
followed by a list of all terms, in the same order as
`self.genes` and `self.terms`.
ancestors: list
A list of genes and/or terms. Default: Same as the
``descendants`` parameter.
sparse : bool
If True, return a scipy.sparse matrix. If False (default),
return a NumPy array.
Returns
-------
d : np.ndarray or scipy.sparse.matrix
A descendants-by-ancestors matrix. ``d[i,j]`` is 1 if term
i is a descendant of term j, and 0 otherwise. Note that
``d[i,i]==1`` and ``d[root,i]==0``, for every i.
"""
d = self.shortest_paths(descendants=descendants,
ancestors=ancestors,
sparse=sparse)
if sparse:
d.data = np.isfinite(d.data)
else:
d = np.isfinite(d)
return d
# def get_leaves(self, terms_list, children_list=None):
# """Returns terms in ``terms_list`` that are not ancestors of any term in
# ``children_list``.
# Parameters
# ----------
# terms_list : list
# children_list : list
# If ``children_list`` is None, then select the terms in
# <terms_list> that are not ancestors of any of the other
# terms in <terms_list>.
# """
# connectivity_matrix_nodiag = self.get_connectivity_matrix_nodiag()
# terms_list = np.array(terms_list)
# if children_list is None:
# children_list = terms_list
# else:
# children_list = np.array(children_list)
# return terms_list[~ np.any(connectivity_matrix_nodiag[children_list, :][:, terms_list], axis=0)]
[docs] def propagate(self,
direction='forward',
gene_term=True,
term_term=False,
verbose=False,
inplace=False):
"""Propagate gene-term annotations through the ontology.
As an example, consider an ontology with one gene ``g``, three terms
``t1, t2, t3`` and the following connections:
::
t1-->t2
t2-->t3
g-->t1
g-->t2
In "forward" propagation, a new relation ``g-->t3`` is added. In
"reverse" propagation, the relation "g-->t2" is deleted
because it is an indirect relation inferred from "g-->t1" and
"t1-->t2".
Parameters
----------
direction : str
The direction of propagation. Either 'forward' or 'reverse'
inplace : bool
If True, then modify the ontology. If False, then create
and modify a copy.
Returns
-------
: ddot.Ontology.Ontology
"""
if inplace:
ont = self
else:
ont = self.copy()
assert direction in ['forward', 'reverse'], "Propagation direction must be forward or backward"
forward = direction=='forward'
if not forward:
# This is needed to ensure that the pruning to a parent's
# gene set can be based on the gene sets of its direct
# children
ont = ont.propagate(gene_term=gene_term, term_term=term_term, direction='forward', inplace=True)
if gene_term:
term_2_gene_set = {t : set(g) for t, g in ont.term_2_gene.items()}
if term_term:
parent_2_child_set = {p : set(c) for p, c in ont.parent_2_child.items()}
# # TODO: have this topological sorting be a part of the code below
# graph = ont.to_igraph(include_genes=False, spanning_tree=False)
# for c_idx in graph.topological_sorting(mode='in'):
# child = graph.vs[c_idx]['name']
for child in ont.topological_sorting(top_down=forward, include_genes=False):
for parent in ont.child_2_parent[child]:
if gene_term:
if forward:
term_2_gene_set[parent] |= term_2_gene_set[child]
else:
term_2_gene_set[parent] -= term_2_gene_set[child]
if term_term:
if forward:
parent_2_child_set[parent] |= parent_2_child_set[child]
else:
parent_2_child_set[parent] -= parent_2_child_set[child]
if gene_term:
ont.gene_2_term = invert_dict(term_2_gene_set,
keymap=make_index(ont.terms),
valmap=dict(enumerate(ont.genes)))
ont.term_2_gene = {a : list(b) for a, b in term_2_gene_set.items()}
if term_term:
ont.parent_2_child = {a : list(b) for a, b in parent_2_child_set.items()}
ont.child_2_parent = ont._get_child_2_parent()
ont._check_valid()
return ont
def get_ontotype(self,
genotypes,
input_format='gene_list',
output_format='dataframe',
matrix_columns=None):
"""Transform genotypes to ontotypes.
.. [1] Yu, M.K., Kramer, M., Dutkowski, J., Srivas, R., Licon,
K., Kreisberg, J.F., Ng, C.T., Krogan, N., Sharan,
R. and Ideker, T., 2016. "Translation of genotype to
phenotype by a hierarchy of cell subsystems". *Cell
Systems*, 2(2), pp.77-88.
Parameters
----------
genotypes : list, np.ndarray, scipy.sparse.spmatrix, pd.DataFrame
input_format : str
If "gene_list", then ``genotypes`` is a list of genotypes,
where genotype is itself a list of genes mutated. Each
gene is assumed to have a mutation value of 1.
If 'matrix', then ``genotypes`` is a genotype-by-gene
matrix, where the value at position (i,j) represents the
mutation value of gene j in genotype i. ``genotypes`` can
be a NumPy array, SciPy sparse matrix, or Pandas
dataframe.
output_format : str
If 'sparse', then return a sparse matrix as a
scipy.sparse.csr_matrix object. (default)
If 'dataframe', then return a pandas.DataFrame object.
If 'array', then return a numpy.ndarray object.
matrix_columns : list
represents a list of the genes that are represented by the
columns of ``genotypes``. Only used when input_format is
"matrix" and ``genotypes`` is a NumPy array or SciPy sparse
matrix.
Returns
-------
: scipy.sparse.csr_matrix, pandas.DataFrame, numpy.ndarray
genotype-by-term matrix, where the ordering of rows and
terms is the same as ``genotypes`` and ``self.terms``
"""
genotypes_names = None
if input_format=='gene_list':
gene_2_term = {k: np.array(v) for k, v in self.gene_2_term.items()}
genotypes_x = [np.concatenate([gene_2_term[g] for g in gset]) if len(gset)>0 else np.array([]) for gset in genotypes]
indices = np.concatenate(genotypes_x)
indptr = np.append(0, np.cumsum([gset.size for gset in genotypes_x]))
data = np.ones((indices.size, ), dtype=np.int64)
ontotypes = scipy.sparse.csr_matrix(
(data, indices, indptr),
(len(genotypes), len(self.terms)))
ontotypes.sum_duplicates()
elif input_format=='matrix':
if isinstance(genotypes, pd.DataFrame):
matrix_columns = genotypes.columns
genotypes_names = genotypes.index
genotypes = genotypes.values
elif isinstance(genotypes, np.ndarray) or scipy.sparse.issparse(genotypes):
assert matrix_columns is not None
else:
raise Exception("Parameter <genotypes> must be a genotype-by-gene matrix "
"represented as a Pandas dataframe, NumPy array, or SciPy sparse matrix. "
"Consider changing the <input_format> parameter")
contained = np.array([g in self.genes_index for g in matrix_columns])
genotypes = scipy.sparse.csc_matrix(genotypes)[:,contained]
gene_2_term_matrix = scipy.sparse.csr_matrix(self.get_gene_2_term_matrix())
gene_2_term_matrix = scipy.sparse.csr_matrix(gene_2_term_matrix)[contained,:]
ontotypes = genotypes.dot(gene_2_term_matrix)
else:
raise Exception('Invalid input format')
if output_format=='dataframe':
ontotypes = pd.DataFrame(ontotypes.toarray(), columns=self.terms)
if genotypes_names is not None:
ontotypes.index = genotypes_names
elif output_format=='sparse':
pass
elif output_format=='array':
ontotypes = ontotypes.toarray()
else:
raise Exception('Invalid output format')
return ontotypes
def get_gene_2_term_matrix(self):
"""Returns a gene-by-term matrix stored as a scipy.sparse.coo_matrix
Returns
-------
: scipy.sparse.coo_matrix
"""
# Convert gene names to indices
gene_2_term = [(self.genes_index[g], t_list)
for g, t_list in self.gene_2_term.items()]
gene_2_term_matrix = scipy.sparse.coo_matrix(
([1 for g, t_list in gene_2_term for t in t_list],
([g for g, t_list in gene_2_term for t in t_list],
[t for g, t_list in gene_2_term for t in t_list])),
shape=(len(self.genes), len(self.terms)))
return gene_2_term_matrix
def summary(self):
"""Summarize the Ontology's contents with respect to number of genes,
terms, and connections.
Returns
--------
: str
"""
if self.node_attr is None:
node_attr_names = []
else:
node_attr_names = self.node_attr.columns.tolist()
# node_attr_names = ', '.join(map(str, self.node_attr.columns))
if self.edge_attr is None:
edge_attr_names = []
else:
edge_attr_names = self.edge_attr.columns.tolist()
# edge_attr_names = ', '.join(map(str, self.edge_attr.columns))
summary = ('%s genes, '
'%s terms, '
'%s gene-term relations, '
'%s term-term relations'
'\nnode_attributes: %s'
'\nedge_attributes: %s') % (
len(self.genes),
len(self.terms),
sum([len(x) for x in self.gene_2_term.values()]),
sum([len(x) for x in self.parent_2_child.values()]),
node_attr_names,
edge_attr_names)
return summary
[docs] def to_ndex(self,
ndex_user,
ndex_pass,
ndex_server=None,
name=None,
description=None,
network=None,
main_feature=None,
subnet_max_term_size=None,
visible_term_attr=None,
layout='bubble',
propagate='reverse',
style=None,
node_alias='Original_Name',
term_2_uuid=None,
visibility='PUBLIC',
verbose=False):
"""Upload an Ontology object to NDEx. The Ontology can be preformatted in
several ways including
1. Set a name and description of the Ontology
2. Upload a supporting gene-gene subnetwork for every term in the Ontology
3. Propagate gene-term annotations
4. Layout the nodes.
5. Apply a visual style, e.g. specifying node and edge colors
Parameters
----------
name : str
Name of Ontology
description : str
Description of Ontology
layout : str
The name of the layout algorithm for laying out the
Ontology as a graph. Node positions are stored in the
node attributes 'x_pos' and 'y_pos'. If None, then do not
perform a layout.
style : ndex.networkn.NdexGraph
The Cytoscape.js visual style on NDEx. Represented using
CX and stored in an NdexGraph.
network : pandas.Dataframe
Dataframe describing gene-gene network from which to
create subnetworks for every term. To be passed to
Ontology.upload_subnets_ndex().
features : list of str
Columns in the gene-gene network to upload. To be passed
to Ontology.upload_subnets_ndex().
ndex_server : str
URL of NDEx server
ndex_user : str
NDEx username
ndex_pass : str
NDEx password
public : bool
Whether to make the Ontology public on NDEx
node_alias : str
visibility : str
Returns
-------
: ndex.networkn.NdexGraph
"""
if propagate is not None:
ont = self.propagate(direction=propagate, inplace=False)
else:
ont = self
if ndex_server is None:
ndex_server = ddot.config.ndex_server
if (network is not None) and (term_2_uuid is None):
if subnet_max_term_size is None:
terms = ont.terms
else:
terms = [t for t,s in zip(ont.terms, ont.term_sizes) if s <= subnet_max_term_size]
# Only upload subnets for the unique set of the original
# terms
if node_alias in ont.node_attr.columns:
orig_2_new = {a : b.index.values for a, b in ont.node_attr.loc[terms, [node_alias]].groupby(node_alias)}
terms = [b[0] for b in orig_2_new.values()]
term_2_uuid = ont.upload_subnets_ndex(
network,
main_feature,
name,
ndex_user,
ndex_pass,
ndex_server=ndex_server,
terms=terms,
visibility=visibility,
verbose=verbose
)
if node_alias in ont.node_attr.columns:
term_2_uuid = {s : term_2_uuid[orig_2_new[t][0]] for t in orig_2_new for s in orig_2_new[t] if orig_2_new[t][0] in term_2_uuid}
elif term_2_uuid is None:
term_2_uuid = {}
if verbose: print('Creating NdexGraph')
G = ont.to_NdexGraph(
name=name,
description=description,
term_2_uuid=term_2_uuid,
layout=layout,
style=style)
if visible_term_attr is not None:
df = ddot.utils.nx_nodes_to_pandas(G, visible_term_attr)
df.rename(columns=lambda x: 'Display:' + x, inplace=True)
ddot.utils.set_node_attributes_from_pandas(G, df)
G.set_network_attribute('Display', '|'.join(visible_term_attr))
if verbose: print('Uploading to NDEx')
ont_url = G.upload_to(ndex_server, ndex_user, ndex_pass, visibility=visibility)
return ont_url, G
def to_NdexGraph(self,
name=None,
description=None,
term_2_uuid=None,
spanning_tree=True,
layout='bubble',
style=None,
verbose=False):
"""Formats an Ontology object into a NetworkX object with extra node
attributes that are accessed by the hierarchical viewer.
Parameters
-----------
name : str
Name of Ontology, as would appear if uploaded to NDEx.
description : str
Description of Ontology, as would appear if uploaded to NDEx.
term_2_uuid : dict
A dictionary mapping a term to a NDEx UUID of a gene-gene
subnetwork of genes in that term. the UUID will be stored
in the node attribute 'ndex:internallink'. If uploaded to
NDEx, then this attribute will provide a hyperlink to the
gene-gene subnetwork when the term is clicked upon on the
NDEx page for this ontology.
This dictionary can be created using
Ontology.upload_subnets_ndex(). Default: no dictionary.
layout : str
Layout the genes and terms in this Ontology. Stored in the
node attributes 'x_pos' and 'y_pos'. If None, then do not
perform a layout.
Returns
-------
: ndex.networkn.NdexGraph
"""
# Convert to NetworkX
G = self.to_networkx(layout=layout, spanning_tree=spanning_tree)
if style is None:
style = 'passthrough'
# Set extra attributes for passthrough visual styling
if style=='passthrough':
for v, data in G.nodes(data=True):
is_gene = data[self.NODETYPE_ATTR]==self.GENE_NODETYPE
if 'Vis:Shape' not in data:
data['Vis:Shape'] = 'Rectangle' if is_gene else 'Circle'
if 'Vis:Fill Color' not in data:
data['Vis:Fill Color'] = '#FFFFFF'
if 'Vis:Border Paint' not in data:
data['Vis:Border Paint'] = '#000000'
for u, v, data in G.edges(data=True):
if 'Vis:Visible' not in data and 'Is_Tree_Edge' in data:
data['Vis:Visible'] = data['Is_Tree_Edge']=='Tree'
style = ddot.config.get_passthrough_style()
else:
raise Exception('Unsupported style')
# Set links to subnetworks supporting each term
if term_2_uuid:
for t in self.terms:
if t in term_2_uuid:
G.node[t]['ndex:internalLink'] = '[%s](%s)' % (G.node[t]['Label'], term_2_uuid[t])
# # Change Original_Name to node indices
# name_2_idx = {data['name'] : v for v, data in G.nodes(data=True)}
# for v, data in G.nodes(data=True):
# if 'Original_Name' in data and 'Hidden' in data and data['Hidden']==True:
# data['Original_Name'] = name_2_idx[data['Original_Name']]
G = nx_to_NdexGraph(G)
if name is not None:
G.set_name(name)
if description is not None:
G.set_network_attribute('Description', description)
if style:
import ndex.beta.toolbox as toolbox
toolbox.apply_network_as_template(G, style)
return G
[docs] def to_cx(self,
output=None,
name=None,
description=None,
term_2_uuid=None,
spanning_tree=True,
layout='bubble',
style=None):
"""Formats an Ontology object into a CX file format
Parameters
-----------
output : str
Filename or file-like object to write CX file. If None,
then CX is returned as a JSON object, but not written to a
file.
name : str
Name of Ontology, as would appear if uploaded to NDEx.
description : str
Description of Ontology, as would appear if uploaded to NDEx.
term_2_uuid : list
A dictionary mapping a term to a NDEx UUID of a gene-gene
subnetwork of genes in that term. the UUID will be stored
in the node attribute 'ndex:internallink'. If uploaded to
NDEx, then this attribute will provide a hyperlink to the
gene-gene subnetwork when the term is clicked upon on the
NDEx page for this ontology.
This dictionary can be created using
Ontology.upload_subnets_ndex(). Default: no dictionary.
layout : str
Layout the genes and terms in this Ontology. Stored in the
node attributes 'x_pos' and 'y_pos'. If None, then do not
perform a layout.
Returns
-------
: CX representation as a JSON-like dictionary
"""
# Convert to NdexGraph
G = self.to_NdexGraph(name=name,
description=description,
term_2_uuid=term_2_uuid,
spanning_tree=spanning_tree,
layout=layout,
style=style)
cx = G.to_cx()
if output is not None:
if hasattr(output, 'write'):
json.dump(cx, output)
else:
with io.open(output, 'w') as f:
json.dump(cx, f)
return cx
[docs] def to_graphml(self,
output,
layout='bubble',
spanning_tree=True):
"""Writes an Ontology object in graphml format.
Parameters
-----------
output : str
Filename or file-like object to write CX file. If None,
then CX is returned as a JSON object, but not written to a
file.
layout : str
Layout the genes and terms in this Ontology. Stored in the
node attributes 'x_pos' and 'y_pos'. If None, then do not
perform a layout.
"""
# Convert to NetworkX
G = self.to_NdexGraph(spanning_tree=spanning_tree,
layout=layout)
if hasattr(output, 'write'):
nx.write_graphml(G, output)
else:
with io.open(output, 'w') as f:
nx.write_graphml(G, f)
def _force_directed_layout(self, G):
"""Force-directed layout on only the terms"""
sub_nx = G.copy()
sub_nx.remove_edges_from([(u,v) for u,v,attr in sub_nx.edges(data=True) if attr['Is_Tree_Edge']=='Not_Tree'])
pos = nx.spring_layout(sub_nx, dim=2, k=None,
pos=None,
fixed=None,
iterations=50,
weight=None,
scale=1.0)
tmp = np.array([x[0] for x in pos.values()])
x_min, x_max = tmp.min(), tmp.max()
tmp = np.array([x[1] for x in pos.values()])
y_min, y_max = tmp.min(), tmp.max()
x_scale = 500. / (y_max - y_min)
y_scale = 500. / (x_max - x_min)
pos = {a : [b[0] * x_scale, b[1] * y_scale] for a, b in pos.items()}
return pos
def upload_subnets_ndex(self,
network,
main_feature,
name,
ndex_user,
ndex_pass,
ndex_server=None,
terms=None,
gene_columns=['Gene1', 'Gene2'],
propagate='forward',
visibility='PUBLIC',
node_attr=None,
node_alias='Original_Name',
z_score=False,
spring_feature=None, spring_weight=1.0,
edge_groups=None,
max_num_edges = -1,
verbose=False):
"""For each term in the ontology, upload a subnetwork of interactions
between the genes in that term to NDEx.
TODO: instead of specifying gene_columns, add another
parameter use_index to specify that genes are the network's
index
Parameters
----------
network : pandas.Dataframe
Dataframe describing network
features : list of str
Columns in network to upload
name : str
Prefix for the names of all subnetworks
ndex_server : str
URL of NDEx server
ndex_user : str
NDEx username
ndex_pass : str
NDEx password
terms : list
List of terms to upload a subnetwork. Default: upload for
all terms.
gene_columns : list
Columns in network that represent the two genes.
propagate : str
The direction ('forward' or 'reverse') to propagate
gene-term annotations up the hierarchy with
Ontology.propagate(). If None, then don't
propagate annotations.
public : bool
Whether to make networks public on NDEx
node_attr : pandas.DataFrame
"""
if propagate:
ont = self.propagate(direction=propagate, inplace=False)
else:
ont = self
if ndex_server is None:
ndex_server = ddot.config.ndex_server
ndex = nc.Ndex(ndex_server, ndex_user, ndex_pass)
term_2_uuid = {}
start = time.time()
g1, g2 = gene_columns[0] + '_lex', gene_columns[1] + '_lex'
features = [f for f in network.columns if (f not in gene_columns)]
assert main_feature in features, 'A main feature of the network must be specified'
network = network[features + gene_columns].copy()
network[gene_columns[0]] = network[gene_columns[0]].astype(str)
network[gene_columns[1]] = network[gene_columns[1]].astype(str)
# Filter dataframe for gene pairs within the ontology
genes_set = set(ont.genes)
tmp = [x in genes_set and y in genes_set
for x, y in zip(network[gene_columns[0]], network[gene_columns[1]])]
network = network.loc[tmp, :]
# Lexicographically sort gene1 and gene2 so that gene1 < gene2
# actually this may be redundant
if z_score:
for feat in features:
network[feat] = network[feat].astype(np.float64)
# Normalize features into z-scores
tmp = network[features]
network[features] = (tmp - tmp.mean()) / tmp.std()
# network_sq = ddot.utils.pivot_square(network, g1, g2, main_feature)
# Calculate the min/max range of features
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
def f(x):
if str(x) in numerics:
return 'numeric'
elif str(x) == 'bool':
return 'boolean'
else:
raise Exception()
feature_types = network[features].dtypes.map(f)
feature_mins = network[features].min().astype(np.str)
feature_maxs = network[features].max().astype(np.str)
# set an upper limit to the maximum number of edges uploaded to NDEx
# (contributed by Fan Zheng)
if max_num_edges > 0:
network.sort_values(by = main_feature, ascending=False, inplace=True)
network = network.iloc[:max_num_edges, :]
# Lexicographically sort gene1 and gene2 so that gene1 < gene2
# actually this may be redundant
network[g1], network[g2] = zip(
*[(x, y) if x < y else (y, x) for x, y in zip(network[gene_columns[0]], network[gene_columns[1]])])
network_idx = {x: i for i, x in enumerate(zip(network[g1], network[g2]))}
if terms is None:
terms = ont.terms
if verbose: print('Uploading %s terms' % len(terms))
for upload_idx, t in enumerate(terms):
start = time.time()
if node_alias in ont.node_attr.columns:
genes = ont.node_attr.loc[genes, node_alias].values
else:
genes = [ont.genes[g] for g in ont.term_2_gene[t]]
genes.sort()
gene_pairs_idx = [network_idx[gp] for gp in itertools.combinations(genes, 2) \
if gp in network_idx]
# New (Parent weight)
children = ont.parent_2_child[t]
min_children_term_weights = -1
if ('Parent weight' in ont.node_attr.columns.tolist()) and (len(children) >0):
children_term_weights = []
for c in children:
if ont.node_attr.loc[c, 'Parent weight'] >0:
children_term_weights.append(ont.node_attr.loc[c, 'Parent weight'])
if len(children_term_weights):
children_term_weights = np.array(children_term_weights)
min_children_term_weights = np.min(children_term_weights)
if len(gene_pairs_idx) > 0:
network_sub = network.iloc[gene_pairs_idx, :]
network_sub = network_sub.loc[network_sub[main_feature] >= ont.node_attr.loc[t, 'Parent weight']]
# filter network if max_num_edges is greater then 0
if max_num_edges != None and max_num_edges > 0:
network_sub.sort_values(by=main_feature, ascending=False, inplace=True)
network_sub = network_sub.iloc[:max_num_edges, :]
# New: apply some minimum string force so nodes will not fly away
# if spring_feature != None:
# network_sub.loc[network_sub[spring_feature] < min_children_term_weights, spring_feature] = 0.5*min_children_term_weights
# network_sub[spring_feature] = network_sub[spring_feature] ** spring_weight
G_nx = nx.from_pandas_dataframe(network_sub, g1, g2,
edge_attr=features)
if node_attr is not None:
set_node_attributes_from_pandas(G_nx, node_attr)
G_nx.add_nodes_from(list(set(genes) - set(G_nx.nodes())))
# Annotate the membership in children terms
children = ont.parent_2_child[t]
df = pd.DataFrame({c : None for c in children}, index=genes, dtype=bool)
for c in children:
genes_in = [ont.genes[g] for g in ont.term_2_gene[c]]
# for g in genes_in:
# G_nx.node[g]['Group:'+c] = True
df.loc[genes_in, c] = True
df.rename(columns=lambda x: 'Group:'+x, inplace=True)
ddot.utils.set_node_attributes_from_pandas(G_nx, df)
# # If a gene belongs to multiple children, then place it where it is most similar
# for g_i in (df.sum(1) > 0).nonzero():
# g = genes[g_i]
# choices = df.loc[g, :].nonzero()
# network_sq.loc[g, :].argmax()
G = nx_to_NdexGraph(G_nx)
G.set_name('%s supporting network for %s' % (name, t))
G.set_network_attribute('Description', '%s supporting network for %s' % (name, t))
G.set_network_attribute('Main Feature', main_feature)
for f in features:
if (f == spring_feature) and (f != main_feature):
continue
G.set_network_attribute('%s type' % f, feature_types[f])
if feature_types[f] == 'numeric':
G.set_network_attribute('%s min' % f, feature_mins[f])
G.set_network_attribute('%s max' % f, feature_maxs[f])
# for c in children:
# G.set_network_attribute('Group:' + c, True)
G.set_network_attribute('Group', '|'.join(children))
# New: calculate the score threshold of this subnetwork
G.set_network_attribute('Main Feature Default Cutoff', float(ont.node_attr.loc[t, 'Parent weight']))
G.set_network_attribute('Parent weight', float(ont.node_attr.loc[t, 'Parent weight']))
if min_children_term_weights > 0:
G.set_network_attribute('Children weight', '|'.join(['{:.3f}'.format(w) for w in children_term_weights]))
# G.set_network_attribute('Main Feature Default Cutoff', float(min_children_term_weights))
if isinstance(edge_groups, dict) and (len(edge_groups.keys()) > 0):
edge_group_string = []
for k, vs in edge_groups.items():
vs.sort()
edge_group_string.append(','.join([k] + vs))
edge_group_string = '|'.join(edge_group_string)
G.set_network_attribute('edge groups', edge_group_string)
# New: only keep the biggest compoent in the network
G = max(nx.weakly_connected_component_subgraphs(G), key=len)
# # further remove degree == 1 nodes
# if len(G.nodes()) > 6:
# low_deg_nodes = []
# for v, deg in G.degree().items():
# if deg <= 1:
# low_deg_nodes.append(v)
#
# while len(low_deg_nodes) != 0:
# G.remove_nodes_from(low_deg_nodes)
# low_deg_nodes = []
# for v, deg in G.degree().items():
# if deg <= 1:
# low_deg_nodes.append(v)
# New: compute a pre-layout to networks
if spring_feature != None:
# G_cx = G.to_cx() # why converted back and forth
# G = NdexGraph(G_cx)
gsim = layouts._create_simple_graph(G)
pos = nx.spring_layout(gsim, scale=200 * math.sqrt(gsim.number_of_nodes()), weight=spring_feature)
G.pos = pos
# layouts.apply_directed_flow_layout(G, node_width=50, weight=spring_feature)
start_upload = time.time()
ndex_url = G.upload_to(ndex_server, ndex_user, ndex_pass, visibility=visibility)
term_2_uuid[t] = parse_ndex_uuid(ndex_url)
upload_time = time.time() - start_upload
if verbose:
print(upload_idx,
'Term:', t,
'Gene pairs:', len(G_nx.edges()),
'Genes:', len(genes),
'Time:', round(time.time() - start, 4),
'Upload time:', round(upload_time, 4),
'NDEx URL:', ndex_url)
else:
if verbose:
print(upload_idx, 'No data provided for gene pairs in Term: %s' % t)
return term_2_uuid
[docs] def get_best_ancestors(self, node_order=None, verbose=False, include_genes=True):
"""Compute the 'best' ancestor for every pair of terms. 'Best' is
specified by a ranking of terms. For example, if terms are
ranked by size, from smallest to largest, then the smallest
common ancestor is calculated.
Parameters
----------
node_order : list
A list of terms, ordered by their rank with the 'best' term at the beginning.
include_genes : bool
Returns
--------
ancestors : np.ndarray
ancestors[a,b] = the best common ancestor of terms a and
b, represented as a 0-based index of self.terms
nodes : list
List of the row and column names. Rows and columns are the
same.
"""
ont = self.propagate(direction='reverse', inplace=False)
graph = ont.to_igraph(include_genes=include_genes, spanning_tree=False)
if node_order is None:
# By default, sort from smallest to largest terms
node_order = [self.terms[t] for t in np.argsort(ont.term_sizes)]
d = np.int8(np.isfinite(np.array(graph.shortest_paths(graph.vs, graph.vs, mode='out'), order='C')))
ancestor_matrix = np.zeros(d.shape, dtype=np.int32)
ancestor_matrix.fill(-1)
if verbose: time_print('Iterating:')
for t in node_order:
i = graph.vs.find(t).index
t_i = self.terms_index[t]
# Note: includes self as a child
children = np.where(d[:,i] == 1)[0]
# For those descendants without a computed LCA yet, set their LCA to this term
lca_sub = ancestor_matrix[children.reshape(-1,1), children]
lca_sub[lca_sub == -1] = t_i
ancestor_matrix[children.reshape(-1,1), children] = lca_sub
# Check symmetry
assert (ancestor_matrix.T == ancestor_matrix).all()
assert (-1 == ancestor_matrix).sum() == 0, 'The ontology may have more than one root'
return ancestor_matrix, graph.vs['name']
@classmethod
def _make_tree_igraph(self,
graph=None,
method='priority',
edge_name='smallest_parent',
parent_priority=None, edge_priority=None, default_priority=None, optim='max'):
"""Returns copy of graph with new edge attribute marking spanning
tree
"""
if graph is None:
graph = self.to_igraph(include_genes=False, spanning_tree=True)
if method=='priority':
assert 1 == (parent_priority is not None) + (edge_priority is not None)
if edge_priority is not None: assert default_priority is not None
if optim=='min': optim=min
if optim=='max': optim=max
graph.es[edge_name] = False
for v in graph.vs:
parents = graph.neighbors(v.index, mode='out')
if len(parents) > 0:
"""Choose the parent with the highest valued priority"""
if parent_priority is not None:
small_parent = optim(parents, key=lambda p: parent_priority[p])
elif edge_priority is not None:
small_parent = optim(parents, key=lambda p: edge_priority.get(graph.get_eid(v.index, p), default_priority))
graph.es[graph.get_eid(v.index, small_parent)][edge_name] = True
else:
raise Exception('Method not supported')
return graph
[docs] def to_pickle(self, file, compression='infer'):
"""Saves Ontology object with the Python pickle protocol."""
pandas.io.pickle.to_pickle(self, file, compression=compression)
[docs] @classmethod
def read_pickle(cls, file, compression='infer'):
"""Loads an Ontology object from a pickled state."""
return pandas.io.pickle.read_pickle(file, compression=compression)
def __repr__(self):
return self.summary()
def __str__(self):
return self.summary()