Source code for ddot.Ontology

from __future__ import absolute_import, print_function, division

import itertools, multiprocessing, logging, os, collections, random, math, sys, time
from itertools import groupby, combinations
from operator import *
from collections import Counter
import tempfile
from subprocess import Popen, PIPE, STDOUT
import inspect
import shlex
import shutil
import io
from io import StringIO
import json
import datetime

import numpy as np
import pandas as pd
import pandas.io.pickle
import networkx as nx
import igraph
import scipy, scipy.sparse
from scipy.sparse import csr_matrix, coo_matrix
from scipy.stats import hypergeom

import ndex.client as nc
from ndex.networkn import NdexGraph
import ndex.beta.layouts as layouts
        
import ddot
import ddot.config
from ddot.utils import time_print, set_node_attributes_from_pandas, set_edge_attributes_from_pandas, nx_to_NdexGraph, NdexGraph_to_nx, parse_ndex_uuid, parse_ndex_server, make_index, update_nx_with_alignment, bubble_layout_nx, split_indices_chunk, invert_dict, make_network_public, nx_edges_to_pandas, nx_nodes_to_pandas, ig_edges_to_pandas, ig_nodes_to_pandas, melt_square, nx_set_tree_edges, gridify

def _collapse_node(g,
                   v,
                   edge_filter=None,
                   use_v_name=False,
                   combine_attrs=None,
                   default_attr=None,
                   verbose=True,
                   fast_collapse=False,
                   delete=True):
    """Collapses a node in a Graph (igraph package) while preserving
    long-range hierarchical relations between descendants and
    ancestral nodes.

    """

    if use_v_name:
        assert isinstance(v, str)
        v = g.vs.find(name_eq=v).index

    try:
        g.vs[v]
    except:
        raise Exception("Can't find vertex %s in graph. Consider setting use_v_name=True" % v)
    
    if fast_collapse:
        parents = g.neighbors(v, mode='out')
        children = g.neighbors(v, mode='in')

        if len(parents) > 0 and len(children) > 0:
            # A faster collapse that adds all new edges
            # simultaneously. Ignores edge attributes
            new_edges = [(c, p) for p in parents for c in children]
            new_edges = [x for x, y in zip(new_edges, g.get_eids(new_edges, error=False)) if y == -1]
            g.add_edges(new_edges)
    else:
        g.es['collapsed_length'] = 0
        g.es['collapsed_terms'] = [[] for x in g.es]
        
        in_edges = g.es[g.incident(v, mode='in')]
        out_edges = g.es[g.incident(v, mode='out')]

        if edge_filter is not None:
            in_edges = [e for e in in_edges if edge_filter(e)]
            out_edges = [e for e in out_edges if edge_filter(e)]

        for e_in in in_edges:
            for e_out in out_edges:

                in_neigh, out_neigh = e_in.source, e_out.target

                # Only add an edge if it doesn't already exist                                         
                if g[in_neigh, out_neigh] == 0:
                    g.add_edge(in_neigh, out_neigh)
                    e = g.es[g.get_eid(in_neigh, out_neigh)]
                    if combine_attrs is not None:
                        # Set default value of edge attributes to 0                                         
                        for key in combine_attrs:  e[key] = None

                e = g.es[g.get_eid(in_neigh, out_neigh)]

                # Update attributes
                if combine_attrs is not None:
                    for key in combine_attrs:
                        e[key] = combine_attrs[key](e_in, e_out, e)
                        if verbose and key=='triangle_edge_priority':
                            print('Setting',
                                  key,
                                  g.vs[in_neigh]['name'],
                                  g.vs[out_neigh]['name'],
                                  'to',
                                  combine_attrs[key](e_in, e_out, e),
                                  (e_in[key], e_out[key]))

                e['collapsed_length'] = e_in['collapsed_length'] + e_out['collapsed_length']
                e['collapsed_terms'] = e_in['collapsed_terms'] + [g.vs[v]['name']] + e_out['collapsed_terms']

    if delete:
        g.delete_vertices(v)

    return g

def read_alignment_file(f, source='Term_1'):
    """Parses an alignment file created from alignOntology's calculateFDRs script

    Parameters
    -----------
    f : str
       Filename of alignment file

    source : str 
       Indicates which ontology will be the index of the
       returned pandas.DataFrame. Value must be either 'Term_1' (first
       ontology) or 'Term_2' (second ontology)

    Returns
    --------
    : pandas.DataFrame
       DataFrame with four columns: 'Term', 'Similarity', 'FDR', and 'Size'.
       The index of the DataFrame are the names of terms in the "source" ontology. 
    
    """

    # Five columns in the input file
    # 1) Term from first "computed" ontology
    # 2) Term from second "reference" ontology
    # 3) Similarity value
    # 4) FDR
    # 5) Size of the term in the first ontology

    df = pd.read_table(f,
                       names=['Term_1', 'Term_2', 'Similarity', 'FDR', 'Size'],
                       dtype={'Term_1':str,
                              'Term_2':str,
                              'Similarity':np.float64,
                              'FDR':np.float64,
                              'Size':np.int64},
                       header=None)
    target = 'Term_2' if source=='Term_1' else 'Term_1'
    df.rename(columns={target : 'Term'}, inplace=True)
    df.set_index(source, inplace=True)
    df.index.rename('Term', inplace=True)
    return df

def align_hierarchies(hier1,
                      hier2,
                      iterations,
                      threads,
                      update_hier1=False,
                      update_hier2=False,
                      calculateFDRs=None,
                      mutual_collapse=True,
                      output=None,
                      verbose=False):
    
    if output is None:
        with tempfile.NamedTemporaryFile('w', delete=True) as output_file:
            return align_hierarchies(hier1, hier2, iterations, threads,
                                     update_hier1=update_hier1, update_hier2=update_hier2,
                                     mutual_collapse=mutual_collapse,
                                     output=output_file.name,
                                     calculateFDRs=calculateFDRs,
                                     verbose=verbose)

    common_genes = set(hier1.genes) & set(hier2.genes)

    hier1_orig, hier2_orig = hier1, hier2
        
    if len(common_genes) > 0:
        if mutual_collapse:
            hier1, hier2 = Ontology.mutual_collapse(hier1, hier2, verbose=verbose)
            hier1.clear_node_attr()
            hier1.clear_edge_attr()
            hier2.clear_node_attr()
            hier2.clear_edge_attr()
            hier1.propagate('reverse', inplace=True)
            hier2.propagate('reverse', inplace=True)

        def to_file(hier):
            if isinstance(hier, Ontology):
                with tempfile.NamedTemporaryFile('w', delete=False) as f:
                    hier.to_table(f, clixo_format=True)
                hier = f.name
            else:
                assert isinstance(hier, file) or os.path.exists(hier)
            return hier

        hier1 = to_file(hier1)
        hier2 = to_file(hier2)

        if calculateFDRs is None:
            top_level = os.path.dirname(os.path.abspath(inspect.getfile(ddot)))
            calculateFDRs = os.path.join(top_level, 'alignOntology', 'calculateFDRs')

            #assert os.path.isdir(ddot.config.alignOntology)
            #calculateFDRs = os.path.join(ddot.config.alignOntology, 'calculateFDRs')
        assert os.path.isfile(calculateFDRs)

        if threads is None:
            import multiprocessing
            threads = multiprocessing.cpu_count()

        output_dir = tempfile.mkdtemp(prefix='tmp')
        cmd = '{5} {0} {1} 0.05 criss_cross {2} {3} {4} gene'.format(
                  hier1, hier2, output_dir, iterations, threads, calculateFDRs)
        print('Alignment command:', cmd)

        p = Popen(shlex.split(cmd), shell=False)

        try:        
            p.wait()
            shutil.copy(os.path.join(output_dir, 'alignments_FDR_0.1_t_0.1'), output)
        finally:
            if os.path.isdir(output_dir):
                shutil.rmtree(output_dir)

            if p.poll() is None:
                if verbose: time_print('Killing alignment process %s. Output: %s' % (p.pid, output))
                p.kill()  # Kill the process
        align1 = read_alignment_file(output)[['Term', 'Similarity', 'FDR']]
    else:    
        align1 = pd.DataFrame(columns=['Term', 'Similarity', 'FDR'])
    
    align2 = align1.copy()
    align2.index, align2['Term'] = align2['Term'].values.copy(), align2.index.values.copy()

    append_prefix = lambda x: 'Aligned_%s' % x

    if update_hier1:
        if hasattr(update_hier1, '__iter__'):
            node_attr = hier2_orig.node_attr[update_hier1]
        else:
            node_attr = hier2_orig.node_attr

        hier2_import = pd.merge(pd.DataFrame(index=align2.index), node_attr, left_index=True, right_index=True, how='left')
        assert (hier2_import.index == align2.index).all()
        # Change index to terms in hier1
        hier2_import.index = align2['Term'].copy()
        hier2_import.rename(columns=append_prefix, inplace=True)

    if update_hier2:
        if hasattr(update_hier2, '__iter__'):
            node_attr = hier1_orig.node_attr[update_hier2]
        else:
            node_attr = hier1_orig.node_attr

        hier1_import = pd.merge(pd.DataFrame(index=align1.index), node_attr, left_index=True, right_index=True, how='left')
        assert (hier1_import.index == align1.index).all()
        # Change index to terms in hier2
        hier1_import.index = align1['Term'].copy()
        hier1_import.rename(columns=append_prefix, inplace=True)

    if update_hier1:
        hier1_orig.update_node_attr(align1.rename(columns=append_prefix))
        hier1_orig.update_node_attr(hier2_import)
    if update_hier2:
        hier2_orig.update_node_attr(align2.rename(columns=append_prefix))
        hier2_orig.update_node_attr(hier1_import)

    return align1

def parse_obo(obo,
              output_file=None,
              id2name_file=None,
              id2namespace_file=None,
              alt_id_file=None):
    """Parses an OBO file and writes the results into several tables.

    Parameters
    ----------
    obo : str

        Filename of OBO file

    output_file : str

        Filename to write table that describes the ontology's
        hierarchical structure. The table has four columns: (1) parent
        term, (2) child term, (3) relation type (e.g. "is_a" or
        "part_of"), (4) namespace of relation
        (e.g. "biological_process" or "cellular component")

    id2name_file : str

        Filename to write table of term descriptions.  The table has
        two columns: (1) Ontology term (e.g. "GO:0000030"), (2)
        description (e.g. "mannosyltransferase activity")

    id2namespace_file : str
    
        Filename to write table of term namespaces.  The table has two
        columns: (1) Ontology term (e.g. "GO:0000030"), (2) namespace
        of the term (e.g. "biological_process")

    alt_id_file : str
    
        Filename to write table of alternative Term IDs that are
        synonyms and refer to the same term. The table has two
        columns: (1) Primary Term ID, (2) Alternative Term ID

    """

    ## Keywords that screw up parsing:
    # import, is_anonymous, intersection_of, union_of

    ## Relations
    # 'is_a:'
    # 'relationship: has_part'  # Not in filtered GO
    # 'relationship: occurs_in' # Not in filtered GO
    # 'relationship: part_of'   
    # 'relationship: positively_regulates' 
    # 'relationship: negatively_regulates'
    # 'relationship: regulates'
    # 'relationship: results_in' # Not in filtered GO

    stanza, edges = [], []
    id2name = dict()
    id2namespace = dict()
    alt_id = dict()
    in_term_stanza = False
    default_namespace_exists = False
    for line in io.open(obo).read().splitlines():

        line = line.split('!')[0].strip()  # Remove comments

        if len(line)>0 and line[0]=='[' and line[-1]==']':
            # Add last stanza if it was a term stanza.  Include namespace.
            if in_term_stanza:
                edges.extend(x+(namespace, ) for x in stanza)

            # Start new term stanza
            stanza = []
            
            # Set the default namespace, if it exists
            if default_namespace_exists:
                namespace = default_namespace
            
            # In a term stanzo or not
            in_term_stanza = line =='[Term]'

            name = None
                
        #if 'alt_id:' in line: assert False

        if 'id:' == line[:3]:
            curr_term = line.split('id:')[1].strip()
        elif 'alt_id:' in line:
            alt_term = line.split('alt_id:')[1].strip()
            if curr_term in alt_id:  alt_id[curr_term].append(alt_term)
            else:                          alt_id[curr_term] = [alt_term]
            id2name[alt_term] = name
        elif 'name:' in line:
            name = line.split('name:')[1].strip()
            assert not curr_term in id2name
            id2name[curr_term] = name
        elif 'is_a:' in line:
            parent = line.split('is_a:')[1].strip()
            stanza.append((parent, curr_term, 'is_a'))
        elif 'relationship:' in line:
            line = line.split('relationship:')[1].strip().split()
            if len(line)!=2: print(line)
            assert len(line)==2
            relation, parent = line
            stanza.append((parent, curr_term, relation))
        elif 'namespace:' == line[:10]:
            namespace = line.split('namespace:')[1].strip()
            assert not curr_term in id2namespace
            id2namespace[curr_term] = namespace
        elif 'default-namespace:' == line[:18]:
            namespace = line.split('default-namespace:')[1].strip()
            default_namespace_exists = True
            default_namespace = namespace

    pd.DataFrame(edges).to_csv(output_file, header=False, index=False, sep='\t')
    pd.Series(id2name).to_csv(id2name_file, sep='\t')
    pd.Series(id2namespace).to_csv(id2namespace_file, sep='\t')
    pd.Series(dict([(a, c) for a, b in alt_id.items() for c in b])).to_csv(alt_id_file, sep='\t')

def parse_gaf(gaf):
    """
    Read gene-term annotations from GAF file format:

    http://geneontology.org/page/go-annotation-file-gaf-format-21

    Parameters
    ----------
    gaf : str
        Filename of GAF file

    Returns
    --------
    A list of 2-tuples (gene, GO term)
    """

    gaf_columns = ['DB', 'DB Object ID', 'DB Object Symbol',
                   'Qualifier', 'GO ID', 'DB:Reference',
                   'Evidence Code', 'With (or) From', 'Aspect',
                   'DB Object Name', 'DB Object Synonym',
                   'DB Object Type', 'Taxon', 'Date',
                   'Assigned By', 'Annotation Extension',
                   'Gene Product Form ID']
    df = pd.read_table(gaf, header=None, comment='!', names=gaf_columns)

    # Check that all annotations are to UniProtKB protein IDs
    # assert df['DB'].unique().size == 1 and df['DB'].unique()[0]=='UniProtKB'

    # Remove annotations that have a NOT qualifier
    df = df.loc[df['Qualifier']!='NOT', :]

#    return df.loc[:, ['DB Object ID', 'GO ID']].values.tolist()
    return df

[docs]class Ontology(object): """A Python representation for constructing, analyzing, and manipulating the hierarchical structure of ontologies. An Ontology object contains the following attributes for representing the hierarchical structure. Do not directly modify these attributes. Parameters ---------- genes : list Names of genes terms : list Names of terms gene_2_term : dict gene_2_term[<gene>] --> list of terms connected to <gene>. Terms are represented as their 0-based index in self.terms. term_2_gene : dict term_2_gene[<term>] --> list of genes connected to <term>. Genes are represented as their 0-based index in self.genes. child_2_parent : dict child_2_parent[<child>] --> list of the parent terms of <child> parent_2_child : dict parent_2_child[<parent>] --> list of the children terms of <parent> term_sizes : list A list of every term's size, i.e. the number of unique genes that it and its descendant terms contain. This list has the same order as self.terms. It holds that for every i, `term_sizes[i] = len(self.term_2_gene[self.terms[i]])` """ NODETYPE_ATTR = 'NodeType' GENE_NODETYPE = 'Gene' TERM_NODETYPE = 'Term' EDGETYPE_ATTR = 'EdgeType' GENE_TERM_EDGETYPE = 'Gene-Term' CHILD_PARENT_EDGETYPE = 'Child-Parent' def __init__(self, hierarchy, mapping, edge_attr=None, node_attr=None, parent_child=False, add_root_name=None, propagate=None, ignore_orphan_terms=False, verbose=True, **kwargs): """Construct an Ontology object. Parameters ---------- hierarchy : list, tuple Iterable of (child term, parent term). E.g. list of 2-tuples mapping : list, tuple Iterable of (gene, term) pairs. E.g. list of 2-tuples edge_attr : pandas.DataFrame Meta-data describing (child_term, parent_term) pairs. Suggestion: The index of the DataFrame must be a pandas.MultiIndex, where the first level is the child term and the second level is the parent term. parent_child : bool If True, then the definitions of <hierarchy> and <mapping> are reversed so that they iterate over (parent term, child term) and (term, gene) pairs. propagate : None, str The direction ('forward' or 'reverse') to propagate gene-term annotations up the hierarchy with Ontology.propagate(). If None, then don't propagate annotations. add_root_name : bool The name of an artificial root. If there are multiple roots in the ontology, then they are joined into one root with this name. Default: Don't create this root. ignore_orphan_terms : bool """ if 'empty' in kwargs and kwargs['empty'] is True: return if parent_child: hierarchy = [(x[1],x[0]) for x in hierarchy] mapping = [(x[1],x[0]) for x in mapping] # Cast all node names to strings hierarchy = [(str(x[0]),str(x[1])) for x in hierarchy] mapping = [(str(x[0]),str(x[1])) for x in mapping] ## Read term-to-term edges # parent_2_child[<term_name>] --> list of <term_name>'s children terms self.parent_2_child = {r: [p[0] for p in q] for r, q in \ itertools.groupby(sorted(hierarchy, key=lambda a:a[1]), key=lambda a:a[1])} ## Read gene-to-term edges # self.gene_2_term[<gene_name>] --> list of terms that <gene_name> is mapped to self.gene_2_term = {key: set([a[1] for a in group]) for key, group in \ itertools.groupby(sorted(mapping, key=lambda a:a[0]), key=lambda a:a[0])} ## Check that the set of terms is the same according to ## parent_2_child and self.gene_2_term terms_A = set.union(set(self.parent_2_child.keys()), *[set(x) for x in self.parent_2_child.values()]) if len(self.gene_2_term) > 0: terms_B = set.union(*self.gene_2_term.values()) else: terms_B = set([]) if verbose and ignore_orphan_terms and len(terms_B - terms_A)>0: print('WARNING: Ignoring {} terms are connected to genes but not to other terms'.format(len(terms_B - terms_A))) # if verbose and len(terms_A - terms_B)>0: # print 'WARNING: {} terms connected to other terms but not to genes'.format(len(terms_A - terms_B)) if ignore_orphan_terms: self.terms = sorted(terms_A) else: self.terms = sorted(terms_A | terms_B) self.genes = sorted(self.gene_2_term.keys()) if add_root_name is not None: root_list = self.get_roots() if len(root_list) > 1: print('Unifying %s roots into one super-root' % len(root_list)) self.parent_2_child[add_root_name] = root_list self.terms.append(add_root_name) ## terms_index[<term_name>] --> index in self.terms self.terms_index = make_index(self.terms) ## self.genes_index[<gene_name>] --> index in self.genes self.genes_index = make_index(self.genes) ## Convert self.gene_2_term to list term indices rather than term names for k, v in self.gene_2_term.items(): self.gene_2_term[k] = [self.terms_index[x] for x in self.gene_2_term[k] if x in self.terms_index] if node_attr is None: self.clear_node_attr() else: assert node_attr.index.nlevels == 1 if node_attr.index.name != 'Node': # if verbose: # print("Changing node_attr index name from %s to 'Node'" % node_attr.index.name) # # import traceback # # print traceback.print_stack() node_attr.index.name = 'Node' self.node_attr = node_attr if edge_attr is None: self.clear_edge_attr() else: assert edge_attr.index.nlevels == 2 edge_attr.index.names = ['Child', 'Parent'] # if 'Child' in edge_attr.index.names and 'Parent' in edge_attr.index.names: # edge_attr.index = edge_attr.index[['Child', 'Parent']] # else: # edge_attr.index.names = ['Child', 'Parent'] # if edge_attr.index.names != ['Child', 'Parent']: # if verbose: # print("Changing edge_attr index names from %s to ['Child', 'Parent']" % edge_attr.index.names) # edge_attr.index.names = ['Child', 'Parent'] self.edge_attr = edge_attr self._update_fields() if propagate: self.propagate(direction=propagate, inplace=True) self._update_fields() self._check_valid() # ## Not necessary and requires extra start-up time (perhaps set as a __init__ parameter to precalculate many things) # empty_terms = sum([x==0 for x in self.term_sizes]) # if verbose and empty_terms > 0: # print 'WARNING: {} terms are connected to other terms but not to genes'.format(empty_terms), [t for t, x in zip(self.terms, self.term_sizes) if x==0][:5] # # import traceback # # print traceback.print_stack() def _update_fields(self, reset_term_sizes=True): self.child_2_parent = self._get_child_2_parent() self.term_2_gene = self._get_term_2_gene() if reset_term_sizes: self._term_sizes = None for t in self.terms: if t not in self.parent_2_child: self.parent_2_child[t] = [] if t not in self.child_2_parent: self.child_2_parent[t] = [] def add_root(self, root_name, inplace=False): """Check if there is a single unifying root term of the ontology. If not, then identify the multiple roots and join them under an artificial root.""" if inplace: ont = self else: ont = self.copy() assert root_name not in ont.terms root_list = ont.get_roots() if len(root_list) > 1: print('Unifying %s roots into one super-root' % len(root_list)) ont.parent_2_child[root_name] = root_list ont.terms.append(root_name) ont.terms_index = make_index(sorted(ont.terms)) for g, t_list in ont.gene_2_term.items(): ont.gene_2_term[g] = [ont.terms_index[ont.terms[t]] for t in t_list] ont.terms.sort() ont._update_fields() return ont def _get_child_2_parent(self): """ Converts self.parent_2_child to child_2_parent # child_2_parent[<term_name>] --> list of <term_name>'s parent term names """ cp_pairs = [] for p, c_list in self.parent_2_child.items(): for c in c_list: cp_pairs.append((c,p)) first = lambda a: a[0] cp_pairs.sort(key=first) child_2_parent = { r: [p[1] for p in q] for r, q in itertools.groupby(cp_pairs, key=first) } for t in self.terms: if t not in child_2_parent: child_2_parent[t] = [] return child_2_parent def clear_node_attr(self): """Resets the node attributes to be empty.""" self.node_attr = pd.DataFrame() self.node_attr.index.name = 'Node' def clear_edge_attr(self): """Resets the edge attributes to be empty.""" self.edge_attr = pd.DataFrame() self.edge_attr.index = pd.MultiIndex(levels=[[],[]], codes=[[],[]], names=['Child', 'Parent']) def update_node_attr(self, node_attr): """Update existing node attributes or add new node attributes. Parameters ---------- node_attr : pandas.DataFrame Dataframe where index are the names of genes or terms and where the columns are the names of node attributes. """ #### # TODO : make sure that renaming/deleting/collapsing of genes and columns respect the node_attr and edge_attr # Filter for genes and terms in the ontology nodes = set(self.genes) | set(self.terms) node_attr = node_attr.loc[[x for x in node_attr.index if x in nodes], :] assert node_attr.index.duplicated().sum() == 0 # Update index to the union of current and new node_attr self.node_attr = self.node_attr.reindex(self.node_attr.index.union(node_attr.index)) # Update columns for col in node_attr.columns: self.node_attr.loc[node_attr.index, col] = node_attr[col] def update_edge_attr(self, edge_attr): """Update existing edge attributes or add new edge attributes. Parameters ---------- edge_attr : pandas.DataFrame Dataframe where the index is a MultiIndex represents edges in the Ontology, such that the first level is the name of a gene or child term, and the second level is the name of a parent term. Columns are the names of edge attributes. """ # Filter for genes and terms in the ontology edges = [] for child, parent_list in self.child_2_parent.items(): for parent in parent_list: edges.append((child, parent)) for gene, term_list in self.gene_2_term.items(): for term in term_list: edges.append((gene, self.terms[term])) edges = set(edges) edge_attr = edge_attr.loc[[x for x in edge_attr.index if x in edges], :] assert edge_attr.index.duplicated().sum() == 0 # Update index self.edge_attr = self.edge_attr.reindex(self.edge_attr.index.union(edge_attr.index)) # Update values for overlapping columns for col in edge_attr.columns: self.edge_attr.loc[edge_attr.index, col] = edge_attr[col].values def get_roots(self): """Returns a list of the root term(s). Returns ------- : list """ tmp = set(self.terms) - set([y for x in self.parent_2_child.values() for y in x]) return sorted(tmp) def _make_dummy(self, tree_edges=None): """For each term T in the ontology, create a new dummy term that indirectly connect T's to T. For example, if g1 and g2 are in T, then a new term dummy_T is created so that the new ontology consists of g1 --> T_dummy g2 --> T_dummy T_dummy --> T Parameters ---------- tree_edges : list List of (child, parent) edges that constitute a spanning tree of the ontology. If specified, then for each term T, only the genes that are connected to T in the spanning tree will be re-routed to the dummy node. Default: None. This restriction will not apply Returns ------- : ddot.Ontology.Ontology """ ont = self new_gene_2_term = [] new_child_2_parent = [] for t in ont.terms: used_dummy = False if len(ont.parent_2_child[t]) > 0: dummy_term = 'dummy2_%s' % t else: dummy_term = t for g in [ont.genes[g] for g in ont.term_2_gene[t]]: if (tree_edges is None) or (g,t) in tree_edges: new_gene_2_term.append((g, dummy_term)) used_dummy=True if used_dummy and dummy_term != t: new_child_2_parent.append([dummy_term, t]) for p in ont.child_2_parent[t]: if (tree_edges is None) or (t,p) in tree_edges: new_child_2_parent.append((t, p)) ont_dummy = Ontology(new_child_2_parent, new_gene_2_term) return ont_dummy def _collect_transform(self, tree_edges=None, hidden_gene=True, hidden_parent=True, hidden_child=True): """ Creates intermediate duplicate nodes """ ont = self if tree_edges is None: tree_edges = self.get_tree() nodes_copy = {v : 1 for v in ont.genes + ont.terms} def get_copy(u): u_name = '%s.%s' % (u, nodes_copy[u]) nodes_copy[u] += 1 return u_name collect_nodes = [] new_gene_2_term = [] new_child_2_parent = [] for t in ont.terms: ## Gene-term connections collect_hidden_gene = 'collect_hidden_gene_%s' % t used_hidden_gene = False for g in [ont.genes[g] for g in ont.term_2_gene[t]]: if (not hidden_gene) or ((g, t) in tree_edges): new_gene_2_term.append((g, collect_hidden_gene)) used_hidden_gene = True else: new_gene_2_term.append((get_copy(g), collect_hidden_gene)) used_hidden_gene = True if used_hidden_gene: collect_nodes.append(collect_hidden_gene) new_child_2_parent.append((collect_hidden_gene, t)) ## Parent-child term connections collect_hidden_child = 'collect_hidden_child_%s' % t collect_hidden_parent = 'collect_hidden_parent_%s' % t used_hidden_child, used_hidden_parent = False, False for c in ont.parent_2_child[t]: if (not hidden_child) or ((c,t) in tree_edges): new_child_2_parent.append((c,t)) else: new_child_2_parent.append((get_copy(c), collect_hidden_child)) used_hidden_child = True for p in ont.child_2_parent[t]: if hidden_parent and ((t,p) not in tree_edges): new_child_2_parent.append((get_copy(p), collect_hidden_parent)) used_hidden_parent = True if used_hidden_child: collect_nodes.append(collect_hidden_child) new_child_2_parent.append((collect_hidden_child, t)) if used_hidden_parent: collect_nodes.append(collect_hidden_parent) new_child_2_parent.append((collect_hidden_parent, t)) ont_collect = Ontology(new_child_2_parent, new_gene_2_term, node_attr=ont.node_attr.copy(), edge_attr=ont.edge_attr.copy(), verbose=False) ################################################## # Set Original_Name and Size for Duplicate Nodes # new_and_orig = [('%s.%s' %(v,i), v) for v, copy_num in nodes_copy.items() for i in (range(1, copy_num) if copy_num>1 else [])] new_2_orig = dict(new_and_orig) df = pd.DataFrame({'orig_tmp' : [x[1] for x in new_and_orig], 'Hidden' : True}, index=[x[0] for x in new_and_orig]) df = df.astype({'orig_tmp' : np.str, 'Hidden' : np.bool}) # For duplicate nodes, set the Original_Name attribute to the name of the original node merge = pd.merge(df, ont.node_attr, how='left', left_on=['orig_tmp'], right_index=True) if 'Original_Name' in merge: unset = pd.isnull(merge['Original_Name']).values merge.loc[unset, 'Original_Name'] = df.loc[unset, 'orig_tmp'].values else: merge['Original_Name'] = df['orig_tmp'].values del merge['orig_tmp'] # Set the 'Size' attribute of duplicate nodes to be the 'Size' # of the original node. If the original node is a term with no # 'Size' attribute, then set 'Size' to be the number of genes # in the term in_merge = set(merge.index) for node in merge.index: if node in new_2_orig: orig = new_2_orig[node] if orig in in_merge and not pd.isnull(merge.loc[orig, 'Size']): merge.loc[node, 'Size'] = merge.loc[new_2_orig[node], 'Size'] elif orig in ont.terms_index: merge.loc[node, 'Size'] = ont.term_sizes[ont.terms_index[orig]] # Append attributes for the new nodes try: # Used for pandas version >= 0.23 ont_collect.node_attr = pd.concat([ont.node_attr, merge], axis=0, sort=True) except: ont_collect.node_attr = pd.concat([ont.node_attr, merge], axis=0) ######################################## # Set Label and Size for collect nodes # ######################################## def get_label(x): if 'hidden_child' in x: return 'Linked Children' elif 'hidden_parent' in x: return 'Linked Parents' elif 'hidden_gene' in x: return 'Linked Genes' elif 'tree_gene' in x: return 'Genes' collect_attr = pd.DataFrame( {'Size' : 1, 'Label' : [get_label(x) for x in collect_nodes], 'is_collect_node' : True}, index=collect_nodes) ont_collect.update_node_attr(collect_attr) return ont_collect
[docs] def unfold(self, duplicate=None, genes_only=False, levels=None, tree_edges=None): """Traverses the ontology from the root to the leaves while duplicating nodes during the traversal to create a tree representation. Traverse the ontology from the root nodes to the leaves in a breadth-first manner. Each time a node is traversed, then create a duplicate of it Parameters ---------- duplicate : list Nodes to duplicate for unfolding. Default: all genes and terms genes_only : bool If True, then duplicate all of the genes and none of the terms. Default: False levels : """ ont = self.propagate(direction='reverse', inplace=False) hidden_mode = levels is not None if hidden_mode: if tree_edges is None: tree_edges = self.get_tree() hidden_depth = {} if genes_only: duplicate = ont.genes elif duplicate is None: duplicate = ont.genes + ont.terms nodes_copy = {x : 0 for x in duplicate} def get_name(u): if u in nodes_copy: u_name = '%s.%s' % (u, nodes_copy[u]) nodes_copy[u] += 1 else: u_name = u return u_name to_expand = [] new_2_orig = {} for u in ont.get_roots(): u_name = get_name(u) new_2_orig[u_name] = u to_expand.append(u_name) if hidden_mode: hidden_depth[u_name] = 0 expanded = set(to_expand) hierarchy, mapping = [], [] # Manual bfs curr = 0 while curr < len(to_expand): v_name = to_expand[curr] v = new_2_orig[v_name] for u in [ont.genes[u] for u in ont.term_2_gene[v]]: u_name = get_name(u) new_2_orig[u_name] = u mapping.append((u_name, v_name)) if hidden_mode: v_depth = hidden_depth[v_name] if v_depth==0: if (u,v) in tree_edges: hidden_depth[u_name] = 0 else: hidden_depth[u_name] = 1 elif v_depth < levels: hidden_depth[u_name] = v_depth + 1 for u in ont.parent_2_child[v]: u_name = get_name(u) new_2_orig[u_name] = u hierarchy.append((u_name, v_name)) if hidden_mode: v_depth = hidden_depth[v_name] insert = u_name not in expanded if v_depth==0 and ((u,v) in tree_edges): hidden_depth[u_name] = 0 elif v_depth < levels: hidden_depth[u_name] = v_depth + 1 else: insert = False else: insert = u_name not in expanded if insert: to_expand.append(u_name) expanded.add(u_name) curr += 1 new_nodes, orig_nodes = zip(*new_2_orig.items()) new_nodes, orig_nodes = list(new_nodes), list(orig_nodes) ont.node_attr = ont.node_attr.reindex(list(set(orig_nodes))) node_attr = ont.node_attr.loc[orig_nodes, :].copy() if 'Original_Name' in node_attr: unset = pd.isnull(node_attr['Original_Name']).values node_attr.loc[unset, 'Original_Name'] = np.array(orig_nodes)[unset] else: node_attr['Original_Name'] = orig_nodes if hidden_mode: node_attr['Level'] = [hidden_depth[v] for v in new_nodes] node_attr.index = new_nodes node_attr.dropna(axis=0, how='all', inplace=True) new_edges = hierarchy + mapping old_edges = [(new_2_orig[u], new_2_orig[v]) for u, v in new_edges] in_index = [x in ont.edge_attr.index for x in old_edges] if sum(in_index) > 0: edge_attr = ont.edge_attr.loc[[x for x, y in zip(old_edges, in_index) if y], :].copy() edge_attr.index = pd.MultiIndex.from_tuples([x for x, y in zip(new_edges, in_index) if y]) edge_attr.dropna(axis=0, how='all', inplace=True) else: edge_attr = None ont = Ontology(hierarchy, mapping, edge_attr=edge_attr, node_attr=node_attr, parent_child=False, verbose=False) return ont
def _to_networkx_no_layout(self): G = nx.DiGraph() ################################# ### Add nodes and node attributes G.add_nodes_from(self.genes + self.terms) set_node_attributes_from_pandas(G, self.node_attr) # Ensure that all 'Size' values are the same numeric type if 'Size' in self.node_attr.columns: dtype = self.node_attr['Size'].dtype if dtype in [np.dtype('float16'), np.dtype('float32'), np.dtype('float64')]: dtype = float else: dtype = int else: dtype = int for t in self.terms: G.node[t][self.NODETYPE_ATTR] = self.TERM_NODETYPE if ('Size' not in G.node[t]) or pd.isnull(G.node[t]['Size']): G.node[t]['Size'] = dtype(self.term_sizes[self.terms_index[t]]) G.node[t]['isRoot'] = False for g in self.genes: G.node[g][self.NODETYPE_ATTR] = self.GENE_NODETYPE if ('Size' not in G.node[g]) or pd.isnull(G.node[g]['Size']): G.node[g]['Size'] = dtype(1) G.node[g]['isRoot'] = False # Identify the root root = self.get_roots()[0] G.node[root]['isRoot'] = True # Set the node attribute 'Label'. If the node has a "Original # Name" attribute, indicating that it is a duplicate, then use # that. Otherwise, use the node's name. for x in self.genes + self.terms: data = G.node[x] if ('Label' not in data) or pd.isnull(data['Label']): if ('Original_Name' in data) and (not pd.isnull(data['Original_Name'])): data['Label'] = data['Original_Name'] else: data['Label'] = x ################################# ### Add edges and edge attributes G.add_edges_from([(g, self.terms[t], {self.EDGETYPE_ATTR : self.GENE_TERM_EDGETYPE}) \ for g in self.genes for t in self.gene_2_term[g]]) G.add_edges_from([(c, p, {self.EDGETYPE_ATTR : self.CHILD_PARENT_EDGETYPE}) \ for p in self.terms for c in self.parent_2_child.get(p, [])]) set_edge_attributes_from_pandas(G, self.edge_attr) return G def expand(self, spanning_tree=True): if spanning_tree is True: ont = self._collect_transform() else: # Assume a list of tree edges are supplied ont = self._collect_transform(spanning_tree) G_tree = ont.get_tree(ret='ontology')._to_networkx_no_layout() pos = bubble_layout_nx(G_tree) tmp = ont.node_attr[['Label', 'is_collect_node']].dropna() collect_nodes = tmp[tmp['is_collect_node']].index gridify(collect_nodes, pos, G_tree) ## Remove collector nodes def decide_delete(v): return ((not layout_params['hidden_parent'] and v=='Linked Parents') or (not layout_params['hidden_child'] and v=='Linked Children') or (not layout_params['hidden_gene'] and v=='Linked Genes')) tmp = ont.node_attr[['Label', 'is_collect_node']].dropna() tmp = tmp[tmp['is_collect_node']] tmp = tmp[tmp['Label'].apply(decide_delete)] to_delete = tmp.index.tolist() ont_red = ont if len(to_delete) > 0: # Need fast special delete ont_red = ont_red.delete(to_delete=to_delete, preserve_transitivity=True) # Set the original term sizes for the original copy of # each term (not the duplicates) ont_red.update_node_attr(pd.DataFrame({'Size' : self.term_sizes}, index=self.terms)) G = ont_red._to_networkx_no_layout() nodes_set = set(G.nodes()) G.pos = {n : (float(scale*p[0]), float(scale*p[1])) for n, p in pos.items() if n in nodes_set} nx_set_tree_edges(G, ont_red.get_tree()) ###################################################### # TODO: move this visual styling outside of the layout # functionality nx.set_edge_attributes(G, values='ARROW', name='Vis:EDGE_SOURCE_ARROW_SHAPE') nx.set_edge_attributes(G, values='NONE', name='Vis:EDGE_TARGET_ARROW_SHAPE') for v, data in G.nodes(data=True): # if 'collect_hidden' in v and 'is_collect_node' in data and data['is_collect_node']: # for u in G.predecessors(v): # G.node[u]['Vis:Fill Color'] = '#3182BD' try: if 'collect_hidden_parent' in v and 'is_collect_node' in data and data['is_collect_node']: for _, _, data in G.in_edges(v, data=True): data["Vis:EDGE_TARGET_ARROW_SHAPE"] = 'ARROW' data["Vis:EDGE_SOURCE_ARROW_SHAPE"] = 'NONE' except: print(data) print('v', v) print('collect_hidden_parent' in v) print('is_collect_node' in data) print(data['is_collect_node']) raise
[docs] def to_networkx(self, layout='bubble', spanning_tree=True, layout_params=None, verbose=False): """Converts Ontology into a NetworkX object. Parameters ---------- node_attr : pandas.DataFrame Meta-data about genes and terms that will be included as node attributes in the NetworkX object. edge_attr : pandas.DataFrame Meta-data about connections among genes and terms that will be included as edge attributes in the NetworkX object. spanning_tree : bool If True, then identify a spanning tree of the DAG. include an edge attribute "Is_Tree_Edge" that indicates layout : str The name of the layout algorithm for laying out the Ontology as a graph. Node positions are astored in the node attributes 'x_pos' and 'y_pos'. If None, then do not perform a layout. Returns ------- : nx.DiGraph """ default_layout_params = {'hidden_parent' : True, 'hidden_child' : False, 'hidden_gene' : False} if layout_params is not None: default_layout_params.update(layout_params) layout_params = default_layout_params if spanning_tree: scale = 1 if layout is None or layout=='bubble': G = self._to_networkx_no_layout() if spanning_tree is True: tree_edges = self.get_tree() else: tree_edges = spanning_tree nx_set_tree_edges(G, tree_edges) if layout=='bubble': G_tree = self.propagate('reverse')._make_dummy(tree_edges)._to_networkx_no_layout() pos = bubble_layout_nx(G_tree, verbose=verbose) gridify([v for v in G_tree.nodes() if 'dummy2' in v], pos, G_tree) G.pos = {n : (float(scale*p[0]), float(scale*p[1])) for n, p in pos.items() if 'dummy2' not in n} elif layout=='bubble-collect': if spanning_tree is True: ont = self._collect_transform() else: # Assume a list of tree edges are supplied ont = self._collect_transform(spanning_tree) G_tree = ont.get_tree(ret='ontology')._to_networkx_no_layout() pos = bubble_layout_nx(G_tree) tmp = ont.node_attr[['Label', 'is_collect_node']].dropna() collect_nodes = tmp[tmp['is_collect_node']].index gridify(collect_nodes, pos, G_tree) ## Remove collector nodes def decide_delete(v): return ((not layout_params['hidden_parent'] and v=='Linked Parents') or (not layout_params['hidden_child'] and v=='Linked Children') or (not layout_params['hidden_gene'] and v=='Linked Genes')) tmp = ont.node_attr[['Label', 'is_collect_node']].dropna() tmp = tmp[tmp['is_collect_node']] tmp = tmp[tmp['Label'].apply(decide_delete)] to_delete = tmp.index.tolist() ont_red = ont if len(to_delete) > 0: # Need fast special delete ont_red = ont_red.delete(to_delete=to_delete, preserve_transitivity=True) # Set the original term sizes for the original copy of # each term (not the duplicates) ont_red.update_node_attr(pd.DataFrame({'Size' : self.term_sizes}, index=self.terms)) G = ont_red._to_networkx_no_layout() nodes_set = set(G.nodes()) G.pos = {n : (float(scale*p[0]), float(scale*p[1])) for n, p in pos.items() if n in nodes_set} nx_set_tree_edges(G, ont_red.get_tree()) ###################################################### # TODO: move this visual styling outside of the layout # functionality nx.set_edge_attributes(G, values='ARROW', name='Vis:EDGE_SOURCE_ARROW_SHAPE') nx.set_edge_attributes(G, values='NONE', name='Vis:EDGE_TARGET_ARROW_SHAPE') for v, data in G.nodes(data=True): # if 'collect_hidden' in v and 'is_collect_node' in data and data['is_collect_node']: # for u in G.predecessors(v): # G.node[u]['Vis:Fill Color'] = '#3182BD' try: if 'collect_hidden_parent' in v and 'is_collect_node' in data and data['is_collect_node']: for _, _, data in G.in_edges(v, data=True): data["Vis:EDGE_TARGET_ARROW_SHAPE"] = 'ARROW' data["Vis:EDGE_SOURCE_ARROW_SHAPE"] = 'NONE' except: print(data) print('v', v) print('collect_hidden_parent' in v) print('is_collect_node' in data) print(data['is_collect_node']) raise else: raise Exception('Unsupported layout: %s', layout) if layout is not None: nx.set_node_attributes(G, values={n : x for n, (x,y) in G.pos.items()}, name='x_pos') nx.set_node_attributes(G, values={n : y for n, (x,y) in G.pos.items()}, name='y_pos') else: G = self._to_networkx_no_layout() return G
[docs] @classmethod def from_table(cls, table, parent=0, child=1, is_mapping=None, mapping=None, mapping_parent=0, mapping_child=1, header=0, propagate=False, verbose=False, clixo_format=False, clear_default_attr=True, **kwargs): """Create Ontology from a tab-delimited table or pandas DataFrame. Duplicate gene-term or term-term connections in the table are removed. Parameters ---------- table : pandas.DataFrame, file-like object, or filename A table that lists (child term, parent term) pairs. If mapping==None, then this table should also include (gene, term) pairs. parent : int or str Column for parent terms in table (index or name of column) child : int or str Column for child terms and genes in table (index or name of column) is_mapping : function A function that is applied on each row and returns True if the row represents a (gene, term) pair and False otherwise. This function is only applied when a separate table of (gene, term) pairs is not specified, i.e. mapping==None. The default function is `lambda row: row[2]=={0}` which tests if the third column equals the string "{0}". mapping : pandas.DataFrame, file-like object, or filename (optional) A separate table listing only (gene, term) pairs mapping_parent : int or str Column for terms in mapping table (index or name of column) mappping_child : int or str Column for genes in mapping table (index or name of column) header : int or None Row number to use as the column names, which are then stored in the resulting Ontology object's `edge_attr` field. For example if `header=0` (default), then the first row is assumed to be column names. If `header=None`, then no column names are assumed. propagate : None or str The direction ('forward' or 'reverse') for propagating gene-term annotations up the hierarchy with Ontology.propagate(). If None, then don't propagate annotations. clixo_format : bool If True, The table is assumed to be in the same format produced by the CLIXO C++ implementation. In particular, table has three columns: Column 1) Parent Term Column 2) Child Term or Gene Column 3) The string "gene" if the row is a gene-term mapping, otherwise the string "default". The table is also assumed to have no column headers (i.e. header=False) clear_default_attr: bool If True (default), then remove the edge attribute 'EdgeType' created using Ontology.to_table(). This attribute was created to make the table be an equivalent representation of an Ontology object; however, it is no longer necessary after reconstructing the Ontology object. Returns ------- : ddot.Ontology.Ontology """.format(cls.GENE_TERM_EDGETYPE) if clixo_format: ont = cls.from_table( table, parent=0, child=1, is_mapping=lambda x: x[2]=='gene', header=None, clixo_format=False, verbose=verbose) ont.edge_attr.columns = map(str, ont.edge_attr.columns) del ont.edge_attr['2'] return ont if is_mapping is None: if mapping is None: # print('WARNING: no gene-term connections ' # 'were specified by the is_mapping ' # 'function or separate table. ' # 'Default: assume a gene-term connection when the 3rd column equals %s' % cls.GENE_TERM_EDGETYPE) is_mapping = lambda x: x.iloc[2]==cls.GENE_TERM_EDGETYPE # Read table try: table = pd.read_table(table, comment='#', header=header) except: assert isinstance(table, pd.DataFrame) if child not in table.columns: child = table.columns[child] if parent not in table.columns: parent = table.columns[parent] for col in [child, parent]: table.loc[:,col] = table.loc[:,col].astype(str) edge_attr = table.set_index([child, parent]) edge_attr.index.rename(['Child', 'Parent'], inplace=True) if mapping is None: # Extract gene-term connections from table mask = table.apply(is_mapping, axis=1) mapping = table.loc[mask, :].loc[:,[child, parent]] hierarchy = table.loc[~mask, :].loc[:,[child, parent]] mapping_child, mapping_parent = child, parent else: # Read separate table of gene-term connections try: mapping = pd.read_table(mapping, comment='#', header=header) except: assert isinstance(mapping, pd.DataFrame) if mapping_child not in mapping.columns: mapping_child = mapping.columns[mapping_child] if mapping_parent not in mapping.columns: mapping_parent = mapping.columns[mapping_parent] for col in [mapping_child, mapping_parent]: mapping.loc[:,col] = mapping.loc[:,col].astype(str) mapping_attr = mapping.set_index([mapping_child, mapping_parent]) mapping_attr.index.rename(['Child', 'Parent'], inplace=True) try: # Used for pandas version >= 0.23 edge_attr = pd.concat([edge_attr, mapping_attr], sort=True) except: edge_attr = pd.concat([edge_attr, mapping_attr]) mapping = mapping.loc[:,[mapping_child, mapping_parent]] hierarchy = table.loc[:,[child, parent]] dups = mapping.duplicated([mapping_child, mapping_parent]).sum() if dups > 0: print('WARNING: Dropping %s duplicate gene-term connections' % dups) mapping.drop_duplicates([mapping_child, mapping_parent], inplace=True) dups = hierarchy.duplicated([child, parent]).sum() if dups > 0: print('WARNING: Dropping %s duplicate term-term connections' % dups) hierarchy.drop_duplicates([child, parent], inplace=True) edge_attr = edge_attr.loc[~ edge_attr.index.duplicated(), :] edge_attr.index.names = ['Child', 'Parent'] if clear_default_attr: if cls.EDGETYPE_ATTR in edge_attr: del edge_attr[cls.EDGETYPE_ATTR] mapping, hierarchy = mapping.values.tolist(), hierarchy.values.tolist() return cls(hierarchy, mapping, parent_child=False, edge_attr=edge_attr, propagate=propagate, verbose=verbose, **kwargs)
@classmethod def from_scipy_linkage(cls, Z): """Creates an Ontology object from a linkage matrix created by scipy's hierarchical/agglomerative clustering. Note that this form of clustering produces a binary tree. """ import scipy.cluster.hierarchy rootnode, nodelist = scipy.cluster.hierarchy.to_tree(Z, rd=True) leaves = set(scipy.cluster.hierarchy.leaves_list(Z)) hierarchy, mapping = [], [] for v in nodelist: v_id = v.get_id() if v.get_left(): child = v.get_left().get_id() if child in leaves: mapping.append((v_id, child)) else: hierarchy.append((v_id, child)) if v.get_right(): child = v.get_right().get_id() if child in leaves: mapping.append((v_id, child)) else: hierarchy.append((v_id, child)) return cls(hierarchy, mapping, parent_child=True)
[docs] @classmethod def from_ndex(cls, ndex_uuid, ndex_user=None, ndex_pass=None, ndex_server=None, edgetype_attr=None, edgetype_value=None): """Reads an Ontology stored on NDEx. Gene and terms are distinguished according by an edge attribute. Parameters ---------- ndex_uuid : str NDEx UUID of ontology edgetype_attr : str Name of the edge attribute that distinguishes a (gene, term) pair from a (child term, parent term) pair gene_value : str Value of the edge attribute for (gene, term) pairs Returns ------- : ddot.Ontology.Ontology """ if ndex_server is None: ndex_server = ddot.config.ndex_server if ndex_user is None: ndex_user = ddot.config.ndex_user if ndex_pass is None: ndex_pass = ddot.config.ndex_pass if '/' in ndex_uuid: ndex_server = parse_ndex_server(ndex_uuid) ndex_uuid = parse_ndex_uuid(ndex_uuid) G = NdexGraph( server=ndex_server, username=ndex_user, password=ndex_pass, uuid=ndex_uuid) return cls.from_NdexGraph( G, edgetype_attr=edgetype_attr, edgetype_value=edgetype_value)
@classmethod def from_NdexGraph(cls, G, edgetype_attr=None, edgetype_value=None): """Converts a NdexGraph object to an Ontology object. Gene and terms are distinguished by an edge attribute. Parameters ---------- G : NdexGraph edgetype_attr : str Name of the edge attribute that distinguishes a (gene, term) pair from a (child term, parent term) pair gene_value : str Value of the edge attribute for (gene, term) pairs Returns ------- : ddot.Ontology.Ontology """ return cls.from_networkx( NdexGraph_to_nx(G), edgetype_attr=edgetype_attr, edgetype_value=edgetype_value)
[docs] @classmethod def from_networkx(cls, G, edgetype_attr=None, edgetype_value=None, clear_default_attr=True): """Converts a NetworkX object to an Ontology object. Gene and terms are distinguished by an edge attribute. Parameters ---------- G : nx.DiGraph edgetype_attr : str Name of the edge attribute that distinguishes a (gene, term) pair from a (child term, parent term) pair edgetype_value : str Value of the edge attribute for (gene, term) pairs clear_default_attr : bool If True (default), then remove the node and edge attributes that are created in a NetworkX graph using Ontology.to_networkx() or Ontology.to_ndex(). These attributes include 'Label', 'Size', 'NodeType', and 'EdgeType'. These attributes were created to make the NetworkX graph be an equivalent representation of an Ontology object; however, they are no longer necessary after reconstrcting the Ontology object. Returns ------- : ddot.Ontology.Ontology """ if edgetype_attr is None: edgetype_attr=cls.EDGETYPE_ATTR if edgetype_value is None: edgetype_value=cls.GENE_TERM_EDGETYPE hierarchy = [] mapping = [] for u, v, attr in G.edges(data=True): if attr[edgetype_attr] == edgetype_value: mapping.append((u, v)) else: hierarchy.append((u, v)) edge_attr = nx_edges_to_pandas(G) node_attr = nx_nodes_to_pandas(G) ont = cls(hierarchy, mapping, node_attr=node_attr, edge_attr=edge_attr) if clear_default_attr: for attr in [Ontology.NODETYPE_ATTR, 'Label', 'Size', 'isRoot', 'x_pos', 'y_pos']: if attr in ont.node_attr.columns: del ont.node_attr[attr] for attr in [edgetype_attr, 'Is_Tree_Edge']: if attr in ont.edge_attr.columns: del ont.edge_attr[attr] return ont
[docs] @classmethod def from_igraph(cls, G, edgetype_attr=None, edgetype_value=None, verbose=False): """Converts a igraph Graph object to an Ontology object. Gene and terms are distinguished by an edge attribute. Parameters ---------- G : igraph.Graph edgetype_attr : str Name of the edge attribute that distinguishes a (gene, term) pair from a (child term, parent term) pair edgetype_value : str Value of the edge attribute for (gene, term) pairs Returns ------- : ddot.Ontology.Ontology """ if edgetype_attr is None: edgetype_attr=cls.EDGETYPE_ATTR if edgetype_value is None: edgetype_value=cls.GENE_TERM_EDGETYPE hierarchy = [] mapping = [] for e in G.es: u = G.vs[e.source]['name'] v = G.vs[e.target]['name'] if e[edgetype_attr] == edgetype_value: mapping.append((u, v)) else: hierarchy.append((u, v)) edge_attr = ig_edges_to_pandas(G) node_attr = ig_nodes_to_pandas(G) edge_attr.index.names = ['Child', 'Parent'] node_attr.index.name = 'Node' ont = cls(hierarchy, mapping, node_attr=node_attr, edge_attr=edge_attr, verbose=verbose) for attr in [Ontology.NODETYPE_ATTR]: if attr in ont.node_attr.columns: del ont.node_attr[attr] for attr in [edgetype_attr, 'Is_Tree_Edge']: if attr in ont.edge_attr.columns: del ont.edge_attr[attr] return ont
def collapse_ontology(self, method='python', to_keep=None, min_term_size=2, verbose=True): """Remove redundant and empty terms. When a term T is removed, hierarchical relations are preserved by connecting every child of T with every parent of T. This removal operation has the nice property of being commutative, i.e. the order of removal does not matter. Parameters ----------- method : str If "mhkramer", then use the collapseRedundantNodes script in the alignOntology package. If "python", then use an internal Python script. min_term_size : int Remove terms that are below this size. TODO: not yet supported Returns ------- : ddot.ddot.Ontology A new Ontology object """ if method=='mhkramer': assert to_keep is None, 'to_keep is only supported for method=="python"' # Propagate forward and then reverse ont = self.copy() ont = self.propagate(direction='forward', inplace=False) ont.propagate(direction='reverse', inplace=True) top_level = os.path.dirname(os.path.abspath(inspect.getfile(ddot))) collapseRedundantNodes = os.path.join(top_level, 'alignOntology', 'collapseRedundantNodes') # assert os.path.isdir(ddot.config.alignOntology) # collapseRedundantNodes = os.path.join(ddot.config.alignOntology, 'collapseRedundantNodes') assert os.path.isfile(collapseRedundantNodes) with tempfile.NamedTemporaryFile('w', delete=False) as f: ont.to_table(f, clixo_format=True) try: cmd = '%s %s' % (collapseRedundantNodes, f.name) print('collapse command:', cmd) p = Popen(shlex.split(cmd), shell=False, stdout=PIPE, stderr=PIPE) collapsed, err = p.communicate() collapsed = collapsed.decode() finally: os.remove(f.name) ont = Ontology.from_table( StringIO(collapsed), is_mapping=lambda x: x[2]=='gene', clixo_format=True ) ont.clear_edge_attr() ont.update_node_attr(self.node_attr) ont.update_edge_attr(self.edge_attr) return ont elif method=='python': ont = self.propagate('forward', inplace=False) term_hash = {t : hash(tuple(g_list)) for t, g_list in ont.term_2_gene.items()} to_collapse = set() for p in ont.parent_2_child: for c in ont.parent_2_child[p]: if term_hash[p] == term_hash[c]: to_collapse.add(p) if min_term_size is not None: to_collapse = to_collapse | set([t for t, s in zip(ont.terms, ont.term_sizes) if s < min_term_size]) if to_keep is not None: to_collapse = to_collapse - set(to_keep) # print('to_collapse:', sorted(to_collapse)) ont.propagate('reverse', inplace=True) ont_red = ont.delete(to_delete=to_collapse, preserve_transitivity=True) return ont_red @classmethod def mutual_collapse(cls, ont1, ont2, verbose=False): """Collapses two ontologies to the common set of genes. Parameters ----------- ont1 : ddot.Ontology.Ontology ont2 : ddot.Ontology.Ontology Returns ------- ont1_collapsed : ddot.Ontology.Ontology ont2_collapsed : ddot.Ontology.Ontology """ common_genes = set(ont1.genes) & set(ont2.genes) if verbose: print('Common genes:', len(common_genes)) if len(common_genes) > 0: ont1 = ont1.delete(to_delete=set(ont1.genes) - common_genes, inplace=False) ont1_collapsed = ont1.collapse_ontology() ont2 = ont2.delete(to_delete=set(ont2.genes) - common_genes, inplace=False) ont2_collapsed = ont2.collapse_ontology() else: raise Exception('No common genes between ontologies') if verbose: print('ont1_collapsed:', ont1_collapsed.summary()) print('ont2_collapsed:', ont2_collapsed.summary()) return ont1_collapsed, ont2_collapsed
[docs] def focus(self, branches=None, genes=None, collapse=False, root=True, verbose=True): """ """ assert (branches is not None) or (genes is not None) to_keep = np.array(self.genes + self.terms) if branches is not None: to_keep = to_keep[self.connected(to_keep, branches).sum(1) > 0] if verbose: print('Genes and Terms to keep:', to_keep.size) if genes is not None: to_keep = to_keep[self.connected(genes, to_keep).sum(0) > 0] if verbose: print('Genes and Terms to keep:', to_keep.size) if root: while True: common_root = self.common_ancestors(to_keep, minimal=True) if common_root in to_keep or len(common_root)<=1: break else: print('Adding', common_root) to_keep = np.append(to_keep, common_root) ont = self.delete(to_keep=to_keep, preserve_transitivity=True) if collapse: ont = ont.collapse_ontology(method='python', to_keep=ont.get_roots()) df = ont.to_table(edge_attr=True) new_connections = [] for t in ont.terms: removed_genes = set([self.genes[g] for g in self.term_2_gene[t]]) - set([ont.genes[g] for g in ont.term_2_gene[t]]) removed_terms = set(self.parent_2_child[t]) - set(ont.parent_2_child[t]) if len(removed_genes) > 0: new_connections.append(('%s_%s_other_genes' % (t, len(removed_genes)), t, self.GENE_TERM_EDGETYPE)) if len(removed_terms) > 0: new_connections.append(('%s_%s_other_terms' % (t, len(removed_terms)), t, self.CHILD_PARENT_EDGETYPE)) if len(new_connections) > 0: new_connections = pd.DataFrame(new_connections) new_connections.columns = ['Child', 'Parent', self.EDGETYPE_ATTR] new_nodes = new_connections['Child'].values.tolist() new_connections['Summary'] = True df['Summary'] = False try: # Used for pandas version >= 0.23 tmp = pd.concat([df, new_connections], ignore_index=True, sort=True) except: tmp = pd.concat([df, new_connections], ignore_index=True) df = tmp[df.columns] ont = Ontology.from_table(df) ont.update_node_attr(self.node_attr) # orig_sizes = pd.DataFrame({'Original_Size' : self.term_sizes}, index=self.terms) # ont.update_node_attr(orig_sizes) # if len(new_connections)>0: # summary_sizes = pd.DataFrame({'Original_Size' : [int(x.split('_')[1]) for x in new_nodes]}, index=new_nodes) # ont.update_node_attr(summary_sizes) if len(new_connections) > 0: ont.update_node_attr(pd.DataFrame({'Label':['_'.join(x.split('_')[1:]) for x in new_nodes]}, index=new_nodes)) return ont
[docs] def delete(self, to_delete=None, to_keep=None, preserve_transitivity=True, inplace=False): """Delete genes and/or terms from the ontology. Parameters ---------- to_delete : array-like (optional) Names of genes and/or terms to delete. Either to_delete or to_keep must be specified. to_keep : array-like (optional) Names of genes and/or terms to keep; all other genes/terms are delete. Only used if to_delete is not specified. preserve_transitivity : bool If True, then maintain transitive relations when deleting terms. For example, if the hierarchical structure consists of geneA --> term1 term1 --> term2 term2 --> term3 term2 --> term4 then deleting term2 will result in the structure: geneA --> term1 term1 --> term3 term3 --> term4 If False, then deleting term2 will result in a disconnected structure: geneA --> term1 inplace : bool If True, then modify the ontology. If False, then create and modify a copy. Returns ------- : ddot.Ontology.Ontology """ if inplace: ont = self else: ont = self.copy() if to_delete is not None: terms = set([x for x in to_delete if x in ont.terms_index]) genes = set([x for x in to_delete if x in ont.genes_index]) elif to_keep is not None: terms = set(ont.terms) - set([x for x in to_keep if x in ont.terms_index]) genes = set(ont.genes) - set([x for x in to_keep if x in ont.genes_index]) else: raise Exception('Must specify nodes to delete or to keep') if len(genes) > 0: ont.genes = [g for g in ont.genes if g not in genes] ont.genes_index = make_index(ont.genes) ont.gene_2_term = {g : t for g, t in ont.gene_2_term.items() if g not in genes} ont._update_fields() if len(terms) > 0: if preserve_transitivity: gene_2_term_set = {g : set([ont.terms[s] for s in t]) for g, t in ont.gene_2_term.items()} term_2_gene_set = {a : set(b) for a, b in ont.term_2_gene.items()} child_2_parent_set = {a : set(b) for a, b in ont.child_2_parent.items()} parent_2_child_set = {a : set(b) for a, b in ont.parent_2_child.items()} for t in terms: t_parents = child_2_parent_set[t] t_genes = term_2_gene_set[t] t_children = parent_2_child_set[t] for g_i in t_genes: g = ont.genes[g_i] gene_2_term_set[g].update(t_parents) gene_2_term_set[g].remove(t) for p in t_parents: term_2_gene_set[p].update(t_genes) parent_2_child_set[p].update(t_children) parent_2_child_set[p].remove(t) for c in t_children: child_2_parent_set[c].update(t_parents) child_2_parent_set[c].remove(t) del child_2_parent_set[t] del parent_2_child_set[t] del term_2_gene_set[t] ont.terms = [t for t in ont.terms if t not in terms] terms_index = make_index(ont.terms) ont.terms_index = terms_index ont.gene_2_term = {g : sorted([terms_index[s] for s in t]) for g, t in gene_2_term_set.items()} ont.child_2_parent = {c : sorted(p) for c, p in child_2_parent_set.items()} ont.parent_2_child = invert_dict(ont.child_2_parent) ont._update_fields() else: tmp_gene_2_term = {g : [ont.terms[t] for t in t_list] for g, t_list in ont.gene_2_term.items()} ont.terms = [t for t in ont.terms if t not in terms] ont.terms_index = make_index(ont.terms) ont.gene_2_term = {g : [ont.terms_index[t] for t in t_list if t not in terms] for g, t_list in tmp_gene_2_term.items()} ont.parent_2_child = {p : [c for c in c_list if c not in terms] for p, c_list in ont.parent_2_child.items() if p not in terms} ont._update_fields() # Update node/edge attributes to_keep = (set(ont.terms) | set(ont.genes)) - genes - terms ont.edge_attr = ont.edge_attr[ont.edge_attr.index.get_level_values(0).isin(to_keep) | \ ont.edge_attr.index.get_level_values(1).isin(to_keep)] ont.node_attr = ont.node_attr[ont.node_attr.index.isin(to_keep)] return ont
def rename(self, genes=lambda x: x, terms=lambda x: x, inplace=False): """Rename gene and/or term names. Parameters ---------- genes : dict or function If dictionary, then it maps current gene names to new names. Genes not in dictionary are deleted. If function, then genes(name) returns the new name. terms : dict or function If dictionary, then it maps current term names to new names. Terms not in dictionary are deleted. If function, then terms(name) returns the new name. inplace : bool If True, then modify the ontology. If False, then create and modify a copy. Returns ------- : ddot.Ontology.Ontology """ try: terms = {t : terms(t) for t in self.terms} except: pass try: genes = {g : genes(g) for g in self.genes} except: pass if inplace: ont = self else: ont = self.copy() if genes: new_genes = set() new_gene_2_term = {} for g in ont.genes: new_g = genes.get(g, g) if hasattr(new_g, '__iter__') and not isinstance(new_g, str): for new_gg in new_g: new_genes.add(new_gg) new_gene_2_term[new_gg] = ont.gene_2_term[g] else: new_genes.add(new_g) new_gene_2_term[new_g] = ont.gene_2_term[g] ont.genes = sorted(new_genes) ont.gene_2_term = new_gene_2_term ont.genes_index = make_index(ont.genes) ont._update_fields() if terms: ont.parent_2_child = {terms.get(p, p) : [terms.get(c, c) for c in c_list] for p, c_list in ont.parent_2_child.items()} old_term_names = ont.terms ont.terms = [terms.get(t,t) for t in ont.terms] # Retain a unique set of term names ont.terms = sorted(set(ont.terms)) ont.terms_index = make_index(ont.terms) ont.gene_2_term = {g : [ont.terms_index[terms.get(t,t)] for t in [old_term_names[t] for t in t_list]] for g, t_list in ont.gene_2_term.items()} ont._update_fields() conversions = genes.copy() conversions.update(terms) # Remove identities conversions = {k : v for k, v in conversions.items() if k!=v} f = lambda x: conversions.get(x,x) # Update node attributes index = ont.node_attr.index ont.node_attr.index = pd.Series(index).map(f) # Update edge attributes idx = ont.edge_attr.index idx.set_levels([idx.levels[0].map(f), idx.levels[1].map(f)], inplace=True) ont._check_valid() return ont def _check_valid(self): if not self.is_dag(): print('Found cycle:', nx.find_cycle(self._to_networkx_no_layout())) raise Exception('Not a directed acyclic graph') assert len(self.genes) == len(set(self.genes)) assert len(self.terms) == len(set(self.terms)) assert set(self.genes) == set(self.gene_2_term.keys()) assert set(self.terms) == set(self.child_2_parent.keys()) assert set(self.terms) == set(self.parent_2_child.keys()) assert set(self.terms) == set(self.term_2_gene.keys()) assert self.edge_attr.index.duplicated().sum()==0 assert self.node_attr.index.duplicated().sum()==0
[docs] def to_table(self, output=None, term_2_term=True, gene_2_term=True, edge_attr=False, header=True, parent_child=True, clixo_format=False): """Convert Ontology to a table representation. Return a pandas.DataFrame and, optionally, write it to a file as a tab-delimited file. Parameters ---------- output : filepath or file-like File to write table. If None, then only return a pandas.DataFrame term_2_term : bool Include (child term, parent term) pairs gene_2_term : bool Include (gene, term) pairs edge_attr : array-like or bool List of extra edge attributes to include. If True, then include all attributes. If False, then don't include any attribute. header : bool If True (default), then write the column names as the first row of the table. parent_child : bool If True, then the first column is the parent term and the second column is the child term or gene. If False, then the columns are reversed. clixo_format : bool If True, the table is the same format used the CLIXO C++ implementation. In particular, the table has three columns: Column 1) Parent Term Column 2) Child Term or Gene Column 3) The string "gene" if the row is a gene-term mapping, otherwise the string "default". Returns ------- : pandas.DataFrame Contains at least three columns: (1) "Parent", (2) "Child", and (3) "EdgeType". """ if clixo_format: df = self.to_table(output=None, term_2_term=True, gene_2_term=True, edge_attr=False, header=False, parent_child=True, clixo_format=False) df.replace({self.EDGETYPE_ATTR : {self.GENE_TERM_EDGETYPE : 'gene', self.CHILD_PARENT_EDGETYPE : 'default'}}, inplace=True) if output is not None: df.to_csv(output, header=False, index=False, sep='\t') return df df = pd.DataFrame(columns=['Parent','Child',self.EDGETYPE_ATTR]) if term_2_term: df = df.append(self._hierarchy_to_pandas(), ignore_index=True) if gene_2_term: df = df.append(self._mapping_to_pandas(), ignore_index=True) if edge_attr and self.edge_attr.shape[1] > 0: if edge_attr==True: edge_attr = df.columns df = df.merge(self.edge_attr, how='left', left_on=['Child', 'Parent'], right_index=True) first_two = ['Parent', 'Child'] if parent_child else ['Child', 'Parent'] df = df[first_two + [x for x in df.columns if x not in first_two]] if output is not None: df.to_csv(output, header=header, index=False, sep='\t') return df
def _hierarchy_to_pandas(self): triples = [(p,c) for p, c_list in self.parent_2_child.items() for c in c_list] df = pd.DataFrame(triples, columns=['Parent', 'Child']) df[self.EDGETYPE_ATTR] = self.CHILD_PARENT_EDGETYPE return df def _mapping_to_pandas(self): pairs = [(self.terms[t], g) for g, t_list in self.gene_2_term.items() for t in t_list] df = pd.DataFrame(pairs, columns=['Parent', 'Child']) df[self.EDGETYPE_ATTR] = self.GENE_TERM_EDGETYPE return df def copy(self): """Create a deep copy of the Ontology object""" ont = Ontology(None, None, **{'empty' : True}) for x in ['node_attr', 'edge_attr']: setattr(ont, x, getattr(self, x).copy()) for x in ['genes', 'terms']: setattr(ont, x, getattr(self, x)[:]) if self._term_sizes is None: ont._term_sizes = None else: ont._term_sizes = self._term_sizes[:] for x in ['genes_index', 'terms_index']: setattr(ont, x, getattr(self, x).copy()) for x in ['gene_2_term', 'term_2_gene', 'child_2_parent', 'parent_2_child']: copy_val = {k : v[:] for k, v in getattr(self, x).items()} setattr(ont, x, copy_val) return ont
[docs] def flatten(self, include_genes=True, include_terms=False, similarity='Resnik'): """Flatten the hierarchy into a node-node similarity matrix by calculating a similarity between pair of genes in `genes_subset`. Currently, only the Resnik semantic similarity measure is implemented. Parameters ----------- include_genes : bool If True, then calculate pairwise similarities between genes. If `include_terms` is also True, then also calculate similarities between genes and terms. include_terms : bool If True, then calculate pairwise similarities between terms. If `include_genes` is also True, then also calculate similarities between genes and terms. similarity : str Type of semantic similarity. (default: "Resnik") The Resnik similarity s(g1,g2) is defined as :math:`-log_2(|T_{sca}| / |T_{root}|)` where :math:`|T|` is the number of genes in `genes_subset` that are under term T. :math:`T_{sca}` is the "smallest common ancestor", the common ancestral term with the smallest term size. :math:`T_{root}` is the root term of the ontology. Resnik, P. (1999). Semantic similarity in a taxonomy: An information-based measured and its application to problems of ambiguity in natural language. J. Artif. Intell. Res. 11,95-130. Returns ------- : (sim, nodes) A 2-tuple consisting of `sim`, a node-by-node NumPy array, and `nodes`, a NumPy array of the node names in `sim`. """ assert include_genes assert not include_terms, 'include_terms is not yet implemented' if similarity=='Resnik': sca, nodes = self.get_best_ancestors(include_genes=include_genes) nodes_subset = self.genes if include_genes else [] nodes_subset += self.terms if include_terms else [] nodes_idx = ddot.utils.make_index(nodes) idx = [nodes_idx[v] for v in nodes_subset] sca = sca[idx, :][:, idx] ss = -1 * np.log2(np.array(self.term_sizes)[sca] / float(len(self.genes))) ss = ss.astype(np.float32) return ss, np.array(nodes_subset) else: raise Exception('Unsupported similarity type')
def common_ancestors(self, nodes, min_nodes='all', minimal=True): """Return the common ancestors of a set of genes Parameters ---------- nodes : list List of nodes (genes and/or terms) to find the common ancestors min_nodes : str or int If 'all', then return only terms that contain all of the input genes. If an integer, then return only terms that contain at least <nodes> of the input genes minimal : bool If True, then do NOT return the terms that are themselves ancestors of the other common ancestors. This filter leaves only the 'minimal' set of common ancestors. Returns ------- : list List of common ancestors """ if min_nodes=='all': min_nodes = len(nodes) conn = self.connected(nodes, self.terms) anc_bool = conn.sum(0) >= min_nodes anc = np.array(self.terms)[anc_bool] if minimal: anc_conn = self.connected(anc, anc, sparse=False) np.fill_diagonal(anc_conn, 0) anc = anc[anc_conn.sum(0) == 0] return anc def _get_term_2_gene(self, verbose=False): if verbose: print('Calculating term_2_gene') term_2_gene = invert_dict( self.gene_2_term, keymap=make_index(self.genes), valmap=dict(enumerate(self.terms))) for t in self.terms: if not t in term_2_gene: term_2_gene[t] = [] return term_2_gene @property def term_sizes(self): if self._term_sizes is None: self._term_sizes = self._get_term_sizes(propagate=True) return self._term_sizes def _get_term_sizes(self, propagate=True): """Returns an array of term sizes in the same order as self.terms""" if propagate: ont = self.propagate(inplace=False) gene_2_term = ont.gene_2_term # gene_2_term = self._propagate_forward() else: gene_2_term = self.gene_2_term tmp = Counter([x for y in gene_2_term.values() for x in y]) term_sizes = [tmp[x] for x in range(len(self.terms))] return term_sizes def get_information_gain(self): for p in terms: self.parent_2_children[p] def shuffle_genes(self, inplace=False): """Shuffle the names of genes""" new_order = self.genes.copy() random.shuffle(new_order) rename = dict(zip(self.genes, new_order)) return self.rename(rename, inplace=False) def get_tree(self, ret='edges', verbose=False): """Identify a spanning tree of the DAG (including genes as part of the DAG). Parameters ------------ ret : str If 'edges', then return a list of (u, v) edges in the tree. If 'ontology', return an Ontology object consisting of only the tree edges. Returns ------- : array-like or Ontology """ tree = self.to_igraph(include_genes=True, spanning_tree=True) if ret=='edges': tree_edges = set([(tree.vs[e.source]['name'], tree.vs[e.target]['name']) for e in tree.es if e['Is_Tree_Edge']=='Tree']) return tree_edges elif ret=='ontology': tree.delete_edges([e.index for e in tree.es if e['Is_Tree_Edge']=='Not_Tree']) return Ontology.from_igraph(tree, verbose=verbose) def is_dag(self): """Return True if the Ontology is a valid directed acyclic graph, False otherwise. """ return self.to_igraph(include_genes=True, spanning_tree=False).is_dag()
[docs] def topological_sorting(self, top_down=True, include_genes=False): """Perform a topological sorting. top_down : If True, then ancestral nodes (e.g. the root nodes) come before descendants in the sorting. If False, then reverse the sorting """ graph = self.to_igraph(include_genes=include_genes, spanning_tree=False) topo = list(graph.vs[graph.topological_sorting(mode='out')]['name']) if not top_down: topo = topo[::-1] return topo
[docs] def to_igraph(self, include_genes=True, spanning_tree=False): """Convert Ontology to an igraph.Graph object. Gene and term names are stored in the 'name' vertex attribute of the igraph object. Parameters ---------- include_genes : bool Include genes as vertices in the igraph object. spanning_tree : bool If True, then identify a spanning tree of the DAG. include an edge attribute "Is_Tree_Edge" that indicates Returns ------- : igraph.Graph """ if include_genes: terms_index_offset = {t : v + len(self.genes) for t, v in self.terms_index.items()} gene_term_edges = [(self.genes_index[g], terms_index_offset[self.terms[t]]) for g in self.genes for t in self.gene_2_term[g]] child_parent_edges = [(terms_index_offset[c], terms_index_offset[p]) for p, children in self.parent_2_child.items() for c in children] vertex_attrs = self.node_attr.reindex(index=self.genes + self.terms).loc[self.genes + self.terms].to_dict(orient='list') vertex_attrs.update({ 'name':self.genes + self.terms, self.NODETYPE_ATTR:[self.GENE_NODETYPE for x in self.genes] + [self.TERM_NODETYPE for x in self.terms] }) graph = igraph.Graph(n=len(self.genes) + len(self.terms), edges=gene_term_edges + child_parent_edges, directed=True, vertex_attrs=vertex_attrs, edge_attrs={self.EDGETYPE_ATTR : [self.GENE_TERM_EDGETYPE for x in gene_term_edges] + \ [self.CHILD_PARENT_EDGETYPE for x in child_parent_edges]}) else: edges = [(self.terms_index[c], self.terms_index[p]) for p, children in self.parent_2_child.items() for c in children] graph = igraph.Graph(n=len(self.terms), edges=edges, directed=True, vertex_attrs={'name':self.terms}, edge_attrs={self.EDGETYPE_ATTR : [self.CHILD_PARENT_EDGETYPE for x in edges]}) if spanning_tree: parent_priority = [self.term_sizes[self.terms_index[v['name']]] if (v['name'] in self.terms_index) else 1 for v in graph.vs] # Identify spanning tree graph = self._make_tree_igraph( graph, parent_priority=parent_priority, optim=min, edge_name='Is_Tree_Edge') graph.es['Is_Tree_Edge'] = ['Tree' if x else 'Not_Tree' for x in graph.es['Is_Tree_Edge']] return graph
def shortest_paths(self, descendants=None, ancestors=None, sparse=False, weights=None, chunk_size=500): """Calculate the length of the shortest paths from descendant nodes to ancestor nodes. Parameters ---------- sparse : bool If True, return a scipy.sparse matrix. If False, return a NumPy array weights : dict Dictionary mapping (child term, parent term) or (gene, term) edges to weights. Any edge with no given weight is assigned a weight of 0 by default. (default) If weights is None, then a uniform weight is assumed. chunk_size : int (optional) Computational optimization: shortest paths are calculated in batches. Returns ------- d : np.ndarray or scipy.sparse.spmatrix d[x,y] is the length of the shortest directed path from a descendant node x to ancestor node y. d[x,y]==numpy.inf if no directed path exists. The rows are in the same order as <descendants>, and the columns are in the same order as <ancestors>. """ graph = self.to_igraph(include_genes=True, spanning_tree=False) import numbers if weights is None: weights = 1 if weights is not None and not isinstance(weights, numbers.Number): # Assume dictionary weights = [weights.get((graph.vs[e.source]['name'], graph.vs[e.target]['name']), 0) for e in graph.es] graph.es['weight'] = weights if descendants is None: descendants = graph.vs if ancestors is None: ancestors = descendants tmp = [graph.shortest_paths( descendants[x[0]:x[1]], ancestors, weights='weight', mode='out') for x in split_indices_chunk(len(descendants), chunk_size)] if sparse: return scipy.sparse.vstack([scipy.sparse.csr_matrix(x) for x in tmp]) else: return np.vstack(tmp) def longest_paths(self, descendants=None, ancestors=None, sparse=False, weights=None, chunk_size=500):# TODO: when ancestors are specified, the results become negative """Computes the lengths of the longest directed paths between all pairs of terms. Returns ------- d : np.ndarray or scipy.sparse.spmatrix d[x,y] is the length of the longest directed path from a descendant term with index x to an ancestral term with index y, where indices are defined by self.terms. d[x,y]==numpy.inf if no directed path exists. """ d = self.shortest_paths(descendants=descendants, ancestors=ancestors, sparse=sparse, weights=-1, chunk_size=chunk_size) if sparse: d.data = -1 * d.data else: d = -1 * d return d
[docs] def connected(self, descendants=None, ancestors=None, sparse=False): """Calculate which genes or terms are descendants of other genes or terms. Parameters ----------- descendants: list A list of genes and/or terms. Default: A list of all genes followed by a list of all terms, in the same order as `self.genes` and `self.terms`. ancestors: list A list of genes and/or terms. Default: Same as the ``descendants`` parameter. sparse : bool If True, return a scipy.sparse matrix. If False (default), return a NumPy array. Returns ------- d : np.ndarray or scipy.sparse.matrix A descendants-by-ancestors matrix. ``d[i,j]`` is 1 if term i is a descendant of term j, and 0 otherwise. Note that ``d[i,i]==1`` and ``d[root,i]==0``, for every i. """ d = self.shortest_paths(descendants=descendants, ancestors=ancestors, sparse=sparse) if sparse: d.data = np.isfinite(d.data) else: d = np.isfinite(d) return d
# def get_leaves(self, terms_list, children_list=None): # """Returns terms in ``terms_list`` that are not ancestors of any term in # ``children_list``. # Parameters # ---------- # terms_list : list # children_list : list # If ``children_list`` is None, then select the terms in # <terms_list> that are not ancestors of any of the other # terms in <terms_list>. # """ # connectivity_matrix_nodiag = self.get_connectivity_matrix_nodiag() # terms_list = np.array(terms_list) # if children_list is None: # children_list = terms_list # else: # children_list = np.array(children_list) # return terms_list[~ np.any(connectivity_matrix_nodiag[children_list, :][:, terms_list], axis=0)]
[docs] def propagate(self, direction='forward', gene_term=True, term_term=False, verbose=False, inplace=False): """Propagate gene-term annotations through the ontology. As an example, consider an ontology with one gene ``g``, three terms ``t1, t2, t3`` and the following connections: :: t1-->t2 t2-->t3 g-->t1 g-->t2 In "forward" propagation, a new relation ``g-->t3`` is added. In "reverse" propagation, the relation "g-->t2" is deleted because it is an indirect relation inferred from "g-->t1" and "t1-->t2". Parameters ---------- direction : str The direction of propagation. Either 'forward' or 'reverse' inplace : bool If True, then modify the ontology. If False, then create and modify a copy. Returns ------- : ddot.Ontology.Ontology """ if inplace: ont = self else: ont = self.copy() assert direction in ['forward', 'reverse'], "Propagation direction must be forward or backward" forward = direction=='forward' if not forward: # This is needed to ensure that the pruning to a parent's # gene set can be based on the gene sets of its direct # children ont = ont.propagate(gene_term=gene_term, term_term=term_term, direction='forward', inplace=True) if gene_term: term_2_gene_set = {t : set(g) for t, g in ont.term_2_gene.items()} if term_term: parent_2_child_set = {p : set(c) for p, c in ont.parent_2_child.items()} # # TODO: have this topological sorting be a part of the code below # graph = ont.to_igraph(include_genes=False, spanning_tree=False) # for c_idx in graph.topological_sorting(mode='in'): # child = graph.vs[c_idx]['name'] for child in ont.topological_sorting(top_down=forward, include_genes=False): for parent in ont.child_2_parent[child]: if gene_term: if forward: term_2_gene_set[parent] |= term_2_gene_set[child] else: term_2_gene_set[parent] -= term_2_gene_set[child] if term_term: if forward: parent_2_child_set[parent] |= parent_2_child_set[child] else: parent_2_child_set[parent] -= parent_2_child_set[child] if gene_term: ont.gene_2_term = invert_dict(term_2_gene_set, keymap=make_index(ont.terms), valmap=dict(enumerate(ont.genes))) ont.term_2_gene = {a : list(b) for a, b in term_2_gene_set.items()} if term_term: ont.parent_2_child = {a : list(b) for a, b in parent_2_child_set.items()} ont.child_2_parent = ont._get_child_2_parent() ont._check_valid() return ont
def get_ontotype(self, genotypes, input_format='gene_list', output_format='dataframe', matrix_columns=None): """Transform genotypes to ontotypes. .. [1] Yu, M.K., Kramer, M., Dutkowski, J., Srivas, R., Licon, K., Kreisberg, J.F., Ng, C.T., Krogan, N., Sharan, R. and Ideker, T., 2016. "Translation of genotype to phenotype by a hierarchy of cell subsystems". *Cell Systems*, 2(2), pp.77-88. Parameters ---------- genotypes : list, np.ndarray, scipy.sparse.spmatrix, pd.DataFrame input_format : str If "gene_list", then ``genotypes`` is a list of genotypes, where genotype is itself a list of genes mutated. Each gene is assumed to have a mutation value of 1. If 'matrix', then ``genotypes`` is a genotype-by-gene matrix, where the value at position (i,j) represents the mutation value of gene j in genotype i. ``genotypes`` can be a NumPy array, SciPy sparse matrix, or Pandas dataframe. output_format : str If 'sparse', then return a sparse matrix as a scipy.sparse.csr_matrix object. (default) If 'dataframe', then return a pandas.DataFrame object. If 'array', then return a numpy.ndarray object. matrix_columns : list represents a list of the genes that are represented by the columns of ``genotypes``. Only used when input_format is "matrix" and ``genotypes`` is a NumPy array or SciPy sparse matrix. Returns ------- : scipy.sparse.csr_matrix, pandas.DataFrame, numpy.ndarray genotype-by-term matrix, where the ordering of rows and terms is the same as ``genotypes`` and ``self.terms`` """ genotypes_names = None if input_format=='gene_list': gene_2_term = {k: np.array(v) for k, v in self.gene_2_term.items()} genotypes_x = [np.concatenate([gene_2_term[g] for g in gset]) if len(gset)>0 else np.array([]) for gset in genotypes] indices = np.concatenate(genotypes_x) indptr = np.append(0, np.cumsum([gset.size for gset in genotypes_x])) data = np.ones((indices.size, ), dtype=np.int64) ontotypes = scipy.sparse.csr_matrix( (data, indices, indptr), (len(genotypes), len(self.terms))) ontotypes.sum_duplicates() elif input_format=='matrix': if isinstance(genotypes, pd.DataFrame): matrix_columns = genotypes.columns genotypes_names = genotypes.index genotypes = genotypes.values elif isinstance(genotypes, np.ndarray) or scipy.sparse.issparse(genotypes): assert matrix_columns is not None else: raise Exception("Parameter <genotypes> must be a genotype-by-gene matrix " "represented as a Pandas dataframe, NumPy array, or SciPy sparse matrix. " "Consider changing the <input_format> parameter") contained = np.array([g in self.genes_index for g in matrix_columns]) genotypes = scipy.sparse.csc_matrix(genotypes)[:,contained] gene_2_term_matrix = scipy.sparse.csr_matrix(self.get_gene_2_term_matrix()) gene_2_term_matrix = scipy.sparse.csr_matrix(gene_2_term_matrix)[contained,:] ontotypes = genotypes.dot(gene_2_term_matrix) else: raise Exception('Invalid input format') if output_format=='dataframe': ontotypes = pd.DataFrame(ontotypes.toarray(), columns=self.terms) if genotypes_names is not None: ontotypes.index = genotypes_names elif output_format=='sparse': pass elif output_format=='array': ontotypes = ontotypes.toarray() else: raise Exception('Invalid output format') return ontotypes def get_gene_2_term_matrix(self): """Returns a gene-by-term matrix stored as a scipy.sparse.coo_matrix Returns ------- : scipy.sparse.coo_matrix """ # Convert gene names to indices gene_2_term = [(self.genes_index[g], t_list) for g, t_list in self.gene_2_term.items()] gene_2_term_matrix = scipy.sparse.coo_matrix( ([1 for g, t_list in gene_2_term for t in t_list], ([g for g, t_list in gene_2_term for t in t_list], [t for g, t_list in gene_2_term for t in t_list])), shape=(len(self.genes), len(self.terms))) return gene_2_term_matrix def summary(self): """Summarize the Ontology's contents with respect to number of genes, terms, and connections. Returns -------- : str """ if self.node_attr is None: node_attr_names = [] else: node_attr_names = self.node_attr.columns.tolist() # node_attr_names = ', '.join(map(str, self.node_attr.columns)) if self.edge_attr is None: edge_attr_names = [] else: edge_attr_names = self.edge_attr.columns.tolist() # edge_attr_names = ', '.join(map(str, self.edge_attr.columns)) summary = ('%s genes, ' '%s terms, ' '%s gene-term relations, ' '%s term-term relations' '\nnode_attributes: %s' '\nedge_attributes: %s') % ( len(self.genes), len(self.terms), sum([len(x) for x in self.gene_2_term.values()]), sum([len(x) for x in self.parent_2_child.values()]), node_attr_names, edge_attr_names) return summary
[docs] def to_ndex(self, ndex_user, ndex_pass, ndex_server=None, name=None, description=None, network=None, main_feature=None, subnet_max_term_size=None, visible_term_attr=None, layout='bubble', propagate='reverse', style=None, node_alias='Original_Name', term_2_uuid=None, visibility='PUBLIC', verbose=False): """Upload an Ontology object to NDEx. The Ontology can be preformatted in several ways including 1. Set a name and description of the Ontology 2. Upload a supporting gene-gene subnetwork for every term in the Ontology 3. Propagate gene-term annotations 4. Layout the nodes. 5. Apply a visual style, e.g. specifying node and edge colors Parameters ---------- name : str Name of Ontology description : str Description of Ontology layout : str The name of the layout algorithm for laying out the Ontology as a graph. Node positions are stored in the node attributes 'x_pos' and 'y_pos'. If None, then do not perform a layout. style : ndex.networkn.NdexGraph The Cytoscape.js visual style on NDEx. Represented using CX and stored in an NdexGraph. network : pandas.Dataframe Dataframe describing gene-gene network from which to create subnetworks for every term. To be passed to Ontology.upload_subnets_ndex(). features : list of str Columns in the gene-gene network to upload. To be passed to Ontology.upload_subnets_ndex(). ndex_server : str URL of NDEx server ndex_user : str NDEx username ndex_pass : str NDEx password public : bool Whether to make the Ontology public on NDEx node_alias : str visibility : str Returns ------- : ndex.networkn.NdexGraph """ if propagate is not None: ont = self.propagate(direction=propagate, inplace=False) else: ont = self if ndex_server is None: ndex_server = ddot.config.ndex_server if (network is not None) and (term_2_uuid is None): if subnet_max_term_size is None: terms = ont.terms else: terms = [t for t,s in zip(ont.terms, ont.term_sizes) if s <= subnet_max_term_size] # Only upload subnets for the unique set of the original # terms if node_alias in ont.node_attr.columns: orig_2_new = {a : b.index.values for a, b in ont.node_attr.loc[terms, [node_alias]].groupby(node_alias)} terms = [b[0] for b in orig_2_new.values()] term_2_uuid = ont.upload_subnets_ndex( network, main_feature, name, ndex_user, ndex_pass, ndex_server=ndex_server, terms=terms, visibility=visibility, verbose=verbose ) if node_alias in ont.node_attr.columns: term_2_uuid = {s : term_2_uuid[orig_2_new[t][0]] for t in orig_2_new for s in orig_2_new[t] if orig_2_new[t][0] in term_2_uuid} elif term_2_uuid is None: term_2_uuid = {} if verbose: print('Creating NdexGraph') G = ont.to_NdexGraph( name=name, description=description, term_2_uuid=term_2_uuid, layout=layout, style=style) if visible_term_attr is not None: df = ddot.utils.nx_nodes_to_pandas(G, visible_term_attr) df.rename(columns=lambda x: 'Display:' + x, inplace=True) ddot.utils.set_node_attributes_from_pandas(G, df) G.set_network_attribute('Display', '|'.join(visible_term_attr)) if verbose: print('Uploading to NDEx') ont_url = G.upload_to(ndex_server, ndex_user, ndex_pass, visibility=visibility) return ont_url, G
def to_NdexGraph(self, name=None, description=None, term_2_uuid=None, spanning_tree=True, layout='bubble', style=None, verbose=False): """Formats an Ontology object into a NetworkX object with extra node attributes that are accessed by the hierarchical viewer. Parameters ----------- name : str Name of Ontology, as would appear if uploaded to NDEx. description : str Description of Ontology, as would appear if uploaded to NDEx. term_2_uuid : dict A dictionary mapping a term to a NDEx UUID of a gene-gene subnetwork of genes in that term. the UUID will be stored in the node attribute 'ndex:internallink'. If uploaded to NDEx, then this attribute will provide a hyperlink to the gene-gene subnetwork when the term is clicked upon on the NDEx page for this ontology. This dictionary can be created using Ontology.upload_subnets_ndex(). Default: no dictionary. layout : str Layout the genes and terms in this Ontology. Stored in the node attributes 'x_pos' and 'y_pos'. If None, then do not perform a layout. Returns ------- : ndex.networkn.NdexGraph """ # Convert to NetworkX G = self.to_networkx(layout=layout, spanning_tree=spanning_tree) if style is None: style = 'passthrough' # Set extra attributes for passthrough visual styling if style=='passthrough': for v, data in G.nodes(data=True): is_gene = data[self.NODETYPE_ATTR]==self.GENE_NODETYPE if 'Vis:Shape' not in data: data['Vis:Shape'] = 'Rectangle' if is_gene else 'Circle' if 'Vis:Fill Color' not in data: data['Vis:Fill Color'] = '#FFFFFF' if 'Vis:Border Paint' not in data: data['Vis:Border Paint'] = '#000000' for u, v, data in G.edges(data=True): if 'Vis:Visible' not in data and 'Is_Tree_Edge' in data: data['Vis:Visible'] = data['Is_Tree_Edge']=='Tree' style = ddot.config.get_passthrough_style() else: raise Exception('Unsupported style') # Set links to subnetworks supporting each term if term_2_uuid: for t in self.terms: if t in term_2_uuid: G.node[t]['ndex:internalLink'] = '[%s](%s)' % (G.node[t]['Label'], term_2_uuid[t]) # # Change Original_Name to node indices # name_2_idx = {data['name'] : v for v, data in G.nodes(data=True)} # for v, data in G.nodes(data=True): # if 'Original_Name' in data and 'Hidden' in data and data['Hidden']==True: # data['Original_Name'] = name_2_idx[data['Original_Name']] G = nx_to_NdexGraph(G) if name is not None: G.set_name(name) if description is not None: G.set_network_attribute('Description', description) if style: import ndex.beta.toolbox as toolbox toolbox.apply_network_as_template(G, style) return G
[docs] def to_cx(self, output=None, name=None, description=None, term_2_uuid=None, spanning_tree=True, layout='bubble', style=None): """Formats an Ontology object into a CX file format Parameters ----------- output : str Filename or file-like object to write CX file. If None, then CX is returned as a JSON object, but not written to a file. name : str Name of Ontology, as would appear if uploaded to NDEx. description : str Description of Ontology, as would appear if uploaded to NDEx. term_2_uuid : list A dictionary mapping a term to a NDEx UUID of a gene-gene subnetwork of genes in that term. the UUID will be stored in the node attribute 'ndex:internallink'. If uploaded to NDEx, then this attribute will provide a hyperlink to the gene-gene subnetwork when the term is clicked upon on the NDEx page for this ontology. This dictionary can be created using Ontology.upload_subnets_ndex(). Default: no dictionary. layout : str Layout the genes and terms in this Ontology. Stored in the node attributes 'x_pos' and 'y_pos'. If None, then do not perform a layout. Returns ------- : CX representation as a JSON-like dictionary """ # Convert to NdexGraph G = self.to_NdexGraph(name=name, description=description, term_2_uuid=term_2_uuid, spanning_tree=spanning_tree, layout=layout, style=style) cx = G.to_cx() if output is not None: if hasattr(output, 'write'): json.dump(cx, output) else: with io.open(output, 'w') as f: json.dump(cx, f) return cx
[docs] def to_graphml(self, output, layout='bubble', spanning_tree=True): """Writes an Ontology object in graphml format. Parameters ----------- output : str Filename or file-like object to write CX file. If None, then CX is returned as a JSON object, but not written to a file. layout : str Layout the genes and terms in this Ontology. Stored in the node attributes 'x_pos' and 'y_pos'. If None, then do not perform a layout. """ # Convert to NetworkX G = self.to_NdexGraph(spanning_tree=spanning_tree, layout=layout) if hasattr(output, 'write'): nx.write_graphml(G, output) else: with io.open(output, 'w') as f: nx.write_graphml(G, f)
def _force_directed_layout(self, G): """Force-directed layout on only the terms""" sub_nx = G.copy() sub_nx.remove_edges_from([(u,v) for u,v,attr in sub_nx.edges(data=True) if attr['Is_Tree_Edge']=='Not_Tree']) pos = nx.spring_layout(sub_nx, dim=2, k=None, pos=None, fixed=None, iterations=50, weight=None, scale=1.0) tmp = np.array([x[0] for x in pos.values()]) x_min, x_max = tmp.min(), tmp.max() tmp = np.array([x[1] for x in pos.values()]) y_min, y_max = tmp.min(), tmp.max() x_scale = 500. / (y_max - y_min) y_scale = 500. / (x_max - x_min) pos = {a : [b[0] * x_scale, b[1] * y_scale] for a, b in pos.items()} return pos def upload_subnets_ndex(self, network, main_feature, name, ndex_user, ndex_pass, ndex_server=None, terms=None, gene_columns=['Gene1', 'Gene2'], propagate='forward', visibility='PUBLIC', node_attr=None, node_alias='Original_Name', z_score=False, spring_feature=None, spring_weight=1.0, edge_groups=None, max_num_edges = -1, verbose=False): """For each term in the ontology, upload a subnetwork of interactions between the genes in that term to NDEx. TODO: instead of specifying gene_columns, add another parameter use_index to specify that genes are the network's index Parameters ---------- network : pandas.Dataframe Dataframe describing network features : list of str Columns in network to upload name : str Prefix for the names of all subnetworks ndex_server : str URL of NDEx server ndex_user : str NDEx username ndex_pass : str NDEx password terms : list List of terms to upload a subnetwork. Default: upload for all terms. gene_columns : list Columns in network that represent the two genes. propagate : str The direction ('forward' or 'reverse') to propagate gene-term annotations up the hierarchy with Ontology.propagate(). If None, then don't propagate annotations. public : bool Whether to make networks public on NDEx node_attr : pandas.DataFrame """ if propagate: ont = self.propagate(direction=propagate, inplace=False) else: ont = self if ndex_server is None: ndex_server = ddot.config.ndex_server ndex = nc.Ndex(ndex_server, ndex_user, ndex_pass) term_2_uuid = {} start = time.time() g1, g2 = gene_columns[0] + '_lex', gene_columns[1] + '_lex' features = [f for f in network.columns if (f not in gene_columns)] assert main_feature in features, 'A main feature of the network must be specified' network = network[features + gene_columns].copy() network[gene_columns[0]] = network[gene_columns[0]].astype(str) network[gene_columns[1]] = network[gene_columns[1]].astype(str) # Filter dataframe for gene pairs within the ontology genes_set = set(ont.genes) tmp = [x in genes_set and y in genes_set for x, y in zip(network[gene_columns[0]], network[gene_columns[1]])] network = network.loc[tmp, :] # Lexicographically sort gene1 and gene2 so that gene1 < gene2 # actually this may be redundant if z_score: for feat in features: network[feat] = network[feat].astype(np.float64) # Normalize features into z-scores tmp = network[features] network[features] = (tmp - tmp.mean()) / tmp.std() # network_sq = ddot.utils.pivot_square(network, g1, g2, main_feature) # Calculate the min/max range of features numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] def f(x): if str(x) in numerics: return 'numeric' elif str(x) == 'bool': return 'boolean' else: raise Exception() feature_types = network[features].dtypes.map(f) feature_mins = network[features].min().astype(np.str) feature_maxs = network[features].max().astype(np.str) # set an upper limit to the maximum number of edges uploaded to NDEx # (contributed by Fan Zheng) if max_num_edges > 0: network.sort_values(by = main_feature, ascending=False, inplace=True) network = network.iloc[:max_num_edges, :] # Lexicographically sort gene1 and gene2 so that gene1 < gene2 # actually this may be redundant network[g1], network[g2] = zip( *[(x, y) if x < y else (y, x) for x, y in zip(network[gene_columns[0]], network[gene_columns[1]])]) network_idx = {x: i for i, x in enumerate(zip(network[g1], network[g2]))} if terms is None: terms = ont.terms if verbose: print('Uploading %s terms' % len(terms)) for upload_idx, t in enumerate(terms): start = time.time() if node_alias in ont.node_attr.columns: genes = ont.node_attr.loc[genes, node_alias].values else: genes = [ont.genes[g] for g in ont.term_2_gene[t]] genes.sort() gene_pairs_idx = [network_idx[gp] for gp in itertools.combinations(genes, 2) \ if gp in network_idx] # New (Parent weight) children = ont.parent_2_child[t] min_children_term_weights = -1 if ('Parent weight' in ont.node_attr.columns.tolist()) and (len(children) >0): children_term_weights = [] for c in children: if ont.node_attr.loc[c, 'Parent weight'] >0: children_term_weights.append(ont.node_attr.loc[c, 'Parent weight']) if len(children_term_weights): children_term_weights = np.array(children_term_weights) min_children_term_weights = np.min(children_term_weights) if len(gene_pairs_idx) > 0: network_sub = network.iloc[gene_pairs_idx, :] network_sub = network_sub.loc[network_sub[main_feature] >= ont.node_attr.loc[t, 'Parent weight']] # filter network if max_num_edges is greater then 0 if max_num_edges != None and max_num_edges > 0: network_sub.sort_values(by=main_feature, ascending=False, inplace=True) network_sub = network_sub.iloc[:max_num_edges, :] # New: apply some minimum string force so nodes will not fly away # if spring_feature != None: # network_sub.loc[network_sub[spring_feature] < min_children_term_weights, spring_feature] = 0.5*min_children_term_weights # network_sub[spring_feature] = network_sub[spring_feature] ** spring_weight G_nx = nx.from_pandas_dataframe(network_sub, g1, g2, edge_attr=features) if node_attr is not None: set_node_attributes_from_pandas(G_nx, node_attr) G_nx.add_nodes_from(list(set(genes) - set(G_nx.nodes()))) # Annotate the membership in children terms children = ont.parent_2_child[t] df = pd.DataFrame({c : None for c in children}, index=genes, dtype=bool) for c in children: genes_in = [ont.genes[g] for g in ont.term_2_gene[c]] # for g in genes_in: # G_nx.node[g]['Group:'+c] = True df.loc[genes_in, c] = True df.rename(columns=lambda x: 'Group:'+x, inplace=True) ddot.utils.set_node_attributes_from_pandas(G_nx, df) # # If a gene belongs to multiple children, then place it where it is most similar # for g_i in (df.sum(1) > 0).nonzero(): # g = genes[g_i] # choices = df.loc[g, :].nonzero() # network_sq.loc[g, :].argmax() G = nx_to_NdexGraph(G_nx) G.set_name('%s supporting network for %s' % (name, t)) G.set_network_attribute('Description', '%s supporting network for %s' % (name, t)) G.set_network_attribute('Main Feature', main_feature) for f in features: if (f == spring_feature) and (f != main_feature): continue G.set_network_attribute('%s type' % f, feature_types[f]) if feature_types[f] == 'numeric': G.set_network_attribute('%s min' % f, feature_mins[f]) G.set_network_attribute('%s max' % f, feature_maxs[f]) # for c in children: # G.set_network_attribute('Group:' + c, True) G.set_network_attribute('Group', '|'.join(children)) # New: calculate the score threshold of this subnetwork G.set_network_attribute('Main Feature Default Cutoff', float(ont.node_attr.loc[t, 'Parent weight'])) G.set_network_attribute('Parent weight', float(ont.node_attr.loc[t, 'Parent weight'])) if min_children_term_weights > 0: G.set_network_attribute('Children weight', '|'.join(['{:.3f}'.format(w) for w in children_term_weights])) # G.set_network_attribute('Main Feature Default Cutoff', float(min_children_term_weights)) if isinstance(edge_groups, dict) and (len(edge_groups.keys()) > 0): edge_group_string = [] for k, vs in edge_groups.items(): vs.sort() edge_group_string.append(','.join([k] + vs)) edge_group_string = '|'.join(edge_group_string) G.set_network_attribute('edge groups', edge_group_string) # New: only keep the biggest compoent in the network G = max(nx.weakly_connected_component_subgraphs(G), key=len) # # further remove degree == 1 nodes # if len(G.nodes()) > 6: # low_deg_nodes = [] # for v, deg in G.degree().items(): # if deg <= 1: # low_deg_nodes.append(v) # # while len(low_deg_nodes) != 0: # G.remove_nodes_from(low_deg_nodes) # low_deg_nodes = [] # for v, deg in G.degree().items(): # if deg <= 1: # low_deg_nodes.append(v) # New: compute a pre-layout to networks if spring_feature != None: # G_cx = G.to_cx() # why converted back and forth # G = NdexGraph(G_cx) gsim = layouts._create_simple_graph(G) pos = nx.spring_layout(gsim, scale=200 * math.sqrt(gsim.number_of_nodes()), weight=spring_feature) G.pos = pos # layouts.apply_directed_flow_layout(G, node_width=50, weight=spring_feature) start_upload = time.time() ndex_url = G.upload_to(ndex_server, ndex_user, ndex_pass, visibility=visibility) term_2_uuid[t] = parse_ndex_uuid(ndex_url) upload_time = time.time() - start_upload if verbose: print(upload_idx, 'Term:', t, 'Gene pairs:', len(G_nx.edges()), 'Genes:', len(genes), 'Time:', round(time.time() - start, 4), 'Upload time:', round(upload_time, 4), 'NDEx URL:', ndex_url) else: if verbose: print(upload_idx, 'No data provided for gene pairs in Term: %s' % t) return term_2_uuid
[docs] def get_best_ancestors(self, node_order=None, verbose=False, include_genes=True): """Compute the 'best' ancestor for every pair of terms. 'Best' is specified by a ranking of terms. For example, if terms are ranked by size, from smallest to largest, then the smallest common ancestor is calculated. Parameters ---------- node_order : list A list of terms, ordered by their rank with the 'best' term at the beginning. include_genes : bool Returns -------- ancestors : np.ndarray ancestors[a,b] = the best common ancestor of terms a and b, represented as a 0-based index of self.terms nodes : list List of the row and column names. Rows and columns are the same. """ ont = self.propagate(direction='reverse', inplace=False) graph = ont.to_igraph(include_genes=include_genes, spanning_tree=False) if node_order is None: # By default, sort from smallest to largest terms node_order = [self.terms[t] for t in np.argsort(ont.term_sizes)] d = np.int8(np.isfinite(np.array(graph.shortest_paths(graph.vs, graph.vs, mode='out'), order='C'))) ancestor_matrix = np.zeros(d.shape, dtype=np.int32) ancestor_matrix.fill(-1) if verbose: time_print('Iterating:') for t in node_order: i = graph.vs.find(t).index t_i = self.terms_index[t] # Note: includes self as a child children = np.where(d[:,i] == 1)[0] # For those descendants without a computed LCA yet, set their LCA to this term lca_sub = ancestor_matrix[children.reshape(-1,1), children] lca_sub[lca_sub == -1] = t_i ancestor_matrix[children.reshape(-1,1), children] = lca_sub # Check symmetry assert (ancestor_matrix.T == ancestor_matrix).all() assert (-1 == ancestor_matrix).sum() == 0, 'The ontology may have more than one root' return ancestor_matrix, graph.vs['name']
@classmethod def _make_tree_igraph(self, graph=None, method='priority', edge_name='smallest_parent', parent_priority=None, edge_priority=None, default_priority=None, optim='max'): """Returns copy of graph with new edge attribute marking spanning tree """ if graph is None: graph = self.to_igraph(include_genes=False, spanning_tree=True) if method=='priority': assert 1 == (parent_priority is not None) + (edge_priority is not None) if edge_priority is not None: assert default_priority is not None if optim=='min': optim=min if optim=='max': optim=max graph.es[edge_name] = False for v in graph.vs: parents = graph.neighbors(v.index, mode='out') if len(parents) > 0: """Choose the parent with the highest valued priority""" if parent_priority is not None: small_parent = optim(parents, key=lambda p: parent_priority[p]) elif edge_priority is not None: small_parent = optim(parents, key=lambda p: edge_priority.get(graph.get_eid(v.index, p), default_priority)) graph.es[graph.get_eid(v.index, small_parent)][edge_name] = True else: raise Exception('Method not supported') return graph
[docs] def to_pickle(self, file, compression='infer'): """Saves Ontology object with the Python pickle protocol.""" pandas.io.pickle.to_pickle(self, file, compression=compression)
[docs] @classmethod def read_pickle(cls, file, compression='infer'): """Loads an Ontology object from a pickled state.""" return pandas.io.pickle.read_pickle(file, compression=compression)
def __repr__(self): return self.summary() def __str__(self): return self.summary()