Displaying a tanglegram (i.e. "cophylogeny plot") #3159
Replies: 2 comments 6 replies
-
Cool! So there's no attempt to reorder the tips here to optimise, just take both in minlex and draw the lines? |
Beta Was this translation helpful? Give feedback.
-
Here's the code for running the untangling algorithm(s) described in from Bio import Phylo
import numpy as np
import io
from scipy.cluster.hierarchy import is_valid_linkage
import tanglegram
def newick_string_to_linkage_matrix(newick_str):
"""
Convert a newick to a scipy.cluster.hierarchy linakes matrix. Returns the
matrix and an array of labels
"""
# definition of linkage matrix:
# https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html
# create tree object via Biopython (Bio.Phylo)
tree = next(Phylo.parse(io.StringIO(newick_str), format="newick"))
# calculate tree height
tree_height = max(tree.distance(c) for c in tree.find_clades(terminal=True))
# add comment with id and span of terminal nodes:
id_map = {}
for i, c in enumerate(tree.find_clades(terminal=True)):
c.comment = (i, 1)
id_map[i] = c.name
# ancestor list orderred by distance from the root
anc_lst = []
for c in tree.find_clades(terminal=False):
d = tree.distance(c)
anc_lst.append((c, list(c), d))
anc_lst.sort(key=lambda x:x[2], reverse=True)
# running number of node
nodes = len(list(tree.find_clades(terminal=True)))
lnk_lst = []
for anc,children, anc_d in anc_lst:
n_children = len(children)
assert n_children>=2
child1 = children[0]
for child2 in children[1:]:
id1, n_leaves1 = child1.comment
id2, n_leaves2 = child2.comment
total_leaves = n_leaves1 + n_leaves2
anc.comment = (nodes, total_leaves)
distance = tree_height - anc_d
nodes += 1
row = [id1, id2, distance, total_leaves]
lnk_lst.append(row)
child1 = anc
return np.array(lnk_lst), [id_map[i] for i in range(len(id_map))]
def to_newick_tree(Z, leaf_names):
"""
Convert a linkage matrix into a
string in `Newick <https://en.wikipedia.org/wiki/Newick_format>`_ format.
Newick is a commonly-used format to represent trees. It is used by packages
such as `ete3 <http://etetoolkit.org/>`_ for tree manipulation and visualisation.
Parameters
----------
Z : ndarray
The linkage matrix in proper form (see the `linkage`
function documentation).
leaf_names : list
List of strings giving the names of the leaves of the tree. The order
of strings is assumed to be the the same as the order of the data points
given to `linkage`.
Returns
-------
tree : string
See Also
--------
linkage, to_tree
Examples
--------
>>> import numpy as np
>>> from scipy.cluster import hierarchy
>>> Z = np.asarray([[0, 1, 3.0, 2], [3, 2, 4.0, 3]], dtype=np.double)
>>> leaf_names = ["l0", "l1", "l2"]
>>> hierarchy.to_newick_tree(Z, leaf_names)
((l0:3.0,l1:3.0):1.0,l2:4.0);
"""
Z = np.asarray(Z, order='c')
# Number of original objects is equal to the number of rows plus 1.
n = Z.shape[0] + 1
n_leaves = len(leaf_names)
if n != n_leaves:
raise ValueError(f"Expected {n} leaf names, got {n_leaves}")
is_valid_linkage(Z, throw=True, name='Z')
newick_intermediates = leaf_names + [None] * Z.shape[0]
cluster_dists = [0] * (n + Z.shape[0])
for i, row in enumerate(Z):
dist = row[2]
fi = int(row[0])
fj = int(row[1])
cdi = dist - cluster_dists[fi]
cdj = dist - cluster_dists[fj]
newick_subtree = (
f"({newick_intermediates[fi]}:{cdi},"
f"{newick_intermediates[fj]}:{cdj})"
)
newick_intermediates[i + n] = newick_subtree
cluster_dists[i + n] = dist
newick_intermediates[fi] = None
newick_intermediates[fj] = None
return newick_intermediates[i + n] + ";"
def untangle_newicks(newick1, newick2, method, **kwargs):
Z1, labels1 = newick_string_to_linkage_matrix(newick1)
Z2, labels2 = newick_string_to_linkage_matrix(newick2)
untangled1, untangled2 = tanglegram.untangle(Z1, Z2, labels1, labels2, method, **kwargs)
return to_newick_tree(untangled1, labels1), to_newick_tree(untangled2, labels2) The import string
import msprime
import tskit
# make a tree sequence. We'll compare the k'th tree with the first tree
ts = msprime.simulate(10, recombination_rate=10)
tables = ts.dump_tables()
tables.nodes.metadata_schema = tskit.MetadataSchema.permissive_json()
tables.nodes.packset_metadata([
tables.nodes.metadata_schema.validate_and_encode_row(
{'name': string.ascii_uppercase[nd.id]} if nd.is_sample() else {}
) for nd in ts.nodes()])
ts = tables.tree_sequence()
k=20
nwk1, nwk2 = untangle_newicks(
ts.at_index(0).as_newick(node_labels={nd.id: nd.metadata.get('name', '') for nd in ts.nodes()}),
ts.at_index(k).as_newick(node_labels={nd.id: nd.metadata.get('name', '') for nd in ts.nodes()}),
method="step2side"
)
display(draw_tanglegram(ts.at_index(0), ts.at_index(k), 'name', width=800, height=300))
display(draw_tanglegram(tsconvert.from_newick(nwk1).first(), tsconvert.from_newick(nwk2).first(), 'name', width=800, height=300)) I'm unclear if this produces as good a result as the java Dendroscope untangling algorithm, but it is linked to a good paper, and is certainly easier to use. Probably we would want to combine the ![]() |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
I have put together a simple
draw_tanglegram
function for comparing trees, which might be useful to some people:Here's the code:
Beta Was this translation helpful? Give feedback.
All reactions