import sys
import os
import re
import tempfile
import subprocess
import random, sets
#from Bio.Nexus.Trees import Tree
#from Bio.Nexus.Nodes import Node, Chain
propertyXML = """
"""
############### BEGIN EXTRACTED FROM BIOPYTHON ###############
# Biopython License Agreement
#
# Permission to use, copy, modify, and distribute this software and its
# documentation with or without modifications and for any purpose and
# without fee is hereby granted, provided that any copyright notices
# appear in all copies and that both those copyright notices and this
# permission notice appear in supporting documentation, and that the
# names of the contributors or copyright holders not be used in
# advertising or publicity pertaining to distribution of the software
# without specific prior permission.
#
# THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL
# WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE
# CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
# OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
# OR PERFORMANCE OF THIS SOFTWARE.
class ChainException(Exception):
pass
class NodeException(Exception):
pass
class Chain:
"""Stores a list of nodes that are linked together."""
def __init__(self):
"""Initiates a node chain: (self)."""
self.chain={}
self.id=-1
def _get_id(self):
"""Gets a new id for a node in the chain."""
self.id+=1
return self.id
def add(self,node,prev=None):
"""Attaches node to another: (self, node, prev)."""
if prev is not None and prev not in self.chain:
raise ChainException('Unknow predecessor: '+str(prev))
else:
id=self._get_id()
node.set_id(id)
node.set_prev(prev)
if prev is not None:
self.chain[prev].add_succ(id)
self.chain[id]=node
return id
def collapse(self,id):
"""Deletes node from chain and relinks successors to predecessor: collapse(self, id)."""
if id not in self.chain:
raise ChainException('Unknown ID: '+str(id))
prev_id=self.chain[id].get_prev()
self.chain[prev_id].remove_succ(id)
succ_ids=self.chain[id].get_succ()
for i in succ_ids:
self.chain[i].set_prev(prev_id)
self.chain[prev_id].add_succ(succ_ids)
node=self.chain[id]
self.kill(id)
return node
def kill(self,id):
"""Kills a node from chain without caring to what it is connected: kill(self,id)."""
if id not in self.chain:
raise ChainException('Unknown ID: '+str(id))
else:
del self.chain[id]
def unlink(self,id):
"""Disconnects node from his predecessor: unlink(self,id)."""
if id not in self.chain:
raise ChainException('Unknown ID: '+str(id))
else:
prev_id=self.chain[id].prev
if prev_id is not None:
self.chain[prev_id].succ.pop(self.chain[prev_id].succ.index(id))
self.chain[id].prev=None
return prev_id
def link(self, parent,child):
"""Connects son to parent: link(self,son,parent)."""
if child not in self.chain:
raise ChainException('Unknown ID: '+str(child))
elif parent not in self.chain:
raise ChainException('Unknown ID: '+str(parent))
else:
self.unlink(child)
self.chain[parent].succ.append(child)
self.chain[child].set_prev(parent)
def is_parent_of(self,parent,grandchild):
"""Check if grandchild is a subnode of parent: is_parent_of(self,parent,grandchild)."""
if grandchild==parent or grandchild in self.chain[parent].get_succ():
return True
else:
for sn in self.chain[parent].get_succ():
if self.is_parent_of(sn,grandchild):
return True
else:
return False
def trace(self,start,finish):
"""Returns a list of all node_ids between two nodes (excluding start, including end): trace(start,end)."""
if start not in self.chain or finish not in self.chain:
raise NodeException('Unknown node.')
if not self.is_parent_of(start,finish) or start==finish:
return []
for sn in self.chain[start].get_succ():
if self.is_parent_of(sn,finish):
return [sn]+self.trace(sn,finish)
class Node:
"""A single node."""
def __init__(self,data=None):
"""Represents a node with one predecessor and multiple successors: (self, data=None)."""
self.id=None
self.data=data
self.prev=None
self.succ=[]
def set_id(self,id):
"""Sets the id of a node, if not set yet: (self,id)."""
if self.id is not None:
raise NodeException, 'Node id cannot be changed.'
self.id=id
def get_id(self):
"""Returns the node's id: (self)."""
return self.id
def get_succ(self):
"""Returns a list of the node's successors: (self)."""
return self.succ
def get_prev(self):
"""Returns the id of the node's predecessor: (self)."""
return self.prev
def add_succ(self,id):
"""Adds a node id to the node's successors: (self,id)."""
if isinstance(id,type([])):
self.succ.extend(id)
else:
self.succ.append(id)
def remove_succ(self,id):
"""Removes a node id from the node's successors: (self,id)."""
self.succ.remove(id)
def set_succ(self,new_succ):
"""Sets the node's successors: (self,new_succ)."""
if not isinstance(new_succ,type([])):
raise NodeException, 'Node successor must be of list type.'
self.succ=new_succ
def set_prev(self,id):
"""Sets the node's predecessor: (self,id)."""
self.prev=id
def get_data(self):
"""Returns a node's data: (self)."""
return self.data
def set_data(self,data):
"""Sets a node's data: (self,data)."""
self.data=data
PRECISION_BRANCHLENGTH=6
PRECISION_SUPPORT=6
class Nodes: # this is just to pretend that the preceding is in a Nodes module
ChainException = ChainException
NodeException = NodeException
Chain = Chain
Node = Node
class TreeError(Exception): pass
class NodeData:
"""Stores tree-relevant data associated with nodes (e.g. branches or otus)."""
def __init__(self,taxon=None,branchlength=0.0,support=None):
self.taxon=taxon
self.branchlength=branchlength
self.support=support
class Tree(Nodes.Chain):
"""Represents a tree using a chain of nodes with on predecessor (=ancestor)
and multiple successors (=subclades).
"""
# A newick tree is parsed into nested list and then converted to a node list in two stages
# mostly due to hsitorical reasons. This could be done in one swoop). Note: parentheses ( ) and
# colon : are not allowed in taxon names. This is against NEXUS standard, but makes life much
# easier when parsing trees.
def __init__(self,tree=None,weight=1,rooted=False,name='',data=NodeData,values_are_support=False):
"""Ntree(self,tree)."""
Nodes.Chain.__init__(self)
self.__values_are_support=values_are_support
self.weight=weight
self.rooted=rooted
self.name=name
root=Nodes.Node(data())
self.add(root)
self.root=root.id
if tree: # use the tree we have
self._add_subtree(parent_id=root.id,tree=self._parse(tree)[0],data=data)
def _parse(self,tree):
"""Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively."""
if tree.count('(')!=tree.count(')'):
raise TreeError, 'Parentheses do not match in (sub)tree: '+tree
if tree.count('(')==0: # a leaf
colon=tree.rfind(':')
if colon>-1:
return [tree[:colon],self._get_values(tree[colon+1:])]
else:
return [tree,[None]]
else:
closing=tree.rfind(')')
val=self._get_values(tree[closing+1:])
if not val:
val=[None]
subtrees=[]
plevel=0
prev=1
for p in range(1,closing):
if tree[p]=='(':
plevel+=1
elif tree[p]==')':
plevel-=1
elif tree[p]==',' and plevel==0:
subtrees.append(tree[prev:p])
prev=p+1
subtrees.append(tree[prev:closing])
subclades=[self._parse(subtree) for subtree in subtrees]
return [subclades,val]
def _add_subtree(self,parent_id=None,tree=None,data=NodeData):
"""Adds leaf or tree (in newick format) to a parent_id. (self,parent_id,tree)."""
if parent_id is None:
raise TreeError('Need node_id to connect to.')
for st in tree:
if type(st[0])==list: # it's a subtree
nd=data()
if len(st[1])>=2: # if there's two values, support comes first. Is that always so?
nd.support=st[1][0]
if st[1][1] is not None:
nd.branchlength=st[1][1]
elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths
if not self.__values_are_support: # default
if st[1][0] is not None:
nd.branchlength=st[1][0]
else:
nd.support=st[1][0]
sn=Nodes.Node(nd)
self.add(sn,parent_id)
self._add_subtree(sn.id,st[0])
else: # it's a leaf
nd=data()
nd.taxon=st[0]
if len(st)>1:
if len(st[1])>=2: # if there's two values, support comes first. Is that always so?
nd.support=st[1][0]
if st[1][1] is not None:
nd.branchlength=st[1][1]
elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths
if not self.__values_are_support: # default
if st[1][0] is not None:
nd.branchlength=st[1][0]
else:
nd.support=st[1][0]
leaf=Nodes.Node(nd)
self.add(leaf,parent_id)
def _get_values(self, text):
"""Extracts values (support/branchlength) from xx[:yyy], xx."""
if text=='':
return None
return [float(t) for t in text.split(':') if t.strip()]
def node(self,node_id):
"""Return the instance of node_id.
node = node(self,node_id)
"""
if node_id not in self.chain:
raise TreeError('Unknown node_id: %d' % node_id)
return self.chain[node_id]
def split(self,parent_id=None,n=2,branchlength=1.0):
"""Speciation: generates n (default two) descendants of a node.
[new ids] = split(self,parent_id=None,n=2,branchlength=1.0):
"""
if parent_id is None:
raise TreeError('Missing node_id.')
ids=[]
parent_data=self.chain[parent_id].get_data()
for i in range(n):
node=Nodes.Node()
if parent_data:
# the data class is probably some subclass from NodeData we don't know
# we need a new instance for that
node.data=parent_data.__class__()
# each node has taxon and branchlength attribute
if parent_data.taxon:
node.data.taxon=parent_data.taxon+str(i)
node.data.branchlength=branchlength
ids.append(self.add(node,parent_id))
return ids
def search_taxon(self,taxon):
"""Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes.
node_id = search_taxon(self,taxon)
"""
for id,node in self.chain.items():
if node.get_data().taxon==taxon:
return id
return None
def prune(self,taxon):
"""Prunes a terminal taxon from the tree.
id_of_previous_node = prune(self,taxon)
If taxon is from a bifurcation, the connectiong node will be collapsed
and its branchlength added to remaining terminal node. This might be no
longer a meaningful value'
"""
id=self.search_taxon(taxon)
if id is None:
raise TreeError('Taxon not found: %s' % taxon)
elif id not in self.get_terminals():
raise TreeError('Not a terminal taxon: %s' % taxon)
else:
prev=self.unlink(id)
self.kill(id)
if not prev==self.root and len(self.node(prev).succ)==1:
succ=self.node(prev).get_succ(0)
new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength
self.collapse(prev)
self.node(succ).data.branchlength=new_bl
return prev
def get_taxa(self,node_id=None):
"""Return a list of all otus downwards from a node (self, node_id).
nodes = get_taxa(self,node_id=None)
"""
if node_id is None:
node_id=self.root
if node_id not in self.chain:
raise TreeError('Unknown node_id: %d.' % node_id)
if self.chain[node_id].get_succ()==[]:
if self.chain[node_id].get_data():
return [self.chain[node_id].get_data().taxon]
else:
return None
else:
list=[]
for succ in self.chain[node_id].get_succ():
list.extend(self.get_taxa(succ))
return list
def get_terminals(self):
"""Return a list of all terminal nodes."""
return [i for i,n in self.chain.items() if n.succ==[]]
def sum_branchlength(self,root=None,node=None):
"""Adds up the branchlengths from root (default self.root) to node.
sum = sum_branchlength(self,root=None,node=None)
"""
if root is None:
root=self.root
if node is None:
raise TreeError('Missing node id.')
blen=0.0
while node is not None and node is not root:
blen+=self.chain[node].get_data().branchlength
node=self.chain[node].get_prev()
return blen
def set_subtree(self,node):
"""Return subtree as a set of nested sets.
sets = set_subtree(self,node)
"""
if self.node(node).succ==[]:
return self.node(node).data.taxon
else:
return sets.Set([self.set_subtree(n) for n in self.node(node).succ])
def is_identical(self,tree2):
"""Compare tree and tree2 for identity.
result = is_identical(self,tree2)
"""
return self.set_subtree(self.root)==tree2.set_subtree(tree2.root)
def is_compatible(self,tree2,threshold):
"""Compares branches with support>threshold for compatibility.
result = is_compatible(self,tree2,threshold)
"""
if sets.Set(self.get_taxa())!=sets.Set(tree2.get_taxa()):
raise TreeError, 'Can\'t compare trees with different taxon compositions.'
t1=[(sets.Set(self.get_taxa(n)),self.node(n).data.support) for n in self.chain.keys() if \
self.node(n).succ and\
(self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)]
t2=[(sets.Set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.chain.keys() if \
tree2.node(n).succ and\
(tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)]
conflict=[]
for (st1,sup1) in t1:
for (st2,sup2) in t2:
if not st1.issubset(st2) and not st2.issubset(st1):
intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2
if intersect:
conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2))
return conflict
def common_ancestor(self,node1,node2):
"""Return the common ancestor that connects to nodes.
node_id = common_ancestor(self,node1,node2)
"""
l1=[self.root]+self.trace(self.root,node1)
l2=[self.root]+self.trace(self.root,node2)
return [n for n in l1 if n in l2][-1]
def distance(self,node1,node2):
"""Add and return the sum of the branchlengths between two nodes.
dist = distance(self,node1,node2)
"""
ca=self.common_ancestor(node1,node2)
return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2)
def is_monophyletic(self,taxon_list):
"""Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise.
result = is_monophyletic(self,taxon_list)
"""
if isinstance(taxon_list,str):
taxon_set=sets.Set([taxon_list])
else:
taxon_set=sets.Set(taxon_list)
node_id=self.root
while 1:
subclade_taxa=sets.Set(self.get_taxa(node_id))
if subclade_taxa==taxon_set: # are we there?
return node_id
else: # check subnodes
for subnode in self.chain[node_id].get_succ():
if sets.Set(self.get_taxa(subnode)).issuperset(taxon_set): # taxon_set is downstream
node_id=subnode
break # out of for loop
else:
return -1 # taxon set was not with successors, for loop exhausted
def branchlength2support(self):
"""Move values stored in data.branchlength to data.support, and set branchlength to 0.0
This is necessary when support has been stored as branchlength (e.g. paup), and has thus
been read in as branchlength.
"""
for n in self.chain.keys():
self.node(n).data.support=self.node(n).data.branchlength
self.node(n).data.branchlength=0.0
def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True):
"""Generates a random tree with ntax taxa and/or taxa from taxlabels.
new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True)
Trees are bifurcating by default. (Polytomies not yet supported).
"""
if not ntax and taxon_list:
ntax=len(taxon_list)
elif not taxon_list and ntax:
taxon_list=['taxon'+str(i+1) for i in range(ntax)]
elif not ntax and not taxon_list:
raise TreeError('Either numer of taxa or list of taxa must be specified.')
elif ntax<>len(taxon_list):
raise TreeError('Length of taxon list must correspond to ntax.')
# initiate self with empty root
self.__init__(plain=False)
terminals=self.get_terminals()
# bifurcate randomly at terminal nodes until ntax is reached
while len(terminals)1:
treeline+=' [&W%s]' % str(round(float(self.weight),3))
if self.rooted:
treeline+=' [&R]'
if internal_taxon_labels:
treeline+=' = (%s)%s;' % (','.join(map(newickize,self.node(self.root).succ)), self.node(0).data.taxon)
else:
treeline+=' = (%s);' % ','.join(map(newickize,self.node(self.root).succ))
return treeline
def __str__(self):
"""Short version of to_string(), gives plain tree"""
return self.to_string(plain=True)
############### END EXTRACTED FROM BIOPYTHON ###############
def namesFormatForTree(tree):
template = "%(tax_id)s\t|\t%(name_txt)s\t|\t\t|\tscientific name\t|"
nameLines = []
for nodeID in tree.chain:
name = "-".join(tree.get_taxa(nodeID))
values = {"name_txt":name, "tax_id":str(nodeID+1)} # ids must start with 1 in dmp file
nameLines.append(template % values)
return "\n".join(nameLines) + "\n"
def nodesFormatForTree(tree):
template = "%(tax_id)s\t|\t%(parent_tax_id)s\t|\tspecies\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t\t|"
nodeLines = []
for nodeID, node in tree.chain.items():
parentTaxID = node.get_prev()
if parentTaxID == None: parentTaxID = 0
values = {"tax_id":str(nodeID+1), "parent_tax_id":str(parentTaxID+1)} # ids must start with 1 in dmp file
nodeLines.append(template % values)
return "\n".join(nodeLines) + "\n"
def xmlSequencesTextForGeneTree(tree):
template = "%(node_id)s%(node_id)s%(gene_name)s%(species_name)sMA01complete"
geneLines = []
for nodeID in tree.get_terminals():
name = tree.node(nodeID).data.taxon
speciesName, geneName = name.split("+")
values = {"node_id":str(nodeID), "gene_name":geneName, "species_name":speciesName}
geneLines.append(template % values)
return "\n".join(geneLines) + "\n"
def indexTaxonomyFiles(workdirPath):
indexer = subprocess.Popen(["java", "softparsmap.DataSourceXmlNcbiTaxonomy", "prop.xml", "example"], cwd=workdirPath, stdout=sys.stderr)
indexer.wait()
def numericTaxaTree(tree):
newick = newickForTree(tree, support=True)
treeCopy = Tree(newick, values_are_support=True)
for nodeID in treeCopy.get_terminals():
treeCopy.node(nodeID).data.taxon = str(nodeID)
return treeCopy
def newickForTree(tree, support=False):
if support:
return re.findall("\(.*\)", tree.to_string(support_as_branchlengths=True))[0]
else:
return re.findall("\(.*\)", tree.to_string())[0]
def parseSchreiber(schreiberText="[[4,5],[2,[1]],[9,10],[7,[3]],[[2],[4]]]\n[S,S,S,S,D]\n"):
lines = schreiberText.split("\n")
nodeList = eval(lines[0]) # this eval only works if the only list content is numbers
labelList = (lines[1])[1:-1].split(",")
listOfNodes = []
tree = Tree()
root = tree.node(0)
for item in nodeList[:-1]: # last item is root node
aNode = Nodes.Node(NodeData())
tree.add(aNode)
listOfNodes.append(aNode)
for subitem in item:
if type(subitem) == list:
index = int(subitem[0]) - 1
aNode.add_succ(listOfNodes[index].get_id())
else: # subitem should be a number which is a terminal label
terminalNode = Nodes.Node(NodeData())
terminalNode.data.taxon = str(subitem)
tree.add(terminalNode, aNode.get_id())
listOfNodes.append(root)
for childIndex in nodeList[-1]: # this connects nodes to the root
if type(childIndex) == list:
index = int(childIndex[0]) - 1
root.add_succ(listOfNodes[index].get_id())
else:
terminalNode = Nodes.Node(NodeData())
terminalNode.data.taxon = str(childIndex)
tree.add(terminalNode, root.get_id())
for index in range(len(labelList)):
internalLabel = labelList[index]
node = listOfNodes[index]
node.data.taxon = internalLabel
return tree
# def applyInternalLabelsToTreeUsingSchreiberText(geneTree, schreiberText):
# lines = schreiberText.split("\n")
# nodeList = eval(lines[0])
# labelList = eval(lines[1])
# for item in nodeList:
# for subitem in item:
def runSoftParsMapRooting(workdirPath):
softparsmap = subprocess.Popen(["java", "softparsmap.Compute", "prop.xml", "root_example", "unrooted"], cwd=workdirPath, stdout=sys.stderr)
softparsmap.wait()
rootedTreeText = (file(os.path.join(workdirPath, "target", "family1_rooted.tree")).read()).strip()[:-1]
return Tree(rootedTreeText, values_are_support=True)
def runSoftParsMapDuplicationMapping(workdirPath):
softparsmap = subprocess.Popen(["java", "softparsmap.Compute", "prop.xml", "map_example", "rooted"], cwd=workdirPath, stdout=sys.stderr)
softparsmap.wait()
schreiberOutput = (file(os.path.join(workdirPath, "target", "family1_map.info")).read()).strip()
labeledTree = parseSchreiber(schreiberText=schreiberOutput)
return labeledTree
def fixupRootedTreeTaxonNames(rootedTree, origGeneTree):
for nodeID in rootedTree.get_terminals():
taxonNumber = rootedTree.node(nodeID).data.taxon
taxonName = origGeneTree.node(int(taxonNumber)).data.taxon
rootedTree.node(nodeID).data.taxon = taxonName
def main():
speciesTreeText = file(sys.argv[1]).read().strip()
speciesTree = Tree(speciesTreeText[:-1]) # removing semicolon
geneTreeText = file(sys.argv[2]).read().strip()
geneTree = Tree(geneTreeText[:-1], values_are_support=True) # removing semicolon
workdirPath = tempfile.mkdtemp()
#print workdirPath
treedirPath = os.path.join(workdirPath, "trees")
os.mkdir(treedirPath)
namesFile = file(os.path.join(workdirPath, "names.dmp"), "w")
nodesFile = file(os.path.join(workdirPath, "nodes.dmp"), "w")
seqsFile = file(os.path.join(workdirPath, "db.drw"), "w")
geneTreeFile = file(os.path.join(treedirPath, "family1.tree"), "w")
propertiesFile = file(os.path.join(workdirPath, "prop.xml"), "w")
namesFile.write(namesFormatForTree(speciesTree))
namesFile.close()
nodesFile.write(nodesFormatForTree(speciesTree))
nodesFile.close()
seqsFile.write(xmlSequencesTextForGeneTree(geneTree))
seqsFile.close()
geneTreeText = newickForTree(numericTaxaTree(geneTree), support=True) + ";\n"
geneTreeFile.write(geneTreeText)
geneTreeFile.close()
propertiesFile.write(propertyXML)
propertiesFile.close()
indexTaxonomyFiles(workdirPath)
rootedTree = runSoftParsMapRooting(workdirPath)
labeledTree = runSoftParsMapDuplicationMapping(workdirPath)
fixupRootedTreeTaxonNames(labeledTree, geneTree)
treeString = labeledTree.to_string(internal_taxon_labels=True)
print treeString[treeString.find("("):]
if __name__ == '__main__':
main()