import sys import os import re import tempfile import subprocess import random, sets #from Bio.Nexus.Trees import Tree #from Bio.Nexus.Nodes import Node, Chain propertyXML = """ """ ############### BEGIN EXTRACTED FROM BIOPYTHON ############### # Biopython License Agreement # # Permission to use, copy, modify, and distribute this software and its # documentation with or without modifications and for any purpose and # without fee is hereby granted, provided that any copyright notices # appear in all copies and that both those copyright notices and this # permission notice appear in supporting documentation, and that the # names of the contributors or copyright holders not be used in # advertising or publicity pertaining to distribution of the software # without specific prior permission. # # THE CONTRIBUTORS AND COPYRIGHT HOLDERS OF THIS SOFTWARE DISCLAIM ALL # WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL THE # CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT # OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE # OR PERFORMANCE OF THIS SOFTWARE. class ChainException(Exception): pass class NodeException(Exception): pass class Chain: """Stores a list of nodes that are linked together.""" def __init__(self): """Initiates a node chain: (self).""" self.chain={} self.id=-1 def _get_id(self): """Gets a new id for a node in the chain.""" self.id+=1 return self.id def add(self,node,prev=None): """Attaches node to another: (self, node, prev).""" if prev is not None and prev not in self.chain: raise ChainException('Unknow predecessor: '+str(prev)) else: id=self._get_id() node.set_id(id) node.set_prev(prev) if prev is not None: self.chain[prev].add_succ(id) self.chain[id]=node return id def collapse(self,id): """Deletes node from chain and relinks successors to predecessor: collapse(self, id).""" if id not in self.chain: raise ChainException('Unknown ID: '+str(id)) prev_id=self.chain[id].get_prev() self.chain[prev_id].remove_succ(id) succ_ids=self.chain[id].get_succ() for i in succ_ids: self.chain[i].set_prev(prev_id) self.chain[prev_id].add_succ(succ_ids) node=self.chain[id] self.kill(id) return node def kill(self,id): """Kills a node from chain without caring to what it is connected: kill(self,id).""" if id not in self.chain: raise ChainException('Unknown ID: '+str(id)) else: del self.chain[id] def unlink(self,id): """Disconnects node from his predecessor: unlink(self,id).""" if id not in self.chain: raise ChainException('Unknown ID: '+str(id)) else: prev_id=self.chain[id].prev if prev_id is not None: self.chain[prev_id].succ.pop(self.chain[prev_id].succ.index(id)) self.chain[id].prev=None return prev_id def link(self, parent,child): """Connects son to parent: link(self,son,parent).""" if child not in self.chain: raise ChainException('Unknown ID: '+str(child)) elif parent not in self.chain: raise ChainException('Unknown ID: '+str(parent)) else: self.unlink(child) self.chain[parent].succ.append(child) self.chain[child].set_prev(parent) def is_parent_of(self,parent,grandchild): """Check if grandchild is a subnode of parent: is_parent_of(self,parent,grandchild).""" if grandchild==parent or grandchild in self.chain[parent].get_succ(): return True else: for sn in self.chain[parent].get_succ(): if self.is_parent_of(sn,grandchild): return True else: return False def trace(self,start,finish): """Returns a list of all node_ids between two nodes (excluding start, including end): trace(start,end).""" if start not in self.chain or finish not in self.chain: raise NodeException('Unknown node.') if not self.is_parent_of(start,finish) or start==finish: return [] for sn in self.chain[start].get_succ(): if self.is_parent_of(sn,finish): return [sn]+self.trace(sn,finish) class Node: """A single node.""" def __init__(self,data=None): """Represents a node with one predecessor and multiple successors: (self, data=None).""" self.id=None self.data=data self.prev=None self.succ=[] def set_id(self,id): """Sets the id of a node, if not set yet: (self,id).""" if self.id is not None: raise NodeException, 'Node id cannot be changed.' self.id=id def get_id(self): """Returns the node's id: (self).""" return self.id def get_succ(self): """Returns a list of the node's successors: (self).""" return self.succ def get_prev(self): """Returns the id of the node's predecessor: (self).""" return self.prev def add_succ(self,id): """Adds a node id to the node's successors: (self,id).""" if isinstance(id,type([])): self.succ.extend(id) else: self.succ.append(id) def remove_succ(self,id): """Removes a node id from the node's successors: (self,id).""" self.succ.remove(id) def set_succ(self,new_succ): """Sets the node's successors: (self,new_succ).""" if not isinstance(new_succ,type([])): raise NodeException, 'Node successor must be of list type.' self.succ=new_succ def set_prev(self,id): """Sets the node's predecessor: (self,id).""" self.prev=id def get_data(self): """Returns a node's data: (self).""" return self.data def set_data(self,data): """Sets a node's data: (self,data).""" self.data=data PRECISION_BRANCHLENGTH=6 PRECISION_SUPPORT=6 class Nodes: # this is just to pretend that the preceding is in a Nodes module ChainException = ChainException NodeException = NodeException Chain = Chain Node = Node class TreeError(Exception): pass class NodeData: """Stores tree-relevant data associated with nodes (e.g. branches or otus).""" def __init__(self,taxon=None,branchlength=0.0,support=None): self.taxon=taxon self.branchlength=branchlength self.support=support class Tree(Nodes.Chain): """Represents a tree using a chain of nodes with on predecessor (=ancestor) and multiple successors (=subclades). """ # A newick tree is parsed into nested list and then converted to a node list in two stages # mostly due to hsitorical reasons. This could be done in one swoop). Note: parentheses ( ) and # colon : are not allowed in taxon names. This is against NEXUS standard, but makes life much # easier when parsing trees. def __init__(self,tree=None,weight=1,rooted=False,name='',data=NodeData,values_are_support=False): """Ntree(self,tree).""" Nodes.Chain.__init__(self) self.__values_are_support=values_are_support self.weight=weight self.rooted=rooted self.name=name root=Nodes.Node(data()) self.add(root) self.root=root.id if tree: # use the tree we have self._add_subtree(parent_id=root.id,tree=self._parse(tree)[0],data=data) def _parse(self,tree): """Parses (a,b,c...)[[[xx]:]yy] into subcomponents and travels down recursively.""" if tree.count('(')!=tree.count(')'): raise TreeError, 'Parentheses do not match in (sub)tree: '+tree if tree.count('(')==0: # a leaf colon=tree.rfind(':') if colon>-1: return [tree[:colon],self._get_values(tree[colon+1:])] else: return [tree,[None]] else: closing=tree.rfind(')') val=self._get_values(tree[closing+1:]) if not val: val=[None] subtrees=[] plevel=0 prev=1 for p in range(1,closing): if tree[p]=='(': plevel+=1 elif tree[p]==')': plevel-=1 elif tree[p]==',' and plevel==0: subtrees.append(tree[prev:p]) prev=p+1 subtrees.append(tree[prev:closing]) subclades=[self._parse(subtree) for subtree in subtrees] return [subclades,val] def _add_subtree(self,parent_id=None,tree=None,data=NodeData): """Adds leaf or tree (in newick format) to a parent_id. (self,parent_id,tree).""" if parent_id is None: raise TreeError('Need node_id to connect to.') for st in tree: if type(st[0])==list: # it's a subtree nd=data() if len(st[1])>=2: # if there's two values, support comes first. Is that always so? nd.support=st[1][0] if st[1][1] is not None: nd.branchlength=st[1][1] elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths if not self.__values_are_support: # default if st[1][0] is not None: nd.branchlength=st[1][0] else: nd.support=st[1][0] sn=Nodes.Node(nd) self.add(sn,parent_id) self._add_subtree(sn.id,st[0]) else: # it's a leaf nd=data() nd.taxon=st[0] if len(st)>1: if len(st[1])>=2: # if there's two values, support comes first. Is that always so? nd.support=st[1][0] if st[1][1] is not None: nd.branchlength=st[1][1] elif len(st[1])==1: # otherwise it could be real branchlengths or support as branchlengths if not self.__values_are_support: # default if st[1][0] is not None: nd.branchlength=st[1][0] else: nd.support=st[1][0] leaf=Nodes.Node(nd) self.add(leaf,parent_id) def _get_values(self, text): """Extracts values (support/branchlength) from xx[:yyy], xx.""" if text=='': return None return [float(t) for t in text.split(':') if t.strip()] def node(self,node_id): """Return the instance of node_id. node = node(self,node_id) """ if node_id not in self.chain: raise TreeError('Unknown node_id: %d' % node_id) return self.chain[node_id] def split(self,parent_id=None,n=2,branchlength=1.0): """Speciation: generates n (default two) descendants of a node. [new ids] = split(self,parent_id=None,n=2,branchlength=1.0): """ if parent_id is None: raise TreeError('Missing node_id.') ids=[] parent_data=self.chain[parent_id].get_data() for i in range(n): node=Nodes.Node() if parent_data: # the data class is probably some subclass from NodeData we don't know # we need a new instance for that node.data=parent_data.__class__() # each node has taxon and branchlength attribute if parent_data.taxon: node.data.taxon=parent_data.taxon+str(i) node.data.branchlength=branchlength ids.append(self.add(node,parent_id)) return ids def search_taxon(self,taxon): """Returns the first matching taxon in self.data.taxon. Not restricted to terminal nodes. node_id = search_taxon(self,taxon) """ for id,node in self.chain.items(): if node.get_data().taxon==taxon: return id return None def prune(self,taxon): """Prunes a terminal taxon from the tree. id_of_previous_node = prune(self,taxon) If taxon is from a bifurcation, the connectiong node will be collapsed and its branchlength added to remaining terminal node. This might be no longer a meaningful value' """ id=self.search_taxon(taxon) if id is None: raise TreeError('Taxon not found: %s' % taxon) elif id not in self.get_terminals(): raise TreeError('Not a terminal taxon: %s' % taxon) else: prev=self.unlink(id) self.kill(id) if not prev==self.root and len(self.node(prev).succ)==1: succ=self.node(prev).get_succ(0) new_bl=self.node(prev).data.branchlength+self.node(succ).data.branchlength self.collapse(prev) self.node(succ).data.branchlength=new_bl return prev def get_taxa(self,node_id=None): """Return a list of all otus downwards from a node (self, node_id). nodes = get_taxa(self,node_id=None) """ if node_id is None: node_id=self.root if node_id not in self.chain: raise TreeError('Unknown node_id: %d.' % node_id) if self.chain[node_id].get_succ()==[]: if self.chain[node_id].get_data(): return [self.chain[node_id].get_data().taxon] else: return None else: list=[] for succ in self.chain[node_id].get_succ(): list.extend(self.get_taxa(succ)) return list def get_terminals(self): """Return a list of all terminal nodes.""" return [i for i,n in self.chain.items() if n.succ==[]] def sum_branchlength(self,root=None,node=None): """Adds up the branchlengths from root (default self.root) to node. sum = sum_branchlength(self,root=None,node=None) """ if root is None: root=self.root if node is None: raise TreeError('Missing node id.') blen=0.0 while node is not None and node is not root: blen+=self.chain[node].get_data().branchlength node=self.chain[node].get_prev() return blen def set_subtree(self,node): """Return subtree as a set of nested sets. sets = set_subtree(self,node) """ if self.node(node).succ==[]: return self.node(node).data.taxon else: return sets.Set([self.set_subtree(n) for n in self.node(node).succ]) def is_identical(self,tree2): """Compare tree and tree2 for identity. result = is_identical(self,tree2) """ return self.set_subtree(self.root)==tree2.set_subtree(tree2.root) def is_compatible(self,tree2,threshold): """Compares branches with support>threshold for compatibility. result = is_compatible(self,tree2,threshold) """ if sets.Set(self.get_taxa())!=sets.Set(tree2.get_taxa()): raise TreeError, 'Can\'t compare trees with different taxon compositions.' t1=[(sets.Set(self.get_taxa(n)),self.node(n).data.support) for n in self.chain.keys() if \ self.node(n).succ and\ (self.node(n).data and self.node(n).data.support and self.node(n).data.support>=threshold)] t2=[(sets.Set(tree2.get_taxa(n)),tree2.node(n).data.support) for n in tree2.chain.keys() if \ tree2.node(n).succ and\ (tree2.node(n).data and tree2.node(n).data.support and tree2.node(n).data.support>=threshold)] conflict=[] for (st1,sup1) in t1: for (st2,sup2) in t2: if not st1.issubset(st2) and not st2.issubset(st1): intersect,notin1,notin2=st1 & st2, st2-st1, st1-st2 if intersect: conflict.append((st1,sup1,st2,sup2,intersect,notin1,notin2)) return conflict def common_ancestor(self,node1,node2): """Return the common ancestor that connects to nodes. node_id = common_ancestor(self,node1,node2) """ l1=[self.root]+self.trace(self.root,node1) l2=[self.root]+self.trace(self.root,node2) return [n for n in l1 if n in l2][-1] def distance(self,node1,node2): """Add and return the sum of the branchlengths between two nodes. dist = distance(self,node1,node2) """ ca=self.common_ancestor(node1,node2) return self.sum_branchlength(ca,node1)+self.sum_branchlength(ca,node2) def is_monophyletic(self,taxon_list): """Return node_id of common ancestor if taxon_list is monophyletic, -1 otherwise. result = is_monophyletic(self,taxon_list) """ if isinstance(taxon_list,str): taxon_set=sets.Set([taxon_list]) else: taxon_set=sets.Set(taxon_list) node_id=self.root while 1: subclade_taxa=sets.Set(self.get_taxa(node_id)) if subclade_taxa==taxon_set: # are we there? return node_id else: # check subnodes for subnode in self.chain[node_id].get_succ(): if sets.Set(self.get_taxa(subnode)).issuperset(taxon_set): # taxon_set is downstream node_id=subnode break # out of for loop else: return -1 # taxon set was not with successors, for loop exhausted def branchlength2support(self): """Move values stored in data.branchlength to data.support, and set branchlength to 0.0 This is necessary when support has been stored as branchlength (e.g. paup), and has thus been read in as branchlength. """ for n in self.chain.keys(): self.node(n).data.support=self.node(n).data.branchlength self.node(n).data.branchlength=0.0 def randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True): """Generates a random tree with ntax taxa and/or taxa from taxlabels. new_tree = randomize(self,ntax=None,taxon_list=None,branchlength=1.0,branchlength_sd=None,bifurcate=True) Trees are bifurcating by default. (Polytomies not yet supported). """ if not ntax and taxon_list: ntax=len(taxon_list) elif not taxon_list and ntax: taxon_list=['taxon'+str(i+1) for i in range(ntax)] elif not ntax and not taxon_list: raise TreeError('Either numer of taxa or list of taxa must be specified.') elif ntax<>len(taxon_list): raise TreeError('Length of taxon list must correspond to ntax.') # initiate self with empty root self.__init__(plain=False) terminals=self.get_terminals() # bifurcate randomly at terminal nodes until ntax is reached while len(terminals)1: treeline+=' [&W%s]' % str(round(float(self.weight),3)) if self.rooted: treeline+=' [&R]' if internal_taxon_labels: treeline+=' = (%s)%s;' % (','.join(map(newickize,self.node(self.root).succ)), self.node(0).data.taxon) else: treeline+=' = (%s);' % ','.join(map(newickize,self.node(self.root).succ)) return treeline def __str__(self): """Short version of to_string(), gives plain tree""" return self.to_string(plain=True) ############### END EXTRACTED FROM BIOPYTHON ############### def namesFormatForTree(tree): template = "%(tax_id)s\t|\t%(name_txt)s\t|\t\t|\tscientific name\t|" nameLines = [] for nodeID in tree.chain: name = "-".join(tree.get_taxa(nodeID)) values = {"name_txt":name, "tax_id":str(nodeID+1)} # ids must start with 1 in dmp file nameLines.append(template % values) return "\n".join(nameLines) + "\n" def nodesFormatForTree(tree): template = "%(tax_id)s\t|\t%(parent_tax_id)s\t|\tspecies\t|\t\t|\t0\t|\t1\t|\t11\t|\t1\t|\t1\t|\t1\t|\t0\t|\t0\t|\t\t|" nodeLines = [] for nodeID, node in tree.chain.items(): parentTaxID = node.get_prev() if parentTaxID == None: parentTaxID = 0 values = {"tax_id":str(nodeID+1), "parent_tax_id":str(parentTaxID+1)} # ids must start with 1 in dmp file nodeLines.append(template % values) return "\n".join(nodeLines) + "\n" def xmlSequencesTextForGeneTree(tree): template = "%(node_id)s%(node_id)s%(gene_name)s%(species_name)sMA01complete" geneLines = [] for nodeID in tree.get_terminals(): name = tree.node(nodeID).data.taxon speciesName, geneName = name.split("+") values = {"node_id":str(nodeID), "gene_name":geneName, "species_name":speciesName} geneLines.append(template % values) return "\n".join(geneLines) + "\n" def indexTaxonomyFiles(workdirPath): indexer = subprocess.Popen(["java", "softparsmap.DataSourceXmlNcbiTaxonomy", "prop.xml", "example"], cwd=workdirPath, stdout=sys.stderr) indexer.wait() def numericTaxaTree(tree): newick = newickForTree(tree, support=True) treeCopy = Tree(newick, values_are_support=True) for nodeID in treeCopy.get_terminals(): treeCopy.node(nodeID).data.taxon = str(nodeID) return treeCopy def newickForTree(tree, support=False): if support: return re.findall("\(.*\)", tree.to_string(support_as_branchlengths=True))[0] else: return re.findall("\(.*\)", tree.to_string())[0] def parseSchreiber(schreiberText="[[4,5],[2,[1]],[9,10],[7,[3]],[[2],[4]]]\n[S,S,S,S,D]\n"): lines = schreiberText.split("\n") nodeList = eval(lines[0]) # this eval only works if the only list content is numbers labelList = (lines[1])[1:-1].split(",") listOfNodes = [] tree = Tree() root = tree.node(0) for item in nodeList[:-1]: # last item is root node aNode = Nodes.Node(NodeData()) tree.add(aNode) listOfNodes.append(aNode) for subitem in item: if type(subitem) == list: index = int(subitem[0]) - 1 aNode.add_succ(listOfNodes[index].get_id()) else: # subitem should be a number which is a terminal label terminalNode = Nodes.Node(NodeData()) terminalNode.data.taxon = str(subitem) tree.add(terminalNode, aNode.get_id()) listOfNodes.append(root) for childIndex in nodeList[-1]: # this connects nodes to the root if type(childIndex) == list: index = int(childIndex[0]) - 1 root.add_succ(listOfNodes[index].get_id()) else: terminalNode = Nodes.Node(NodeData()) terminalNode.data.taxon = str(childIndex) tree.add(terminalNode, root.get_id()) for index in range(len(labelList)): internalLabel = labelList[index] node = listOfNodes[index] node.data.taxon = internalLabel return tree # def applyInternalLabelsToTreeUsingSchreiberText(geneTree, schreiberText): # lines = schreiberText.split("\n") # nodeList = eval(lines[0]) # labelList = eval(lines[1]) # for item in nodeList: # for subitem in item: def runSoftParsMapRooting(workdirPath): softparsmap = subprocess.Popen(["java", "softparsmap.Compute", "prop.xml", "root_example", "unrooted"], cwd=workdirPath, stdout=sys.stderr) softparsmap.wait() rootedTreeText = (file(os.path.join(workdirPath, "target", "family1_rooted.tree")).read()).strip()[:-1] return Tree(rootedTreeText, values_are_support=True) def runSoftParsMapDuplicationMapping(workdirPath): softparsmap = subprocess.Popen(["java", "softparsmap.Compute", "prop.xml", "map_example", "rooted"], cwd=workdirPath, stdout=sys.stderr) softparsmap.wait() schreiberOutput = (file(os.path.join(workdirPath, "target", "family1_map.info")).read()).strip() labeledTree = parseSchreiber(schreiberText=schreiberOutput) return labeledTree def fixupRootedTreeTaxonNames(rootedTree, origGeneTree): for nodeID in rootedTree.get_terminals(): taxonNumber = rootedTree.node(nodeID).data.taxon taxonName = origGeneTree.node(int(taxonNumber)).data.taxon rootedTree.node(nodeID).data.taxon = taxonName def main(): speciesTreeText = file(sys.argv[1]).read().strip() speciesTree = Tree(speciesTreeText[:-1]) # removing semicolon geneTreeText = file(sys.argv[2]).read().strip() geneTree = Tree(geneTreeText[:-1], values_are_support=True) # removing semicolon workdirPath = tempfile.mkdtemp() #print workdirPath treedirPath = os.path.join(workdirPath, "trees") os.mkdir(treedirPath) namesFile = file(os.path.join(workdirPath, "names.dmp"), "w") nodesFile = file(os.path.join(workdirPath, "nodes.dmp"), "w") seqsFile = file(os.path.join(workdirPath, "db.drw"), "w") geneTreeFile = file(os.path.join(treedirPath, "family1.tree"), "w") propertiesFile = file(os.path.join(workdirPath, "prop.xml"), "w") namesFile.write(namesFormatForTree(speciesTree)) namesFile.close() nodesFile.write(nodesFormatForTree(speciesTree)) nodesFile.close() seqsFile.write(xmlSequencesTextForGeneTree(geneTree)) seqsFile.close() geneTreeText = newickForTree(numericTaxaTree(geneTree), support=True) + ";\n" geneTreeFile.write(geneTreeText) geneTreeFile.close() propertiesFile.write(propertyXML) propertiesFile.close() indexTaxonomyFiles(workdirPath) rootedTree = runSoftParsMapRooting(workdirPath) labeledTree = runSoftParsMapDuplicationMapping(workdirPath) fixupRootedTreeTaxonNames(labeledTree, geneTree) treeString = labeledTree.to_string(internal_taxon_labels=True) print treeString[treeString.find("("):] if __name__ == '__main__': main()