lib/bio/db/newick.rb in bio-1.1.0 vs lib/bio/db/newick.rb in bio-1.2.0

- old
+ new

@@ -4,22 +4,33 @@ # Copyright:: Copyright (C) 2004-2006 # Naohisa Goto <ng@bioruby.org> # Daniel Amelang <dan@amelang.net> # License:: The Ruby License # -# $Id: newick.rb,v 1.7 2007/04/05 23:35:40 trevor Exp $ +# $Id: newick.rb,v 1.8 2007/12/12 16:06:22 ngoto Exp $ # +# == Description +# +# This file contains parser and formatter of Newick and NHX. +# +# == References +# +# * http://evolution.genetics.washington.edu/phylip/newick_doc.html +# * http://www.phylosoft.org/forester/NHX.html +# +require 'strscan' require 'bio/tree' module Bio class Tree #--- # newick output #+++ + # default options DEFAULT_OPTIONS = { :indent => ' ' } def __get_option(key, options) if (r = options[key]) != nil then @@ -30,14 +41,30 @@ DEFAULT_OPTIONS[key] end end private :__get_option + + # formats Newick label (unquoted_label or quoted_label) + def __to_newick_format_label(str, options) + if __get_option(:parser, options) == :naive then + return str.to_s + end + str = str.to_s + if /([\(\)\,\:\[\]\_\'\x00-\x1f\x7f])/ =~ str then + # quoted_label + return "\'" + str.gsub(/\'/, "\'\'") + "\'" + end + # unquoted_label + return str.gsub(/ /, '_') + end + private :__to_newick_format_label + # formats leaf def __to_newick_format_leaf(node, edge, options) - label = get_node_name(node).to_s + label = __to_newick_format_label(get_node_name(node), options) dist = get_edge_distance_string(edge) bs = get_node_bootstrap_string(node) @@ -60,11 +87,11 @@ private :__to_newick_format_leaf # formats leaf for NHX def __to_newick_format_leaf_NHX(node, edge, options) - label = get_node_name(node).to_s + label = __to_newick_format_label(get_node_name(node), options) dist = get_edge_distance_string(edge) bs = get_node_bootstrap_string(node) @@ -163,15 +190,18 @@ private :__to_newick # Returns a newick formatted string. # If block is given, the order of the node is sorted # (as the same manner as Enumerable#sort). - # Description about options. - # :indent : indent string; set false to disable (default: ' ') - # :bootstrap_style : :disabled disables bootstrap representations - # :traditional traditional style - # :molphy Molphy style (default) + # + # Available options: + # <tt>:indent</tt>:: + # indent string; set false to disable (default: ' ') + # <tt>:bootstrap_style</tt>:: + # <tt>:disabled</tt> disables bootstrap representations. + # <tt>:traditional</tt> for traditional style. + # <tt>:molphy</tt> for Molphy style (default). def output_newick(options = {}, &block) #:yields: node1, node2 root = @root root ||= self.nodes.first return '();' unless root __to_newick([], root, 0, :__to_newick_format_leaf, options, &block) + @@ -183,12 +213,15 @@ # Returns a NHX (New Hampshire eXtended) formatted string. # If block is given, the order of the node is sorted # (as the same manner as Enumerable#sort). - # Description about options. - # :indent : indent string; set false to disable (default: ' ') + # + # Available options: + # <tt>:indent</tt>:: + # indent string; set false to disable (default: ' ') + # def output_nhx(options = {}, &block) #:yields: node1, node2 root = @root root ||= self.nodes.first return '();' unless root __to_newick([], root, 0, @@ -255,17 +288,32 @@ Node = Bio::Tree::Node # Creates a new Newick object. # _options_ for parsing can be set. # - # Note: molphy-style bootstrap values may be parsed, even if - # the options[:bootstrap_style] is set to :traditional or :disabled. - # Note: By default, if all of the internal node's names are numeric + # Available options: + # <tt>:bootstrap_style</tt>:: + # <tt>:traditional</tt> for traditional bootstrap style, + # <tt>:molphy</tt> for molphy style, + # <tt>:disabled</tt> to ignore bootstrap strings. + # For details of default actions, please read the notes below. + # <tt>:parser</tt>:: + # <tt>:naive</tt> for using naive parser, compatible with + # BioRuby 1.1.0, which ignores quoted strings and + # do not convert underscores to spaces. + # + # Notes for bootstrap style: + # Molphy-style bootstrap values may always be parsed, even if + # the <tt>options[:bootstrap_style]</tt> is set to + # <tt>:traditional</tt> or <tt>:disabled</tt>. + # + # Note for default or traditional bootstrap style: + # By default, if all of the internal node's names are numeric # and there are no NHX and no molphy-style boostrap values, # the names of internal nodes are regarded as bootstrap values. - # options[:bootstrap_style] = :disabled or :molphy to disable the feature - # (or at least one NHX tag exists). + # <tt>options[:bootstrap_style] = :disabled</tt> or <tt>:molphy</tt> + # to disable the feature (or at least one NHX tag exists). def initialize(str, options = nil) str = str.sub(/\;(.*)/m, ';') @original_string = str @entry_overrun = $1 @options = (options or {}) @@ -306,61 +354,70 @@ def __get_option(key, options) options[key] or (@options ? @options[key] : nil) end # Parses newick formatted leaf (or internal node) name. - def __parse_newick_leaf(str, node, edge, options) - case str - when /(.*)\:(.*)\[(.*)\]/ - node.name = $1 - edge.distance_string = $2 if $2 and !($2.strip.empty?) - # bracketted string into bstr - bstr = $3 - when /(.*)\[(.*)\]/ - node.name = $1 - # bracketted string into bstr - bstr = $2 - when /(.*)\:(.*)/ - node.name = $1 - edge.distance_string = $2 if $2 and !($2.strip.empty?) - else - node.name = str + def __parse_newick_leaf(leaf_tokens, node, edge, options) + t = leaf_tokens.shift + if !t.kind_of?(Symbol) then + node.name = t + t = leaf_tokens.shift end - # determines NHX or Molphy-style bootstrap - if bstr and !(bstr.strip.empty?) + if t == :':' then + t = leaf_tokens.shift + if !t.kind_of?(Symbol) then + edge.distance_string = t if t and !(t.strip.empty?) + t = leaf_tokens.shift + end + end + + if t == :'[' then + btokens = leaf_tokens case __get_option(:original_format, options) when :nhx # regarded as NHX string which might be broken - __parse_nhx(bstr, node, edge) + __parse_nhx(btokens, node, edge) when :traditional # simply ignored else - case bstr + case btokens[0].to_s.strip + when '' + # not automatically determined when /\A\&\&NHX/ # NHX string # force to set NHX mode @options[:original_format] = :nhx - __parse_nhx(bstr, node, edge) + __parse_nhx(btokens, node, edge) else # Molphy-style boostrap values # let molphy mode if nothing determined @options[:original_format] ||= :molphy + bstr = '' + while t = btokens.shift and t != :']' + bstr.concat t.to_s + end node.bootstrap_string = bstr - end #case bstr + end #case btokens[0] end end + if !btokens and !leaf_tokens.empty? then + # syntax error? + end + node.name ||= '' # compatibility for older BioRuby + # returns true true end # Parses NHX (New Hampshire eXtended) string - def __parse_nhx(bstr, node, edge) - a = bstr.split(/\:/) - a.shift if a[0] == '&&NHX' - a.each do |str| + def __parse_nhx(btokens, node, edge) + btokens.shift if btokens[0] == '&&NHX' + btokens.each do |str| + break if str == :']' + next if str.kind_of?(Symbol) tag, val = str.split(/\=/, 2) case tag when 'B' node.bootstrap_string = val when 'D' @@ -389,63 +446,154 @@ end end #each true end + # splits string to tokens + def __parse_newick_tokenize(str, options) + str = str.chop if str[-1..-1] == ';' + # http://evolution.genetics.washington.edu/phylip/newick_doc.html + # quoted_label ==> ' string_of_printing_characters ' + # single quote in quoted_label is '' (two single quotes) + # + + if __get_option(:parser, options) == :naive then + ary = str.split(/([\(\)\,\:\[\]])/) + ary.collect! { |x| x.strip!; x.empty? ? nil : x } + ary.compact! + ary.collect! do |x| + if /\A([\(\)\,\:\[\]])\z/ =~ x then + x.intern + else + x + end + end + return ary + end + + tokens = [] + ss = StringScanner.new(str) + + while !(ss.eos?) + if ss.scan(/\s+/) then + # do nothing + + elsif ss.scan(/[\(\)\,\:\[\]]/) then + # '(' or ')' or ',' or ':' or '[' or ']' + t = ss.matched + tokens.push t.intern + + elsif ss.scan(/\'/) then + # quoted_label + t = '' + while true + if ss.scan(/([^\']*)\'/) then + t.concat ss[1] + if ss.scan(/\'/) then + # single quote in quoted_label + t.concat ss.matched + else + break + end + else + # incomplete quoted_label? + break + end + end #while true + unless ss.match?(/\s*[\(\)\,\:\[\]]/) or ss.match?(/\s*\z/) then + # label continues? (illegal, but try to rescue) + if ss.scan(/[^\(\)\,\:\[\]]+/) then + t.concat ss.matched.lstrip + end + end + tokens.push t + + elsif ss.scan(/[^\(\)\,\:\[\]]+/) then + # unquoted_label + t = ss.matched.strip + t.gsub!(/[\r\n]/, '') + # unquoted underscore should be converted to blank + t.gsub!(/\_/, ' ') + tokens.push t unless t.empty? + + else + # unquoted_label in end of string + t = ss.rest.strip + t.gsub!(/[\r\n]/, '') + # unquoted underscore should be converted to blank + t.gsub!(/\_/, ' ') + tokens.push t unless t.empty? + ss.terminate + + end + end #while !(ss.eos?) + + tokens + end + + # get tokens for a leaf + def __parse_newick_get_tokens_for_leaf(ary) + r = [] + while t = ary[0] and t != :',' and t != :')' and t != :'(' + r.push ary.shift + end + r + end + # Parses newick formatted string. def __parse_newick(str, options = {}) # initializing root = Node.new cur_node = root edges = [] nodes = [ root ] internal_nodes = [] node_stack = [] # preparation of tokens - str = str.chop if str[-1..-1] == ';' - ary = str.split(/([\(\)\,])/) - ary.collect! { |x| x.strip!; x.empty? ? nil : x } - ary.compact! + ary = __parse_newick_tokenize(str, options) previous_token = nil # main loop while token = ary.shift #p token case token - when ',' - if previous_token == ',' or previous_token == '(' then + when :',' + if previous_token == :',' or previous_token == :'(' then # there is a leaf whose name is empty. ary.unshift(token) ary.unshift('') token = nil end - when '(' + when :'(' node = Node.new nodes << node internal_nodes << node node_stack.push(cur_node) cur_node = node - when ')' - if previous_token == ',' or previous_token == '(' then + when :')' + if previous_token == :',' or previous_token == :'(' then # there is a leaf whose name is empty. ary.unshift(token) ary.unshift('') token = nil else edge = Edge.new - next_token = ary[0] - if next_token and next_token != ',' and next_token != ')' then - __parse_newick_leaf(next_token, cur_node, edge, options) - ary.shift + leaf_tokens = __parse_newick_get_tokens_for_leaf(ary) + token = nil + if leaf_tokens.size > 0 then + __parse_newick_leaf(leaf_tokens, cur_node, edge, options) end parent = node_stack.pop raise ParseError, 'unmatched parentheses' unless parent edges << Bio::Relation.new(parent, cur_node, edge) cur_node = parent end else leaf = Node.new edge = Edge.new - __parse_newick_leaf(token, leaf, edge, options) + ary.unshift(token) + leaf_tokens = __parse_newick_get_tokens_for_leaf(ary) + token = nil + __parse_newick_leaf(leaf_tokens, leaf, edge, options) nodes << leaf edges << Bio::Relation.new(cur_node, leaf, edge) end #case previous_token = token end #while