#
# = bio/io/flatfile/autodetection.rb - file format auto-detection
#
#   Copyright (C) 2001-2006 Naohisa Goto <ng@bioruby.org>
#
# License:: The Ruby License
#
#  $Id:$
#
#
#  See documents for Bio::FlatFile::AutoDetect and Bio::FlatFile.
#

require 'tsort'
require 'bio/io/flatfile'

module Bio

  class FlatFile

    # AutoDetect automatically determines database class of given data.
    class AutoDetect

      include TSort

      # Array to store autodetection rules.
      # This is defined only for inspect.
      class RulesArray < Array
        # visualize contents
        def inspect
          "[#{self.collect { |e| e.name.inspect }.join(' ')}]"
        end
      end #class RulesArray

      # Template of a single rule of autodetection
      class RuleTemplate
        # Creates a new element.
        def self.[](*arg)
          self.new(*arg)
        end
        
        # Creates a new element.
        def initialize
          @higher_priority_elements = RulesArray.new
          @lower_priority_elements  = RulesArray.new
          @name = nil
        end

        # self is prior to the _elem_.
        def is_prior_to(elem)
          return nil if self == elem
          elem.higher_priority_elements << self
          self.lower_priority_elements << elem
          true
        end

        # higher priority elements
        attr_reader :higher_priority_elements
        # lower priority elements
        attr_reader :lower_priority_elements

        # database classes
        attr_reader :dbclasses

        # unique name of the element
        attr_accessor :name

        # If given text (and/or meta information) is known, returns
        # the database class.
        # Otherwise, returns nil or false.
        #
        # _text_ will be a String.
        # _meta_ will be a Hash.
        # _meta_ may contain following keys.
        # :path => pathname, filename or uri.
        def guess(text, meta)
          nil
        end

        private
        # Gets constant from constant name given as a string.
        def str2const(str)
          const = Object
          str.split(/\:\:/).each do |x|
            const = const.const_get(x)
          end
          const
        end

        # Gets database class from given object.
        # Current implementation is: 
        # if _obj_ is kind of String, regarded as a constant.
        # Otherwise, returns _obj_ as is.
        def get_dbclass(obj)
          obj.kind_of?(String) ? str2const(obj) : obj
        end
      end #class Rule_Template

      # RuleDebug is a class for debugging autodetect classes/methods
      class RuleDebug < RuleTemplate
        # Creates a new instance.
        def initialize(name)
          super()
          @name = name
        end

        # prints information to the $stderr.
        def guess(text, meta)
          $stderr.puts @name
          $stderr.puts text.inspect
          $stderr.puts meta.inspect
          nil
        end
      end #class RuleDebug

      # Special element that is always top or bottom priority.
      class RuleSpecial < RuleTemplate
        def initialize(name)
          #super()
          @name = name
        end
        # modification of @name is inhibited.
        def name=(x)
          raise 'cannot modify name'
        end

        # always returns void array
        def higher_priority_elements
          []
        end
        # always returns void array
        def lower_priority_elements
          []
        end
      end #class RuleSpecial

      # Special element that is always top priority.
      TopRule = RuleSpecial.new('top')
      # Special element that is always bottom priority.
      BottomRule = RuleSpecial.new('bottom')

      # A autodetection rule to use a regular expression
      class RuleRegexp < RuleTemplate
        # Creates a new instance.
        def initialize(dbclass, re)
          super()
          @re = re
          @name = dbclass.to_s
          @dbclass = nil
          @dbclass_lazy = dbclass
        end

        # database class (lazy evaluation)
        def dbclass
          unless @dbclass
            @dbclass = get_dbclass(@dbclass_lazy)
          end
          @dbclass
        end
        private :dbclass

        # returns database classes
        def dbclasses
          [ dbclass ]
        end

        # If given text matches the regexp, returns the database class.
        # Otherwise, returns nil or false.
        # _meta_ is ignored.
        def guess(text, meta)
          @re =~ text ? dbclass : nil
        end
      end #class RuleRegexp

      # A autodetection rule to use more than two regular expressions.
      # If given string matches one of the regular expressions,
      # returns the database class.
      class RuleRegexp2 < RuleRegexp
        # Creates a new instance.
        def initialize(dbclass, *regexps)
          super(dbclass, nil)
          @regexps = regexps
        end

        # If given text matches one of the regexp, returns the database class.
        # Otherwise, returns nil or false.
        # _meta_ is ignored.
        def guess(text, meta)
          @regexps.each do |re|
            return dbclass if re =~ text
          end
          nil
        end
      end #class RuleRegexp

      # A autodetection rule that passes data to the proc object.
      class RuleProc < RuleTemplate
        # Creates a new instance.
        def initialize(*dbclasses, &proc)
          super()
          @proc = proc
          @dbclasses = nil
          @dbclasses_lazy = dbclasses
          @name = dbclasses.collect { |x| x.to_s }.join('|')
        end

        # database classes (lazy evaluation)
        def dbclasses
          unless @dbclasses
            @dbclasses = @dbclasses_lazy.collect { |x| get_dbclass(x) }
          end
          @dbclasses
        end

        # If given text (and/or meta information) is known, returns
        # the database class.
        # Otherwise, returns nil or false.
        #
        # Refer RuleTemplate#guess for _meta_.
        def guess(text, meta)
          @proc.call(text)
        end
      end #class RuleProc
      
      # Creates a new Autodetect object
      def initialize
        # stores autodetection rules.
        @rules = Hash.new
        # stores elements (cache)
        @elements = nil
        self.add(TopRule)
        self.add(BottomRule)
      end

      # Adds a new element.
      # Returns _elem_.
      def add(elem)
        raise 'element name conflicts' if @rules[elem.name]
        @elements = nil
        @rules[elem.name] = elem
        elem
      end

      # (required by TSort.)
      # For all elements, yields each element.
      def tsort_each_node(&x)
        @rules.each_value(&x)
      end

      # (required by TSort.)
      # For a given element, yields each child
      # (= lower priority elements) of the element.
      def tsort_each_child(elem)
        if elem == TopRule then
          @rules.each_value do |e|
            yield e unless e == TopRule or 
              e.lower_priority_elements.index(TopRule)
          end
        elsif elem == BottomRule then
          @rules.each_value do |e|
            yield e if e.higher_priority_elements.index(BottomRule)
          end
        else
          elem.lower_priority_elements.each do |e|
            yield e if e != BottomRule
          end
          unless elem.higher_priority_elements.index(BottomRule)
            yield BottomRule
          end
        end
      end

      # Returns current elements as an array
      # whose order fulfills all elements' priorities.
      def elements
        unless @elements
          ary = tsort
          ary.reverse!
          @elements = ary
        end
        @elements
      end

      # rebuilds the object and clears internal cache.
      def rehash
        @rules.rehash
        @elements = nil
      end

      # visualizes the object (mainly for debug)
      def inspect
        "<#{self.class.to_s} " +
          self.elements.collect { |e| e.name.inspect }.join(' ') +
          ">"
      end

      # Iterates over each element.
      def each_rule(&x) #:yields: elem
        elements.each(&x)
      end

      # Autodetect from the text.
      # Returns a database class if succeeded.
      # Returns nil if failed.
      def autodetect(text, meta = {})
        r = nil
        elements.each do |e|
          #$stderr.puts e.name
          r = e.guess(text, meta)
          break if r
        end
        r
      end

      # autodetect from the FlatFile object.
      # Returns a database class if succeeded.
      # Returns nil if failed.
      def autodetect_flatfile(ff, lines = 31)
        meta = {}
        stream = ff.instance_eval { @stream }
        begin
          path = stream.path
        rescue NameError
        end
        if path then
          meta[:path] = path
          # call autodetect onece with meta and without any read action
          if r = self.autodetect(stream.prefetch_buffer, meta)
            return r
          end
        end
        # reading stream
        1.upto(lines) do |x|
          break unless line = stream.prefetch_gets
          if line.strip.size > 0 then
            if r = self.autodetect(stream.prefetch_buffer, meta)
              return r
            end
          end
        end
        return nil
      end

      # default autodetect object for class method
      @default = nil

      # returns the default autodetect object
      def self.default
        unless @default then
          @default = self.make_default
        end
        @default
      end

      # sets the default autodetect object.
      def self.default=(ad)
        @default = ad
      end

      # make a new autodetect object
      def self.[](*arg)
        a = self.new
        arg.each { |e| a.add(e) }
        a
      end

      # make a default of default autodetect object
      def self.make_default
        a = self[
          genbank  = RuleRegexp[ 'Bio::GenBank',
            /^LOCUS       .+ bp .*[a-z]*[DR]?NA/ ],
          genpept  = RuleRegexp[ 'Bio::GenPept',
            /^LOCUS       .+ aa .+/ ],
          medline  = RuleRegexp[ 'Bio::MEDLINE',
            /^PMID\- [0-9]+$/ ],
          embl     = RuleRegexp[ 'Bio::EMBL',
            /^ID   .+\; .*(DNA|RNA|XXX)\;/ ],
          sptr     = RuleRegexp2[ 'Bio::SPTR',
            /^ID   .+\; *PRT\;/,
            /^ID   [-A-Za-z0-9_\.]+ .+\; *[0-9]+ *AA\./ ],
          prosite  = RuleRegexp[ 'Bio::PROSITE',
            /^ID   [-A-Za-z0-9_\.]+\; (PATTERN|RULE|MATRIX)\.$/ ],
          transfac = RuleRegexp[ 'Bio::TRANSFAC',
            /^AC  [-A-Za-z0-9_\.]+$/ ],

          aaindex  = RuleProc.new('Bio::AAindex1', 'Bio::AAindex2') do |text|
            if /^H [-A-Z0-9_\.]+$/ =~ text then
              if text =~ /^M [rc]/ then
                Bio::AAindex2
              elsif text =~ /^I    A\/L/ then
                Bio::AAindex1
              else
                false #fail to determine
              end
            else
              nil
            end
          end,

          litdb    = RuleRegexp[ 'Bio::LITDB',
            /^CODE        [0-9]+$/ ],
          pathway_module = RuleRegexp[ 'Bio::KEGG::MODULE',
            /^ENTRY       .+ Pathway\s+Module\s*/ ],
          pathway  = RuleRegexp[ 'Bio::KEGG::PATHWAY',
            /^ENTRY       .+ Pathway\s*/ ],
          brite    = RuleRegexp[ 'Bio::KEGG::BRITE',
            /^Entry           [A-Z0-9]+/ ],
          orthology = RuleRegexp[ 'Bio::KEGG::ORTHOLOGY',
            /^ENTRY       .+ KO\s*/ ],
          drug     = RuleRegexp[ 'Bio::KEGG::DRUG',
            /^ENTRY       .+ Drug\s*/ ],
          glycan   = RuleRegexp[ 'Bio::KEGG::GLYCAN',
            /^ENTRY       .+ Glycan\s*/ ],
          enzyme   = RuleRegexp2[ 'Bio::KEGG::ENZYME',
            /^ENTRY       EC [0-9\.]+$/,
            /^ENTRY       .+ Enzyme\s*/
          ],
          compound = RuleRegexp2[ 'Bio::KEGG::COMPOUND',
            /^ENTRY       C[A-Za-z0-9\._]+$/,
            /^ENTRY       .+ Compound\s*/
          ],
          reaction = RuleRegexp2[ 'Bio::KEGG::REACTION',
            /^ENTRY       R[A-Za-z0-9\._]+$/,
            /^ENTRY       .+ Reaction\s*/
          ],
          genes    = RuleRegexp[ 'Bio::KEGG::GENES',
            /^ENTRY       .+ (CDS|gene|.*RNA|Contig) / ],
          genome   = RuleRegexp[ 'Bio::KEGG::GENOME',
            /^ENTRY       [a-z]+$/ ],

          fantom = RuleProc.new('Bio::FANTOM::MaXML::Cluster',
                                'Bio::FANTOM::MaXML::Sequence') do |text|
            if /\<\!DOCTYPE\s+maxml\-(sequences|clusters)\s+SYSTEM/ =~ text
              case $1
              when 'clusters'
                Bio::FANTOM::MaXML::Cluster
              when 'sequences'
                Bio::FANTOM::MaXML::Sequence
              else
                nil #unknown
              end
            else
              nil
            end
          end,

          pdb = RuleRegexp[ 'Bio::PDB',
            /^HEADER    .{40}\d\d\-[A-Z]{3}\-\d\d   [0-9A-Z]{4}/ ],
          het = RuleRegexp[ 'Bio::PDB::ChemicalComponent',
            /^RESIDUE +.+ +\d+\s*$/ ],

          clustal = RuleRegexp2[ 'Bio::ClustalW::Report',
          /^CLUSTAL .*\(.*\).*sequence +alignment/,
          /^CLUSTAL FORMAT for T-COFFEE/ ],

          gcg_msf = RuleRegexp[ 'Bio::GCG::Msf',
          /^!!(N|A)A_MULTIPLE_ALIGNMENT .+/ ],

          gcg_seq = RuleRegexp[ 'Bio::GCG::Seq',
          /^!!(N|A)A_SEQUENCE .+/ ],

          blastxml = RuleRegexp[ 'Bio::Blast::Report',
            /\<\!DOCTYPE BlastOutput PUBLIC / ],
          wublast  = RuleRegexp[ 'Bio::Blast::WU::Report',
            /^BLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
          wutblast = RuleRegexp[ 'Bio::Blast::WU::Report_TBlast',
            /^TBLAST.? +[\-\.\w]+\-WashU +\[[\-\.\w ]+\]/ ],
          blast    = RuleRegexp[ 'Bio::Blast::Default::Report',
            /^BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
          tblast   = RuleRegexp[ 'Bio::Blast::Default::Report_TBlast',
            /^TBLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],
          rpsblast   = RuleRegexp[ 'Bio::Blast::RPSBlast::Report',
            /^RPS\-BLAST.? +[\-\.\w]+ +\[[\-\.\w ]+\]/ ],

          blat   = RuleRegexp[ 'Bio::Blat::Report',
            /^psLayout version \d+/ ],
          spidey = RuleRegexp[ 'Bio::Spidey::Report',
            /^\-\-SPIDEY version .+\-\-$/ ],
          hmmer  = RuleRegexp[ 'Bio::HMMER::Report',
            /^HMMER +\d+\./ ],
          sim4   = RuleRegexp[ 'Bio::Sim4::Report',
            /^seq1 \= .*\, \d+ bp(\r|\r?\n)seq2 \= .*\, \d+ bp(\r|\r?\n)/ ],

          fastq  = RuleRegexp[ 'Bio::Fastq',
            /^\@.+(?:\r|\r?\n)(?:[^\@\+].*(?:\r|\r?\n))+/ ],

          fastaformat = RuleProc.new('Bio::FastaFormat',
                                     'Bio::NBRF',
                                     'Bio::FastaNumericFormat') do |text|
            if /^>.+$/ =~ text
              case text
              when /^>([PF]1|[DR][LC]|N[13]|XX)\;.+/
                Bio::NBRF
              when /^>.+$\s+(^\#.*$\s*)*^\s*\d*\s*[-a-zA-Z_\.\[\]\(\)\*\+\$]+/
                  Bio::FastaFormat
              when /^>.+$\s+^\s*\d+(\s+\d+)*\s*$/
                Bio::FastaNumericFormat
              else
                false
              end
            else
              nil
            end
          end
        ]

        # dependencies
        # NCBI
        genbank.is_prior_to genpept
        # EMBL/UniProt
        embl.is_prior_to sptr
        sptr.is_prior_to prosite
        prosite.is_prior_to transfac
        # KEGG
        #aaindex.is_prior_to litdb
        #litdb.is_prior_to brite
        pathway_module.is_prior_to pathway
        pathway.is_prior_to brite
        brite.is_prior_to orthology
        orthology.is_prior_to drug
        drug.is_prior_to glycan
        glycan.is_prior_to enzyme
        enzyme.is_prior_to compound
        compound.is_prior_to reaction
        reaction.is_prior_to genes
        genes.is_prior_to genome
        # PDB
        pdb.is_prior_to het
        # BLAST
        wublast.is_prior_to wutblast
        wutblast.is_prior_to blast
        blast.is_prior_to tblast
        # Fastq
        BottomRule.is_prior_to(fastq)
        fastq.is_prior_to(fastaformat)
        # FastaFormat
        BottomRule.is_prior_to(fastaformat)

        # for debug
        #debug_first = RuleDebug.new('debug_first')
        #a.add(debug_first)
        #debug_first.is_prior_to(TopRule)

        ## for debug
        #debug_last = RuleDebug.new('debug_last')
        #a.add(debug_last)
        #BottomRule.is_prior_to(debug_last)
        #fastaformat.is_prior_to(debug_last)

        ## for suppressing warnings
        p medline, aaindex, litdb, fantom, clustal,
          gcg_msf, gcg_seq, blastxml, rpsblast, blat,
          spidey, hmmer, sim4 if false

        a.rehash
        return a
      end
      
    end #class AutoDetect
  end #class FlatFile
end #module Bio