require 'transmem'
require 'xml_style_parser'

class TopPred ; end


class TopPred::Index < Hash
  include TransmemIndex

  # we need to match whatever function toppred uses to generate identifiers if
  # we want derivative processes to be fast and accurate
  def reference_to_key(reference)
    if reference
      ri = reference.index(' ')
      frst =
        if ri 
          reference[0...reference.index(' ')]
        else 
          reference
        end
      if frst
        frst.gsub(/[^0-9a-zA-Z]/,'_')
      else
        nil
      end
    else
      nil
    end
  end

  def initialize(file, kind=:default)
    case kind
    when :default
      TopPred.default_index(file, self)
    else
      abort "can't do #{kind}"
    end
  end

  # This class will probably change its interface some in the future
  # That's the web portal
  # http://bioweb.pasteur.fr/seqanal/interfaces/toppred.html
  # How to run:
  # uncheck 'Produce hydrophobicity graph image (-g)'
  # choose 'Xml' or 'New: new text' output
  # type in your email, then hit 'Run toppred'
end

class TopPred
  include TransmemIndex

  # returns the default index
  def self.default_index(file, index={})
    TopPred::Parser.new(TopPred::Parser.filetype(file)).file_to_index(file, index)
  end

end

module TopPred::Parser
  # returns :xml or :text
  def self.filetype(file)
    File.open(file) do |fh|
      case fh.gets
      when /<\?xml version.*>/
        :xml 
      when /Algorithm specific/
        :text
      else
        nil
      end
    end
  end

  # type = :xml or :text
  def self.new(parser_type=:xml)
    klass = 
      case parser_type
      when :xml
        TopPred::Parser_XML
      when :text
        TopPred::Parser_Text
      else
        abort "don't recognize parser type: #{parser_type}"
      end
    klass.new
  end

  def file_to_index(file, index={})
    File.open(file) {|fh| to_index(fh, index) }
  end

  # where each segment = [prob, first, last] and aaseq is a string each
  # segment may also be a hash => first, last, probability (adding key
  # 'aaseq') 
  # first/last '1' indexed returns segments where each is [prob,
  # first, last, aaseq] or hash (above)
  def add_sequences_to_segments(segments, aaseq)
    if segments.first.is_a? Array
      segments.each do |seg|
        first_index = seg[1] - 1
        length = (seg[2] - seg[1]) + 1
        seg.push( aaseq[first_index, length] )
      end
    else
      segments.each do |seg|
        first_index = seg[:start] - 1
        length = (seg[:stop] - seg[:start]) + 1
        seg[:aaseq] = ( aaseq[first_index, length] )
      end
    end
    segments
  end


end

module TopPred::Parser_XML
  include TopPred::Parser
  include XMLStyleParser

  def self.new(meth=:to_index)
    parser = XMLStyleParser.choose_parser(self, meth).new
    @method = meth
    parser
  end

  def parse(file)
    send(@method, file)
  end
end

class TopPred::Parser_XML::DOM
  include TopPred::Parser_XML
  include XMLStyleParser

=begin
  YAL010C: 
  num_putative_transmembrane_segments: 1
  aaseq: MLPYMDQVLRAFYQSTHWSTQNSYEDITATSRTLLDFRIPSAIHLQISNKSTPNTFNSLDFSTRSRINGSLSYLYSDAQQLEKFMRNSTDIPLQDATETYRQLQPNLNFSVSSANTLSSDNTTVDNDKKLLHDSKFVKKSLYYGRMYYPSSDLEAMIIKRLSPQTQFMLKGVSSFKESLNVLTCYFQRDSHRNLQEWIFSTSDLLCGYRVLHNFLTTPSKFNTSLYNNSSLSLGAEFWLGLVSLSPGCSTTLRYYTHSTNTGRPLTLTLSWQPLFGHISSTYSAKTGTNSTFCAKYDFNLYSIESNLSFGCEFWQKKHHLLETNKNNNDKLEPISDELVDINPNSRATKLLHENVPDLNSAVNDIPSTLDIPVHKQKLLNDLTYAFSSSLRKIDEERSTIEKFDNKINSSIFTSVWKLSTSLRDKTLKLLWEGKWRGFLISAGTELVFTRGFQESLSDDEKNDNAISISATDTENGNIPVFPAKFGIQFQYST
  best_structure_probability: 1.0
  transmembrane_segments: 
  - aaseq: SLGAEFWLGLVSLSPGCSTTL
    stop: 252
    start: 232
    probability: 1.0
  num_certain_transmembrane_segments: 1
  num_found: 2
=end

  # should return a index
  def to_index(io, index = {})
    get_root_node_from_io(io) do |toppreds_n|

      abort if toppreds_n.name != 'toppreds'
      toppreds_n.find('child::toppred').each do |toppred_n|
        att_hash = {}
        sequence_n = toppred_n.find_first('child::sequence')
        index[sequence_n['id']] = att_hash
        att_hash[:aaseq] = sequence_n.content.gsub(/[\s\n]/,'')
        abort if att_hash[:aaseq].size != sequence_n['size'].to_i
        tmsummary_n = sequence_n.find_first('following-sibling::tmsummary')

        num_found = tmsummary_n['segments'].to_i
        att_hash[:num_found] = num_found
        if num_found > 0

          num_certain_transmembrane_segments = 0
          num_putative_transmembrane_segments = 0
          tmsummary_n.find('child::segment').each do |segment_n|
            abort if segment_n.name != 'segment'
            case segment_n['type']
            when 'certain'
              num_certain_transmembrane_segments += 1
            else # putative
              num_putative_transmembrane_segments += 1
            end
          end
          att_hash[:num_putative_transmembrane_segments] = num_putative_transmembrane_segments
          att_hash[:num_certain_transmembrane_segments] = num_certain_transmembrane_segments

          topologies_n = tmsummary_n.next
          abort if topologies_n.name != 'topologies'
          # get the top probability topology:
          top_prob_topology_n = topologies_n.find('child::topology').to_a.max {|a,b| a['prob'].to_f <=> b['prob'].to_f }
          tmsegments = []
          top_prob_topology_n.find('child::tmsegment').each do |tmsegment_n|
            tmhash = {}
            tmhash[:start] = tmsegment_n['start'].to_i
            tmhash[:stop] = tmsegment_n['stop'].to_i
            ## WARNING! it appears the probability is broken on xml output!!
            tmhash[:probability] = tmsegment_n['prob'].to_f
            tmsegments << tmhash
          end
          add_sequences_to_segments(tmsegments, att_hash[:aaseq])
          att_hash[:transmembrane_segments] = tmsegments
        end
      end
    end
    index
  end

end

class TopPred::Parser_Text
  include TopPred::Parser


  # returns a hash structure in this form: {identifier => {aaseq => String,
  # num_found: Int, num_certain_transmembrane_segments => Int,
  # num_putative_transmembrane_segments => Int, best_structure_probability =>
  # Float, transmembrane_segments => [probability => Float, start => Int, stop
  # => Int, aaseq => String] } }
  def to_index(io, index={})
    current_record = nil

    io.each do |line|
      if line =~ /^Sequence : (.*?) +\(/
        current_identifier = $1.dup
        index[current_identifier] = {}
        current_record = index[current_identifier]
        current_record[:aaseq] = read_aaseq(io)
        read_segment_summary(io, current_record)
      elsif line =~ /^HEADER\s+START\s+STOP/
        top_struc = top_structure( read_structures(io) )
        current_record[:best_structure_probability] = top_struc[:probability]
        current_record[:transmembrane_segments] = top_struc[:tm]
        add_sequences_to_segments(current_record[:transmembrane_segments], current_record[:aaseq])
        segment_arrays_to_hashes(current_record[:transmembrane_segments])
      end
    end
    index
  end

  private 

  # returns a list of all structures given a filehandle starting just after
  # the first "HEADER START STOP ..." line
  def read_structures(fh)
    structures = []
    loop do
      structures.push( read_structure(fh) )
      break if fh.eof?
      line = fh.readline
      unless line =~ /^HEADER\s+START\s+STOP/
        break
      end
    end
    structures
  end

  # returns a hash with key :probability and key :tm contains an array of
  # arrays: [prob(Float), start(Int), stop(Int)]
  def read_structure(fh)
    structure = {}
    # READ the first line
    line = fh.readline
    structure[:probability] = line.split(/\s+/)[2].to_f
    structure[:tm] = read_segments(fh)
    structure
  end

  # returns an array of arrays of transmembrane segments: [prob(Float),
  # start(Int), stop(Int)]
  # returns after seeing '//'
  def read_segments(fh)
    segments = []
    st = Regexp.escape('//') ; end_regex = /#{st}/
    fh.each do |line|
      if line =~ /^TRANSMEM/
        (header, start, stop, len, prob) = line.split(/\s+/)[0,5]
        segments << [prob.to_f, start.to_i, stop.to_i]
      elsif line =~ end_regex
        break
      end
    end
    segments
  end

  # returns the top probability structure (first on tie)
  def top_structure(list)
    top_prob = list.first[:probability]
    top_struc = list.first
    list.each do |st|
      if st[:probability] > top_prob 
        top_struc = st
        top_prob = st[:probability]
      end
    end
    top_struc
  end

  def read_aaseq(fh)
    aaseq = '' 
    fh.each do |line|
      line.chomp!
      unless line =~ /[\w\*]/
        break
      end
      aaseq << line 
    end
    aaseq
  end

  def segment_arrays_to_hashes(list)
    list.map! do |ar|
      { :probability => ar[0],
      :start => ar[1],
      :stop => ar[2],
      :aaseq => ar[3],
      }
    end
  end

  # returns [certain, putative]
  # expects first line to be a tm segment
  def num_certain_putative(fh)
    certain = 0
    putative = 0
    fh.each do |line|
      certainty = line.chomp.split(/\s+/).last
      if !certainty
        break
      else
        certain += 1 if certainty == 'Certain'
        putative += 1 if certainty == 'Putative'
      end
    end
    [certain, putative]
  end

  def read_segment_summary(fh, rec)
    fh.each do |line|
      if line =~ /Found: (.*?) segments/
        rec[:num_found] = $1.to_i
        break if rec[:num_found] == 0
      elsif line =~ /Helix\s+Begin/
        (cert, putat) = num_certain_putative(fh) 
        rec[:num_certain_transmembrane_segments] = cert
        rec[:num_putative_transmembrane_segments] = putat
        break
      end
    end
  end
end

class TopPred::Parser_XML::LibXML < TopPred::Parser_XML::DOM
  def get_root_node_from_io(io, &block)
    # turn off warnings because this doesn't seem to work:
    # XML::Parser.default_load_external_dtd = false
    # (There is a warning about not finding DTD)
    xml_parser_warnings = XML::Parser.default_warnings
    XML::Parser.default_warnings = false
    doc = XML::Parser.io(io).parse
    root = doc.root
    block.call(root)
    # reset the warning level of XML::Parser:
    XML::Parser.default_warnings = xml_parser_warnings
  end
end

class TopPred::Parser_XML::AXML < TopPred::Parser_XML::DOM
  def get_root_node_from_io(io, &block)
    root = ::AXML.parse(io)
    block.call(root)
  end
end