module SequenceServer
  # Module to collect some sequence-related helper functions
  module SequenceHelpers

    # copied from bioruby's Bio::Sequence
    # returns a Hash. Eg: composition("asdfasdfffffasdf")
    #                      => {"a"=>3, "d"=>3, "f"=>7, "s"=>3}
    def composition(sequence_string)
      count = Hash.new(0)
      sequence_string.scan(/./) do |x|
        count[x] += 1
      end
      return count
    end

    # Strips all non-letter characters. guestimates sequence based on that.
    # If less than 10 useable characters... returns nil
    # If more than 90% ACGTU returns :nucleotide. else returns :protein
    def guess_sequence_type(sequence_string)
      cleaned_sequence = sequence_string.gsub(/[^A-Z]/i, '')  # removing non-letter characters
      cleaned_sequence.gsub!(/[NX]/i, '')                     # removing ambiguous  characters

      return nil if cleaned_sequence.length < 10 # conservative

      composition = composition(cleaned_sequence)       
      composition_NAs    = composition.select { |character, count|character.match(/[ACGTU]/i) } # only putative NAs
      putative_NA_counts = composition_NAs.collect { |key_value_array| key_value_array[1] }     # only count, not char
      putative_NA_sum    = putative_NA_counts.inject { |sum, n| sum + n }                       # count of all putative NA
      putative_NA_sum    = 0 if putative_NA_sum.nil?

      if putative_NA_sum > (0.9 * cleaned_sequence.length)
        return :nucleotide
      else
        return :protein
      end
    end

    # splits input at putative fasta definition lines (like ">adsfadsf"), guesses sequence type for each sequence. 
    # if not enough sequence to determine, returns nil.
    # if 2 kinds of sequence mixed together, raises ArgumentError
    # otherwise, returns :nucleotide or :protein
    def type_of_sequences(fasta_format_string)
      # the first sequence does not need to have a fasta definition line
      sequences = fasta_format_string.split(/^>.*$/).delete_if { |seq| seq.empty? }

      # get all sequence types
      sequence_types = sequences.collect { |seq|  guess_sequence_type(seq) }.uniq.compact

      return nil if sequence_types.empty?

      if sequence_types.length == 1
        return sequence_types.first # there is only one (but yes its an array)
      else 
        raise ArgumentError, "Insufficient info to determine sequence type. Cleaned queries are: #{ sequences.to_s }"
      end
    end

    # Return the database type that can be used for a given blast method.
    #   db_type_for("blastp")  => :protein
    #   db_type_for("tblastn") => :nucleotide
    #   db_type_for(nil)       => nil
    def db_type_for(blast_method)
      case blast_method
      when 'blastp', 'blastx'
        :protein
      when 'blastn', 'tblastx', 'tblastn'
        :nucleotide
      end
    end

    # Return the blast methods that can be used for a given type of sequence.
    #   blast_methods_for(:protein)     => ['blastp', 'tblastn']
    #   blast_methods_for(:nucleotide)  => ['blastn','tblastx','blastx']
    #   blast_methods_for(nil)          => ['blastp', 'tblastn','blastn','tblastx','blastx']
    def blast_methods_for(seq_type)
      case seq_type
      when :protein
        ['blastp', 'tblastn']
      when :nucleotide
        ['blastn','tblastx','blastx']
      else # Sequence type not predicted, so don't make any assumptions about the blast method
        ['blastp', 'tblastn','blastn','tblastx','blastx']
      end
    end

    def sequence_from_blastdb(ids, db)  # helpful when displaying parsed blast results
      # we know how to handle an Array of ids
      ids = ids.join(',') if ids.is_a? Array

      # we don't know what to do if the arguments ain't String
      raise TypeError unless ids.is_a? String and db.is_a? String

      # query now!
      #
      # If `blastdbcmd` throws error, we assume sequence not found.
      blastdbcmd = settings.binaries['blastdbcmd']
      command = %x|#{blastdbcmd} -db #{db} -entry #{ids} 2> /dev/null|
    end

    # Given a sequence_id and databases, apply the default (standard)
    # way to convert a sequence_id into a hyperlink, so that the
    # blast results include hyperlinks.
    def construct_standard_sequence_hyperlink(options)
      if options[:sequence_id].match(/^[^ ]/) #if there is a space right after the '>', makeblastdb was run without -parse_seqids
        # By default, add a link to a fasta file of the sequence (if makeblastdb was called with -parse_seqids)
        complete_id = options[:sequence_id][/^(\S+)\s*.*/, 1]  # get id part
        id = complete_id.include?('|') ? complete_id.split('|')[1] : complete_id.split('|')[0]
        @all_retrievable_ids ||= []
        @all_retrievable_ids.push(id)

        link = "/get_sequence/?id=#{id}&db=#{options[:databases].join(' ')}" # several dbs... separate by ' '
        return link
      else
        # do nothing - link == nil means no link will be incorporated
        return nil
      end
    end
  end
end