require 'forwardable'

# Define Sequence class.
module SequenceServer
  # Provides simple sequence processing utilities via class methods. Instance
  # of the class serves as a simple data object to captures sequences fetched
  # from BLAST databases.
  #
  # NOTE:
  #   What all do we need to consistently construct FASTA from `blastdbcmd's`
  #   output?
  #
  #   It would seem rather straightforward. But it's not.
  #
  #   FASTA format:
  #
  #     >id title
  #     actual sequence
  #
  #   ID of a sequence fetched from nr database should look like this:
  #
  #     gi|322796550|gb|EFZ19024.1| -> self.id
  #                     accession   -> self.accession
  #                     ----------
  #                   sequence id   -> self.seqid
  #                  -------------
  #        ---------
  #        gi number                -> self.gi
  #
  #   while for local databases, the id should be the exact same,
  #   as in the original FASTA file:
  #
  #     SI2.2.0_06267 -> self.id == self.accession
  Sequence = Struct.new(:gi, :seqid, :accession, :title, :value) do
    def initialize(*args)
      # If gi of the hit is 'N/A', make it nil instead.
      args[0] = nil if args[0] == 'N/A'

      # If seqid has 'lcl|' prefixed, remove it.
      args[1] = args[1].gsub(/^lcl\|/, '')

      # If hit comes from a non -parse_seqids
      # database, obtain seqid and title from
      # defline.
      if args[1] =~ /^gnl\|/
        defline = args[3].split
        args[1] = defline.shift
        args[3] = defline.join(' ')
      end

      super
    end

    # Returns FASTA sequence id.
    def id
      (gi ? ['gi', gi, seqid] : [seqid]).join('|')
    end

    # Returns length of the sequence.
    def length
      value.length
    end

    # Returns sequence value.
    def to_s
      value
    end

    def info
      { value: value, id: id, title: title }
    end

    # Returns FASTA formatted sequence.
    def fasta
      chars = 60
      lines = (length / chars.to_f).ceil
      defline  = ">#{id} #{title}"
      seqlines = (1..lines).map { |i| to_s[chars * (i - 1), chars] }
      [defline].concat(seqlines).join("\n")
    end
  end

  # Utility methods.
  class Sequence
    class << self
      # Strips all non-letter characters. If less than 10 useable characters
      # return `nil`. If at least 90% is ACGTU, returns `:nucleotide`, else
      # `:protein`.
      def guess_type(sequence)
        # Clean the sequence: first remove non-letter characters, then
        # ambiguous characters.
        cleaned_sequence = sequence.gsub(/[^A-Z]/i, '').gsub(/[NX]/i, '')

        return if cleaned_sequence.length < 10 # conservative

        # Count putative NA in the sequence.
        na_count = 0
        composition = composition(cleaned_sequence)
        composition.each do |character, count|
          na_count += count if character =~ /[ACGTU]/i
        end

        na_count > (0.9 * cleaned_sequence.length) ? :nucleotide : :protein
      end

      # Copied from BioRuby's `Bio::Sequence` class.
      #
      # > composition("asdfasdfffffasdf")
      # => {"a"=>3, "d"=>3, "f"=>7, "s"=>3}
      def composition(sequence_string)
        count = Hash.new(0)
        sequence_string.scan(/./) do |x|
          count[x] += 1
        end
        count
      end
    end

    # Retrieve sequences from BLAST databases.
    class Retriever
      extend Forwardable

      def_delegators SequenceServer, :config, :sys

      # Provides IO for Retriever similar to BLAST::Formatter. We dynamically
      # extend Retriever object with this module if file download has been
      # requested (here it must be remembered that Retriever is used by
      # sequence viewer and FASTA download links).
      module IO
        # Returns handle to a temporary file to which data should be written to
        # or read from.
        def file
          @file ||= Tempfile.new filename
        end

        # Returns a file name to use for the temporary file.
        def filename
          return @filename if @filename
          name = sequence_ids.first            if sequence_ids.length == 1
          name = "#{sequence_ids.length}_hits" if sequence_ids.length >= 2
          @filename = "sequenceserver-#{name}.fa"
        end

        # Returns mime type to use if this file were to be transferred over
        # Internet.
        def mime
          :fasta
        end

        private

        def write
          file.open
          write_error_msgs
          write_sequences
          file.close
        end

        # Write error messages to file. Expects file to be open.
        def write_error_msgs
          error_msgs.each do |heading, message|
            file.puts "# #{heading}"
            message.each_line do |line|
              file.puts "# #{line}"
            end
          end
        end

        # Write sequence data to file. Expects file to be open.
        def write_sequences
          sequences.each do |sequence|
            file.puts sequence.fasta
          end
        end
      end

      def initialize(sequence_ids, database_ids, in_file = false)
        @sequence_ids = Array sequence_ids
        @database_ids = Array database_ids
        @in_file = in_file

        validate && run
      end

      attr_reader :sequence_ids, :database_ids, :in_file

      attr_reader :sequences

      def to_json
        {
          error_msgs: error_msgs,
          sequences:  sequences.map(&:info)
        }.to_json
      end

      private

      def run
        command = "blastdbcmd -outfmt '%g	%i	%a	%t	%s'" \
                  " -db '#{database_names.join(' ')}'" \
                  " -entry '#{sequence_ids.join(',')}'"

        out, = sys(command, path: config[:bin])

        @sequences = out.each_line.map do |line|
          # Stop codons in amino acid sequence databases show up as invalid
          # UTF-8 characters in the output and cause the subsequent call to
          # `split` to fail. We replace invalid UTF-8 characters with X.
          line = line.encode('UTF-8', invalid: :replace, replace: 'X')
          Sequence.new(*line.chomp.split('	'))
        end
        extend(IO) && write if in_file
      end

      def database_names
        Database[database_ids].map(&:name)
      end

      def database_titles
        Database[database_ids].map(&:title)
      end

      def validate
        ids = Database.ids
        return true if database_ids.is_a?(Array) && !database_ids.empty? &&
                       (ids & database_ids).length == database_ids.length
        fail ArgumentError, 'Database id should be one of:' \
                            " #{ids.join("\n")}."
      end

      # rubocop:disable Metrics/MethodLength
      def error_msgs
        return [] if sequences.length == sequence_ids.length
        [
          ['ERROR: incorrect number of sequences found.',
           <<~MSG
             You requested #{sequence_ids.length} sequence(s) with the following
             identifiers:
               #{sequence_ids.join(', ')}
             from the following databases:
               #{database_titles.join(', ')}
             but we found #{sequences.length} sequence(s).

             This is likley due to a problem with how databases are formatted.
             Please share this text with the person managing this website.

             If you are the admin and are confident that your databases are
             correctly formatted, you have likely encountered a weird bug.
             In this case, please raise an issue at:
             https://github.com/wurmlab/sequenceserver/issues

             If any sequences were retrieved, you can find them below
             (but some may be incorrect, so be careful!)
           MSG
          ]
        ]
      end
      # rubocop:enable Metrics/MethodLength
    end
  end
end

# References
# ----------
# [1]: http://blast.ncbi.nlm.nih.gov/blastcgihelp.shtml