require 'bio-velvet'
require 'bio'
require 'pry'

class Bio::FinishM::GraphGenerator
  include Bio::FinishM::Logging

  DEFAULT_OPTIONS = {
    :velvet_kmer_size => 51,
    :assembly_coverage_cutoff => 3.5,
    }

  def add_options(option_parser, options)
    options.merge!(DEFAULT_OPTIONS)
    option_parser.on("--assembly-kmer NUMBER", "when assembling, use this kmer length [default: #{options[:velvet_kmer_size] }]") do |arg|
      options[:velvet_kmer_size] = arg.to_i
    end
    option_parser.on("--assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, all other nodes are removed [default: #{options[:assembly_coverage_cutoff] }]") do |arg|
      options[:assembly_coverage_cutoff] = arg.to_f
    end
    option_parser.on("--post-assembly-coverage-cutoff NUMBER", "Require this much coverage in each node, implemented after assembly [default: not used]") do |arg|
      options[:post_assembly_coverage_cutoff] = arg.to_f
    end
    option_parser.on("--velvet-directory PATH", "Output assembly intermediate files to this directory [default: use temporary directory, delete afterwards]") do |arg|
      options[:output_assembly_path] = arg
    end
    option_parser.on("--already-assembled-velvet-directory PATH", "If an assembly directory has been specified previously with --velvet-directory, re-use this assembly rather than re-doing the assembly [default: off]") do |arg|
      options[:previous_assembly] = arg
    end
  end

  # Generate a ProbedGraph object, given one or more 'probe sequences'
  # and metagenomic reads. This is a rather large method, but seems to
  # be approximately repeated in different applications of FinishM, so
  # creating it for DRY purposes.
  #
  # probe_sequences: DNA sequences (as String objects whose direction points to the outsides of contigs)
  # read_inputs: a ReadInput object, containing the information to feed to velveth
  #
  # options:
  # :probe_reads: a list of sequence numbers (numbering as per velvet Sequence file)
  # :probe_read_names: a list of sequence names (not IDs) that are probes (convert the names to IDs using the CnyUnifiedSeqNames file). There may not be a one to one correspondence of these read names and the probe reads returned in the ProbedGraph since reads can map to multiple sequence IDs.
  # :velvet_kmer_size: kmer
  # :assembly_coverage_cutoff: coverage cutoff for nodes
  # :post_assembly_coverage_cutoff: apply this coverage cutoff to nodes after parsing assembly
  # :output_assembly_path: write assembly to this directory
  # :previous_assembly: a velvet directory from a previous run of the same probe sequences and reads. (Don't re-assemble)
  # :use_textual_sequence_file: by default, a binary sequence file is used. Set this true to get velvet to generate the Sequences file
  # :remove_unconnected_nodes: delete nodes from the graph that are not connected to the probe nodes
  # :graph_search_leash_length: when :remove_unconnected_nodes'ing, use this leash length
  # :dont_parse_noded_reads: if true, skip parsing noded reads (ie the positions of the reads in the graph)
  # :dont_parse_reads: if true, skip parsing reads (ie the sequences of the reads themselves)
  def generate_graph(probe_sequences, read_inputs, options={})
    options[:parse_sequence_file] ||= true
    graph = nil
    read_probing_graph = nil
    finishm_graph = Bio::FinishM::ProbedGraph.new

    log.debug "Options for generate_graph: #{options}" if log.debug?

    velvet_binary_folder = File.join(File.dirname(__FILE__),'..','..','ext','src')
    log.debug "Using velvet binary folder #{velvet_binary_folder}" if log.debug?

    velvet_result = nil

    probe_read_ids = nil
    if options[:probe_reads]
      probe_read_ids = options[:probe_reads]
    else
      probe_read_ids = Set.new((1..probe_sequences.length))
    end
    if options[:previous_assembly].nil? #If assembly has not already been carried out
      Tempfile.open('probes.fa') do |tempfile|
        50.times do # Do 50 times to make sure that velvet doesn't throw out parts of the graph that contain this contig
          probe_sequences.each_with_index do |probe, i|
            tempfile.puts ">probe#{i}"
            tempfile.puts probe
          end
        end
        tempfile.close
        singles = read_inputs.fasta_singles
        if singles and !singles.empty?
          read_inputs.fasta_singles = [tempfile.path, singles].flatten
        else
          read_inputs.fasta_singles = [tempfile.path]
        end
        log.debug "Inputting probes into the assembly:\n#{File.open(tempfile.path).read}" if log.debug?

        runner = Bio::Velvet::Runner.new
        required_version = '1.2.10-wwood_finishm'
        found_version = runner.binary_version(File.join(velvet_binary_folder, 'velveth'))
        if found_version != required_version
          raise "Detected velvet version incompatible with FinishM: #{found_version}, expected #{required_version} which is available from https://github.com/wwood/velvet (on branch less_clipping)"
        end

        log.info "Assembling sampled reads with velvet"
        raise "Need to specify -cov_cutoff" if options[:assembly_coverage_cutoff].nil?
        raise "Need to specify a kmer size" if options[:velvet_kmer_size].nil?
        # Bit of a hack, but have to use -short1 as the anchors because then start and end anchors will have node IDs 1,2,... etc.
        use_binary = options[:use_textual_sequence_file] ? '' : '-create_binary'
        velvet_result = runner.velvet(
          options[:velvet_kmer_size],
          "#{read_inputs.velvet_read_arguments} #{use_binary}",
          "-read_trkg yes -cov_cutoff #{options[:assembly_coverage_cutoff] } -tour_bus no -read_to_node_binary yes",
          :output_assembly_path => options[:output_assembly_path],
          :velveth_path => File.join(velvet_binary_folder, 'velveth'),
          :velvetg_path => File.join(velvet_binary_folder, 'velvetg'),
          )
        if log.debug?
          log.debug "velveth stdout: #{velvet_result.velveth_stdout}"
          log.debug "velveth stderr: #{velvet_result.velveth_stderr}"
          log.debug "velvetg stdout: #{velvet_result.velvetg_stdout}"
          log.debug "velvetg stderr: #{velvet_result.velvetg_stderr}"
        end
        log.info "Finished running assembly"
        finishm_graph.velvet_result_directory = velvet_result.result_directory
      end
    else
      log.info "Using previous assembly stored in #{options[:previous_assembly] }"
      velvet_result = Bio::Velvet::Result.new
      velvet_result.result_directory = options[:previous_assembly]
      finishm_graph.velvet_result_directory = velvet_result.result_directory
    end

    # Check that the probe reads given are present in the assembly passed here
    unless options[:dont_parse_reads]
      sequence_store = parse_velvet_binary_reads(velvet_result.result_directory)
      finishm_graph.velvet_sequences = sequence_store
      if !check_probe_sequences(probe_sequences, sequence_store)
        raise "Probe sequences changed since previous velvet assembly!"
      end
    end

    log.info "Parsing the graph output from velvet"
    opts = {
      # noded reads are parsed in via C, if they are wanted at all
      :dont_parse_noded_reads => true
    }
    bio_velvet_graph = Bio::Velvet::Graph.parse_from_file(
      File.join(velvet_result.result_directory, 'LastGraph'),
      opts
      )
    log.info "Finished parsing graph: found #{bio_velvet_graph.nodes.length} nodes and #{bio_velvet_graph.arcs.length} arcs"

    if options[:dont_parse_noded_reads]
      graph = bio_velvet_graph
    else
      log.info "Beginning parse of graph using velvet's parsing C code.."
      read_probing_graph = Bio::Velvet::Underground::Graph.parse_from_file File.join(velvet_result.result_directory, 'LastGraph')
      log.info "Completed velvet code parsing velvet graph"

      # Make the two graphs into a hybrid one
      graph = Bio::FinishM::HybridGraph.new(bio_velvet_graph, read_probing_graph)
    end
    finishm_graph.graph = graph

    # Find the anchor nodes again
    anchor_sequence_ids = probe_read_ids.to_a.sort
    endings = []
    unless probe_read_ids.empty? and options[:probe_read_names].nil? #don't bother trying to find probes if none exists
      # Convert read names to read IDs if required
      if options[:probe_read_names]
        # Probe reads are given as names, not IDs. What are the corresponding probes then?
        entries = Bio::Velvet::CnyUnifiedSeqNamesFile.extract_entries_using_grep_hack(
          File.join(velvet_result.result_directory, 'CnyUnifiedSeq.names'),
          options[:probe_read_names]
          )
        anchor_sequence_ids = []
        double_counts = 0
        options[:probe_read_names].each do |name| #maintain order of them as they are specified in the original array parameter
          if entries[name].empty?
            raise "Unable to find probe `#{name}' in the probe reads file - was it included in the assembly?"
          elsif entries[name].length > 2
            raise "Found >2 sequences named #{name} in the assembly, being conservative and not continuing"
          else
            entries[name].each do |res|
              anchor_sequence_ids.push res.read_id
            end
            if entries[name].length == 2
              double_counts += 1
              log.debug "Found 2 sequences for #{name}" if log.debug?
            end
          end
        end
        if double_counts > 0
          log.info "#{double_counts} reads were found twice (likely as pairs), including both as probes"
        end
        log.info "Recovered #{anchor_sequence_ids.length} sequences using their names" if log.info?
      end


      # Parse the read to node structure
      log.info "Reading ReadToNode.bin file.." if log.info?
      finishm_graph.read_to_nodes = Bio::FinishM::ReadToNode.new(File.join(velvet_result.result_directory, 'ReadToNode.bin'))

      finder = Bio::AssemblyGraphAlgorithms::NodeFinder.new
      log.info "Finding probe nodes in the assembly"
      c_graph_endings = finder.find_probes_from_read_to_node(finishm_graph.graph, finishm_graph.read_to_nodes, anchor_sequence_ids)
      log.debug "Converting probe nodes found in C graph to Ruby analogues and adding to Ruby-parsed graph"
      endings = c_graph_endings.collect do |node_direction_read|
        if node_direction_read.empty?
          # No probe found
          []
        else #found a node.
          #equivalent node
          node = graph.nodes[node_direction_read[0].node_id]
          #equivalent direction
          direction = node_direction_read[1]
          #equivalent noded read
          nr = Bio::Velvet::Graph::NodedRead.new
          #           nr.read_id = read_id
          #           nr.offset_from_start_of_node = row[1].to_i
          #           nr.start_coord = row[2].to_i
          #           nr.direction = current_node_direction
          cnr = node_direction_read[2]
          nr.read_id = cnr.read_id
          nr.offset_from_start_of_node = cnr.offset_from_start_of_node
          nr.start_coord = cnr.start_coord
          nr.direction = direction
          # collect
          [node, direction, nr]
        end
      end
    end
    finishm_graph.probe_nodes = endings.collect{|array| array[0]}
    finishm_graph.probe_node_directions = endings.collect{|array| array[1]}
    finishm_graph.probe_node_reads = endings.collect{|array| array[2]}

    # Check to make sure the probe sequences map to nodes in the graph
    if finishm_graph.completely_probed?
      if log.info?
        found_all = true
        num_found = 0
        finishm_graph.probe_nodes.each_with_index do |probe,i|
          if probe.nil?
            found_all = false
            log.debug "Unable to recover probe ##{i+1}, perhaps this will cause problems, but proceding optimistically"
          else
            num_found += 1
          end
        end
        if found_all
          if finishm_graph.probe_nodes.empty?
            log.debug "No probes specified, so didn't find any"
          else
            log.info "Found all anchoring nodes in the graph."
          end
        else
          log.info "Found #{num_found} of #{finishm_graph.probe_nodes.length} anchoring nodes in the graph, ignoring the rest"
        end
      end
    else
      raise "Unable to find all anchor reads from the assembly, cannot continue. This is probably an error with this script, not you. Probes not found: #{finishm_graph.missing_probe_indices.inspect}"
    end

    if options[:post_assembly_coverage_cutoff]
      log.info "Removing nodes with coverage < #{options[:post_assembly_coverage_cutoff] } from graph.."
      original_num_nodes = graph.nodes.length
      original_num_arcs = graph.arcs.length
      filter = Bio::AssemblyGraphAlgorithms::CoverageBasedGraphFilter.new
      filter.remove_low_coverage_nodes(graph,
        options[:post_assembly_coverage_cutoff],
        :whitelisted_sequences => Set.new(anchor_sequence_ids)
      )
      log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs, leaving #{graph.nodes.length} nodes and #{graph.arcs.length} arcs."
    end

    if options[:remove_unconnected_nodes]
      if options[:graph_search_leash_length]
        log.info "Removing nodes unconnected to probe nodes from the graph using leash #{options[:graph_search_leash_length] }.."
      else
        log.info "Removing nodes unconnected to probe nodes from the graph without using a leash.."
      end
      original_num_nodes = graph.nodes.length
      original_num_arcs = graph.arcs.length
      filter = Bio::AssemblyGraphAlgorithms::ConnectivityBasedGraphFilter.new
      filter.remove_unconnected_nodes(
        graph,
        finishm_graph.probe_nodes.reject{|n| n.nil?},
        :leash_length => options[:graph_search_leash_length]
        )
      log.info "Removed #{original_num_nodes-graph.nodes.length} nodes and #{original_num_arcs-graph.arcs.length} arcs, leaving #{graph.nodes.length} nodes and #{graph.arcs.length} arcs."
    end

    return finishm_graph
  end

  # Read in the reads from a velvet result
  def parse_velvet_binary_reads(velvet_result_directory)
    sequences_file_path = File.join velvet_result_directory, 'CnyUnifiedSeq'
    log.info "Reading in the actual sequences of all reads from #{sequences_file_path}"
    sequences = Bio::Velvet::Underground::BinarySequenceStore.new sequences_file_path
    log.info "Read in #{sequences.length} sequences"
    return sequences
  end

  # When re-using an assembly, sometimes need to make
  # sure that the probe sequences used previously are the same
  # as what is given this time. Given am Array of probe sequences
  # and a binary_sequence_file, check the probe sequences are the
  # consistent.
  def check_probe_sequences(probe_sequences, sequence_store)
    return true if probe_sequences.nil?

    probe_sequences.each_with_index do |probe, i|
      log.debug "Checking probe sequence \##{i+1}" if log.debug?
      if sequence_store[i+1].upcase != probe.upcase
        log.error "Probe sequence \##{i+1} has changed - perhaps the wrong velvet assembly directory was specified, or a fresh assembly is required?"
        return false
      end
    end
    log.debug "Presence of #{probe_sequences.length} probe sequences verified"
    return true
  end
end