lib/scaffolder.rb in scaffolder-0.2.6 vs lib/scaffolder.rb in scaffolder-0.4.0

- old
+ new

@@ -1,51 +1,215 @@ require 'delegate' require 'bio' +# == Quick start +# +# Given a fasta file containing two sequences. +# +# >seqA +# GCGCGC +# >seqB +# ATATAT +# +# A simple genome scaffold containing the two sequences is specified as a YAML +# formatted text file shown below. Each dash (-) indicates a region in the +# scaffold. In the example below the keyword *sequence* inserts a sequence from +# the fasta file, the keyword *source* identifies that seqA should be used. +# +# --- +# - sequence: +# source: 'seqA' +# - sequence: +# source: 'seqB' +# +# The scaffolder API can then be used as follows to generate a complete +# sequence. +# +# scaffold = Scaffolder.new('scaffold.yml','sequences.fasta') +# sequence = scaffold.inject(String.new) do |build,entry| +# build << entry.sequence +# end +# puts sequence # Prints GCGCGCATATAT +# +# == The Scaffold File +# +# The above example is simplified to demonstrates basic usage. The sections +# below outline the types of regions that can be used in the scaffold file. +# +# === Sequence Regions +# +# Contigs sequences in the scaffold are specified using the *sequence* keyword. +# The *source* keyword should specifies the sequence to use from the fasta file +# and should match the first space delimited word in the fasta header. +# +# ==== Sub-Sequences +# +# When generating a scaffolder only a subset of a sequence may be required. +# Inserting sub-sequences into the scaffold is specified using the *start* and +# *stop* keywords. All of the sequence before the start coordinate is ignored +# and all of sequence after the stop coordinate is ignored, meaning only the +# sequence between the start and stop position inclusively is used in the +# scaffold. +# +# --- +# - sequence: +# source: 'sequence1' +# start: 42 +# stop: 1764 +# +# ==== Reverse Complementation +# +# The *reverse* keyword specifies that the selected sequence is reversed +# complemented. +# +# --- +# - sequence: +# source: 'sequence1' +# reverse: true +# +# === Insert Regions +# +# Sequence contigs may contain gaps, for example where the sequence could not +# be correctly resolved during assembly. Additional sequencing may however +# produce sequences that can be used to fill these gaps. These inserts can be +# added to a sequence using the *insert* keyword and specifying a YAML array of +# the inserts. Multiple inserts can be specified, each separated by a dash (-) +# followed by a new line. +# +# --- +# - sequence: +# source: 'sequence1' +# inserts: +# - +# source: 'insert1' +# open: 3 +# close: 10 +# +# ==== Insert Position +# +# The location where an insert is added to a sequence is defined by either the +# *open*, *close* keywords, or both. This defines where the host sequence is +# 'opened' and 'closed' to add the insert. If only one parameter is used, for +# example using *open*, then the close position is determined from the length +# of the insert sequence and vice versa. +# +# ==== Insert Sub-Sequence +# +# An insert can be subsequenced in the same way as a sequence using the *start* +# and *stop* keywords. Similarly the insert sequence can be reverse completed +# using the *reverse* keyword. +# +# --- +# - sequence: +# source: 'sequence1' +# inserts: +# - +# source: 'insert1' +# open: 3 +# close: 10 +# start: 8 +# stop: 16 +# reverse: true +# +# +# === Unresolved Regions +# +# There may be regions in between sequences in the genome which are unknown but +# which the approximate length is. These can be specified in the scaffold file +# using the *unresolved* keyword. Unresolved regions are filled with 'N' +# nucleotide characters equal to the value specified by the *length* keyword. +# +# --- +# - unresolved: +# length: 10 +# +# === Scaffold File Processing Order +# +# The scaffolder API processes the regions in YAML scaffold file as follows: +# +# * Each region in the scaffold in processed in the order specified in the +# scaffolder file. +# * If the region is a sequence and inserts are specified, the inserts are +# sorted by stop position, then processed from last to first. Each insert is +# processed as follows: +# +# * The insert is subsequenced if specified. +# * The insert is reverse complemented if specified. +# * The insert is added to each host sequence replacing the region of +# sequence specified by the open and close co-ordinates. +# * The host sequence stop position is extended by the difference in length +# that the insert sequence fills. For example if a 5 base pair insert fills +# a 4 base region, the host sequence stop position is increased by the +# difference: 1. +# * The region is subsequenced if specified. +# * The region is reverse complemented if specified. +# +# === WARNING +# +# Inserts with overlapping *open* and *close* regions in the same sequence will +# cause unexpected behaviour and should be avoided. +# class Scaffolder < DelegateClass(Array) - autoload :Region, 'scaffolder/region' - autoload :Insert, 'scaffolder/insert' - autoload :Sequence, 'scaffolder/sequence' + require 'scaffolder/errors' + require 'scaffolder/region' + include Scaffolder::Errors + + # Source is a reserved keyword. The 'source' keyword identifies the + # which corresponding fasta sequence should be retreived from the fasta + # file. + SOURCE = 'source' + + # Raw_sequence is a reserved keyword. The 'raw_sequence' keyword points to + # the sequence from the fasta file identified by the 'source' keyword. + RAW_SEQUENCE = 'raw_sequence' + + # @param [Hash] assembly Produced from loading the scaffold file using YAML.load + # @param [String] sequence Location of the fasta file corresponding to the + # scaffold sequences + # @return [Array] Returns an array of scaffold regions + # @example + # Scaffolder.new(YAML.load('scaffold.yml'),'sequences.fasta') def initialize(assembly,sequence) - @sequences = Hash[ *Bio::FlatFile::auto(sequence).collect { |s| + sequences = Hash[ *Bio::FlatFile::auto(sequence).collect { |s| [s.definition.split.first,s.seq] }.flatten] super(assembly.map do |entry| type, data = entry.keys.first, entry.values.first - case type - when 'unresolved' - Scaffolder::Region.new(:unresolved,'N'*data['length']) - when 'sequence' - sequence = Scaffolder::Sequence.new( - :name => data['source'], - :start => data['start'], - :end => data['end'], - :reverse => data['reverse'], - :sequence => fetch_sequence(data['source']) - ) - if data['inserts'] - sequence.add_inserts(data['inserts'].map do |insert| - Scaffolder::Insert.new( - :start => insert['start'], - :stop => insert['stop'], - :reverse => insert['reverse'], - :sequence => fetch_sequence(insert['source']) - ) - end) - end - sequence - else - raise ArgumentError.new("Unknown tag: #{type}") - end + # Source is the only reserved keyword. Fetches sequence from fasta file. + data = Scaffolder.update_with_sequence(data,sequences) + + Scaffolder::Region[type].generate(data) end) end - def fetch_sequence(name) - sequence = @sequences[name] - raise ArgumentError.new("Missing sequence: #{name}") unless sequence - sequence + # Inserts corresponding fasta data into scaffold data hash. Every hash + # that contains the reserved 'source' keyword has the 'raw_sequence' keyword + # added for the corresponding fasta sequence from the fasta file. + # @param [Hash] data The scaffold hash + # @param [Hash] seqs A hash with identifier => sequence key/value pairs from + # the fasta sequence data. + # @return [Hash] The data hash updated with the 'raw_sequence' sequence + # keyword data. + # @raise [UnkownSequenceError] if the source keyword is used but + # there is no corresponding fasta sequence entry + def self.update_with_sequence(data,seqs) + if data.instance_of? Array + data.each{|a| update_with_sequence(a,seqs) } + else + if data[SOURCE] + sequence = seqs[data[SOURCE]] + if sequence.nil? + raise UnknownSequenceError.new("Unknown sequence: #{data[SOURCE]}") + end + data.merge!({RAW_SEQUENCE => sequence}) + end + data.select{|k,v| v.respond_to? :each}.each do |key,hash| + update_with_sequence(hash,seqs) + end + end + data end end