lib/scaffolder.rb in scaffolder-0.2.6 vs lib/scaffolder.rb in scaffolder-0.4.0
- old
+ new
@@ -1,51 +1,215 @@
require 'delegate'
require 'bio'
+# == Quick start
+#
+# Given a fasta file containing two sequences.
+#
+# >seqA
+# GCGCGC
+# >seqB
+# ATATAT
+#
+# A simple genome scaffold containing the two sequences is specified as a YAML
+# formatted text file shown below. Each dash (-) indicates a region in the
+# scaffold. In the example below the keyword *sequence* inserts a sequence from
+# the fasta file, the keyword *source* identifies that seqA should be used.
+#
+# ---
+# - sequence:
+# source: 'seqA'
+# - sequence:
+# source: 'seqB'
+#
+# The scaffolder API can then be used as follows to generate a complete
+# sequence.
+#
+# scaffold = Scaffolder.new('scaffold.yml','sequences.fasta')
+# sequence = scaffold.inject(String.new) do |build,entry|
+# build << entry.sequence
+# end
+# puts sequence # Prints GCGCGCATATAT
+#
+# == The Scaffold File
+#
+# The above example is simplified to demonstrates basic usage. The sections
+# below outline the types of regions that can be used in the scaffold file.
+#
+# === Sequence Regions
+#
+# Contigs sequences in the scaffold are specified using the *sequence* keyword.
+# The *source* keyword should specifies the sequence to use from the fasta file
+# and should match the first space delimited word in the fasta header.
+#
+# ==== Sub-Sequences
+#
+# When generating a scaffolder only a subset of a sequence may be required.
+# Inserting sub-sequences into the scaffold is specified using the *start* and
+# *stop* keywords. All of the sequence before the start coordinate is ignored
+# and all of sequence after the stop coordinate is ignored, meaning only the
+# sequence between the start and stop position inclusively is used in the
+# scaffold.
+#
+# ---
+# - sequence:
+# source: 'sequence1'
+# start: 42
+# stop: 1764
+#
+# ==== Reverse Complementation
+#
+# The *reverse* keyword specifies that the selected sequence is reversed
+# complemented.
+#
+# ---
+# - sequence:
+# source: 'sequence1'
+# reverse: true
+#
+# === Insert Regions
+#
+# Sequence contigs may contain gaps, for example where the sequence could not
+# be correctly resolved during assembly. Additional sequencing may however
+# produce sequences that can be used to fill these gaps. These inserts can be
+# added to a sequence using the *insert* keyword and specifying a YAML array of
+# the inserts. Multiple inserts can be specified, each separated by a dash (-)
+# followed by a new line.
+#
+# ---
+# - sequence:
+# source: 'sequence1'
+# inserts:
+# -
+# source: 'insert1'
+# open: 3
+# close: 10
+#
+# ==== Insert Position
+#
+# The location where an insert is added to a sequence is defined by either the
+# *open*, *close* keywords, or both. This defines where the host sequence is
+# 'opened' and 'closed' to add the insert. If only one parameter is used, for
+# example using *open*, then the close position is determined from the length
+# of the insert sequence and vice versa.
+#
+# ==== Insert Sub-Sequence
+#
+# An insert can be subsequenced in the same way as a sequence using the *start*
+# and *stop* keywords. Similarly the insert sequence can be reverse completed
+# using the *reverse* keyword.
+#
+# ---
+# - sequence:
+# source: 'sequence1'
+# inserts:
+# -
+# source: 'insert1'
+# open: 3
+# close: 10
+# start: 8
+# stop: 16
+# reverse: true
+#
+#
+# === Unresolved Regions
+#
+# There may be regions in between sequences in the genome which are unknown but
+# which the approximate length is. These can be specified in the scaffold file
+# using the *unresolved* keyword. Unresolved regions are filled with 'N'
+# nucleotide characters equal to the value specified by the *length* keyword.
+#
+# ---
+# - unresolved:
+# length: 10
+#
+# === Scaffold File Processing Order
+#
+# The scaffolder API processes the regions in YAML scaffold file as follows:
+#
+# * Each region in the scaffold in processed in the order specified in the
+# scaffolder file.
+# * If the region is a sequence and inserts are specified, the inserts are
+# sorted by stop position, then processed from last to first. Each insert is
+# processed as follows:
+#
+# * The insert is subsequenced if specified.
+# * The insert is reverse complemented if specified.
+# * The insert is added to each host sequence replacing the region of
+# sequence specified by the open and close co-ordinates.
+# * The host sequence stop position is extended by the difference in length
+# that the insert sequence fills. For example if a 5 base pair insert fills
+# a 4 base region, the host sequence stop position is increased by the
+# difference: 1.
+# * The region is subsequenced if specified.
+# * The region is reverse complemented if specified.
+#
+# === WARNING
+#
+# Inserts with overlapping *open* and *close* regions in the same sequence will
+# cause unexpected behaviour and should be avoided.
+#
class Scaffolder < DelegateClass(Array)
- autoload :Region, 'scaffolder/region'
- autoload :Insert, 'scaffolder/insert'
- autoload :Sequence, 'scaffolder/sequence'
+ require 'scaffolder/errors'
+ require 'scaffolder/region'
+ include Scaffolder::Errors
+
+ # Source is a reserved keyword. The 'source' keyword identifies the
+ # which corresponding fasta sequence should be retreived from the fasta
+ # file.
+ SOURCE = 'source'
+
+ # Raw_sequence is a reserved keyword. The 'raw_sequence' keyword points to
+ # the sequence from the fasta file identified by the 'source' keyword.
+ RAW_SEQUENCE = 'raw_sequence'
+
+ # @param [Hash] assembly Produced from loading the scaffold file using YAML.load
+ # @param [String] sequence Location of the fasta file corresponding to the
+ # scaffold sequences
+ # @return [Array] Returns an array of scaffold regions
+ # @example
+ # Scaffolder.new(YAML.load('scaffold.yml'),'sequences.fasta')
def initialize(assembly,sequence)
- @sequences = Hash[ *Bio::FlatFile::auto(sequence).collect { |s|
+ sequences = Hash[ *Bio::FlatFile::auto(sequence).collect { |s|
[s.definition.split.first,s.seq]
}.flatten]
super(assembly.map do |entry|
type, data = entry.keys.first, entry.values.first
- case type
- when 'unresolved'
- Scaffolder::Region.new(:unresolved,'N'*data['length'])
- when 'sequence'
- sequence = Scaffolder::Sequence.new(
- :name => data['source'],
- :start => data['start'],
- :end => data['end'],
- :reverse => data['reverse'],
- :sequence => fetch_sequence(data['source'])
- )
- if data['inserts']
- sequence.add_inserts(data['inserts'].map do |insert|
- Scaffolder::Insert.new(
- :start => insert['start'],
- :stop => insert['stop'],
- :reverse => insert['reverse'],
- :sequence => fetch_sequence(insert['source'])
- )
- end)
- end
- sequence
- else
- raise ArgumentError.new("Unknown tag: #{type}")
- end
+ # Source is the only reserved keyword. Fetches sequence from fasta file.
+ data = Scaffolder.update_with_sequence(data,sequences)
+
+ Scaffolder::Region[type].generate(data)
end)
end
- def fetch_sequence(name)
- sequence = @sequences[name]
- raise ArgumentError.new("Missing sequence: #{name}") unless sequence
- sequence
+ # Inserts corresponding fasta data into scaffold data hash. Every hash
+ # that contains the reserved 'source' keyword has the 'raw_sequence' keyword
+ # added for the corresponding fasta sequence from the fasta file.
+ # @param [Hash] data The scaffold hash
+ # @param [Hash] seqs A hash with identifier => sequence key/value pairs from
+ # the fasta sequence data.
+ # @return [Hash] The data hash updated with the 'raw_sequence' sequence
+ # keyword data.
+ # @raise [UnkownSequenceError] if the source keyword is used but
+ # there is no corresponding fasta sequence entry
+ def self.update_with_sequence(data,seqs)
+ if data.instance_of? Array
+ data.each{|a| update_with_sequence(a,seqs) }
+ else
+ if data[SOURCE]
+ sequence = seqs[data[SOURCE]]
+ if sequence.nil?
+ raise UnknownSequenceError.new("Unknown sequence: #{data[SOURCE]}")
+ end
+ data.merge!({RAW_SEQUENCE => sequence})
+ end
+ data.select{|k,v| v.respond_to? :each}.each do |key,hash|
+ update_with_sequence(hash,seqs)
+ end
+ end
+ data
end
end