# # = bio/db/gff/gffassemble.rb - Assemble mRNA and CDS from GFF # # Copyright:: Copyright (C) 2010 # Pjotr Prins # License:: The Ruby License # # Fetch information from a GFF file module Bio module GFFbrowser module Helpers module Error def info str, id='' $stderr.print "Info: "+str+" <#{id}>\n" end def warn str, id='' Kernel.warn "Warning: "+str+" <#{id}>" end def error str, id='' Kernel.warn "Error: "+str+" <#{id}>" exit(1) if $stop_on_error end end # Helper class for counting IDs class Counter < Hash def add id self[id] = 0 if self[id] == nil self[id] += 1 end end # Helper class for storing linked records based on a shared ID class LinkedRecs < Hash include Error def add id, rec info "Adding #{rec.feature_type} <#{id}>" self[id] = [] if self[id] == nil self[id] << rec end # Validate all lists belong to the same container/component def validate_seqname each do | id, rec | seqname = rec.first.seqname rec.each do | section | raise "Non-matching seqname #{section.seqname} in #{seqname}" if section.seqname != seqname end end end # Validate all lists share the same parent (if available). First checks # for Parent attribute, next for mRNA attribute def validate_shared_parent each do | id, rec | parent = rec.first.get_attribute('Parent') if parent rec.each do | section | _parent = section.get_attribute('Parent') raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent end end parent = rec.first.get_attribute('mRNA') if parent rec.each do | section | _parent = section.get_attribute('mRNA') raise "Non-matching parent #{_parent} and #{parent} in #{id}" if _parent != parent end end end end # walk all (CDS) lists for every container/component and # validate they do not overlap def validate_nonoverlapping each do | id, rec | sections = Sections::sort(rec) sections.each_with_index do | check, i | neighbour = sections[i+1] if neighbour and check.intersection(neighbour) warn "Overlapping sections for ",id end end end end end class Section < Range attr_reader :rec def initialize rec super(rec.start,rec.end) @rec = rec end def intersection(other) raise ArgumentError, 'value must be a Range' unless other.kind_of?(Range) min, max = first, exclude_end? ? max : last other_min, other_max = other.first, other.exclude_end? ? other.max : other.last new_min = self === other_min ? other_min : other === min ? min : nil new_max = self === other_max ? other_max : other === max ? max : nil new_min && new_max ? new_min..new_max : nil end alias_method :&, :intersection def <=> other first <=> other.first end end module Sections # Return list of sorted Sections def Sections::sort rec sections = [] rec.each do | section | sections.push Section.new(section) end sections.sort end end module Record include Error # Format a record ID by, first, getting the ID attribute. If that fails # the seqname is used with the start/stop positions. def Record::formatID rec id = rec.id if rec.id if !id if rec.seqname id = "#{rec.seqname} #{rec.start} #{rec.end}".strip else id = 'unknown' $stderr.print "Record with unknown ID"+rec.to_s end end id end end module Gff3Component include Error COMPONENT_TYPES = %w{ gene SO:0000704 contig transcript Component region } # Walk the component list to find a matching component/container for a # record. First use the parent ID. If that is missing go by sequence # name. def find_component rec parent = rec.get_attribute('Parent') if @componentlist[parent] # nice, there is a match info "find_component: Matched parent", parent return @componentlist[parent] end search = rec.seqname if @componentlist[search] info "find_component: Matched seqname", search return @componentlist[search] end @componentlist.each do | componentid, component | # dissemble id (id, start, stop) = componentid.split(/ /) if id==search and rec.start >= start.to_i and rec.end <= stop.to_i info "find_component: Matched column 0 and location", componentid return component end end # Ah, painful. At this point the record has no matching container, probably # because it has no parent ID and the component has an ID. We have to go by # ID for every component individually @componentlist.each do | componentid, component | if component.seqname==search and rec.start >= component.start and rec.end <= component.end # p ["----",search,rec] # p component info "find_component: Matched (long search) column 0 and location", componentid return component end end warn "Could not find container/component for",Record::formatID(rec) end end module Gff3Features # Ignore the following features (case sensitive?) IGNORE_FEATURES = Gff3Component::COMPONENT_TYPES + %w{ transposon Match similarity UTR TF_binding_site intronSO:0000188 polyA_sequence SO:0000610 polyA_site SO:0000553 five_prime_UTR SO:0000204 three_prime_UTR SO:0000205 exon SO:0000147 } end module Gff3Sequence # Patch a sequence together from a Sequence string and an array # of records. Note that rec positions are 1-based coordinates, relative # to the landmark given in column 1 - in this case the sequence as it # is passed in. The following options are available: # # :reverse : do reverse if reverse is indicated (default true) # :complement : do complement if reverse is indicated (default true) # :phase : do set CDS phase (default false, normally ignore) # :trim : make sure sequence is multiple of 3 nucleotide bps (default true) # # special options: # # :raw : raw sequence (all above false) # :codonize : codon sequence (reverse, complement, and trim are true) # :fix : fix errors (default false) # def assemble sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true, :fix=>false, :debug=>false } # default to nil, if not passed in do_debug = options[:debug] do_phase = options[:phase] do_fix = options[:fix] # default to true, if not passed in do_reverse = (options[:reverse] == false ? false : true) do_trim = (options[:trim] == false ? false : true) do_complement = (options[:complement] == false ? false : true) if options[:raw] do_phase = false do_reverse = false do_trim = false do_complement = false elsif options[:codonize] do_phase = false do_reverse = true do_trim = true do_complement = true end sectionlist = Sections::sort(reclist) rec0 = sectionlist.first.rec # we assume ORF is always read in the same direction orf_reverse = (rec0.strand == '-') orf_frame = startpos - 1 orf_frameshift = orf_frame % 3 sectionlist = sectionlist.reverse if orf_reverse if do_debug p "------------------" p options p [:reverse,do_reverse] p [:complement,do_complement] p [:trim,do_trim] p [:orf_reverse, orf_reverse, rec0.strand] end if sequence.kind_of?(Bio::FastaFormat) # BioRuby conversion sequence = sequence.seq end # Generate array of sequences seq = sectionlist.map { | section | rec = section.rec s = sequence[(section.begin-1)..(section.end-1)] if do_reverse and orf_reverse s = s.reverse end # Correct for phase. Unfortunately the use of phase is ambiguous. # Here we check whether rec.start is in line with orf_frame. If it # is, we correct for phase. Otherwise it is ignored. if do_phase and rec.phase phase = rec.phase.to_i # if ((rec.start-startpos) % 3 == 0) s = s[phase..-1] # end end s } # p seq seq = seq.join if do_complement and do_reverse and orf_reverse ntseq = Bio::Sequence::NA.new(seq) seq = ntseq.forward_complement.upcase end # This is the place to fix sequences (e.g. the Wormbase bug) if do_fix or @options[:fix] or @options[:fix_wormbase] if @options[:fix_wormbase] and rec0.id.index('gene1')==0 # Wormbase gene1 only, so ignore rest else test_frame = 0 ntseq = Bio::Sequence::NA.new(seq) aaseq = ntseq.translate if aaseq.count('*') > 1 test_frame = 1 seq = seq[1..-1] ntseq = Bio::Sequence::NA.new(seq) aaseq = ntseq.translate if aaseq.count('*') > 1 test_frame = 2 seq = seq[1..-1] ntseq = Bio::Sequence::NA.new(seq) aaseq = ntseq.translate raise 'Validation problem '+rec0.id if aaseq.count('*') > 1 end end if test_frame > 0 warn rec0.id,"Frame adjusted to #{test_frame} (fix)" end end end if do_trim reduce = seq.size % 3 seq = seq[0..(seq.size-1-reduce)] if reduce != 0 end if @options[:validate] ntseq = Bio::Sequence::NA.new(seq) aaseq = ntseq.translate raise 'Validate translation problem '+rec0.id+"\n"+seq if aaseq.count('*') > 1 end retval = seq retval end # Patch a sequence together from a Sequence string and an array # of records and translate in the correct direction and frame. The options # are the same as for +assemble+, except :trim defaults to true. def assembleAA sequence, startpos, reclist, options = { :phase=>false, :reverse=>true, :trim=>true, :complement=>true } seq = assemble(sequence, startpos, reclist, options) ntseq = Bio::Sequence::NA.new(seq) ntseq.translate end # Create a description for output def description id, component, rec sections = Sections::sort(rec) id+' Sequence:'+component.seqname+"_#{component.start}:#{component.end} ("+ sections.map { |s| "#{s.first}:#{s.last}" }.join(', ') +")" end end end # Helpers end end