#!/usr/bin/env ruby require 'scbi_fasta' require 'optparse' ########################################################################################## ## FUNCTIONS ########################################################################################## def load_fasta(fasta) seqs = [] fasta = FastaQualFile.new(fasta) fasta.each do |name, seq| seqs << [name, seq] end fasta.close return seqs end def copy_seqs(seqs) all = [] seqs.each do |seq| all << [seq.first.dup, seq.last.dup] end return all end def output_files(options) file = File.basename(options[:file]) output_files = {} output_files[:insertions] = File.open(file+'_insertions', 'w') if options[:indel] output_files[:delections] = File.open(file+'_deletions', 'w') if options[:indel] output_files[:mix] = File.open(file+'_mix', 'w') if options[:indel] output_files[:cut_100_pb] = File.open(file+'_trimmed', 'w') if options[:trim] output_files[:paired] = File.open(file+'_paired', 'w') if options[:pair] output_files[:fullChim] = File.open(file+'_fullChim', 'w') if options[:chim] output_files[:fusionChim] = File.open(file+'_fusionChim', 'w') if options[:chim] output_files[:fusionChimTruncate] = File.open(file+'_truncateChim', 'w') if options[:chim] return output_files end def random_nt nts =['a','c','g','t'] return nts[rand(4).truncate] end def insertions(seq, position) first_cut = (seq.length/3).truncate second_cut = first_cut *2 case position%3 when 0 seq = seq.insert(first_cut, random_nt) name = '_I__' when 1 seq = seq.insert(second_cut, random_nt) name = '__I_' when 2 seq = seq.insert(first_cut, random_nt) seq = seq.insert(second_cut, random_nt) name = '_I_I_' end return seq, name end def delections(seq, position) first_cut = (seq.length/3).truncate second_cut = first_cut *2 case position%3 when 0 seq.slice!(first_cut) name='_D__' when 1 seq.slice!(second_cut) name='__D_' when 2 seq.slice!(first_cut) seq.slice!(second_cut) name='_D_D_' end return seq, name end def mix(seq, position) first_cut = (seq.length/3).truncate second_cut = first_cut *2 case position%2 when 0 seq = seq.insert(first_cut, random_nt) seq.slice!(second_cut) name='_I_D_' when 1 seq.slice!(first_cut) seq = seq.insert(second_cut, random_nt) name='_D_I_' end return seq, name end def load_utrs(utr_file) utrs = {} File.open(utr_file).each do |line| line.chomp! fields = line.split("\t") seq_name = fields.shift utrs[seq_name] = fields.map{|coord| coord.to_i} end return utrs end ########################################################################################## ## OPTIONS ########################################################################################## options = {} optparse = OptionParser.new do |opts| options[:file]='samples' opts.on( '-f', '--file FILE', 'FASTA file') do |file| options[:file]=file end options[:duplicate]= 1 opts.on( '-d', '--duplicate INTEGER', 'Duplicate sequences to dataset') do |duplicate| options[:duplicate] = duplicate.to_i end options[:split]= FALSE opts.on( '-s', '--split', 'Split sequences in each case') do options[:duplicate] = 3 end options[:chim]= TRUE opts.on( '-c', '--chim', 'Make sequence set of chimeras') do options[:chim] = FALSE end options[:indel]= TRUE opts.on( '-i', '--indel', 'Make sequence set of indels') do options[:indel] = FALSE end options[:pair]= TRUE opts.on( '-p', '--pair', 'Make sequence set of paired') do options[:pair] = FALSE end options[:trim]= TRUE opts.on( '-t', '--trim', 'Make sequence set of trimmed') do options[:trim] = FALSE end # Set a banner, displayed at the top of the help screen. opts.banner = "Usage: #{File.basename($0)} -f FILE \n\n" # This displays the help screen opts.on( '-h', '--help', 'Display this screen' ) do puts opts exit end end # End opts # parse options and remove from ARGV optparse.parse! ########################################################################################## ## MAIN ########################################################################################## if !File.exists?(options[:file]) puts 'File not exists' Process.exit end seqs = load_fasta(options[:file]) output_files = output_files(options) if options[:trim] || options[:chim] file_ext = File.extname(options[:file]) utr_file = options[:file].gsub(file_ext,'')+'.utr' utrs = {} utrs = load_utrs(utr_file) if File.exists?(utr_file) end index = 0 seqs.each do |name, seq| if index % 2 == 0 && !seqs[index+1].nil? && options[:chim] second_seq = seqs[index+1].first second_seq_fasta = seq+seqs[index+1].last output_files[:fullChim].puts ">#{name+'_'+second_seq}\n#{seq+second_seq_fasta}" if !utrs.empty? utr_coord = utrs[name] utr_coord_second = utrs[second_seq] chim5 = seq[0..utr_coord.last] chim3 = second_seq_fasta[utr_coord_second.first..second_seq_fasta.length-1] output_files[:fusionChim].puts ">#{name+'_'+second_seq}\n#{chim5+chim3}" if !chim5.nil? && !chim3.nil? chim5_trunc = chim5[0..chim5.length-100] chim3_trunc = chim3[100..chim3.length] output_files[:fusionChimTruncate].puts ">#{name+'_'+second_seq}\n#{chim5_trunc+chim3_trunc}" if !chim5_trunc.nil? && !chim3_trunc.nil? end end if options[:trim] if utrs.empty? output_files[:cut_100_pb].puts ">#{name}\n#{seq[99..seq.length-101]}" else utr_coord = utrs[name] trim_seq = seq[utr_coord.first+100..utr_coord.last-100] output_files[:cut_100_pb].puts ">#{name}\n#{trim_seq}" if !trim_seq.nil? && !trim_seq.empty? end end if options[:pair] n_number = rand(5..50) position = seq.length/2 - n_number/2 output_files[:paired].puts ">#{name}\n#{seq[0..position] + 'N'*n_number + seq[position+1..seq.length-1]}" end index += 1 end if options[:indel] all_seqs = [] options[:duplicate].times do all_seqs.concat(copy_seqs(seqs)) end length = all_seqs.length all_seqs.each_with_index do |s, i| case i when 0..length/3-1 seq, type = insertions(s.last, i) file = :insertions when length/3..2*length/3-1 seq, type = delections(s.last, i) file =:delections else seq, type = mix(s.last, i) file = :mix end output_files[file].puts ">#{s.first}#{type}\n#{seq}" end end output_files.values.map{|file| file.close}