#!/usr/bin/env ruby # # bio-vcf parser and transformer # Author:: Pjotr Prins # License:: MIT # # Copyright (C) 2014-2015 Pjotr Prins USAGE = "Vcf parser" gempath = File.dirname(File.dirname(__FILE__)) $: << File.join(gempath,'lib') VERSION_FILENAME=File.join(gempath,'VERSION') version = File.new(VERSION_FILENAME).read.chomp require 'bio-vcf' require 'bio-vcf/pcows' require 'optparse' require 'timeout' require 'fileutils' # Uncomment when using the bio-logger # require 'bio-logger' # log = Bio::Log::LoggerPlus.new 'vcf' # log.outputters = Bio::Log::Outputter.stderr # Bio::Log::CLI.logger('stderr') # Bio::Log::CLI.trace('info') options = { show_help: false, source: 'https://github.com/CuppenResearch/bioruby-vcf', version: version+' (Pjotr Prins)', date: Time.now.to_s, thread_lines: 40_000, timeout: 180 } opts = OptionParser.new do |o| o.banner = "Usage: #{File.basename($0)} [options] filename\ne.g. #{File.basename($0)} < test/data/input/somaticsniper.vcf" o.on('-i','--ignore-missing', 'Ignore missing data') do options[:ignore_missing] = true end o.on('--filter cmd',String, 'Evaluate filter on each record') do |cmd| options[:filter] = cmd end o.on('--sfilter cmd',String, 'Evaluate filter on each sample') do |cmd| options[:sfilter] = cmd end o.on("--sfilter-samples list", Array, "Filter on selected samples (e.g., 0,1") do |l| options[:sfilter_samples] = l end o.on('--ifilter cmd','--if cmd',String, 'Include filter') do |cmd| options[:ifilter] = cmd end o.on("--ifilter-samples list", Array, "Include set - implicitely defines exclude set") do |l| options[:ifilter_samples] = l end o.on('--efilter cmd','--ef cmd',String, 'Exclude filter') do |cmd| options[:efilter] = cmd end o.on("--efilter-samples list", Array, "Exclude set - overrides exclude set") do |l| options[:efilter_samples] = l end o.on("--bed bedfile", String, "Filter on BED elements") do |bed| options[:bed] = bed end o.on('-e cmd', '--eval cmd',String, 'Evaluate command on each record') do |cmd| options[:eval] = cmd end o.on('--eval-once cmd',String, 'Evaluate command once (usually for header info)') do |cmd| options[:eval_once] = true options[:eval] = cmd # options[:num_threads] = 1 # options[:thread_lines] = 1 options[:skip_header] = true end o.on('--seval cmd',String, 'Evaluate command on each sample') do |cmd| options[:seval] = cmd options[:skip_header] = true end o.on("--rewrite eval", "Rewrite INFO") do |s| options[:rewrite] = s end o.on("--samples list", Array, "Output selected samples") do |l| options[:samples] = l end o.on("--rdf", "Generate Turtle RDF (also check out --template!)") do |b| require 'bio-vcf/vcfrdf' options[:rdf] = true options[:skip_header] = true end o.on("--num-threads [num]", Integer, "Multi-core version (default #{options[:num_threads]})") do |i| options[:num_threads] = i end o.on("--thread-lines num", Integer, "Fork thread on num lines (default #{options[:thread_lines]})") do |i| options[:thread_lines] = i end o.on_tail("--id name", String, "Identifier") do |s| options[:id] = s end o.on_tail("--tags list", String, "Add tags") do |s| options[:tags] = s end o.on("--skip-header", "Do not output VCF header info") do options[:skip_header] = true end o.on("--set-header list", Array, "Set a special tab delimited output header (#samples expands to sample names)") do |list| options[:set_header] = list options[:skip_header] = true end o.on("-t erb","--template erb",String, "Use ERB template for output") do |s| require 'bio-vcf/vcfrdf' require 'erb' options[:template] = s options[:skip_header] = true end o.on("--add-header-tag", "Add bio-vcf status tag to header output") do |t| options[:tag] = true end o.on("--timeout [num]", Integer, "Timeout waiting for thread to complete (default #{options[:timeout]})") do |i| options[:timeout] = i end # Uncomment the following when using the bio-logger # o.separator "" # o.on("--logger filename",String,"Log to file (default stderr)") do | name | # Bio::Log::CLI.logger(name) # end # # o.on("--trace options",String,"Set log level (default INFO, see bio-logger)") do | s | # Bio::Log::CLI.trace(s) # end # o.on("--statistics", "Output statistics") do |q| options[:statistics] = true options[:num_threads] = nil end o.on("-q", "--quiet", "Run quietly") do |q| # Bio::Log::CLI.trace('error') options[:quiet] = true end o.on("-v", "--verbose", "Run verbosely") do |v| options[:verbose] = true end o.on("--debug", "Show debug messages") do |v| # Bio::Log::CLI.trace('debug') options[:debug] = true end o.separator "" o.on_tail('-h', '--help', 'display this help and exit') do options[:show_help] = true end end opts.parse!(ARGV) BIOVCF_VERSION=version BIOVCF_BANNER = "bio-vcf #{version} (biogem Ruby #{RUBY_VERSION} with pcows) by Pjotr Prins 2015\n" $stderr.print BIOVCF_BANNER if !options[:quiet] if options[:show_help] print opts print USAGE exit 1 end if RUBY_VERSION =~ /^1/ $stderr.print "WARNING: bio-vcf runs on Ruby 2.x only\n" end $stderr.print "Options: ",options,"\n" if !options[:quiet] if options[:template] include BioVcf::RDF require 'bio-vcf/template' fn = options[:template] raise "No template #{fn}!" if not File.exist?(fn) # template = ERB.new(File.read(fn)) template = Bio::Template.new(fn) end stats = nil if options[:statistics] options[:num_threads] = nil stats = BioVcf::VcfStatistics.new end # Check for option combinations raise "Missing option --ifilter" if options[:ifilter_samples] and not options[:ifilter] raise "Missing option --efilter" if options[:efilter_samples] and not options[:efilter] raise "Missing option --sfilter" if options[:sfilter_samples] and not options[:sfilter] if options[:samples] samples = options[:samples].map { |s| s.to_i } end include BioVcf # Parse the header section of a VCF file (chomping STDIN) def parse_header line, samples, options header = VcfHeader.new(options[:debug]) header.add(line) print line if not options[:skip_header] STDIN.each_line do | headerline | if headerline !~ /^#/ line = headerline break # end of header end header.add(headerline) if not options[:skip_header] if headerline =~ /^#CHR/ # The header before actual data contains the sample names, first inject the BioVcf meta information print header.tag(options),"\n" if options[:tag] and not options[:skip_header] selected = header.column_names if samples newfields = selected[0..8] samples.each do |s| newfields << selected[s+9] end selected = newfields end print "#",selected.join("\t"),"\n" else print headerline end end end print header.printable_header_line(options[:set_header]),"\n" if options[:set_header] VcfRdf::header if options[:rdf] return header,line end # Parse a VCF line and return the (template) result as a string buffer def parse_line line,header,options,bedfilter,samples,template,stats=nil fields = VcfLine.parse(line) rec = VcfRecord.new(fields,header) r = rec # alias filter = options[:filter] sfilter = options[:sfilter] efilter = options[:efilter] ifilter = options[:ifilter] seval = options[:seval] ignore_missing = options[:ignore_missing] quiet = options[:quiet] if sfilter or efilter or ifilter or seval # check for samples header_samples = header.column_names[9..-1] raise "Empty sample list, can not execute query!" if not header_samples end # -------------------------- # Filtering and set analysis if bedfilter bed = bedfilter.contains(rec) return if not bed end return if filter and not rec.gfilter(filter,ignore_missing_data: ignore_missing,quiet: quiet) if sfilter rec.each_sample(options[:sfilter_samples]) do | sample | return if not sample.sfilter(sfilter,ignore_missing_data: ignore_missing,quiet: quiet) end end if ifilter found = false rec.each_sample(options[:ifilter_samples]) do | sample | if sample.ifilter(ifilter,ignore_missing_data: ignore_missing,quiet: quiet) found = true break end end # Skip if there are no matches return if not found end if efilter rec.each_sample(options[:efilter_samples]) do | sample | return if not sample.efilter(efilter,ignore_missing_data: ignore_missing,quiet: quiet) end end stats.add(rec) if stats # ----------------------------- # From here on decide on output if samples # Select certain samples for output newfields = fields[0..8] samples.each do |s| newfields << fields[s+9] end fields = newfields end if options[:eval] or seval begin results = nil # result string if options[:eval] res = rec.eval(options[:eval],ignore_missing_data: ignore_missing,quiet: quiet) results = res if res end if seval list = (results ? [] : [rec.chr,rec.pos]) rec.each_sample(options[:sfilter_samples]) { | sample | list << sample.eval(seval,ignore_missing_data: ignore_missing,quiet: quiet) } results = (results ? results.to_s + "\t" : "" ) + list.join("\t") end rescue => e $stderr.print "\nLine: ",line $stderr.print "ERROR evaluating --eval <#{options[:eval]}> #{e.message}\n" raise if options[:verbose] exit 1 end return results.to_s+"\n" if results else if options[:rdf] # Output Turtle RDF VcfRdf::record(options[:id],rec,options[:tags]) elsif options[:template] # Use ERB template begin template.body(binding) rescue Exception => e $stderr.print e,": ",fields,"\n" $stderr.print e.backtrace.inspect if options[:verbose] raise end elsif options[:rewrite] # Default behaviour prints VCF line, but rewrite info eval(options[:rewrite]) (fields[0..6]+[rec.info.to_s]+fields[8..-1]).join("\t")+"\n" elsif stats # do nothing else # Default behaviour prints VCF line fields.join("\t")+"\n" end end end pcows = PCOWS.new(options[:num_threads],'bio-vcf',options[:timeout]) header = nil header_output_completed = false CHUNK_SIZE = options[:thread_lines] chunk_lines = [] line_number=0 if options[:bed] bedfilter = BedFilter.new(options[:bed]) end begin # Define linear parser function (going through one chunk) process = lambda { | lines | res = [] lines.each do | line | res << parse_line(line,header,options,bedfilter,samples,template,stats) end res } # ---- Main loop STDIN.each_line do | line | line_number += 1 # ---- Skip embedded headers down the line... next if header_output_completed and line =~ /^#/ # ---- In the following section header information is handled - # this only happens once. # ---- Parse the header lines (chomps from STDIN) # and returns header info and the current line if line =~ /^#/ header,line = parse_header(line,samples,options) end # p [line_number,line] # ---- After the header continue processing if not header_output_completed # one-time post-header processing if not options[:efilter_samples] and options[:ifilter_samples] # Create exclude set as a complement of include set options[:efilter_samples] = header.column_names[9..-1].fill{|i|i.to_s}-options[:ifilter_samples] end print template.header(binding) if template header_output_completed = true end if options[:eval_once] # this happens if we only want one line evaluated - say to get # the number of samples print parse_line(line,header,options,bedfilter,samples,template,stats) exit 0 end # ---- Lines are collected in one buffer and the lines buffer # is added to the chunks list (for the threads) chunk_lines << line # ---- In the following section the VCF lines are parsed by chunks # The chunks may go into different threads if chunk_lines.size > CHUNK_SIZE # ---- process one chunk $stderr.print '.' if not options[:quiet] pcows.wait_for_worker_slot() pcows.submit_worker(process,chunk_lines) pcows.process_output() chunk_lines = [] end end pcows.submit_worker(process,chunk_lines) pcows.wait_for_workers() pcows.process_remaining_output() print template.footer(binding) if template stats.print if stats rescue Exception => e # $stderr.print line $stderr.print e.message,"\n" if e.message != 'exit' raise if options[:verbose] exit 1 end