require 'snp-search' require 'snp_db_connection' require 'snp_db_models' require 'snp_db_schema' require 'activerecord-import' gem "slop", "~> 3.3.1" # gem "slop", "~> 2.4.0" require 'slop' opts = Slop.parse do banner "\nruby snp-search [-create] [-query] [-output] [-n ] [options]*" separator '' on :C, :create, 'Create database' on :Q, :query, 'Query database' on :O, :output, 'Output options' separator '' # separator 'README file: https://github.com/hpa-bioinformatics/snp-search/blob/master/README.rdoc' # separator 'The following command must be used when using -create, or -query or -out_file' on :n, :name=, 'Name of database, Required' separator '' separator '-create options' on :d, :database_reference_file=, 'Reference genome file, in gbk or embl file format, Required', true on :v, :vcf_file=, 'variant call format (vcf) file, Required', true on :c, :cuttoff_snp=, 'SNP quality cutoff, (default = 90)', :as => :int, :default => 90 on :g, :cuttoff_genotype=, 'Genotype quality cutoff (default = 30)', :as => :int, :default => 30 separator '' separator '-query options' on :u, :unique_snps, 'Query for unique snps in the database' on :r, :not_include_snps_from_gene, 'Remove SNPs from specified gene from database' on :s, :strain=, 'The strains/samples you like to query, Required' on :a, :annotation=, 'The gene you like to remove from analysis' separator '' separator '-output [-fasta] [-syn] options' on :f, :fasta, 'output fasta file' on :S, :syn, 'output tab-delimited file with synonymous and non-synonymous info' on :o, :out=, 'Name of output file' on :t, :tree, 'Generate SNP phylogeny' on :w, :nwk_out=, 'Name of output tree in Newick format' end # opts.end ########################################################### # CREATING A DATABASE if opts[:create] # puts opts[:cuttoff_snp].to_i error_msg = "" error_msg += "-n: \t Name of your database\n" unless opts[:name] error_msg += "-d: \t Reference genome file, in gbk or embl file format\n" unless opts[:database_reference_file] error_msg += "-v: \t .vcf file\n" unless opts[:vcf_file] error_msg_optional = "" error_msg_optional += "-c: \tSNP quality cutoff, (default = 90)\n" error_msg_optional += "-g: \tGenotype quality cutoff (default = 30)\n" unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts "Optional fields:" puts error_msg_optional puts opts.help unless opts.empty? exit end abort "#{opts[:database_reference_file]} file does not exist!" unless File.exist?(opts[:database_reference_file]) abort "#{opts[:vcf_file]} file does not exist!" unless File.exist?(opts[:vcf_file]) # Name of your database establish_connection(opts[:name]) # Schema will run here db_schema ref = opts[:database_reference_file] sequence_format = guess_sequence_format(ref) case sequence_format when :genbank sequence_flatfile = Bio::FlatFile.open(Bio::GenBank,opts[:database_reference_file]).next_entry when :embl sequence_flatfile = Bio::FlatFile.open(Bio::EMBL,opts[:database_reference_file]).next_entry else puts "All sequence files should be in genbank or embl format" exit end # path for vcf file here vcf_mpileup_file = opts[:vcf_file] # The populate_features_and_annotations method populates the features and annotations. It uses the embl/gbk file. populate_features_and_annotations(sequence_flatfile) #The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff populate_snps_alleles_genotypes(vcf_mpileup_file, opts[:cuttoff_snp], opts[:cuttoff_genotype]) # puts "populate_snps_alleles_genotypes(#{vcf_mpileup_file}, #{opts[:cuttoff_snp]}, #{opts[:cuttoff_genotype]}.to_i)" ########################################################### # QUERYING THE DATABASE elsif opts [:query] #FIND UNIQUE SNPS if opts[:unique_snps] error_msg = "" error_msg += "-n: \t Name of your database\n" unless opts[:name] error_msg += "-s: \t List of strains you like to query\n" unless opts[:strain] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts opts.help unless opts.empty? exit end abort "#{opts[:name]} database does not exist!" unless File.exist?(opts[:name]) abort "#{opts[:strain]} file does not exist!" unless File.exist?(opts[:strain]) establish_connection(opts[:name]) strains = [] File.read(opts[:strain]).each_line do |line| strains << line.chop end # puts find_shared_snps(strains) # exit gas_snps = find_shared_snps(strains) gas_snps.each do |snp| puts "The number of unique snps are #{snp.id}" end ################################################################ # REMOVE SNPS ASSOCIATED WITH SPECIFIC GENES elsif opts[:not_include_snps_from_gene] error_msg = "" error_msg += "-n: \t Name of your database\n" unless opts[:name] error_msg += "-o: \t name of your output file\n" unless opts[:out] error_msg += "-a: \t name of the gene that you like to remove from the database\n" unless opts[:annotation] error_msg_optional = "" error_msg_optional += "-tree: \t Construct tree from output\n" unless opts[:tree] error_msg_optional += "-nwk_out: Name of Newick output file(use only when-tree option used)\n" unless opts[:nwk_out] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts "Optional fields:" puts error_msg_optional puts opts.help unless opts.empty? exit end abort "#{opts[:name]} database does not exist!" unless File.exist?(opts[:name]) # annotation = opts[:annotation] establish_connection(opts[:name]) # Getting list of strains from database strains = Strain.all sequence_hash = Hash.new # create a sequence hash # hash key is strain_id, loop through strain_id # create an empty array strains.each do |strain| sequence_hash[strain.id] = Array.new end # output opened for data input output = File.open("#{opts[:out]}", "w") # Perform query snps = Snp.includes(:alleles => :genotypes).find_by_sql("SELECT snps.* FROM snps INNER JOIN features ON features.id = snps.feature_id WHERE features.id NOT IN (select distinct features.id FROM features INNER JOIN annotations ON annotations.feature_id = features.id WHERE annotations.value LIKE '%#{opts[:annotation]}%')") i = 0 puts "Your Query is submitted and is being processed......." snps.each do |snp| # puts snp.inspect i += 1 puts "Total number of SNPs generated so far: #{i}" if i % 100 == 0 ActiveRecord::Base.transaction do snp.alleles.each do |allele| # puts allele.inspect allele.genotypes.each do |genotype| #push bases to hash sequence_hash[genotype.strain_id] << allele.base end end end end #generate FASTA file strains.each do |strain| output.print ">#{strain.name}\n" , sequence_hash[strain.id].join("") output.puts end # GENERATE TREE FROM FASTA FILE if opts[:tree] `FastTree -fastest -nt #{opts[:out]} > #{opts[:nwk_out]}` end else puts "use -unique_snps or -not_include_snps_from_gene query options" end # ############################################################## # OUTPUT DATABASE IN FASTA FORMAT elsif opts[:output] if opts[:fasta] error_msg = "" error_msg += "-n: \t Name of your database\n" unless opts[:name] error_msg += "-o: \t name of your output file (in FASTA format)\n" unless opts[:out] error_msg_optional = "" error_msg_optional += "-tree: \t Construct tree from output\n" unless opts[:tree] error_msg_optional += "-nwk_out: Name of Newick output file(use only when-tree option used)\n" unless opts[:nwk_out] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts "Optional fields:" puts error_msg_optional puts opts.help unless opts.empty? exit end abort "#{opts[:name]} database does not exist!" unless File.exist?(opts[:name]) establish_connection(opts[:name]) # Getting list of strains from database strains = Strain.all sequence_hash = Hash.new # create a sequence hash # hash key is strain_id, loop through strain_id # create an empty array strains.each do |strain| sequence_hash[strain.id] = Array.new end output = File.open("#{opts[:out]}", "w") # Select all snps snps = Snp.all i = 0 puts "Your out file is being prepared......." snps.each do |snp| i += 1 puts "Total number of SNPs outputted so far: #{i}" if i % 100 == 0 ActiveRecord::Base.transaction do snp.alleles.each do |allele| # puts allele.inspect allele.genotypes.each do |genotype| #push bases to hash sequence_hash[genotype.strain_id] << allele.base end end end end puts sequence_hash exit #generate FASTA file strains.each do |strain| output.print ">#{strain.name}\n" , sequence_hash[strain.id].join("") output.puts end if opts[:tree] # puts "FastTree -fastest -nt #{opts[:out]} > #{opts[:w]}" `FastTree -fastest -nt #{opts[:out]} > #{opts[:w]}` end end ######################################### if opts[:syn] error_msg = "" error_msg += "-n option: \t the name of your database\n" unless opts[:name] error_msg += "-d option: \t the reference file in gbk format\n" unless opts[:database_reference_file] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts opts.help unless opts.empty? exit end abort "#{opts[:name]} database does not exist!" unless File.exist?(opts[:name]) abort "#{opts[:database_reference_file]} vcf file does not exist!" unless File.exist?(opts[:database_reference_file]) establish_connection(opts[:name]) ref = opts[:database_reference_file] synonymous(ref) end else puts opts.help end