require 'snp-search' require 'snp_db_connection' require 'snp_db_models' require 'snp_db_schema' require 'activerecord-import' # gem "slop", "~> 3.1.0" gem "slop", "~> 2.4.0" require 'slop' opts = Slop.new do # separator 'test' banner "\nruby snp-search [OPTIONS]" on :C, :create, 'Create database' on :Q, :query, 'Query database' on :O, :out_file, 'Output the database to a file' # separator '' # separator 'README file: https://github.com/hpa-bioinformatics/snp-search/blob/master/README.rdoc' # separator 'The following command must be used when using -create, or -query or -out_file' on :n, :name=, 'Name of database, Required' # separator '' # separator '-create options' on :d, :database_reference_file, 'Reference genome file, in gbk or embl file format, Required', true on :v, :vcf_file, '.vcf file, Required', true on :c, :cuttoff_snp, 'SNP quality cutoff, (default = 90)', :default => 90 on :g, :cuttoff_genotype, 'Genotype quality cutoff (default = 30)', :default => 30 # separator '' # separator '-query options' on :G, :genes_query, 'Query for unique genes in the database' on :R, :remove_genes, 'Remove set of genes from database and create FASTA file' on :s, :strain=, 'The strains/samples you like to query, Required' on :a, :annotation=, 'The gene you like to remove from analysis' on :o, :output=, 'output file, in fasta format' on :t, :tree, 'Generate SNP phylogeny' on :w, :tree_nwk_output=, 'output tree in Newick format' on :S, :syn, 'syn' end opts.parse ########################################################### # CREATING A DATABASE if opts[:create] error_msg = "" error_msg += "-n option: \t the name of your database\n" unless opts[:name] error_msg += "-d option: \t reference genome file, in gbk or embl file format\n" unless opts[:database_reference_file] error_msg += "-v option: \t .vcf file\n" unless opts[:vcf_file] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts opts.help unless opts.empty? exit end abort "#{opts[:database_reference_file]} file does not exist!" unless File.exist?(opts[:database_reference_file]) abort "#{opts[:vcf_file]} file does not exist!" unless File.exist?(opts[:vcf_file]) # Name of your database establish_connection(opts[:name]) # Schema will run here db_schema ref = opts[:database_reference_file] sequence_format = guess_sequence_format(ref) case sequence_format when :genbank sequence_flatfile = Bio::FlatFile.open(Bio::GenBank,opts[:database_reference_file]).next_entry when :embl sequence_flatfile = Bio::FlatFile.open(Bio::EMBL,opts[:database_reference_file]).next_entry else puts "All sequence files should be in genbank or embl format" exit end # path for vcf file here vcf_mpileup_file = opts[:vcf_file] # The populate_features_and_annotations method populates the features and annotations. It uses the embl/gbk file. populate_features_and_annotations(sequence_flatfile) #The populate_snps_alleles_genotypes method populates the snps, alleles and genotypes. It uses the vcf file, and if specified, the SNP quality cutoff and genotype quality cutoff populate_snps_alleles_genotypes(vcf_mpileup_file, opts[:cuttoff_snp].to_i, opts[:cuttoff_genotype].to_i) ########################################################### # QUERYING THE DATABASE elsif opts [:query] #FIND UNIQUE SNPS if opts[:genes_query] error_msg = "" error_msg += "-n option, \t the name of your database\n" unless opts[:name] error_msg += "-s option, \t list of strains you like to query\n" unless opts[:strain] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts opts.help unless opts.empty? exit end abort "#{opts[:name]} database does not exist!" unless File.exist?(opts[:name]) abort "#{opts[:strain]} file does not exist!" unless File.exist?(opts[:strain]) establish_connection(opts[:name]) strains = [] File.read(opts[:strain]).each_line do |line| strains << line.chop end # puts find_shared_snps(strains) # exit gas_snps = find_shared_snps(strains) gas_snps.each do |snp| puts "The number of unique snps are #{snp.id}.size" end ################################################################ # REMOVE SNPS ASSOCIATED WITH SPECIFIC GENES elsif opts[:remove_genes] error_msg = "" error_msg += "-n option: \t the name of your database\n" unless opts[:name] error_msg += "-o option: \t name of your output file\n" unless opts[:output] error_msg += "-a option: \t name of the gene that you like to remove from the database\n" unless opts[:annotation] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts opts.help unless opts.empty? exit end abort "#{opts[:name]} database does not exist!" unless File.exist?(opts[:name]) # annotation = opts[:annotation] establish_connection(opts[:name]) # Getting list of strains from database strains = Strain.all sequence_hash = Hash.new # create a sequence hash # hash key is strain_id, loop through strain_id # create an empty array strains.each do |strain| sequence_hash[strain.id] = Array.new end # output opened for data input output = File.open("#{opts[:output]}", "w") # Perform query snps = Snp.includes(:alleles => :genotypes).find_by_sql("SELECT snps.* FROM snps INNER JOIN features ON features.id = snps.feature_id WHERE features.id NOT IN (select distinct features.id FROM features INNER JOIN annotations ON annotations.feature_id = features.id WHERE annotations.value LIKE '%#{opts[:annotation]}%')") i = 0 puts "Your Query is submitted and is being processed......." snps.each do |snp| # puts snp.inspect i += 1 puts "Total number of SNPs generated so far: #{i}" if i % 100 == 0 ActiveRecord::Base.transaction do snp.alleles.each do |allele| # puts allele.inspect allele.genotypes.each do |genotype| #push bases to hash sequence_hash[genotype.strain_id] << allele.base end end end end #generate FASTA file strains.each do |strain| output.print ">#{strain.name}\n" , sequence_hash[strain.id].join("") output.puts end if opts[:tree] `FastTree -fastest -nt #{opts[:output]} > #{opts[:w]}` end end ############################################################## # OUTPUT DATABASE IN FASTA FORMAT elsif opts[:out_file] error_msg = "" error_msg += "-n option: \t the name of your database\n" unless opts[:name] error_msg += "-o option: \t name of your output file\n" unless opts[:output] unless error_msg == "" puts "Please provide the following required fields:" puts error_msg puts opts.help unless opts.empty? exit end abort "#{opts[:name]} database does not exist!" unless File.exist?(opts[:name]) establish_connection(opts[:name]) # Getting list of strains from database strains = Strain.all sequence_hash = Hash.new # create a sequence hash # hash key is strain_id, loop through strain_id # create an empty array strains.each do |strain| sequence_hash[strain.id] = Array.new end output = File.open("#{opts[:output]}", "w") # Select all snps snps = Snp.all i = 0 puts "Your out file is being prepared......." snps.each do |snp| i += 1 puts "Total number of SNPs outputted so far: #{i}" if i % 100 == 0 ActiveRecord::Base.transaction do snp.alleles.each do |allele| # puts allele.inspect allele.genotypes.each do |genotype| #push bases to hash sequence_hash[genotype.strain_id] << allele.base end end end #generate FASTA file strains.each do |strain| output.print ">#{strain.name}\n" , sequence_hash[strain.id].join("") output.puts end if opts[:tree] `FastTree -fastest -nt #{opts[:output]} > #{opts[:w]}` end end else puts opts.help end