share/install/Organism/organism_helpers.rb in rbbt-sources-1.1.0 vs share/install/Organism/organism_helpers.rb in rbbt-sources-1.2.0

- old
+ new

@@ -1,5 +1,7 @@ +require 'net/ftp' + $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id'] $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id'] $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id'] $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id'] @@ -54,12 +56,14 @@ $biomart_exon_phase = [ $biomart_ensembl_transcript, ['Phase','phase'], ] +$biomart_pfam= [ + ["Pfam Domain", 'pfam'], +] - $biomart_exons = [ $biomart_ensembl_gene, ['Exon Strand','strand'], ['Exon Chr Start','exon_chrom_start'], ['Exon Chr End','exon_chrom_end'], @@ -69,10 +73,16 @@ file 'scientific_name' do |t| File.open(t.name, 'w') do |f| f.write $scientific_name end end +file 'ortholog_key' do |t| + raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil? + + File.open(t.name, 'w') do |f| f.write $ortholog_key end +end + file 'identifiers' do |t| identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace) identifiers.unnamed = true $biomart_identifiers.each do |name, key, prefix| @@ -454,9 +464,52 @@ goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => $namespace) File.open(t.name, 'w') do |f| f.puts goterms end end +rule /^chromosome_.*/ do |t| + chr = t.name.match(/chromosome_(.*)/)[1] + + archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil + + release = case archive + when "may2009" + "release-54" + when "jun2011" + "release-64" + when nil + Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1] + end + + + ftp = Net::FTP.new("ftp.ensembl.org") + ftp.login + ftp.chdir("pub/#{ release }/fasta/") + ftp.chdir($scientific_name.downcase.sub(" ",'_')) + ftp.chdir('dna') + file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first + + raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil? + + Log.debug("Downloading chromosome sequence: #{ file }") + TmpFile.with_file do |tmpfile| + ftp.getbinaryfile(file, tmpfile) + Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,'')) + ftp.close + end +end + +rule /^possible_ortholog_(.*)/ do |t| + other = t.name.match(/ortholog_(.*)/)[1] + other_key = Organism.ortholog_key(other).produce.read + BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace) +end + +rule /^ortholog_(.*)/ do |t| + other = t.name.match(/ortholog_(.*)/)[1] + other_key = Organism.ortholog_key(other).produce.read + BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace) +end rule /[a-z]{3}[0-9]{4}\/.*/i do |t| t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i archive = $1 task = $2