share/install/Organism/organism_helpers.rb in rbbt-sources-1.1.0 vs share/install/Organism/organism_helpers.rb in rbbt-sources-1.2.0
- old
+ new
@@ -1,5 +1,7 @@
+require 'net/ftp'
+
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
$biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
@@ -54,12 +56,14 @@
$biomart_exon_phase = [
$biomart_ensembl_transcript,
['Phase','phase'],
]
+$biomart_pfam= [
+ ["Pfam Domain", 'pfam'],
+]
-
$biomart_exons = [
$biomart_ensembl_gene,
['Exon Strand','strand'],
['Exon Chr Start','exon_chrom_start'],
['Exon Chr End','exon_chrom_end'],
@@ -69,10 +73,16 @@
file 'scientific_name' do |t|
File.open(t.name, 'w') do |f| f.write $scientific_name end
end
+file 'ortholog_key' do |t|
+ raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
+
+ File.open(t.name, 'w') do |f| f.write $ortholog_key end
+end
+
file 'identifiers' do |t|
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => $namespace)
identifiers.unnamed = true
$biomart_identifiers.each do |name, key, prefix|
@@ -454,9 +464,52 @@
goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => $namespace)
File.open(t.name, 'w') do |f| f.puts goterms end
end
+rule /^chromosome_.*/ do |t|
+ chr = t.name.match(/chromosome_(.*)/)[1]
+
+ archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
+
+ release = case archive
+ when "may2009"
+ "release-54"
+ when "jun2011"
+ "release-64"
+ when nil
+ Open.read("http://www.ensembl.org/info/data/ftp/index.html", :nocache => true).match(/pub\/(\w+-\d+)\/fasta/)[1]
+ end
+
+
+ ftp = Net::FTP.new("ftp.ensembl.org")
+ ftp.login
+ ftp.chdir("pub/#{ release }/fasta/")
+ ftp.chdir($scientific_name.downcase.sub(" ",'_'))
+ ftp.chdir('dna')
+ file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
+
+ raise "Fasta file for chromosome not found: #{ chr } - #{ archive }, #{ release }" if file.nil?
+
+ Log.debug("Downloading chromosome sequence: #{ file }")
+ TmpFile.with_file do |tmpfile|
+ ftp.getbinaryfile(file, tmpfile)
+ Open.write(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
+ ftp.close
+ end
+end
+
+rule /^possible_ortholog_(.*)/ do |t|
+ other = t.name.match(/ortholog_(.*)/)[1]
+ other_key = Organism.ortholog_key(other).produce.read
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", "inter_paralog_" + other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
+end
+
+rule /^ortholog_(.*)/ do |t|
+ other = t.name.match(/ortholog_(.*)/)[1]
+ other_key = Organism.ortholog_key(other).produce.read
+ BioMart.tsv($biomart_db, $biomart_ensembl_gene, [["Ortholog Ensembl Gene ID", other_key]], [], nil, :keep_empty => false, :type => :flat, :filename => t.name, :namespace => $namespace)
+end
rule /[a-z]{3}[0-9]{4}\/.*/i do |t|
t.name =~ /([a-z]{3}[0-9]{4})\/(.*)/i
archive = $1
task = $2