organism_helpers.rb in rbbt-sources-3.4.1

- old
+ new

@@ -1,10 +1,13 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
+
 require 'net/ftp'
+require 'rbbt/sources/biomart'
+require 'rbbt/sources/entrez'
+require File.join(File.dirname(__FILE__), '../lib/helpers')
 require 'rbbt/sources/ensembl_ftp'
 
-#Thread.current['namespace'] = $namespace
-
 $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
 $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
 $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
 $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
 
@@ -75,10 +78,21 @@
 
 $biomart_pfam= [
   ["Pfam Domain", 'pfam'],
 ]
 
+$biomart_go= [
+  ["GO ID", 'go_id'],
+  ["GO Namespace", 'namespace_1003'],
+]
+
+$biomart_go_2009= [
+  ["GO BP ID", 'go_biological_process_id'],
+  ["GO MF ID", 'go_molecular_function_id'],
+  ["GO CC ID", 'go_cellular_component_id'],
+]
+
 $biomart_gene_biotype= [
   ["Biotype", 'gene_biotype'],
 ]
 
 $biomart_exons = [
@@ -89,11 +103,17 @@
 ]
 
 #{{{ Rules
 
 file 'entrez_taxids' do |t|
-  Misc.sensiblewrite(t.name, $taxs * "\n")
+  if $tax && $tax.any?
+    Misc.sensiblewrite(t.name, $taxs * "\n")
+  else
+    tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
+    taxs = tsv[$scientific_name] || []
+    Misc.sensiblewrite(t.name, taxs * "\n")
+  end
 end
 
 file 'scientific_name' do |t|
   Misc.sensiblewrite(t.name, $scientific_name)
 end
@@ -102,11 +122,12 @@
   raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
 
   Misc.sensiblewrite(t.name, $ortholog_key)
 end
 
-file 'identifiers' do |t|
+file 'identifiers' => 'entrez_taxids' do |t|
+  tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
   identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
   identifiers.unnamed =  true
 
   $biomart_identifiers.each do |name, key, prefix|
     next unless identifiers.all_fields.include? name
@@ -114,22 +135,24 @@
       identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
     end
   end
 
   name_pos = identifiers.identify_field "Associated Gene Name"
-  entrez2name = Entrez.entrez2name($taxs)
-  identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
-    names = values[name_pos]
+  if tax_codes and tax_codes.any?
+    entrez2name = Entrez.entrez2name(tax_codes)
+    identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
+      names = values[name_pos] || []
 
-    matches = entrez.select do |e|
-      entrez2name.include?(e) && (names & entrez2name[e]).any?
-    end
+      matches = entrez.select do |e|
+        entrez2name.include?(e) && (names & entrez2name[e]).any?
+      end
 
-    if matches.any?
-      matches
-    else
-      entrez
+      if matches.any?
+        matches
+      else
+        entrez
+      end
     end
   end
 
   refseq_fields = identifiers.fields.select{|f| f =~ /RefSeq/}
 
@@ -145,33 +168,37 @@
     ordered_fields[pos..pos-1] = "RefSeq ID"
 
     identifiers = identifiers.reorder(:key, ordered_fields)
   end
 
-  entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
-  entrez_synonyms.key_field = "Entrez Gene ID"
-  entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
+  if tax_codes and tax_codes.any?
+    entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
+    entrez_synonyms.key_field = "Entrez Gene ID"
+    entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
 
-  identifiers.attach entrez_synonyms
+    identifiers.attach entrez_synonyms
+  end
 
   identifiers.with_unnamed do
     identifiers.each do |key, values|
       values.each do |list|
+        list ||= []
         list.reject!{|v| v.nil? or v.empty?}
         list.uniq!
       end
     end
   end
 
 
   Misc.sensiblewrite(t.name, identifiers.to_s)
 end
 
-file 'lexicon' => 'identifiers' do |t|
+file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
   tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
+  tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
 
-  entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
+  entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
   entrez_description.key_field = "Entrez Gene ID"
   entrez_description.fields = ["Entrez Gene Description"]
 
   tsv = tsv.attach entrez_description
   Misc.sensiblewrite(t.name, tsv.to_s)
@@ -306,12 +333,13 @@
 end
 
 
 # {{{ Other info
 
-file 'gene_pmids' do |t|
-  tsv =  Entrez.entrez2pubmed($taxs)
+file 'gene_pmids' => 'entrez_taxids' do |t|
+  tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
+  tsv =  Entrez.entrez2pubmed(tax_codes)
   text = "#: :namespace=#{Thread.current['namespace']}\n"
   text += "#Entrez Gene ID\tPMID"
   tsv.each do |gene, pmids|
     text << "\n" << gene << "\t" << pmids * "|"
   end
@@ -415,11 +443,11 @@
 file 'gene_go_bp' => 'gene_go' do |t|
   gene_go = TSV.open(t.prerequisites.first)
    
   gene_go.monitor = true
   gene_go.process "GO ID" do |key, go_id, values|
-    clean = values.zip_fields.select do |id, type|
+    clean = NamedArray.zip_fields(values).select do |id, type|
       type == "biological_process"
     end
     clean.collect{|id, type| id}
   end
 
@@ -485,13 +513,13 @@
 
   Misc.sensiblewrite(t.name, pfam.to_s)
 end
 
 file 'chromosomes' do |t|
-  goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
+  tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
 
-  Misc.sensiblewrite(t.name, goterms.to_s)
+  Misc.sensiblewrite(t.name, tsv.keys * "\n")
 end
 
 file 'blacklist_chromosomes' => 'chromosomes' do |t|
   list = TSV.open(t.prerequisites.first).keys.select{|c| c.index('_') or c.index('.')}
   File.open(t.name, 'w') do |f| f.puts list * "\n" end
@@ -509,40 +537,72 @@
 end
 
 
 rule /^chromosome_.*/ do |t|
   chr = t.name.match(/chromosome_(.*)/)[1]
+  path = File.expand_path(t.name)
+  dirname = File.dirname(path)
 
+  organism = File.basename(dirname)
+  if organism =~ /^[a-z]{3}20[0-9]{2}/
+    archive = organism
+    organism = File.basename(File.dirname(dirname))
+    organism = File.join(organism, archive)
+  end
+
   # HACK: Skip LRG chromosomes
   raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
 
   archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
 
   release = Ensembl.releases[archive]
 
-  ftp = Net::FTP.new("ftp.ensembl.org")
+  fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
+  server, _, path = fasta_url.partition("/")
+  path = "/" + path
+
+  ftp = Net::FTP.new(server)
   ftp.passive = true
   ftp.login
-  if release.nil? or release == 'current'
-    ftp.chdir("pub/current_fasta/")
-  else
-    ftp.chdir("pub/#{ release }/fasta/")
-  end
-  ftp.chdir($scientific_name.downcase.sub(" ",'_'))
+  ftp.chdir(path)
   ftp.chdir('dna')
-  file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
 
-  raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
+  file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
+  if file
+    Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
 
-  Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
+    Misc.lock t.name + '.rake' do
+      TmpFile.with_file do |tmpfile|
+        ftp.getbinaryfile(file, tmpfile)
+        Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
+        ftp.close
+      end
+    end
+  else
+    file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
+    Misc.lock t.name + '.rake' do
+      TmpFile.with_file do |tmpfile|
+        ftp.getbinaryfile(file, tmpfile)
+        txt = Open.read(tmpfile, :gzip => true)
 
-  Misc.lock t.name + '.rake' do
-    TmpFile.with_file do |tmpfile|
-      ftp.getbinaryfile(file, tmpfile)
-      Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
-      ftp.close
+        chr_txt = []
+
+        in_chr = false
+        txt.split("\n").each do |line|
+          if line.start_with?(">#{chr}")
+            in_chr = true
+          elsif line.start_with?(">")
+            in_chr = false
+          else
+            chr_txt << line if in_chr
+          end
+        end
+        Misc.sensiblewrite(t.name, chr_txt * "" )
+        ftp.close
+      end
     end
+    raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
   end
 end
 
 rule /^possible_ortholog_(.*)/ do |t|
   other = t.name.match(/ortholog_(.*)/)[1]
@@ -582,10 +642,20 @@
 
 #{{{ Special files
 require 'bio'
 
 file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
+  path = File.expand_path(t.name)
+  dirname = File.dirname(path)
+
+  organism = File.basename(dirname)
+  if organism =~ /^[a-z]{3}20[0-9]{2}/
+    archive = organism
+    organism = File.basename(File.dirname(dirname))
+    organism = File.join(organism, archive)
+  end
+
   exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
 
   chr_transcript_ranges ||= {}
   transcript_strand = {}
 
@@ -614,14 +684,14 @@
 
   transcript_sequence = {}
   chr_transcript_ranges.each do |chr, transcript_ranges|
     begin
       raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
-      p = File.expand_path("./chromosome_#{chr}")
-      Organism.root.annotate p
-      p.sub!(%r{.*/organisms/},'share/organisms/')
-      chr_str = p.produce.read
+      pkgdir = Thread.current["resource"]
+      p = pkgdir[organism]["chromosome_#{chr}"]
+      p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
+      chr_str = p.read
     rescue Exception
       Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
       raise $! unless $!.message =~ /not supported/
       next
     end
@@ -654,11 +724,11 @@
     archive = organism
     organism = File.basename(File.dirname(dirname))
     organism = File.join(organism, archive)
   end
 
-  translation        = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
+  translation        = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
 
   if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
     exon2ensembl       = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true) 
   else
     exon2ensembl       = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true) 
@@ -668,13 +738,13 @@
     transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true) 
   else
     transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true) 
   end
 
-  transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single,  :unmamed => true)
-  transcript_exons   = TSV.open("./transcript_exons", :unmamed => true)
-  exon_ranges        = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
+  transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single,  :unnamed => true)
+  transcript_exons   = TSV.open("./transcript_exons", :unnamed => true)
+  exon_ranges        = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
 
   transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
   transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
 
   translation.through do |transcript_id, values|
@@ -717,16 +787,17 @@
 end
 
 file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
   transcript_5utr     = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
   transcript_3utr     = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
-  transcript_phase     = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
+  transcript_phase    = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
   transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
   transcript_protein  = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
 
 
   protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
+  transcript_sequence.monitor = true
   transcript_sequence.through do |transcript, sequence|
     protein = transcript_protein[transcript]
     next if protein.nil? or protein.empty?
 
     utr5 = transcript_5utr[transcript]
@@ -775,10 +846,11 @@
   TSV.traverse all_uni, :into => dumper, :cpus => 1, :bar => true do |uni|
     uni = uni.first if Array === uni
     uni_seq = UniProt.get_uniprot_sequence(uni)
     ensps = uni2ensps[uni]
     next if ensps.nil? or ensps.empty?
+
     best_ensp = ensps.sort_by do |ensp|
       ensp_seq = ensp2seq[ensp]
       if ensp_seq
         (ensp_seq.length - uni_seq.length).abs
       else
@@ -804,11 +876,11 @@
   release = Ensembl.org2release(organism)
   num = release.split("-").last
   build_code = Organism.GRC_build(organism)
   scientific_name = $scientific_name
   url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
-  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
+  Open.download(url, "#{t.name}.gz")
   nil
 end
 
 file 'cdna_fasta' do |t|
   path = File.expand_path(t.name)
@@ -823,9 +895,10 @@
 
   release = Ensembl.org2release(organism)
   num = release.split("-").last
   build_code = Organism.GRC_build(organism)
   scientific_name = Organism.scientific_name(organism)
-  url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
-  CMD.cmd("wget '#{url}' -O #{t.name}.gz")
+  url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.cdna.all.fa.gz"
+  Open.download(url, "#{t.name}.gz")
   nil
 end
+