share/install/Organism/organism_helpers.rb in rbbt-sources-3.3.0 vs share/install/Organism/organism_helpers.rb in rbbt-sources-3.4.1
- old
+ new
@@ -1,10 +1,13 @@
+$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib'))
+
require 'net/ftp'
+require 'rbbt/sources/biomart'
+require 'rbbt/sources/entrez'
+require File.join(File.dirname(__FILE__), '../lib/helpers')
require 'rbbt/sources/ensembl_ftp'
-#Thread.current['namespace'] = $namespace
-
$biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id']
$biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id']
$biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id']
$biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id']
@@ -75,10 +78,21 @@
$biomart_pfam= [
["Pfam Domain", 'pfam'],
]
+$biomart_go= [
+ ["GO ID", 'go_id'],
+ ["GO Namespace", 'namespace_1003'],
+]
+
+$biomart_go_2009= [
+ ["GO BP ID", 'go_biological_process_id'],
+ ["GO MF ID", 'go_molecular_function_id'],
+ ["GO CC ID", 'go_cellular_component_id'],
+]
+
$biomart_gene_biotype= [
["Biotype", 'gene_biotype'],
]
$biomart_exons = [
@@ -89,11 +103,17 @@
]
#{{{ Rules
file 'entrez_taxids' do |t|
- Misc.sensiblewrite(t.name, $taxs * "\n")
+ if $tax && $tax.any?
+ Misc.sensiblewrite(t.name, $taxs * "\n")
+ else
+ tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat)
+ taxs = tsv[$scientific_name] || []
+ Misc.sensiblewrite(t.name, taxs * "\n")
+ end
end
file 'scientific_name' do |t|
Misc.sensiblewrite(t.name, $scientific_name)
end
@@ -102,11 +122,12 @@
raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil?
Misc.sensiblewrite(t.name, $ortholog_key)
end
-file 'identifiers' do |t|
+file 'identifiers' => 'entrez_taxids' do |t|
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace'])
identifiers.unnamed = true
$biomart_identifiers.each do |name, key, prefix|
next unless identifiers.all_fields.include? name
@@ -114,22 +135,24 @@
identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end
end
end
name_pos = identifiers.identify_field "Associated Gene Name"
- entrez2name = Entrez.entrez2name($taxs)
- identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
- names = values[name_pos]
+ if tax_codes and tax_codes.any?
+ entrez2name = Entrez.entrez2name(tax_codes)
+ identifiers.process "Entrez Gene ID" do |entrez, ensembl, values|
+ names = values[name_pos] || []
- matches = entrez.select do |e|
- entrez2name.include?(e) && (names & entrez2name[e]).any?
- end
+ matches = entrez.select do |e|
+ entrez2name.include?(e) && (names & entrez2name[e]).any?
+ end
- if matches.any?
- matches
- else
- entrez
+ if matches.any?
+ matches
+ else
+ entrez
+ end
end
end
refseq_fields = identifiers.fields.select{|f| f =~ /RefSeq/}
@@ -145,33 +168,37 @@
ordered_fields[pos..pos-1] = "RefSeq ID"
identifiers = identifiers.reorder(:key, ordered_fields)
end
- entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
- entrez_synonyms.key_field = "Entrez Gene ID"
- entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
+ if tax_codes and tax_codes.any?
+ entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4]
+ entrez_synonyms.key_field = "Entrez Gene ID"
+ entrez_synonyms.fields = ["Entrez Gene Name Synonyms"]
- identifiers.attach entrez_synonyms
+ identifiers.attach entrez_synonyms
+ end
identifiers.with_unnamed do
identifiers.each do |key, values|
values.each do |list|
+ list ||= []
list.reject!{|v| v.nil? or v.empty?}
list.uniq!
end
end
end
Misc.sensiblewrite(t.name, identifiers.to_s)
end
-file 'lexicon' => 'identifiers' do |t|
+file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t|
tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"])
+ tax_codes = Open.read(t.prerequisites.last).strip.split("\n")
- entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
+ entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8
entrez_description.key_field = "Entrez Gene ID"
entrez_description.fields = ["Entrez Gene Description"]
tsv = tsv.attach entrez_description
Misc.sensiblewrite(t.name, tsv.to_s)
@@ -306,12 +333,13 @@
end
# {{{ Other info
-file 'gene_pmids' do |t|
- tsv = Entrez.entrez2pubmed($taxs)
+file 'gene_pmids' => 'entrez_taxids' do |t|
+ tax_codes = Open.read(t.prerequisites.first).strip.split("\n")
+ tsv = Entrez.entrez2pubmed(tax_codes)
text = "#: :namespace=#{Thread.current['namespace']}\n"
text += "#Entrez Gene ID\tPMID"
tsv.each do |gene, pmids|
text << "\n" << gene << "\t" << pmids * "|"
end
@@ -415,11 +443,11 @@
file 'gene_go_bp' => 'gene_go' do |t|
gene_go = TSV.open(t.prerequisites.first)
gene_go.monitor = true
gene_go.process "GO ID" do |key, go_id, values|
- clean = values.zip_fields.select do |id, type|
+ clean = NamedArray.zip_fields(values).select do |id, type|
type == "biological_process"
end
clean.collect{|id, type| id}
end
@@ -485,13 +513,13 @@
Misc.sensiblewrite(t.name, pfam.to_s)
end
file 'chromosomes' do |t|
- goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
+ tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace'])
- Misc.sensiblewrite(t.name, goterms.to_s)
+ Misc.sensiblewrite(t.name, tsv.keys * "\n")
end
file 'blacklist_chromosomes' => 'chromosomes' do |t|
list = TSV.open(t.prerequisites.first).keys.select{|c| c.index('_') or c.index('.')}
File.open(t.name, 'w') do |f| f.puts list * "\n" end
@@ -509,40 +537,72 @@
end
rule /^chromosome_.*/ do |t|
chr = t.name.match(/chromosome_(.*)/)[1]
+ path = File.expand_path(t.name)
+ dirname = File.dirname(path)
+ organism = File.basename(dirname)
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
+ archive = organism
+ organism = File.basename(File.dirname(dirname))
+ organism = File.join(organism, archive)
+ end
+
# HACK: Skip LRG chromosomes
raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/
archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil
release = Ensembl.releases[archive]
- ftp = Net::FTP.new("ftp.ensembl.org")
+ fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last
+ server, _, path = fasta_url.partition("/")
+ path = "/" + path
+
+ ftp = Net::FTP.new(server)
ftp.passive = true
ftp.login
- if release.nil? or release == 'current'
- ftp.chdir("pub/current_fasta/")
- else
- ftp.chdir("pub/#{ release }/fasta/")
- end
- ftp.chdir($scientific_name.downcase.sub(" ",'_'))
+ ftp.chdir(path)
ftp.chdir('dna')
- file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first
- raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first
+ if file
+ Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
- Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}")
+ Misc.lock t.name + '.rake' do
+ TmpFile.with_file do |tmpfile|
+ ftp.getbinaryfile(file, tmpfile)
+ Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
+ ftp.close
+ end
+ end
+ else
+ file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil?
+ Misc.lock t.name + '.rake' do
+ TmpFile.with_file do |tmpfile|
+ ftp.getbinaryfile(file, tmpfile)
+ txt = Open.read(tmpfile, :gzip => true)
- Misc.lock t.name + '.rake' do
- TmpFile.with_file do |tmpfile|
- ftp.getbinaryfile(file, tmpfile)
- Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,''))
- ftp.close
+ chr_txt = []
+
+ in_chr = false
+ txt.split("\n").each do |line|
+ if line.start_with?(">#{chr}")
+ in_chr = true
+ elsif line.start_with?(">")
+ in_chr = false
+ else
+ chr_txt << line if in_chr
+ end
+ end
+ Misc.sensiblewrite(t.name, chr_txt * "" )
+ ftp.close
+ end
end
+ raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil?
end
end
rule /^possible_ortholog_(.*)/ do |t|
other = t.name.match(/ortholog_(.*)/)[1]
@@ -582,10 +642,20 @@
#{{{ Special files
require 'bio'
file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t|
+ path = File.expand_path(t.name)
+ dirname = File.dirname(path)
+
+ organism = File.basename(dirname)
+ if organism =~ /^[a-z]{3}20[0-9]{2}/
+ archive = organism
+ organism = File.basename(File.dirname(dirname))
+ organism = File.join(organism, archive)
+ end
+
exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true)
chr_transcript_ranges ||= {}
transcript_strand = {}
@@ -614,14 +684,14 @@
transcript_sequence = {}
chr_transcript_ranges.each do |chr, transcript_ranges|
begin
raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr
- p = File.expand_path("./chromosome_#{chr}")
- Organism.root.annotate p
- p.sub!(%r{.*/organisms/},'share/organisms/')
- chr_str = p.produce.read
+ pkgdir = Thread.current["resource"]
+ p = pkgdir[organism]["chromosome_#{chr}"]
+ p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}"
+ chr_str = p.read
rescue Exception
Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}")
raise $! unless $!.message =~ /not supported/
next
end
@@ -654,11 +724,11 @@
archive = organism
organism = File.basename(File.dirname(dirname))
organism = File.join(organism, archive)
end
- translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true)
+ translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true)
if Ensembl::FTP.has_table?(organism, 'exon_stable_id')
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
else
exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true)
@@ -668,13 +738,13 @@
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
else
transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true)
end
- transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true)
- transcript_exons = TSV.open("./transcript_exons", :unmamed => true)
- exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true)
+ transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
+ transcript_exons = TSV.open("./transcript_exons", :unnamed => true)
+ exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true)
transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single)
transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single)
translation.through do |transcript_id, values|
@@ -717,16 +787,17 @@
end
file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t|
transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true)
transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true)
- transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
+ transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true)
transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true)
transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true)
protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single)
+ transcript_sequence.monitor = true
transcript_sequence.through do |transcript, sequence|
protein = transcript_protein[transcript]
next if protein.nil? or protein.empty?
utr5 = transcript_5utr[transcript]
@@ -775,10 +846,11 @@
TSV.traverse all_uni, :into => dumper, :cpus => 1, :bar => true do |uni|
uni = uni.first if Array === uni
uni_seq = UniProt.get_uniprot_sequence(uni)
ensps = uni2ensps[uni]
next if ensps.nil? or ensps.empty?
+
best_ensp = ensps.sort_by do |ensp|
ensp_seq = ensp2seq[ensp]
if ensp_seq
(ensp_seq.length - uni_seq.length).abs
else
@@ -804,11 +876,11 @@
release = Ensembl.org2release(organism)
num = release.split("-").last
build_code = Organism.GRC_build(organism)
scientific_name = $scientific_name
url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz"
- CMD.cmd("wget '#{url}' -O #{t.name}.gz")
+ Open.download(url, "#{t.name}.gz")
nil
end
file 'cdna_fasta' do |t|
path = File.expand_path(t.name)
@@ -823,9 +895,10 @@
release = Ensembl.org2release(organism)
num = release.split("-").last
build_code = Organism.GRC_build(organism)
scientific_name = Organism.scientific_name(organism)
- url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz"
- CMD.cmd("wget '#{url}' -O #{t.name}.gz")
+ url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.cdna.all.fa.gz"
+ Open.download(url, "#{t.name}.gz")
nil
end
+