share/install/Organism/organism_helpers.rb in rbbt-sources-3.3.0 vs share/install/Organism/organism_helpers.rb in rbbt-sources-3.4.1

- old
+ new

@@ -1,10 +1,13 @@ +$LOAD_PATH.unshift(File.join(File.dirname(__FILE__),'..', '..', '..', 'lib')) + require 'net/ftp' +require 'rbbt/sources/biomart' +require 'rbbt/sources/entrez' +require File.join(File.dirname(__FILE__), '../lib/helpers') require 'rbbt/sources/ensembl_ftp' -#Thread.current['namespace'] = $namespace - $biomart_ensembl_gene = ['Ensembl Gene ID', 'ensembl_gene_id'] $biomart_ensembl_protein = ['Ensembl Protein ID', 'ensembl_peptide_id'] $biomart_ensembl_exon = ['Ensembl Exon ID', 'ensembl_exon_id'] $biomart_ensembl_transcript = ['Ensembl Transcript ID', 'ensembl_transcript_id'] @@ -75,10 +78,21 @@ $biomart_pfam= [ ["Pfam Domain", 'pfam'], ] +$biomart_go= [ + ["GO ID", 'go_id'], + ["GO Namespace", 'namespace_1003'], +] + +$biomart_go_2009= [ + ["GO BP ID", 'go_biological_process_id'], + ["GO MF ID", 'go_molecular_function_id'], + ["GO CC ID", 'go_cellular_component_id'], +] + $biomart_gene_biotype= [ ["Biotype", 'gene_biotype'], ] $biomart_exons = [ @@ -89,11 +103,17 @@ ] #{{{ Rules file 'entrez_taxids' do |t| - Misc.sensiblewrite(t.name, $taxs * "\n") + if $tax && $tax.any? + Misc.sensiblewrite(t.name, $taxs * "\n") + else + tsv = Rbbt.share.databases.entrez.tax_ids.tsv(:key_field => "Scientific Name", merge: true, type: :flat) + taxs = tsv[$scientific_name] || [] + Misc.sensiblewrite(t.name, taxs * "\n") + end end file 'scientific_name' do |t| Misc.sensiblewrite(t.name, $scientific_name) end @@ -102,11 +122,12 @@ raise "Ortholog key not defined. Set up $ortholog_key in the organism specific Rakefile; example $ortholog_key = 'human_ensembl_gene'" unless defined? $ortholog_key and not $ortholog_key.nil? Misc.sensiblewrite(t.name, $ortholog_key) end -file 'identifiers' do |t| +file 'identifiers' => 'entrez_taxids' do |t| + tax_codes = Open.read(t.prerequisites.first).strip.split("\n") identifiers = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_identifiers, [], nil, :namespace => Thread.current['namespace']) identifiers.unnamed = true $biomart_identifiers.each do |name, key, prefix| next unless identifiers.all_fields.include? name @@ -114,22 +135,24 @@ identifiers.process name do |field, key, values| field.each{|v| v.replace "#{prefix}:#{v}"} end end end name_pos = identifiers.identify_field "Associated Gene Name" - entrez2name = Entrez.entrez2name($taxs) - identifiers.process "Entrez Gene ID" do |entrez, ensembl, values| - names = values[name_pos] + if tax_codes and tax_codes.any? + entrez2name = Entrez.entrez2name(tax_codes) + identifiers.process "Entrez Gene ID" do |entrez, ensembl, values| + names = values[name_pos] || [] - matches = entrez.select do |e| - entrez2name.include?(e) && (names & entrez2name[e]).any? - end + matches = entrez.select do |e| + entrez2name.include?(e) && (names & entrez2name[e]).any? + end - if matches.any? - matches - else - entrez + if matches.any? + matches + else + entrez + end end end refseq_fields = identifiers.fields.select{|f| f =~ /RefSeq/} @@ -145,33 +168,37 @@ ordered_fields[pos..pos-1] = "RefSeq ID" identifiers = identifiers.reorder(:key, ordered_fields) end - entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4] - entrez_synonyms.key_field = "Entrez Gene ID" - entrez_synonyms.fields = ["Entrez Gene Name Synonyms"] + if tax_codes and tax_codes.any? + entrez_synonyms = Rbbt.share.databases.entrez.gene_info.find.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => [4] + entrez_synonyms.key_field = "Entrez Gene ID" + entrez_synonyms.fields = ["Entrez Gene Name Synonyms"] - identifiers.attach entrez_synonyms + identifiers.attach entrez_synonyms + end identifiers.with_unnamed do identifiers.each do |key, values| values.each do |list| + list ||= [] list.reject!{|v| v.nil? or v.empty?} list.uniq! end end end Misc.sensiblewrite(t.name, identifiers.to_s) end -file 'lexicon' => 'identifiers' do |t| +file 'lexicon' => ['identifiers', 'entrez_taxids'] do |t| tsv = TSV.open(t.prerequisites.first).slice(["Associated Gene Name", "Entrez Gene Name Synonyms"]) + tax_codes = Open.read(t.prerequisites.last).strip.split("\n") - entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => $taxs.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8 + entrez_description = Rbbt.share.databases.entrez.gene_info.tsv :grep => tax_codes.collect{|tax| "^#{tax}"}, :fixed_grep => false, :key_field => 1, :fields => 8 entrez_description.key_field = "Entrez Gene ID" entrez_description.fields = ["Entrez Gene Description"] tsv = tsv.attach entrez_description Misc.sensiblewrite(t.name, tsv.to_s) @@ -306,12 +333,13 @@ end # {{{ Other info -file 'gene_pmids' do |t| - tsv = Entrez.entrez2pubmed($taxs) +file 'gene_pmids' => 'entrez_taxids' do |t| + tax_codes = Open.read(t.prerequisites.first).strip.split("\n") + tsv = Entrez.entrez2pubmed(tax_codes) text = "#: :namespace=#{Thread.current['namespace']}\n" text += "#Entrez Gene ID\tPMID" tsv.each do |gene, pmids| text << "\n" << gene << "\t" << pmids * "|" end @@ -415,11 +443,11 @@ file 'gene_go_bp' => 'gene_go' do |t| gene_go = TSV.open(t.prerequisites.first) gene_go.monitor = true gene_go.process "GO ID" do |key, go_id, values| - clean = values.zip_fields.select do |id, type| + clean = NamedArray.zip_fields(values).select do |id, type| type == "biological_process" end clean.collect{|id, type| id} end @@ -485,13 +513,13 @@ Misc.sensiblewrite(t.name, pfam.to_s) end file 'chromosomes' do |t| - goterms = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace']) + tsv = BioMart.tsv($biomart_db, ['Chromosome Name', "chromosome_name"] , [] , [], nil, :type => :double, :namespace => Thread.current['namespace']) - Misc.sensiblewrite(t.name, goterms.to_s) + Misc.sensiblewrite(t.name, tsv.keys * "\n") end file 'blacklist_chromosomes' => 'chromosomes' do |t| list = TSV.open(t.prerequisites.first).keys.select{|c| c.index('_') or c.index('.')} File.open(t.name, 'w') do |f| f.puts list * "\n" end @@ -509,40 +537,72 @@ end rule /^chromosome_.*/ do |t| chr = t.name.match(/chromosome_(.*)/)[1] + path = File.expand_path(t.name) + dirname = File.dirname(path) + organism = File.basename(dirname) + if organism =~ /^[a-z]{3}20[0-9]{2}/ + archive = organism + organism = File.basename(File.dirname(dirname)) + organism = File.join(organism, archive) + end + # HACK: Skip LRG chromosomes raise "LRG and GL chromosomes not supported: #{ chr }" if chr =~ /^(?:LRG_|GL0)/ archive = File.basename(FileUtils.pwd) =~ /^([a-z]{3}[0-9]{4})$/i ? $1 : nil release = Ensembl.releases[archive] - ftp = Net::FTP.new("ftp.ensembl.org") + fasta_url = Ensembl::FTP.ftp_name_for(organism, 'fasta').last + server, _, path = fasta_url.partition("/") + path = "/" + path + + ftp = Net::FTP.new(server) ftp.passive = true ftp.login - if release.nil? or release == 'current' - ftp.chdir("pub/current_fasta/") - else - ftp.chdir("pub/#{ release }/fasta/") - end - ftp.chdir($scientific_name.downcase.sub(" ",'_')) + ftp.chdir(path) ftp.chdir('dna') - file = ftp.nlst.select{|file| file =~ /chromosome\.#{ chr }\.fa/}.first - raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil? + file = ftp.nlst.select{|file| file =~ /dna_sm\.chromosome\.#{ chr }\.fa/}.first + if file + Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}") - Log.debug("Downloading chromosome sequence: #{ file } - #{release} #{t.name}") + Misc.lock t.name + '.rake' do + TmpFile.with_file do |tmpfile| + ftp.getbinaryfile(file, tmpfile) + Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,'')) + ftp.close + end + end + else + file = ftp.nlst.select{|file| file =~ /dna_sm\.toplevel\.fa\.gz/}.first if file.nil? + Misc.lock t.name + '.rake' do + TmpFile.with_file do |tmpfile| + ftp.getbinaryfile(file, tmpfile) + txt = Open.read(tmpfile, :gzip => true) - Misc.lock t.name + '.rake' do - TmpFile.with_file do |tmpfile| - ftp.getbinaryfile(file, tmpfile) - Misc.sensiblewrite(t.name, Open.read(tmpfile, :gzip => true).sub(/^>.*\n/,'').gsub(/\s/,'')) - ftp.close + chr_txt = [] + + in_chr = false + txt.split("\n").each do |line| + if line.start_with?(">#{chr}") + in_chr = true + elsif line.start_with?(">") + in_chr = false + else + chr_txt << line if in_chr + end + end + Misc.sensiblewrite(t.name, chr_txt * "" ) + ftp.close + end end + raise "Fasta file for chromosome not found: '#{ chr }' - #{ archive }, #{ release }" if file.nil? end end rule /^possible_ortholog_(.*)/ do |t| other = t.name.match(/ortholog_(.*)/)[1] @@ -582,10 +642,20 @@ #{{{ Special files require 'bio' file 'transcript_sequence' => ["exons", "transcript_exons", "blacklist_chromosomes"] do |t| + path = File.expand_path(t.name) + dirname = File.dirname(path) + + organism = File.basename(dirname) + if organism =~ /^[a-z]{3}20[0-9]{2}/ + archive = organism + organism = File.basename(File.dirname(dirname)) + organism = File.join(organism, archive) + end + exon_info = TSV.open('exons', :type => :list, :fields => ["Exon Strand", "Exon Chr Start", "Exon Chr End", "Chromosome Name"], :unnamed => true) chr_transcript_ranges ||= {} transcript_strand = {} @@ -614,14 +684,14 @@ transcript_sequence = {} chr_transcript_ranges.each do |chr, transcript_ranges| begin raise "LRG, GL, HG, NT, KI, and HSCHR chromosomes not supported: #{chr}" if blacklist_chromosomes.include? chr - p = File.expand_path("./chromosome_#{chr}") - Organism.root.annotate p - p.sub!(%r{.*/organisms/},'share/organisms/') - chr_str = p.produce.read + pkgdir = Thread.current["resource"] + p = pkgdir[organism]["chromosome_#{chr}"] + p.produce or raise "Could not produce #{p}; pkgdir: #{p.pkgdir}" + chr_str = p.read rescue Exception Log.warn("Chr #{ chr } failed (#{transcript_ranges.length} transcripts not covered): #{$!.message}") raise $! unless $!.message =~ /not supported/ next end @@ -654,11 +724,11 @@ archive = organism organism = File.basename(File.dirname(dirname)) organism = File.join(organism, archive) end - translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unmamed => true) + translation = Ensembl::FTP.ensembl_tsv(organism, 'translation', 'transcript_id', %w(seq_start start_exon_id seq_end end_exon_id), :type => :list, :unnamed => true) if Ensembl::FTP.has_table?(organism, 'exon_stable_id') exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon_stable_id', 'exon_id', ['stable_id'], :type => :single, :unnamed => true) else exon2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'exon', 'exon_id', ['stable_id'], :type => :single, :unnamed => true) @@ -668,13 +738,13 @@ transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript_stable_id', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true) else transcript2ensembl = Ensembl::FTP.ensembl_tsv(organism, 'transcript', 'transcript_id', ['stable_id'], :type => :single, :unnamed => true) end - transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unmamed => true) - transcript_exons = TSV.open("./transcript_exons", :unmamed => true) - exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unmamed => true) + transcript_protein = TSV.open("./transcripts", :key_field => "Ensembl Transcript ID", :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true) + transcript_exons = TSV.open("./transcript_exons", :unnamed => true) + exon_ranges = TSV.open("./exons",:fields => ["Exon Chr Start", "Exon Chr End"], :cast => :to_i, :unnamed => true) transcript_utr5 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["5' UTR Length"], :cast => :to_i, :type => :single) transcript_utr3 = TSV.setup({}, :key_field => "Ensembl Transcript ID", :fields => ["3' UTR Length"], :cast => :to_i, :type => :single) translation.through do |transcript_id, values| @@ -717,16 +787,17 @@ end file 'protein_sequence' => ["transcripts", "transcript_5utr", "transcript_3utr", "transcript_phase", "transcript_sequence"] do |t| transcript_5utr = TSV.open(File.expand_path('./transcript_5utr'), :unnamed => true) transcript_3utr = TSV.open(File.expand_path('./transcript_3utr'), :unnamed => true) - transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true) + transcript_phase = TSV.open(File.expand_path('./transcript_phase'), :unnamed => true) transcript_sequence = TSV.open(File.expand_path('./transcript_sequence'), :unnamed => true) transcript_protein = TSV.open(File.expand_path('./transcripts'), :fields => ["Ensembl Protein ID"], :type => :single, :unnamed => true) protein_sequence = TSV.setup({}, :key_field => "Ensembl Protein ID", :fields => ["Sequence"], :type => :single) + transcript_sequence.monitor = true transcript_sequence.through do |transcript, sequence| protein = transcript_protein[transcript] next if protein.nil? or protein.empty? utr5 = transcript_5utr[transcript] @@ -775,10 +846,11 @@ TSV.traverse all_uni, :into => dumper, :cpus => 1, :bar => true do |uni| uni = uni.first if Array === uni uni_seq = UniProt.get_uniprot_sequence(uni) ensps = uni2ensps[uni] next if ensps.nil? or ensps.empty? + best_ensp = ensps.sort_by do |ensp| ensp_seq = ensp2seq[ensp] if ensp_seq (ensp_seq.length - uni_seq.length).abs else @@ -804,11 +876,11 @@ release = Ensembl.org2release(organism) num = release.split("-").last build_code = Organism.GRC_build(organism) scientific_name = $scientific_name url = "ftp://ftp.ensembl.org/pub/release-#{num}/gtf/#{scientific_name.downcase.sub(" ", '_')}/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.gtf.gz" - CMD.cmd("wget '#{url}' -O #{t.name}.gz") + Open.download(url, "#{t.name}.gz") nil end file 'cdna_fasta' do |t| path = File.expand_path(t.name) @@ -823,9 +895,10 @@ release = Ensembl.org2release(organism) num = release.split("-").last build_code = Organism.GRC_build(organism) scientific_name = Organism.scientific_name(organism) - url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.cdna.all.fa.gz" - CMD.cmd("wget '#{url}' -O #{t.name}.gz") + url = "ftp://ftp.ensembl.org/pub/release-#{num}/fasta/#{scientific_name.downcase.sub(" ", '_')}/cdna/#{scientific_name.sub(" ", '_')}.#{build_code}.#{num}.cdna.all.fa.gz" + Open.download(url, "#{t.name}.gz") nil end +