share/install/Organism/organism_helpers.rb in rbbt-sources-3.0.36 vs share/install/Organism/organism_helpers.rb in rbbt-sources-3.0.37
- old
+ new
@@ -46,11 +46,15 @@
$biomart_transcript_biotype = [
["Ensembl Transcript Biotype", 'transcript_biotype'],
]
+$biomart_transcript_name = [
+ ["Ensembl Transcript Name", 'external_transcript_id'],
+]
+
$biomart_protein_sequence = [
['Protein Sequence','peptide'],
]
#{{{ Exons
@@ -440,10 +444,16 @@
biotype = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_biotype, [], nil, :type => :single, :namespace => Thread.current['namespace'])
Misc.sensiblewrite(t.name, biotype.to_s)
end
+file 'transcript_name' do |t|
+ biotype = BioMart.tsv($biomart_db, $biomart_ensembl_transcript, $biomart_transcript_name, [], nil, :type => :single, :namespace => Thread.current['namespace'])
+
+ Misc.sensiblewrite(t.name, biotype.to_s)
+end
+
file 'gene_pfam' do |t|
pfam = BioMart.tsv($biomart_db, $biomart_ensembl_gene, $biomart_pfam, [], nil, :type => :double, :namespace => Thread.current['namespace'])
Misc.sensiblewrite(t.name, pfam.to_s)
end
@@ -697,6 +707,49 @@
psequence = Bio::Sequence::NA.new(("N" * phase) << sequence[utr5..sequence.length-utr3-1]).translate
protein_sequence[protein]=psequence
end
Misc.sensiblewrite(t.name, protein_sequence.to_s)
+end
+
+file 'ensembl2uniprot' => ["protein_sequence", "protein_identifiers"] do |t|
+ ensp2unis = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :type => :flat, :merge => true, :unnamed => true)
+ dumper = TSV::Dumper.new :key_field => "Ensembl Protein ID", :fields => ["UniProt/SwissProt Accession"], :namespace => Thread.current['namespace'], :type => :single
+ dumper.init
+ require 'rbbt/sources/uniprot'
+ TSV.traverse File.expand_path('./protein_sequence'), :into => dumper, :cpus => 20, :bar => true do |ensp,ensp_seq|
+ ensp = ensp.first if Array === ensp
+ unis = ensp2unis[ensp]
+ next if unis.nil? or unis.empty?
+ uni_seqs = UniProt.get_uniprot_sequence(unis)
+ best_uni = unis.zip(uni_seqs).sort_by do |uni,uni_seq|
+ (ensp_seq.length - uni_seq.length).abs
+ end.first.first
+ [ensp, best_uni]
+ end
+ Misc.sensiblewrite(t.name, dumper.stream)
+end
+
+file 'uniprot2ensembl' => ["protein_sequence", "protein_identifiers"] do |t|
+ uni2ensps = TSV.open(File.expand_path('./protein_identifiers'), :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :type => :flat, :merge => true, :unnamed => true)
+ ensp2seq = TSV.open(File.expand_path('./protein_sequence'), :unnamed => true)
+ dumper = TSV::Dumper.new :fields => ["Ensembl Protein ID"], :key_field => "UniProt/SwissProt Accession", :namespace => Thread.current['namespace'], :type => :single
+ dumper.init
+ require 'rbbt/sources/uniprot'
+ all_uni = TSV.open(File.expand_path('./protein_identifiers'), :key_field => "UniProt/SwissProt Accession", :fields => [], :type => :double, :merge => true, :unnamed => true).keys.compact.reject{|u| u.empty?}
+ TSV.traverse all_uni, :into => dumper, :cpus => 1, :bar => true do |uni|
+ uni = uni.first if Array === uni
+ uni_seq = UniProt.get_uniprot_sequence(uni)
+ ensps = uni2ensps[uni]
+ next if ensps.nil? or ensps.empty?
+ best_ensp = ensps.sort_by do |ensp|
+ ensp_seq = ensp2seq[ensp]
+ if ensp_seq
+ (ensp_seq.length - uni_seq.length).abs
+ else
+ uni_seq.length
+ end
+ end.first
+ [uni, best_ensp]
+ end
+ Misc.sensiblewrite(t.name, dumper.stream)
end