lib/rbbt/sources/clinvar.rb in rbbt-sources-3.1.51 vs lib/rbbt/sources/clinvar.rb in rbbt-sources-3.1.52
- old
+ new
@@ -1,56 +1,97 @@
require 'rbbt-util'
require 'rbbt/resource'
+require 'rbbt/sources/organism'
module ClinVar
extend Resource
self.subdir = 'share/databases/ClinVar'
- def self.organism(org="Hsa")
- Organism.default_code(org)
+ def self.organism_hg19(org="Hsa")
+ Organism.organism_for_build("hg19")
end
+ def self.organism_hg38(org="Hsa")
+ Organism.organism_for_build("hg38")
+ end
+
+
ClinVar.claim ClinVar.variant_summary, :url, "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
- ClinVar.claim ClinVar.snv_summary, :proc do
- url = "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
- io = TSV.traverse ClinVar.variant_summary, :type => :array, :into => :stream do |line|
- line = Misc.fixutf8 line
+ ClinVar.claim ClinVar.hg19.snv_summary, :proc do
+ parser = TSV::Parser.new ClinVar.variant_summary, :type => :list
+ dumper = TSV::Dumper.new :fields => parser.fields, :key_field => "Genomic Mutation", :organism => ClinVar.organism_hg19
+ dumper.init
+ chr_pos, start_pos, ref_pos, alt_pos, assembly_pos = %w(Chromosome PositionVCF ReferenceAlleleVCF AlternateAlleleVCF Assembly).collect{|f| parser.fields.index f}
+ TSV.traverse parser, :into => dumper, :bar => true do |allele,values,fields|
+ chr, start, ref, alt, assembly = values.values_at chr_pos, start_pos, ref_pos, alt_pos, assembly_pos
+ next if assembly != "GRCh37"
+ pos, muts = Misc.correct_vcf_mutation(start.to_i, ref, alt)
+ res = muts.collect{|m| [[chr, pos, m] * ":", values] }
+
+ res.extend MultipleResult
+
+ res
+ end
+ dumper.stream
+ end
+
+ ClinVar.claim ClinVar.hg38.snv_summary, :proc do
+ parser = TSV::Parser.new ClinVar.variant_summary, :type => :list
+ dumper = TSV::Dumper.new :fields => parser.fields, :key_field => "Genomic Mutation", :organism => ClinVar.organism_hg38
+ dumper.init
+ chr_pos, start_pos, ref_pos, alt_pos, assembly_pos = %w(Chromosome PositionVCF ReferenceAlleleVCF AlternateAlleleVCF Assembly).collect{|f| parser.fields.index f}
+ TSV.traverse parser, :into => dumper, :bar => true do |allele,values,fields|
+ chr, start, ref, alt, assembly = values.values_at chr_pos, start_pos, ref_pos, alt_pos, assembly_pos
+ next if assembly != "GRCh38"
+ pos, muts = Misc.correct_vcf_mutation(start.to_i, ref, alt)
+ res = muts.collect{|m| [[chr, pos, m] * ":", values] }
+
+ res.extend MultipleResult
+
+ res
+ end
+ dumper.stream
+ end
+
+
+ ClinVar.claim ClinVar.hg19.mi_summary, :proc do
+ require 'rbbt/workflow'
+ Workflow.require_workflow "Sequence"
+ variants = ClinVar.hg19.snv_summary.produce
+ muts = CMD.cmd('cut -f 1', :in => variants.open, :pipe => true)
+ consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg19).clean.run(true)
+
+ options = TSV.parse_header(variants).options.merge({:key_field => "Mutated Isoform"})
+ fields = options[:fields].length
+ dumper = TSV::Dumper.new options
+ dumper.init
+ pasted = TSV.paste_streams([variants, TSV.get_stream(consequence)])
+ TSV.traverse pasted, :into => dumper, :bar => true do |mutation,values|
begin
+ mis = values[fields..-1].flatten
+ next if mis.empty?
res = []
- if line =~ /^#/
- parts = line.split("\t")
- res << (["#Genomic Mutation"] + parts[1..12] + parts[15..23]) * "\t"
- else
- next unless line =~ /GRCh37/
- next if line =~ /(copy number|NT expansion|duplication|indel)/
- parts = line.split("\t")
- chr,pos,ref,mut = parts.values_at 13, 14, 25, 26
- next if ref == 'na' or mut == 'na'
-
- pos, muts = Misc.correct_mutation(pos.to_i,ref,mut)
- muts.each do |mut|
- mutation = [chr,pos,mut] * ":"
- res << ([mutation] + parts[1..12] + parts[15..23]) * "\t"
- end
- end
res.extend MultipleResult
+ mis.each do |mi|
+ res << [mi, values[0..fields-1]]
+ end
res
rescue
Log.exception $!
raise $!
end
end
- Misc.sort_stream(io)
+ dumper.stream
end
- ClinVar.claim ClinVar.mi_summary, :proc do
+ ClinVar.claim ClinVar.hg38.mi_summary, :proc do
require 'rbbt/workflow'
Workflow.require_workflow "Sequence"
- variants = ClinVar.snv_summary.produce
+ variants = ClinVar.hg38.snv_summary.produce
muts = CMD.cmd('cut -f 1', :in => variants.open, :pipe => true)
- consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true).clean.run(true)
+ consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg38).clean.run(true)
options = TSV.parse_header(variants).options.merge({:key_field => "Mutated Isoform"})
fields = options[:fields].length
dumper = TSV::Dumper.new options
dumper.init
@@ -72,5 +113,12 @@
end
dumper.stream
end
end
+if __FILE__ == $0
+ Log.severity = 0
+ ClinVar.hg19.snv_summary.produce
+ ClinVar.hg19.mi_summary.produce(true)
+ ClinVar.hg38.snv_summary.produce
+ ClinVar.hg38.mi_summary.produce(true)
+end