lib/rbbt/sources/clinvar.rb in rbbt-sources-3.1.51 vs lib/rbbt/sources/clinvar.rb in rbbt-sources-3.1.52

- old
+ new

@@ -1,56 +1,97 @@ require 'rbbt-util' require 'rbbt/resource' +require 'rbbt/sources/organism' module ClinVar extend Resource self.subdir = 'share/databases/ClinVar' - def self.organism(org="Hsa") - Organism.default_code(org) + def self.organism_hg19(org="Hsa") + Organism.organism_for_build("hg19") end + def self.organism_hg38(org="Hsa") + Organism.organism_for_build("hg38") + end + + ClinVar.claim ClinVar.variant_summary, :url, "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" - ClinVar.claim ClinVar.snv_summary, :proc do - url = "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" - io = TSV.traverse ClinVar.variant_summary, :type => :array, :into => :stream do |line| - line = Misc.fixutf8 line + ClinVar.claim ClinVar.hg19.snv_summary, :proc do + parser = TSV::Parser.new ClinVar.variant_summary, :type => :list + dumper = TSV::Dumper.new :fields => parser.fields, :key_field => "Genomic Mutation", :organism => ClinVar.organism_hg19 + dumper.init + chr_pos, start_pos, ref_pos, alt_pos, assembly_pos = %w(Chromosome PositionVCF ReferenceAlleleVCF AlternateAlleleVCF Assembly).collect{|f| parser.fields.index f} + TSV.traverse parser, :into => dumper, :bar => true do |allele,values,fields| + chr, start, ref, alt, assembly = values.values_at chr_pos, start_pos, ref_pos, alt_pos, assembly_pos + next if assembly != "GRCh37" + pos, muts = Misc.correct_vcf_mutation(start.to_i, ref, alt) + res = muts.collect{|m| [[chr, pos, m] * ":", values] } + + res.extend MultipleResult + + res + end + dumper.stream + end + + ClinVar.claim ClinVar.hg38.snv_summary, :proc do + parser = TSV::Parser.new ClinVar.variant_summary, :type => :list + dumper = TSV::Dumper.new :fields => parser.fields, :key_field => "Genomic Mutation", :organism => ClinVar.organism_hg38 + dumper.init + chr_pos, start_pos, ref_pos, alt_pos, assembly_pos = %w(Chromosome PositionVCF ReferenceAlleleVCF AlternateAlleleVCF Assembly).collect{|f| parser.fields.index f} + TSV.traverse parser, :into => dumper, :bar => true do |allele,values,fields| + chr, start, ref, alt, assembly = values.values_at chr_pos, start_pos, ref_pos, alt_pos, assembly_pos + next if assembly != "GRCh38" + pos, muts = Misc.correct_vcf_mutation(start.to_i, ref, alt) + res = muts.collect{|m| [[chr, pos, m] * ":", values] } + + res.extend MultipleResult + + res + end + dumper.stream + end + + + ClinVar.claim ClinVar.hg19.mi_summary, :proc do + require 'rbbt/workflow' + Workflow.require_workflow "Sequence" + variants = ClinVar.hg19.snv_summary.produce + muts = CMD.cmd('cut -f 1', :in => variants.open, :pipe => true) + consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg19).clean.run(true) + + options = TSV.parse_header(variants).options.merge({:key_field => "Mutated Isoform"}) + fields = options[:fields].length + dumper = TSV::Dumper.new options + dumper.init + pasted = TSV.paste_streams([variants, TSV.get_stream(consequence)]) + TSV.traverse pasted, :into => dumper, :bar => true do |mutation,values| begin + mis = values[fields..-1].flatten + next if mis.empty? res = [] - if line =~ /^#/ - parts = line.split("\t") - res << (["#Genomic Mutation"] + parts[1..12] + parts[15..23]) * "\t" - else - next unless line =~ /GRCh37/ - next if line =~ /(copy number|NT expansion|duplication|indel)/ - parts = line.split("\t") - chr,pos,ref,mut = parts.values_at 13, 14, 25, 26 - next if ref == 'na' or mut == 'na' - - pos, muts = Misc.correct_mutation(pos.to_i,ref,mut) - muts.each do |mut| - mutation = [chr,pos,mut] * ":" - res << ([mutation] + parts[1..12] + parts[15..23]) * "\t" - end - end res.extend MultipleResult + mis.each do |mi| + res << [mi, values[0..fields-1]] + end res rescue Log.exception $! raise $! end end - Misc.sort_stream(io) + dumper.stream end - ClinVar.claim ClinVar.mi_summary, :proc do + ClinVar.claim ClinVar.hg38.mi_summary, :proc do require 'rbbt/workflow' Workflow.require_workflow "Sequence" - variants = ClinVar.snv_summary.produce + variants = ClinVar.hg38.snv_summary.produce muts = CMD.cmd('cut -f 1', :in => variants.open, :pipe => true) - consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true).clean.run(true) + consequence = Sequence.job(:mutated_isoforms_fast, "Clinvar", :mutations => muts, :non_synonymous => true, :organism => ClinVar.organism_hg38).clean.run(true) options = TSV.parse_header(variants).options.merge({:key_field => "Mutated Isoform"}) fields = options[:fields].length dumper = TSV::Dumper.new options dumper.init @@ -72,5 +113,12 @@ end dumper.stream end end +if __FILE__ == $0 + Log.severity = 0 + ClinVar.hg19.snv_summary.produce + ClinVar.hg19.mi_summary.produce(true) + ClinVar.hg38.snv_summary.produce + ClinVar.hg38.mi_summary.produce(true) +end