require 'nokogiri' require 'rbbt-util' module NCI def self.get_pathways(xml, format = "UP", get_short_name = false) doc = Nokogiri::XML(xml) pathways = {} molecules = {} doc.xpath("//Molecule").each do |molecule| id = molecule.attribute('id').value type = molecule.attribute('molecule_type').value names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value} molecules[id] = {:xml => molecule, :proteins => names} end interactions = {} doc.xpath("//Interaction").each do |interaction| id = interaction.attribute('id').value type = interaction.attribute('interaction_type').value molecule_ids = interaction.xpath('InteractionComponentList/InteractionComponent').collect{|c| c.attribute('molecule_idref').value}.flatten.compact pathway_ids = interaction.xpath('Abstraction').collect{|c| c.attribute('pathway_idref').value}.flatten.compact interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids, :pathway_ids => pathway_ids} end doc.xpath("//Pathway").each do |pathway| id = pathway.attribute('id').value subnet = pathway.attribute('subnet').value name = pathway.xpath('LongName').first.content short_name = pathway.xpath('ShortName').first.content if get_short_name interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value} pathway_interactions = interaction_ids.collect{|i| interactions[i]} pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.compact.flatten pathway_uniprot_ids = pathway_molecule_ids.collect do |i| next unless molecules.include? i molecules[i][:proteins] end if get_short_name pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq], [short_name]] else pathways[id] = [[name], [pathway_uniprot_ids.flatten.compact.uniq]] end end doc.xpath("//Pathway").each do |pathway| id = pathway.attribute('id').value subnet = pathway.attribute('subnet').value name = pathway.xpath('LongName').first.content interaction_ids = pathway.xpath("PathwayComponentList/PathwayComponent").collect{|component| component.attribute("interaction_idref").value} pathway_interactions = interaction_ids.collect{|i| interactions[i]} pathway_subnet_ids = pathway_interactions.collect{|info| info[:pathway_ids]}.compact.flatten pathway_subnet_ids.collect do |nid| next unless pathways.include? nid new_genes = pathways[id].last pathways[nid][1] = (pathways[nid][1] + new_genes).uniq end end pathways end end file 'nature_pathways' do |t| url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/NCI-Nature_Curated.xml.gz" xml = Open.read(url) pathways = NCI.get_pathways(xml) Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s) end file 'biocarta_pathways' do |t| url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/BioCarta.xml.gz" xml = Open.read(url) pathways = NCI.get_pathways(xml, "LL", true) Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID", "Pathway Short Name"]).to_s) end file 'biocarta_pathways_fixed_ids' => 'biocarta_pathways' do |t| orig = TSV.open(Open.open(t.prerequisites.first)) tsv = TSV.setup({}, :type => :double, :key_field => "BioCarta Pathway ID", :fields => ["Pathway Name", "Entrez Gene ID"]) orig.through do |key, values| name, genes, short = values code = "h_" + short.first tsv[code] = [name, genes] end Open.write(t.name, tsv.to_s) end file 'reactome_pathways' do |t| url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz" xml = Open.read(url) pathways = NCI.get_pathways(xml, "UP") Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s) end