require 'nokogiri' module NCI def self.get_pathways(xml, format = "UP") doc = Nokogiri::XML(xml) pathways = {} molecules = {} doc.xpath("//Molecule").each do |molecule| id = molecule.attribute('id').value type = molecule.attribute('molecule_type').value next unless type == "protein" names = molecule.xpath("Name[@name_type='#{format}']").collect{|name| name.attribute("value").value} next if names.empty? molecules[id] = {:xml => molecule, :uniprot => names.first} end interactions = {} doc.xpath("//Interaction").each do |interaction| id = interaction.attribute('id').value molecule_ids = interaction.xpath('*/InteractionComponent').collect{|c| c.attribute('molecule_idref').value} interactions[id] = {:xml => interaction, :molecule_ids => molecule_ids} end doc.xpath("//Pathway").each do |pathway| id = pathway.attribute('id').value subnet = pathway.attribute('subnet').value name = pathway.xpath('LongName').first.content interaction_ids = pathway.xpath("*/PathwayComponent").collect{|component| component.attribute("interaction_idref").value} pathway_interactions = interaction_ids.collect{|id| interactions[id]} pathway_molecule_ids = pathway_interactions.collect{|info| info[:molecule_ids]}.flatten pathway_uniprot_ids = pathway_molecule_ids.collect do |id| next unless molecules.include? id molecules[id][:uniprot] end pathways[id] = [[name], [pathway_uniprot_ids.compact.uniq]] end pathways end end file 'nature_pathways' do |t| url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/NCI-Nature_Curated.xml.gz" xml = Open.read(url) pathways = NCI.get_pathways(xml) Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Nature Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s) end file 'biocarta_pathways' do |t| url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/BioCarta.xml.gz" xml = Open.read(url) pathways = NCI.get_pathways(xml, "LL") Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI BioCarta Pathway ID", :fields => ["Pathway Name","Entrez Gene ID"]).to_s) end file 'reactome_pathways' do |t| url = "ftp://ftp1.nci.nih.gov/pub/PID/XML/Reactome.xml.gz" xml = Open.read(url) pathways = NCI.get_pathways(xml, "UP") Open.write(t.name, TSV.setup(pathways, :type => :double, :key_field => "NCI Reactome Pathway ID", :fields => ["Pathway Name","UniProt/SwissProt Accession"]).to_s) end