require 'rbbt-util' require 'libxml' require 'rbbt/sources/gscholar' require 'rbbt/util/filecache' # This module offers an interface with PubMed, to perform queries, and # retrieve simple information from articles. It uses the caching # services of Rbbt. module PubMed @@pubmed_lag = 1 # Performs the specified query and returns an array with the PubMed # Ids returned. +retmax+ can be used to limit the number of ids # returned, if is not specified 30000 is used. def self.query(query, retmax=nil) retmax ||= 30000 Open.read("http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?retmax=#{retmax}&db=pubmed&term=#{query}",:quiet => true, :nocache => true).scan(/<Id>(\d+)<\/Id>/).flatten end # Processes the xml with an articles as served by MedLine and extracts # the abstract, title and journal information class Article XML_KEYS = [ [:title , "ArticleTitle"], [:journal , "Journal/Title"], [:issue , "Journal/JournalIssue/Issue"], [:volume , "Journal/JournalIssue/Volume"], [:issn , "Journal/ISSN"], [:year , "Journal/JournalIssue/PubDate/Year"], [:month , "Journal/JournalIssue/PubDate/Month"], [:pages , "Pagination/MedlinePgn"], [:author , "AuthorList/Author"], [:abstract , "Abstract/AbstractText"], ] PMC_PDF_URL = "http://www.ncbi.nlm.nih.gov/pmc/articles/PMCID/pdf/" def self.escape_title(title) title.gsub(/(\w*[A-Z][A-Z]+\w*)/, '{\1}') end def self.make_bibentry(lastname, year, title) words = title.downcase.scan(/\w+/) if words.first.length > 3 abrev = words.first else abrev = words[0..2].collect{|w| w.chars.first} * "" end [lastname.gsub(/\s/,'_'), year || "NOYEAR", abrev] * "" end def self.parse_xml(xml) parser = LibXML::XML::Parser.string(xml) pubmed = parser.parse.find("/PubmedArticle").first medline = pubmed.find("MedlineCitation").first article = medline.find("Article").first info = {} info[:pmid] = medline.find("PMID").first.content XML_KEYS.each do |p| name, key = p node = article.find(key).first next if node.nil? info[name] = node.content end bibentry = nil info[:author] = article.find("AuthorList/Author").collect do |author| begin lastname = author.find("LastName").first.content if author.find("ForeName").first.nil? forename = nil else forename = author.find("ForeName").first.content.split(/\s/).collect{|word| if word.length == 1; then word + '.'; else word; end} * " " end bibentry ||= make_bibentry lastname, info[:year], info[:title] rescue end [lastname, forename] * ", " end * " and " info[:bibentry] = bibentry.downcase if bibentry info[:pmc_pdf] = pubmed.find("PubmedData/ArticleIdList/ArticleId").select{|id| id[:IdType] == "pmc"}.first if info[:pmc_pdf] info[:pmc_pdf] = PMC_PDF_URL.sub(/PMCID/, info[:pmc_pdf].content) end info end attr_accessor :title, :abstract, :journal, :author, :pmid, :bibentry, :pmc_pdf, :gscholar_pdf, :pdf_url attr_accessor *XML_KEYS.collect{|p| p.first } def initialize(xml) if xml && ! xml.empty? info = PubMed::Article.parse_xml xml info.each do |key, value| self.send("#{ key }=", value) end end end def pdf_url return pmc_pdf if pmc_pdf @gscholar_pdf ||= begin GoogleScholar::full_text_url title rescue Log.medium "GoogleScholar#full_text failed: #{title}" sleep 0.1 nil end end def full_text return nil if pdf_url.nil? text = nil TmpFile.with_file do |pdf| # Change user-agent, oh well... `wget --user-agent=firefox #{ pdf_url } -O #{ pdf } -t 3` TmpFile.with_file do |txt| `pdftotext #{ pdf } #{ txt }` text = Open.read(txt) if File.exists? txt end end Misc.fixutf8(text) end def bibtex keys = [:author] + XML_KEYS.collect{|p| p.first } - [:bibentry] bibtex = "@article{#{bibentry},\n" keys.each do |key| next if self.send(key).nil? case key when :title bibtex += " title = { #{ PubMed::Article.escape_title title } },\n" when :issue bibtex += " number = { #{ issue } },\n" else bibtex += " #{ key } = { #{ self.send(key) } },\n" end end bibtex += " fulltext = { #{ pdf_url } },\n" if pdf_url bibtex += " pmid = { #{ pmid } }\n}" bibtex end # Join the text from title and abstract def text text = [title, abstract].join("\n") Misc.fixutf8(text) end end # Returns the Article object containing the information for the PubMed # ID specified as an argument. If +pmid+ is an array instead of a single # identifier it returns an hash with the Article object for each id. # It uses the Rbbt cache to save the articles xml. def self.get_article(pmid) if pmid.is_a? Array missing = [] list = {} pmid.each{|p| filename = p.to_s + '.xml' if File.exists? FileCache.path(filename) list[p] = Article.new(Open.read(FileCache.path(filename))) else missing << p end } return list unless missing.any? articles = get_online(missing) articles.each{|p, xml| filename = p + '.xml' FileCache.add(filename,xml) list[p] = Article.new(xml) } return list else filename = pmid.to_s + '.xml' if File.exists? FileCache.path(filename) return Article.new(Open.read(FileCache.path(filename))) else xml = get_online(pmid) FileCache.add(filename,xml) return Article.new(xml) end end end def self.get_article(pmids) _array = Array === pmids pmids = [pmids] unless Array === pmids pmids = pmids.compact.collect{|id| id} result_files = FileCache.cache_online_elements(pmids, 'pubmed-{ID}.xml') do |ids| result = {} values = [] chunks = Misc.divide(ids, (ids.length / 20) + 1) Log::ProgressBar.with_bar(chunks.length, :desc => "Downloading articles from PubMed") do |bar| chunks.each do |list| begin Misc.try3times do url = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" postdata = "db=pubmed&retmode=xml&id=#{list* ","}" xml = TmpFile.with_file(postdata) do |postfile| #Open.read(url, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--post-file=" => postfile) Open.read(url+'?'+postdata, :quiet => true, :nocache => true, :nice => @@pubmed_lag, :nice_key => "PubMed", "--__post-file=" => postfile) end values += xml.scan(/(<PubmedArticle>.*?<\/PubmedArticle>)/smu).flatten end rescue Aborted raise $! rescue Exception Log.exception $! ensure bar.tick end end end values.each do |xml| pmid = xml.scan(/<PMID[^>]*?>(.*?)<\/PMID>/).flatten.first result[pmid] = xml end ids.each{|id| next if id.nil? or result[id]; fid = id.sub(/^0+/,''); next unless result[fid]; result[id] = result[fid]} ids.each{|id| next if id.nil? or result[id]; result[id] = ""} result end articles = {} pmids.each do |id| next if id.nil? or result_files[id].nil? txt = Open.read(result_files[id]) next if txt.empty? articles[id] = Article.new(txt) end if _array articles else articles.values.first end end end