require 'cgi' require 'nokogiri' require 'http_client_patch/include_client' require 'httpclient' module BentoSearch # Supports fielded searching, sorting, pagination. # # Required configuration: # * api_key # # Defaults to 'relevance' sort, rather than scopus's default of date desc. # # Uses the Scopus SciVerse REST API. You need to be a Scopus customer # to access. http://api.elsevier.com # http://www.developers.elsevier.com/action/devprojects # # ToS: http://www.developers.elsevier.com/devcms/content-policies # "Federated Search" use case. # Also: http://www.developers.elsevier.com/cms/apiserviceagreement # # Note that ToS applying to you probably means you must restrict access # to search functionality to authenticated affiliated users only. # # Register for an API key at "Register New Site" at http://developers.elsevier.com/action/devnewsite # You will then need to get server IP addresses registered with Scopus too, # apparently by emailing directly to dave.santucci at elsevier dot com. # # Scopus API Docs: # * http://www.developers.elsevier.com/devcms/content-api-search-request # * http://www.developers.elsevier.com/devcms/content/search-fields-overview # # Some more docs on response elements and query elements: # * http://api.elsevier.com/content/search/#d0n14606 # # Other API's in the suite not being used by this code at present: # * http://www.developers.elsevier.com/devcms/content-api-retrieval-request # * http://www.developers.elsevier.com/devcms/content-api-metadata-request # # Support: Integration@scopus.com # # TODO: Mention to Scopus: Only one author? # Paging of 50 gets an error, but docs say I should be able to request 200. q # # Scopus response does not seem to include language of hit, even though # api allows you to restrict by language. ask scopus if we're missing something? class ScopusEngine include BentoSearch::SearchEngine extend HTTPClientPatch::IncludeClient include_http_client def search_implementation(args) results = Results.new xml, response, exception = nil, nil, nil url = scopus_url(args) begin response = http_client.get( url , nil, # HTTP headers. {"X-ELS-APIKey" => configuration.api_key, "X-ELS-ResourceVersion" => "XOCS", "Accept" => "application/atom+xml"} ) xml = Nokogiri::XML(response.body) rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e exception = e end # handle errors if (response.nil? || xml.nil? || exception || (! HTTP::Status.successful? response.status) || xml.at_xpath("service-error") ) # UGH. Scopus reports 0 hits as an error, not entirely distinguishable # from an actual error. Oh well, we have to go with it. if ( (response.status == 400) && xml && (error_xml = xml.at_xpath("./service-error/status")) && (node_text(error_xml.at_xpath("./statusCode")) == "INVALID_INPUT") && (node_text(error_xml.at_xpath("./statusText")).starts_with? "Result set was empty") ) # PROBABLY 0 hit count, although could be something else I'm afraid. results.total_items = 0 return results else # real error results.error ||= {} results.error[:exception] = e results.error[:status] = response.status if response # keep from storing the entire possibly huge response as error # but sometimes it's an error message. results.error[:error_info] = xml.at_xpath("service_error") if xml return results end end results.total_items = (node_text xml.at_xpath("//opensearch:totalResults", xml_ns)).to_i xml.xpath("//atom:entry", xml_ns).each do | entry | results << (item = ResultItem.new) if scopus_link = entry.at_xpath("atom:link[@ref='scopus']", xml_ns) item.link = scopus_link["href"] end item.title = node_text entry.at_xpath("dc:title", xml_ns) item.journal_title = node_text entry.at_xpath("prism:publicationName", xml_ns) item.issn = node_text entry.at_xpath("prism:issn", xml_ns) item.volume = node_text entry.at_xpath("prism:volume", xml_ns) item.issue = node_text entry.at_xpath("prism:issueIdentifier", xml_ns) item.doi = node_text entry.at_xpath("prism:doi", xml_ns) # pages might be in startingPage/endingPage OR in pageRange if (start = entry.at_xpath("prism:startingPage", xml_ns)) item.start_page = start.text.to_i if ( epage = entry.at_xpath("prism:endingPage", xml_ns)) item.end_page = epage.text.to_i end elsif (range = entry.at_xpath("prism:pageRange", xml_ns)) (spage, epage) = *range.text().split("-") item.start_page = spage item.end_page = epage end # get the year out of the date if date = entry.at_xpath("prism:coverDate", xml_ns) date.text =~ /^(\d\d\d\d)/ item.year = $1.to_i if $1 end # Authors might be in atom:authors seperated by |, or just # a single one in dc:creator if (authors = entry.at_xpath("atom:authors", xml_ns)) authors.text.split("|").each do |author| item.authors << Author.new(:display => author.strip) end elsif (author = entry.at_xpath("dc:creator", xml_ns)) item.authors << Author.new(:display => author.text.strip) end # Format we're still trying to figure out how Scopus API # delivers it. Here is at at least one way. if (doctype = entry.at_xpath("atom:subtype", xml_ns)) item.format = doctype_to_format(doctype.text) item.format_str = doctype_to_string(doctype.text) end end return results end # The escaping rules are not entirely clear for the API. We know colons # and parens are special chars. It's unclear how or if we can escape them, # we'll just remove them. def escape_query(query) # backslash escape doesn't seem to work #query.gsub(/([\\\(\)\:])/) do |match| # "\\#{$1}" #end query.gsub(/([\\\(\)\:])/, ' ') end def self.required_configuration ["api_key"] end def self.default_configuration { :base_url => "http://api.elsevier.com/", :cluster => "SCOPUS" } end # Max per-page is 200, as per http://www.developers.elsevier.com/devcms/content-apis, bottom of page. def max_per_page 200 end def search_field_definitions { nil => {:semantic => :general}, "AUTH" => {:semantic => :author}, "TITLE" => {:semantic => :title}, # controlled and author-assigned keywords "KEY" => {:semantic => :subject}, "ISBN" => {:semantic => :isbn}, "ISSN" => {:semantic => :issn}, } end def sort_definitions # scopus &sort= values, not yet URI-escaped, later code will do that. # # 'refeid' key is currently undocumented on Scopus site, but # was given to me in email by scopus. { "title_asc" => {:implementation => "+itemtitle"}, "date_desc" => {:implementation => "-datesort,+auth"}, "relevance" => {:implementation => "refeid" }, "author_asc" => {:implementation => "+auth"}, "num_cite_desc" => {:implementation => "-numcitedby"} } end protected # returns nil if passed in nil, otherwise # returns nokogiri text() def node_text(node) return nil if node.nil? return node.text() end def xml_ns {"opensearch" => "http://a9.com/-/spec/opensearch/1.1/", "prism" => "http://prismstandard.org/namespaces/basic/2.0/", "dc" => "http://purl.org/dc/elements/1.1/", "atom" => "http://www.w3.org/2005/Atom"} end # Maps from Scopus "doctype" as listed at http://www.developers.elsevier.com/devcms/content/search-fields-overview # and delivered in the XML response as atom:subtype. # Maps to our own internal formats as documented in ResultItem#format # Returns nil if can't map. def doctype_to_format(doctype) { "ar" => "Article", "ip" => "Article", "bk" => "Book", "bz" => "Article", "re" => "Article", # most of what scopus labels 'Report' seem to be ordinary articles. "cp" => :conference_paper, "re" => "Article", # really 'report', but Scopus is unreliable here, most of these are actually articles. "sh" => "Article", # 'short survey' to scopus, but seems to be used for articles. "ip" => "Article", # 'article in press'. 'ed' => "Article", # Editorial 'le' => "Article", # Letter 'no' => "Article", # Note }[doctype.to_s] end # Maps Scopus doctype to human readable strings as documented by Scopus, # does not map 1-1 to our controlled format. def doctype_to_string(doctype) { "ar" => "Article", "ab" => "Abstract Report", "ip" => "Article in Press", "bk" => "Book", "bz" => "Business Article", "cp" => "Conference Paper", "cr" => "Conference Review", "ed" => "Editorial", "er" => "Erratum", "le" => "Letter", "no" => "Note", "pr" => "Press Release", "re" => "Article", # Really 'report', but Scopus is unreliable here, most of these are actually articles. "sh" => "Article" # Really 'short survey' to Scopus, but seems to be used for, well, articles. }[doctype.to_s] end def scopus_url(args) query = escape_query args[:query] if args[:search_field] query = "#{args[:search_field]}(#{query})" end query = "#{configuration.base_url.chomp("/")}/content/search/index:#{configuration.cluster}?query=#{CGI.escape(query)}" query += "&count=#{args[:per_page]}" if args[:per_page] query += "&start=#{args[:start]}" if args[:start] # default to 'relevance' sort if not given, rather than scopus's # default of date desc. args[:sort] ||= "relevance" if (defn = self.sort_definitions[args[:sort]]) && ( value = defn[:implementation]) query += "&sort=#{CGI.escape(value)}" end return query end end end