# encoding: UTF-8
require 'nokogiri'
require 'httpclient'
require 'multi_json'
require 'http_client_patch/include_client'
#
# For EBSCO Discovery Service. You will need a license to use.
#
# == Required Configuration
#
# user_id, password: As given be EBSCO for access to EDS API (may be an admin account in ebscoadmin? Not sure).
# profile: As given by EBSCO, might be "edsapi"?
#
# == Highlighting
#
# EDS has a query-in-context highlighting feature. It is used by defualt, set
# config 'highlighting' to false to disable.
# If turned on, you may get tags
# in title and abstract output if it's on, marked html_safe.
#
# If highlighting is on, since the abstract will be marked html safe, the
# view layer won't be able to safely truncate it. In fact, it's very hard
# to do here too, but we do it anyway, by default to approx configuration
# truncate_highlighted num of chars (default 280). Set to nil if you don't
# want this.
#
# == Linking
#
# The link to record in EBSCO interface delivered as "PLink" will be listed
# as record main link.
#
# Any links listed under will be listed as other_links, using
# configured name provided by EBSCO for CustomLink.
#
# EDS Response does not have sufficient metadata for us to generate an OpenURL
# ourselves. However, in our testing, the first/only CustomLink was an
# an OpenURL. If configuration.assume_first_custom_link_openurl is
# true (as is default), it will be used to create an OpenURL link. However, in
# our testing, many records don't have this at all. **Note** Ask EBSCO support
# to configure your profile so OpenURLs are ALWAYS included for all records, not
# just records with no EBSCO fulltext, to ensure bento_search can get the
# openurl.
#
# As always, you can customize links and other_links with Item Decorators.
#
# == Technical Notes and Difficulties
#
# This API is enormously difficult to work with. Also the response is very odd
# to deal with and missing some key elements. We quite possibly got something
# wrong or non-optimal in this implementation, but we did our best.
#
# Auth issues may make this slow -- you need to spend a (not too speedy) HTTP
# request making a session for every new end-user -- as we have no way to keep
# track of end-users, we do it on every request in this implementation.
#
# Responses don't include much metadata -- we don't actually have journal title,
# volume, issue, etc. We probably _could_ parse it out of the OpenURL that's
# there depending on your profile configuration, but we're not right now.
# Instead we're using the chunk of user-displayable citation/reference it does
# give us (which is very difficult to parse into something usable already),
# and a custom Decorator to display that instead of normalized citation
# made from individual elements.
#
# EBSCO says they plan to improve some of these issues in a September 2012 release.
#
# Title and abstract data seems to be HTML with tags and character entities and
# escaped special chars. We're trusting it and passing it on as html_safe.
#
# Paging can only happen on even pages, with 'page' rather than 'start'. But
# you can pass in 'start' to bento_search, it'll be converted to closest page.
#
# == Authenticated Users
#
# EDS allows searches by unauthenticated users, but the results come back with
# weird blank hits. In such a case, the BentoSearch adapter will return
# records with virtually no metadata, but a title e
# (I18n at bento_search.eds.record_not_available ). Also no abstracts
# are available from unauth search.
#
# By default the engine will search as 'guest' unauth user. But config
# 'auth' key to true to force all searches to auth (if you are protecting your
# app) or pass :auth => true as param into #search method.
#
# == EDS docs:
#
# * Console App to demo requests: https://eds-api.ebscohost.com/Console
# * EDS Wiki: http://edswiki.ebscohost.com/EDS_API_Documentation
# * You'll need to request an account to the EDS wiki, see: http://support.ebsco.com/knowledge_base/detail.php?id=5990
#
class BentoSearch::EdsEngine
include BentoSearch::SearchEngine
extend HTTPClientPatch::IncludeClient
include_http_client
AuthHeader = "x-authenticationToken"
SessionTokenHeader = "x-sessionToken"
@@remembered_auth = nil
@@remembered_auth_lock = Mutex.new
# Class variable to save current known good auth
# uses a mutex to be threadsafe. sigh.
def self.remembered_auth
@@remembered_auth_lock.synchronize do
@@remembered_auth
end
end
# Set class variable with current known good auth.
# uses a mutex to be threadsafe.
def self.remembered_auth=(token)
@@remembered_auth_lock.synchronize do
@@remembered_auth = token
end
end
# an object that includes some Rails helper modules for
# text handling.
def helper
unless @helper
@helper = Object.new
@helper.extend ActionView::Helpers::TextHelper # for truncate
@helper.extend ActionView::Helpers::OutputSafetyHelper # for safe_join
end
return @helper
end
def self.required_configuration
%w{user_id password profile}
end
# From config or args, args over-ride config
def authenticated_end_user?(args)
config = configuration.auth ? true : false
arg = args[:auth]
if ! arg.nil?
arg ? true : false
elsif ! config.nil?
config ? true : false
else
false
end
end
def construct_search_url(args)
query = "AND,"
if args[:search_field]
query += "#{args[:search_field]}:"
end
# Can't have any commas in query, it turns out, although
# this is not documented.
query += args[:query].gsub("/\,/", "")
url = "#{configuration.base_url}search?view=detailed&query=#{CGI.escape query}"
url += "&searchmode=#{CGI.escape configuration.search_mode}"
url += "&highlight=#{configuration.highlighting ? 'y' : 'n' }"
if args[:per_page]
url += "&resultsperpage=#{args[:per_page]}"
end
if args[:page]
url += "&pagenumber=#{args[:page]}"
end
if args[:sort]
if (defn = self.sort_definitions[args[:sort]]) &&
(value = defn[:implementation] )
url += "&sort=#{CGI.escape value}"
end
end
return url
end
def search_implementation(args)
results = BentoSearch::Results.new
end_user_auth = authenticated_end_user? args
begin
with_session(end_user_auth) do |session_token|
url = construct_search_url(args)
response = get_with_auth(url, session_token)
results = BentoSearch::Results.new
if (hits_node = at_xpath_text(response, "./SearchResponseMessageGet/SearchResult/Statistics/TotalHits"))
results.total_items = hits_node.to_i
end
response.xpath("./SearchResponseMessageGet/SearchResult/Data/Records/Record").each do |record_xml|
item = BentoSearch::ResultItem.new
item.title = prepare_eds_payload( element_by_group(record_xml, "Ti"), true )
if item.title.nil? && ! end_user_auth
item.title = I18n.translate("bento_search.eds.record_not_available")
end
item.abstract = prepare_eds_payload( element_by_group(record_xml, "Ab"), true )
# Believe it or not, the authors are encoded as an escaped
# XML-ish payload, that we need to parse again and get the
# actual authors out of. WTF. Thanks for handling fragments
# nokogiri.
author_mess = element_by_group(record_xml, "Au")
author_xml = Nokogiri::XML::fragment(author_mess)
author_xml.xpath(".//searchLink").each do |author_node|
item.authors << BentoSearch::Author.new(:display => author_node.text)
end
# PLink is main inward facing EBSCO link, put it as
# main link.
if direct_link = record_xml.at_xpath("./PLink")
item.link = direct_link.text
end
# Other links may be found in CustomLinks, it seems like usually
# there will be at least one, hopefully the first one is the OpenURL?
record_xml.xpath("./CustomLinks/CustomLink").each do |custom_link|
item.other_links << BentoSearch::Link.new(
:url => custom_link.at_xpath("./Url").text,
:label => custom_link.at_xpath("./Name").text
)
end
if (configuration.assume_first_custom_link_openurl &&
(first = record_xml.xpath "./CustomLinks/CustomLink" ) &&
(node = first.at_xpath "./Url" )
)
openurl = node.text
index = openurl.index('?')
item.openurl_kev_co = openurl.slice index..(openurl.length) if index
end
# Format.
item.format_str = at_xpath_text record_xml, "./Header/PubType"
# Can't find a list of possible PubTypes to see what's there to try
# and map to our internal controlled vocab. oh wells.
# We have a single blob of human-readable citation, that's also
# littered with XML-ish tags we need to deal with. We'll save
# it in a custom location, and use a custom Decorator to display
# it. Sorry it's way too hard for us to preserve
# tags in this mess, they will be lost. Probably don't
# need highlighting in source anyhow.
citation_mess = element_by_group(record_xml, "Src")
citation_txt = Nokogiri::XML::fragment(citation_mess).text
# But strip off some "count of references" often on the end
# which are confusing and useless.
item.custom_data["citation_blob"] = citation_txt.gsub(/ref +\d+ +ref\.$/, '')
item.extend CitationMessDecorator
results << item
end
end
return results
rescue EdsCommException => e
results.error ||= {}
results.error[:exception] = e
results.error[:http_status] = e.http_status
results.error[:http_body] = e.http_body
return results
end
end
# Difficult to get individual elements out of an EDS XML
# response, requires weird xpath, so we do it for you.
# element_by_group(nokogiri_element, "Ti")
#
# Returns string or nil
def element_by_group(noko, group)
at_xpath_text(noko, "./Items/Item[child::Group[text()='#{group}']]/Data")
end
# Wraps calls to the EDS api with CreateSession and EndSession requests
# to EDS. Will pass sessionID in yield from block.
#
# Second optional arg is whether this is an authenticated user, else
# guest access will be used.
#
# with_session(true) do |session_token|
# # can make more requests using session_token,
# # EndSession will be called for you at end of block.
# end
def with_session(auth = false, &block)
auth_token = self.class.remembered_auth
if auth_token.nil?
auth_token = self.class.remembered_auth = get_auth_token
end
create_url = "#{configuration.base_url}createsession?profile=#{configuration.profile}&guest=#{auth ? 'n' : 'y'}"
response_xml = get_with_auth(create_url)
session_token = nil
unless response_xml && (session_token = at_xpath_text(response_xml, "//SessionToken"))
e = EdsCommException.new("Could not get SessionToken")
end
begin
block.yield(session_token)
ensure
if auth_token && session_token
end_url = "#{configuration.base_url}endsession?sessiontoken=#{CGI.escape session_token}"
response_xml = get_with_auth(end_url)
end
end
end
# if the xpath responds, return #text of it, else nil.
def at_xpath_text(noko, xpath)
node = noko.at_xpath(xpath)
if node.nil?
return node
else
return node.text
end
end
# If EDS has put highlighting tags
# in a field, we need to HTML escape the literal values,
# while still using the highlighting tokens to put
# HTML tags around highlighted terms.
#
# Second param, if to assume EDS literals are safe HTML, as they
# seem to be.
def prepare_eds_payload(str, html_safe = false)
return str if str.blank?
unless configuration.highlighting
str = str.html_safe if html_safe
return str
end
parts =
str.split(%r{(?highlight>)}).collect do |substr|
case substr
when "" then "".html_safe
when "" then "".html_safe
# Yes, EDS gives us HTML in the literals, we're choosing to trust it.
else substr.html_safe
end
end
# Crazy ass method to truncate without getting in the middle of our
# html tags. This is wacky hacky, yeah.
if configuration.truncate_highlighted
remainingLength = configuration.truncate_highlighted
in_tag = false
elipses_added = false
truncated_parts = []
parts.each do |substr|
if remainingLength <=0 && ! in_tag
truncated_parts << "..."
break
end
if substr =~ /^$/
truncated_parts << substr
in_tag = true
elsif substr == ""
truncated_parts << substr
in_tag = false
elsif ((remainingLength - substr.length) > 0) || in_tag
truncated_parts << substr
else
truncated_parts << helper.truncate(substr, :length => remainingLength, :separator => ' ')
break
end
remainingLength = remainingLength - substr.length
end
parts = truncated_parts
end
return helper.safe_join(parts, '')
end
# Give it a url pointing at EDS API.
# Second arg must be a session_token if EDS request requires one.
# It will
# * Make a GET request
# * with memo-ized auth token added to headers
# * for XML, with all namespaces removed!
# * Parse JSON into a hash and return hash
# * Try ONCE more to get if EBSCO says bad auth token
# * Raise an EdsCommException if can't auth after second try,
# or other error message, or JSON can't be parsed.
def get_with_auth(url, session_token = nil)
auth_token = self.class.remembered_auth
unless auth_token
auth_token = self.class.remembered_auth = get_auth_token
end
response = nil
response_xml = nil
caught_exception = nil
begin
headers = {AuthHeader => auth_token, 'Accept' => 'application/xml'}
headers[SessionTokenHeader] = session_token if session_token
s_time = Time.now
response = http_client.get(url, nil, headers)
Rails.logger.debug("EDS timing GET: #{Time.now - s_time}:#{url}")
response_xml = Nokogiri::XML(response.body)
response_xml.remove_namespaces!
if (at_xpath_text(response_xml, "//ErrorNumber") == "104") || (at_xpath_text(response_xml, "//ErrorDescription") == "Auth Token Invalid")
# bad auth, try again just ONCE
Rails.logger.debug("EDS auth failed, getting auth again")
headers[AuthHeader] = self.class.remembered_auth = get_auth_token
response = http_client.get(url, nil, headers)
response_xml = Nokogiri::XML(response.body)
response_xml.remove_namespaces!
end
rescue TimeoutError, HTTPClient::ConfigurationError, HTTPClient::BadResponseError, Nokogiri::SyntaxError => e
caught_exception = e
end
if response.nil? || response_xml.nil? || caught_exception || (! HTTP::Status.successful? response.status)
exception = EdsCommException.new("Error fetching URL: #{caught_exception.message if caught_exception} : #{url}")
if response
exception.http_body = response.body
exception.http_status = response.status
end
raise exception
end
return response_xml
end
# Has to make an HTTP request to get EBSCO's auth token.
# returns the auth token. We aren't bothering to keep
# track of the expiration ourselves, can't neccesarily trust
# it anyway.
#
# Raises an EdsCommException on error.
def get_auth_token
# Can't send params as form-encoded, actually need to send a JSON or XML
# body, argh.
body = <<-EOS
{
"UserId":"#{configuration.user_id}",
"Password":"#{configuration.password}"
}
EOS
s_time = Time.now
response = http_client.post(configuration.auth_url, body, {'Accept' => "application/json", "Content-type" => "application/json"})
Rails.logger.debug("EDS timing AUTH: #{Time.now - s_time}s")
unless HTTP::Status.successful? response.status
raise EdsCommException.new("Could not get auth", response.status, response.body)
end
response_hash = nil
begin
response_hash = MultiJson.load response.body
rescue MultiJson::DecodeError
end
unless response_hash.kind_of?(Hash) && response_hash.has_key?("AuthToken")
raise EdsCommException.new("AuthToken not found in auth response", response.status, response.body)
end
return response_hash["AuthToken"]
end
def self.default_configuration
{
:auth_url => 'https://eds-api.ebscohost.com/authservice/rest/uidauth',
:base_url => "http://eds-api.ebscohost.com/edsapi/rest/",
:highlighting => true,
:truncate_highlighted => 280,
:assume_first_custom_link_openurl => true,
:search_mode => 'all' # any | bool | all | smart ; http://support.epnet.com/knowledge_base/detail.php?topic=996&id=1288&page=1
}
end
def sort_definitions
{
"date_desc" => {:implementation => "date"},
"relevance" => {:implementation => "relevance" }
# "date_asc" => {:implementaiton => "date2"}
}
end
def search_field_definitions
{
"TX" => {:semantic => :all},
"AU" => {:semantic => :author},
"TI" => {:semantic => :title},
"SU" => {:semantic => :subject},
"SO" => {}, # source, journal name
"AB" => {}, # abstract
"IS" => {:semantic => :issn},
"IB" => {:semantic => :isbn},
}
end
# an exception talking to EDS api.
# there's a short reason in #message, but also
# possibly an http_status and http_body copied
# from error EDS response.
class EdsCommException < Exception
attr_accessor :http_status, :http_body
def initialize(message, status = nil, body = nil)
super(message)
self.http_status = status
self.http_body = body
end
end
# A built-in decorator alwasy applied, that over-rides
# the ResultItem#published_in display method to use our mess blob
# of human readable citation, since we don't have individual elements
# to create it from in a normalized way.
module CitationMessDecorator
def published_in
custom_data["citation_blob"]
end
end
end