# NOTE: In your SFX Admin, under Menu Configuration / API, you should enable ALL
# 'extra' API information for full umlaut functionality.
# With the exception of "Include openURL parameter", can't figure out how
# that's useful.
#
# config parameters in services.yml
# display name: User displayable name for this service
# base_url: SFX base url.
# click_passthrough: DEPRECATED. Caused problems. Use the SFXBackchannelRecord
# link filter service instead.
# When set to true, Umlaut will send all SFX clicks
# through SFX, for SFX to capture statistics. This is currently done
# using a backdoor into the SFX sfxresolve.cgi script. Defaults to false.
# Note that
# after SFX requests have been removed in the nightly job, the
# click passthrough will cause an error! Set sfx_requests_expire_crontab
# with the crontab pattern you use for requests to expire, and we won't
# try to click passthrough with expired requests.
# sfx_requests_expire_crontab: Crontab pattern that the SFX admin is using
# to expire SFX requests. Used to refrain from click passthrough with
# expired requests, since that is destined to fail.
# services_of_interest: Optional. over-ride the built in list of what types of
# SFX services we want to grab, and what the corresponding umlaut types are.
# hash, with SFX service type name as key, and Umlaut ServiceTypeValue
# name as value.
# extra_targets_of_interest: sfx target_names of targets you want to make
# sure to include in umlaut. A hash with target_name as key, and umlaut
# ResponseTypeValue name as value.
# really_distant_relationships: An array of relationship type codes from SFX
# "related objects". See Table 18 in SFX 3.0 User's Manual. Related
# objects that have only a "really distant relationship" will NOT
# be shown as fulltext, but will instead be banished to the see also
# "highlighted_link" section. You must have display of related objects
# turned ON in SFX display admin, to get related objects at all in
# Umlaut. NOTE: This parameter has a default value to a certain set of
# relationships, set to empty array [] to eliminate defaults.
# sfx_timeout: in seconds, for both open/read timeout value for SFX connection.
# Defaults to 8.
# roll_up_prefixes: ARRAY of STRINGs, prefixes like "EBSCOHOST_". If multiple
# targets sharing one of the specified prefixes are supplied from SFX,
# they will be "rolled up" and collapsed, just the first one included
# in response. For TITLE-LEVEL (rather than article-level) requests,
# the roll-up algorithm is sensitive to COVERAGES, and will only suppress
# targets that have coverages included in remaining non-suppressed targets.
class Sfx < Service
require 'uri'
require 'htmlentities'
require 'cgi'
require 'nokogiri'
require 'date'
#require 'open_url'
required_config_params :base_url
def initialize(config)
# Key is sfx service_type, value is umlaut servicetype string.
# These are the SFX service types we will translate to umlaut
@services_of_interest = {'getFullTxt' => 'fulltext',
'getSelectedFullTxt' => 'fulltext',
'getDocumentDelivery' => 'document_delivery',
'getDOI' => 'highlighted_link',
'getAbstract' => 'abstract',
'getTOC' => 'table_of_contents'}
# Special targets. Key is SFX target_name.
# Value is umlaut service type.
# These targets will be included even if their sfx service_type doesn't
# match our services_of_interest, and the umlaut service ID string given
# here will take precedence and be used even if these targets DO match
# services_of_interest. Generally loaded from yml config in super.
@extra_targets_of_interest = {}
@sfx_timeout = 8
@really_distant_relationships = ["CONTINUES_IN_PART", "CONTINUED_IN_PART_BY", "ABSORBED_IN_PART", "ABSORBED_BY"]
# Include a CrossRef credit, becuase SFX uses CrossRef api internally,
# and CrossRef ToS may require us to give credit.
@credits = {
"SFX" => "http://www.exlibrisgroup.com/sfx.htm",
"CrossRef" => "http://www.crossref.org/"
}
super(config)
end
# Standard method, used by auto background updater. See Service docs.
def service_types_generated
service_strings = []
service_strings.concat( @services_of_interest.values() )
service_strings.concat( @extra_targets_of_interest.values() )
service_strings.uniq!
return service_strings.collect { |s| ServiceTypeValue[s] }
end
def base_url
return @base_url
end
def handle(request)
client = self.initialize_client(request)
begin
response = self.do_request(client)
self.parse_response(response, request)
return request.dispatched(self, true)
rescue Errno::ETIMEDOUT, Timeout::Error => e
# Request to SFX timed out. Record this as unsuccessful in the dispatch table. Temporary.
return request.dispatched(self, DispatchedService::FailedTemporary, e)
end
end
def initialize_client(request)
transport = OpenURL::Transport.new(@base_url, nil, :open_timeout => @sfx_timeout, :read_timeout => @sfx_timeout)
context_object = request.to_context_object
## SFX HACK WORKAROUND
# SFX will parse private_data/pid/rft_dat containing ERIC, when sid/rfr_id
# is CSA. But it only expects an OpenURL 0.1 to do this. We send it a
# 1.0. To get it to recognize it anyway, we need to send it a blank
# url_ver/ctx_ver
if ( context_object.referrer.identifiers.find {|i| i.start_with? "info:sid/CSA"} &&
context_object.referent.private_data != nil)
context_object.openurl_ver = ""
end
transport.add_context_object(context_object)
transport.extra_args["sfx.response_type"]="multi_obj_xml"
@get_coverage = false
metadata = request.referent.metadata
if ( metadata['date'].blank? &&
metadata['year'].blank? &&
(! request.referent.identifiers.find {|i| i =~ /^info\:(doi|pmid)/})
)
# No article-level metadata, do some special stuff.
transport.extra_args["sfx.ignore_date_threshold"]="1"
transport.extra_args["sfx.show_availability"]="1"
@get_coverage = true
end
# Workaround to SFX bug, not sure if this is really still neccesary
# I think it's not, but leave it in anyway just in case.
if (context_object.referent.identifiers.find {|i| i =~ /^info:doi\// })
transport.extra_args['sfx.doi_url']='http://dx.doi.org'
end
return transport
end
def do_request(client)
client.transport_inline
return client.response
end
def parse_response(resolver_response, request)
doc = Nokogiri::XML(resolver_response)
# Catch an SFX error message (in HTML) that's not an XML
# document at all.
unless doc.at('/ctx_obj_set')
Rails.logger.error("sfx.rb: SFX did not return expected response. SFX response: #{resolver_response}")
raise "SFX did not return expected response."
end
# There can be several context objects in the response.
# We need to keep track of which data comes from which, for
# SFX click-through generating et alia
sfx_objs = doc.search('/ctx_obj_set/ctx_obj')
# As we go through the possibly multiple SFX context objects,
# we need to keep track of which one, if any, we want to use
# to enhance the Umlaut referent metadata.
#
# We only enhance for journal type metadata. For book type
# metadata SFX will return something, but it may not be the manifestation
# we want. With journal titles, less of an issue.
#
# In case of multiple SFX hits, enhance metadata only from the
# one that actually had fulltext. If more than one had fulltext, forget it,
# too error prone. If none had full text, just pick the first.
#
# We'll use these variables to keep track of our 'best fit' as
# we loop through em.
best_fulltext_ctx = nil
best_nofulltext_ctx = nil
# We're going to keep our @really_distant_relationship stuff here.
related_titles = {}
# We organize our responses in a queue, so we can process them
# for collapse function, before actually writing to db.
response_queue ||= {}
0.upto(sfx_objs.length - 1 ) do |sfx_obj_index|
sfx_obj = sfx_objs[sfx_obj_index]
# Get out the "perl_data" section, with our actual OpenURL style
# context object information. This was XML escaped as a String (actually
# double-escaped, weirdly), so
# we need to extract the string, unescape it, and then feed it to Nokogiri
# again.
ctx_obj_atts = sfx_obj.at('./ctx_obj_attributes').inner_text
perl_data = Nokogiri::XML( ctx_obj_atts )
# parse it into an OpenURL, we might need it like that.
sfx_co = Sfx.parse_perl_data(perl_data)
sfx_metadata = sfx_co.to_hash
# get SFX objectID
object_id_node =
perl_data.at("./perldata/hash/item[@key='rft.object_id']")
object_id = object_id_node ? object_id_node.inner_text : nil
# Get SFX requestID
request_id_node =
perl_data.at("./perldata/hash/item[@key='sfx.request_id']")
request_id = request_id_node ? request_id_node.inner_text : nil
# Get targets service ids
sfx_target_service_ids =
sfx_obj.search('ctx_obj_targets/target/target_service_id').collect {|e| e.inner_text}
metadata = request.referent.metadata
# For each target delivered by SFX
sfx_obj.search("./ctx_obj_targets/target").each_with_index do|target, target_index|
response_data = {}
# First check @extra_targets_of_interest
sfx_target_name = target.at('./target_name').inner_text
umlaut_service = @extra_targets_of_interest[sfx_target_name]
# If not found, look for it in services_of_interest
unless ( umlaut_service )
sfx_service_type = target.at("./service_type").inner_text
umlaut_service = @services_of_interest[sfx_service_type]
end
# If we have multiple context objs, skip the ill and ask-a-librarian
# links for all but the first, to avoid dups. This is a bit messy,
# but this whole multiple hits thing is messy.
if ( sfx_obj_index > 0 &&
( umlaut_service == 'document_delivery' ||
umlaut_service == 'export_citation' ||
umlaut_service == 'help'))
next
end
# Okay, keep track of best fit ctx for metadata enhancement
if request.referent.format == "journal"
if ( umlaut_service == 'fulltext')
best_fulltext_ctx = perl_data
best_nofulltext_ctx = nil
elsif best_nofulltext_ctx == nil
best_nofulltext_ctx = perl_data
end
end
if ( umlaut_service ) # Okay, it's in services or targets of interest
source = @display_name || "SFX"
target_service_id = (target/"./target_service_id").inner_text
coverage = nil
if ( @get_coverage )
# Make sure you turn on "Include availability info in text format"
# in the SFX Admin API configuration.
thresholds_str = ""
target.search('coverage/coverage_text/threshold_text/coverage_statement').each do | threshold |
thresholds_str += threshold.inner_text.to_s + ".\n";
end
embargoes_str = "";
target.search('coverage/coverage_text/embargo_text/embargo_statement').each do |embargo |
embargoes_str += embargo.inner_text.to_s + ".\n";
end
unless ( thresholds_str.blank? && embargoes_str.blank? )
coverage = thresholds_str + embargoes_str
end
end
related_note = ""
# If this is from a related object, add that on as a note too...
# And maybe skip this entirely!
if (related_node = target.at('./related_service_info'))
relationship = related_node.at('./relation_type').inner_text
issn = related_node.at('./related_object_issn').inner_text
sfx_object_id = related_node.at('./related_object_id').inner_text
title = related_node.at('./related_object_title').inner_text
if @really_distant_relationships.include?(
related_node.at('./relation_type').inner_text)
# Show title-level link in see-also instead of full text.
related_titles[issn] = {
:sfx_object_id => sfx_object_id,
:title => title,
:relationship => relationship,
:issn => issn
}
next
end
related_note = "This version provided from related title: " + CGI.unescapeHTML( title ) + ".\n"
end
if ( sfx_service_type == 'getDocumentDelivery' )
value_string = request_id
else
value_string = (target/"./target_service_id").inner_text
end
response_data[:url] = CGI.unescapeHTML((target/"./target_url").inner_text)
response_data[:notes] = related_note.to_s + CGI.unescapeHTML((target/"./note").inner_text)
# Set the proxy boolean
response_data[:proxy] = CGI.unescapeHTML((target/"./proxy").inner_text).eql?("yes")
response_data[:authentication] = CGI.unescapeHTML((target/"./authentication").inner_text)
response_data[:source] = source
response_data[:coverage] = coverage if coverage
# machine actionable coverage elements, used for collapsing
( response_data[:coverage_begin_date], response_data[:coverage_end_date] ) =
determine_coverage_boundaries(target)
# Sfx metadata we want
response_data[:sfx_base_url] = @base_url
response_data[:sfx_obj_index] = sfx_obj_index + 1 # sfx is 1 indexed
response_data[:sfx_target_index] = target_index + 1
# sometimes the sfx.request_id is missing, go figure.
if request_id = (perl_data/"//hash/item[@key='sfx.request_id']").first
response_data[:sfx_request_id] = request_id.inner_text
end
response_data[:sfx_target_service_id] = target_service_id
response_data[:sfx_target_name] = sfx_target_name
# At url-generation time, the request isn't available to us anymore,
# so we better store this citation info here now, since we need it
# for sfx click passthrough
# Oops, need to take this from SFX delivered metadata.
response_data[:citation_year] = sfx_metadata['rft.date'].to_s[0,4] if sfx_metadata['rft.date']
response_data[:citation_volume] = sfx_metadata['rft.volume'];
response_data[:citation_issue] = sfx_metadata['rft.issue']
response_data[:citation_spage] = sfx_metadata['rft.spage']
# Some debug info
response_data[:debug_info] =" Target: #{sfx_target_name} ; SFX object ID: #{object_id}"
response_data[:display_text] = (target/"./target_public_name").inner_text
response_data.merge!(
:service => self,
:service_type_value => umlaut_service
)
#request.add_service_response( response_data )
# We add the response_data to a list for now, so we can post-process
# for collapse feature before we actually add them.
response_queue[umlaut_service] ||= []
response_queue[umlaut_service] << response_data
end
end
end
if response_queue["fulltext"].present?
response_queue["fulltext"] = roll_up_responses(response_queue["fulltext"], :coverage_sensitive => request.title_level_citation? )
end
# Now that they've been post-processed, actually commit them.
response_queue.each_pair do |type, list|
list.each do |response|
request.add_service_response( response )
end
end
# Add in links to our related titles
related_titles.each_pair do |issn, hash|
request.add_service_response(
:service => self,
:display_text => "#{sfx_relationship_display(hash[:relationship])}: #{hash[:title]}",
:notes => "#{ServiceTypeValue['fulltext'].display_name} available",
:related_object_hash => hash,
:service_type_value => "highlighted_link")
end
# Did we find a ctx best fit for enhancement?
if best_fulltext_ctx
enhance_referent(request, best_fulltext_ctx)
elsif best_nofulltext_ctx
enhance_referent(request, best_nofulltext_ctx)
end
end
# pass in a nokogiri element for , we'll calculate
# ruby Date objects for begin date and end date of coverage,
# passed out as a two-element array [begin, end].
#
# taking embargoes into account. nil if unbounded.
def determine_coverage_boundaries(target)
# machine actionable coverage elements, used for collapsing
if (in_node = target.at_xpath("./coverage/in"))
year = in_node.at_xpath("year").try(:text).try(:to_i)
if year && year != 0
begin_date = Date.new(year, 1, 1)
end_date = Date.new(year, 12, 31)
end
end
if (from = target.at_xpath("./coverage/from"))
year = from.at_xpath("year").try(:text).try(:to_i)
# SFX KB does not have month/day, only year, set to begin of year
begin_date = Date.new(year, 1, 1) if year && year != 0
end
if (from = target.at_xpath("./coverage/to"))
year = from.at_xpath("year").try(:text).try(:to_i)
# set to end of year
end_date = Date.new(year, 12, 31) if year && year != 0
end
# If there's an embargo too, it may modify existing dates
if (embargo = target.at_xpath("./coverage/embargo"))
days = embargo.at_xpath("days").try(:text).try(:to_i)
days.try do |days|
embargo_days = Date.today - days
if embargo.at_xpath("availability").try(:text) == "available"
# only most recent X days, at earliest start
begin_date =
[begin_date || embargo_days, embargo_days].max
else # not_available
# stops at most recent X days, at latest end
end_date =
[end_date || embargo_days, embargo_days].min
end
end
end
return [begin_date, end_date]
end
# Pass in a list of hashes for making ServiceResponse's, we will roll up
# those that should be rolled up per roll_up_prefixes configuration.
#
# In :coverage_sensitive => true, will roll up sensitive to overlapping
# coverage to not remove any coverage.
#
# Does not mutate list passed in, don't try to change it to mutate,
# makes it hard to deal with list changing from underneath you in logic.
def roll_up_responses(list, options = {})
options = options.reverse_merge(:coverage_sensitive => true)
prefixes = @roll_up_prefixes
# If not configured for roll-up, just return it directly.
return list unless prefixes.present?
if options[:coverage_sensitive] == true
# roll up targets with same prefix only if coverage is a strict
# subset of an existing one. If two are equal, take first.
list = list.reject.each_with_index do |item, index|
prefix = prefixes.find {|p| item[:sfx_target_name].start_with?(p)}
bdate = item[:coverage_begin_date] || Date.new(1,1,1)
edate = item[:coverage_end_date] || Date.today
prefix && (
# earlier is equal or superset
list.slice(0, index).find do |candidate|
# nil considered very early or very late, unbounded
candidate_bdate = candidate[:coverage_begin_date] || Date.new(1,1,1)
candidate_edate = candidate[:coverage_end_date] || Date.today
candidate[:sfx_target_name].start_with?(prefix) &&
(candidate_bdate <= bdate) && (candidate_edate >= edate)
end ||
# later is superset, not equal
list.slice(index+1, list.length).find do |candidate|
candidate_bdate = (candidate[:coverage_begin_date] || Date.new(1,1,1))
candidate_edate = (candidate[:coverage_end_date] || Date.today)
candidate[:sfx_target_name].start_with?(prefix) &&
(candidate_bdate <= bdate) && (candidate_edate >= edate) &&
(! (bdate == candidate_bdate && edate == candidate_edate))
end
)
end
else # not coverage_sensitive
# Just roll up to FIRST of each prefix
list = list.reject.each_with_index do |item, index|
prefix = prefixes.find {|p| item[:sfx_target_name].start_with?(p)}
prefix && list.slice(0,index).find do |candidate|
candidate[:sfx_target_name].start_with?(prefix)
end
end
end
return list
end
def sfx_click_passthrough
# From config, or default to false.
return @click_passthrough || false;
end
# Using the value of sfx_request_expire_crontab, determine if the
# umlaut service response is so old that we can't use it for
# sfx click passthrough anymore.
def expired_sfx_request(response)
require 'cron_tab'
crontab_str = @sfx_requests_expire_crontab
return false unless crontab_str # no param, no determination possible
crontab = CronTab.new( crontab_str )
time_of_response = response.created_at
return false unless time_of_response # no recorded time, not possible either
expire_time = crontab.nexttime( time_of_response )
# Give an extra five minutes of time, in case the expire
# process takes up to five minutes to finish.
return( Time.now > (expire_time + 5.minutes) )
end
# Try to provide a weird reverse-engineered url to take the user THROUGH
# sfx to their destination, so sfx will capture for statistics.
# This relies on certain information from the orignal sfx response
# being stored in the Response object at that point. Used by
# sfx_backchannel_record service.
def self.pass_through_url(response)
base_url = response[:sfx_base_url]
sfx_resolver_cgi_url = base_url + "/cgi/core/sfxresolver.cgi"
dataString = "?tmp_ctx_svc_id=#{response[:sfx_target_index]}"
dataString += "&tmp_ctx_obj_id=#{response[:sfx_obj_index]}"
# Don't understand what this is, but it sometimes needs to be 1?
# Hopefully it won't mess anything up when it's not neccesary.
# Really have no idea when it would need to be something other
# than 1.
# Nope, sad to say it does mess up cases where it is not neccesary.
# Grr.
#dataString += "&tmp_parent_ctx_obj_id=1"
dataString += "&service_id=#{response[:sfx_target_service_id]}"
dataString += "&request_id=#{response[:sfx_request_id]}"
dataString += "&rft.year="
dataString += URI.escape(response[:citation_year].to_s) if response[:citation_year]
dataString += "&rft.volume="
dataString += URI.escape(response[:citation_volume].to_s) if response[:citation_volume]
dataString += "&rft.issue="
dataString += URI.escape(response[:citation_issue].to_s) if response[:citation_issue]
dataString += "&rft.spage="
dataString += URI.escape(response[:citation_spage]).to_s if response[:citation_spage]
return sfx_resolver_cgi_url + dataString
end
# Class method to parse a perl_data block as XML in String
# into a ContextObject. Argument is Nokogiri doc containing
# the SFX element and children.
def self.parse_perl_data(doc)
co = OpenURL::ContextObject.new
co.referent.set_format('journal') # default
html_ent_coder = HTMLEntities.new
doc.search('perldata/hash/item').each do |item|
key = item['key'].to_s
value = item.inner_text
# SFX sometimes returns invalid UTF8 (is it really ISO 8859? Is it
# predictable? Who knows. If it's not valid, it'll cause all
# sorts of problems later. So if it's not valid, we're just
# going to ignore it, sorry.
next unless value.valid_encoding?
# Some normalization. SFX uses rft.year, which is not actually
# legal. Stick it in rft.date instead.
key = "rft.date" if key == "rft.year"
prefix, stripped = key.split('.')
# The auinit1 value is COMPLETELY messed up for reasons I do not know.
# Double encoded in bizarre ways.
next if key == '@rft.auinit1' || key == '@rft.auinit'
# Darn multi-value SFX hackery, indicated with keys beginning
# with '@'. Just take the first one,
# our context object can't store more than one. Then regularize the
# key name.
if (prefix == '@rft')
array_items = item.search("array/item")
array_i = array_items[0] unless array_items.blank?
prefix = prefix.slice(1, prefix.length)
value = array_i ? array_i.inner_text : nil
end
# But this still has HTML entities in it sometimes. Now we've
# got to decode THAT.
# TODO: Are we sure we need to do this? We need an example
# from SFX result to test, it's potentially expensive.
value = html_ent_coder.decode(value)
# object_type? Fix that to be the right way.
if (prefix=='rft') && (key=='object_type')
co.referent.set_format( value.downcase )
next
end
if (prefix == 'rft' && value)
co.referent.set_metadata(stripped, value)
end
if (prefix=='@rft_id')
identifiers = item.search('array/item')
identifiers.each do |id|
co.referent.add_identifier(id.inner_text)
end
end
if (prefix=='@rfr_id')
identifiers = item.search('array/item')
identifiers.each do |id|
co.referrer.add_identifier(id.inner_text)
end
end
end
return co
end
# Custom url generation for the weird case
def response_url(service_response, submitted_params)
if (related_object = service_response.data_values[:related_object_hash])
{:controller => 'resolve', "rft.issn" => related_object[:issn], "rft.title" => related_object[:title], "rft.object_id" => related_object[:sfx_object_id] }
else
service_response['url']
end
end
protected
# Second argument is a Nokogiri element representing the
# tag and children.
def enhance_referent(request, perl_data)
ActiveRecord::Base.connection_pool.with_connection do
metadata = request.referent.metadata
sfx_co = Sfx.parse_perl_data(perl_data)
sfx_metadata = sfx_co.referent.metadata
# Do NOT enhance for metadata type 'BOOK', unreliable matching from
# SFX!
return if sfx_metadata["object_type"] == "BOOK" || sfx_metadata["genre"] == "book"
# If we already had metadata for journal title and the SFX one
# differs, we want to over-write it. This is good for ambiguous
# incoming OpenURLs, among other things.
if request.referent.format == 'journal'
request.referent.enhance_referent("jtitle", sfx_metadata['jtitle'])
end
# And ISSN
if request.referent.format == 'journal' && ! sfx_metadata['issn'].blank?
request.referent.enhance_referent('issn', sfx_metadata['issn'])
end
# The rest, we write only if blank, we don't over-write
sfx_metadata.each do |key, value|
if (metadata[key].blank?)
# watch out for SFX's weird array values.
request.referent.enhance_referent(key, value)
end
end
end
end
# From Table 18 in SFX General User's Guide 3.0.
def sfx_relationship_display(sfx_code)
sfx_code = sfx_code.to_s
# Most can simply be #humanized, a couple of over-rides
@sfx_relationship_display ||= {
"TRANSLATION_ENTRY" => "Translation",
}
display = @sfx_relationship_display[sfx_code]
display = sfx_code.humanize if display.nil?
return display
end
end