# NOTE: In your SFX Admin, under Menu Configuration / API, you should enable ALL # 'extra' API information for full umlaut functionality. # With the exception of "Include openURL parameter", can't figure out how # that's useful. # # config parameters in services.yml # display name: User displayable name for this service # base_url: SFX base url. # click_passthrough: DEPRECATED. Caused problems. Use the SFXBackchannelRecord # link filter service instead. # When set to true, Umlaut will send all SFX clicks # through SFX, for SFX to capture statistics. This is currently done # using a backdoor into the SFX sfxresolve.cgi script. Defaults to false. # Note that # after SFX requests have been removed in the nightly job, the # click passthrough will cause an error! Set sfx_requests_expire_crontab # with the crontab pattern you use for requests to expire, and we won't # try to click passthrough with expired requests. # sfx_requests_expire_crontab: Crontab pattern that the SFX admin is using # to expire SFX requests. Used to refrain from click passthrough with # expired requests, since that is destined to fail. # services_of_interest: Optional. over-ride the built in list of what types of # SFX services we want to grab, and what the corresponding umlaut types are. # hash, with SFX service type name as key, and Umlaut ServiceTypeValue # name as value. # extra_targets_of_interest: sfx target_names of targets you want to make # sure to include in umlaut. A hash with target_name as key, and umlaut # ResponseTypeValue name as value. # really_distant_relationships: An array of relationship type codes from SFX # "related objects". See Table 18 in SFX 3.0 User's Manual. Related # objects that have only a "really distant relationship" will NOT # be shown as fulltext, but will instead be banished to the see also # "highlighted_link" section. You must have display of related objects # turned ON in SFX display admin, to get related objects at all in # Umlaut. NOTE: This parameter has a default value to a certain set of # relationships, set to empty array [] to eliminate defaults. # sfx_timeout: in seconds, for both open/read timeout value for SFX connection. # Defaults to 8. class Sfx < Service require 'uri' require 'htmlentities' require 'cgi' require 'nokogiri' #require 'open_url' required_config_params :base_url def initialize(config) # Key is sfx service_type, value is umlaut servicetype string. # These are the SFX service types we will translate to umlaut @services_of_interest = {'getFullTxt' => 'fulltext', 'getSelectedFullTxt' => 'fulltext', 'getDocumentDelivery' => 'document_delivery', 'getDOI' => 'highlighted_link', 'getAbstract' => 'abstract', 'getTOC' => 'table_of_contents'} # Special targets. Key is SFX target_name. # Value is umlaut service type. # These targets will be included even if their sfx service_type doesn't # match our services_of_interest, and the umlaut service ID string given # here will take precedence and be used even if these targets DO match # services_of_interest. Generally loaded from yml config in super. @extra_targets_of_interest = {} @sfx_timeout = 8 @really_distant_relationships = ["CONTINUES_IN_PART", "CONTINUED_IN_PART_BY", "ABSORBED_IN_PART", "ABSORBED_BY"] # Include a CrossRef credit, becuase SFX uses CrossRef api internally, # and CrossRef ToS may require us to give credit. @credits = { "SFX" => "http://www.exlibrisgroup.com/sfx.htm", "CrossRef" => "http://www.crossref.org/" } super(config) end # Standard method, used by auto background updater. See Service docs. def service_types_generated service_strings = [] service_strings.concat( @services_of_interest.values() ) service_strings.concat( @extra_targets_of_interest.values() ) service_strings.uniq! return service_strings.collect { |s| ServiceTypeValue[s] } end def base_url return @base_url end def handle(request) client = self.initialize_client(request) begin response = self.do_request(client) self.parse_response(response, request) return request.dispatched(self, true) rescue Errno::ETIMEDOUT, Timeout::Error => e # Request to SFX timed out. Record this as unsuccessful in the dispatch table. Temporary. return request.dispatched(self, DispatchedService::FailedTemporary, e) end end def initialize_client(request) transport = OpenURL::Transport.new(@base_url, nil, :open_timeout => @sfx_timeout, :read_timeout => @sfx_timeout) context_object = request.to_context_object transport.add_context_object(context_object) transport.extra_args["sfx.response_type"]="multi_obj_xml" @get_coverage = false metadata = request.referent.metadata if ( metadata['date'].blank? && metadata['year'].blank? && (! request.referent.identifiers.find {|i| i =~ /^info\:(doi|pmid)/}) ) # No article-level metadata, do some special stuff. transport.extra_args["sfx.ignore_date_threshold"]="1" transport.extra_args["sfx.show_availability"]="1" @get_coverage = true end # Workaround to SFX bug, not sure if this is really still neccesary # I think it's not, but leave it in anyway just in case. if (context_object.referent.identifiers.find {|i| i =~ /^info:doi\// }) transport.extra_args['sfx.doi_url']='http://dx.doi.org' end return transport end def do_request(client) client.transport_inline return client.response end def parse_response(resolver_response, request) doc = Nokogiri::XML(resolver_response) # Catch an SFX error message (in HTML) that's not an XML # document at all. unless doc.at('/ctx_obj_set') Rails.logger.error("sfx.rb: SFX did not return expected response. SFX response: #{resolver_response}") raise "SFX did not return expected response." end # There can be several context objects in the response. # We need to keep track of which data comes from which, for # SFX click-through generating et alia sfx_objs = doc.search('/ctx_obj_set/ctx_obj') # As we go through the possibly multiple SFX context objects, # we need to keep track of which one, if any, we want to use # to enhance the Umlaut referent metadata. # # We only enhance for journal type metadata. For book type # metadata SFX will return something, but it may not be the manifestation # we want. With journal titles, less of an issue. # # In case of multiple SFX hits, enhance metadata only from the # one that actually had fulltext. If more than one had fulltext, forget it, # too error prone. If none had full text, just pick the first. # # We'll use these variables to keep track of our 'best fit' as # we loop through em. best_fulltext_ctx = nil best_nofulltext_ctx = nil # We're going to keep our @really_distant_relationship stuff here. related_titles = {} 0.upto(sfx_objs.length - 1 ) do |sfx_obj_index| sfx_obj = sfx_objs[sfx_obj_index] # Get out the "perl_data" section, with our actual OpenURL style # context object information. This was XML escaped as a String (actually # double-escaped, weirdly), so # we need to extract the string, unescape it, and then feed it to Nokogiri # again. ctx_obj_atts = CGI.unescapeHTML( sfx_obj.at('./ctx_obj_attributes').inner_html) perl_data = Nokogiri::XML( ctx_obj_atts ) # parse it into an OpenURL, we might need it like that. sfx_co = Sfx.parse_perl_data(perl_data) sfx_metadata = sfx_co.to_hash # get SFX objectID object_id_node = perl_data.at("./perldata/hash/item[@key='rft.object_id']") object_id = object_id_node ? object_id_node.inner_html : nil # Get SFX requestID request_id_node = perl_data.at("./perldata/hash/item[@key='sfx.request_id']") request_id = request_id_node ? request_id_node.inner_html : nil # Get targets service ids sfx_target_service_ids = sfx_obj.search('ctx_obj_targets/target/target_service_id').collect {|e| e.inner_html} metadata = request.referent.metadata # For each target delivered by SFX sfx_obj.search("./ctx_obj_targets/target").each_with_index do|target, target_index| response_data = {} # First check @extra_targets_of_interest sfx_target_name = target.at('./target_name').inner_html umlaut_service = @extra_targets_of_interest[sfx_target_name] # If not found, look for it in services_of_interest unless ( umlaut_service ) sfx_service_type = target.at("./service_type").inner_html umlaut_service = @services_of_interest[sfx_service_type] end # If we have multiple context objs, skip the ill and ask-a-librarian # links for all but the first, to avoid dups. This is a bit messy, # but this whole multiple hits thing is messy. if ( sfx_obj_index > 0 && ( umlaut_service == 'document_delivery' || umlaut_service == 'export_citation' || umlaut_service == 'help')) next end # Okay, keep track of best fit ctx for metadata enhancement if request.referent.format == "journal" if ( umlaut_service == 'fulltext') best_fulltext_ctx = perl_data best_nofulltext_ctx = nil elsif best_nofulltext_ctx == nil best_nofulltext_ctx = perl_data end end if ( umlaut_service ) # Okay, it's in services or targets of interest if (target/"./displayer") source = "SFX/"+(target/"./displayer").inner_html else source = "SFX"+URI.parse(self.url).path end target_service_id = (target/"./target_service_id").inner_html coverage = nil if ( @get_coverage ) # Make sure you turn on "Include availability info in text format" # in the SFX Admin API configuration. thresholds_str = "" target.search('coverage/coverage_text/threshold_text/coverage_statement').each do | threshold | thresholds_str += threshold.inner_html.to_s + ".\n"; end embargoes_str = ""; target.search('coverage/coverage_text/embargo_text/embargo_statement').each do |embargo | embargoes_str += embargo.inner_html.to_s + ".\n"; end unless ( thresholds_str.blank? && embargoes_str.blank? ) coverage = thresholds_str + embargoes_str end end related_note = "" # If this is from a related object, add that on as a note too... # And maybe skip this entirely! if (related_node = target.at('./related_service_info')) relationship = related_node.at('./relation_type').inner_text issn = related_node.at('./related_object_issn').inner_text sfx_object_id = related_node.at('./related_object_id').inner_text title = related_node.at('./related_object_title').inner_text if @really_distant_relationships.include?( related_node.at('./relation_type').inner_text) # Show title-level link in see-also instead of full text. related_titles[issn] = { :sfx_object_id => sfx_object_id, :title => title, :relationship => relationship, :issn => issn } next end related_note = "This version provided from related title: " + CGI.unescapeHTML( title ) + ".\n" end if ( sfx_service_type == 'getDocumentDelivery' ) value_string = request_id else value_string = (target/"./target_service_id").inner_html end response_data[:url] = CGI.unescapeHTML((target/"./target_url").inner_html) response_data[:notes] = related_note.to_s + CGI.unescapeHTML((target/"./note").inner_html) response_data[:authentication] = CGI.unescapeHTML((target/"./authentication").inner_html) response_data[:source] = source response_data[:coverage] = coverage if coverage # Sfx metadata we want response_data[:sfx_base_url] = @base_url response_data[:sfx_obj_index] = sfx_obj_index + 1 # sfx is 1 indexed response_data[:sfx_target_index] = target_index + 1 response_data[:sfx_request_id] = (perl_data/"//hash/item[@key='sfx.request_id']").first.inner_html response_data[:sfx_target_service_id] = target_service_id response_data[:sfx_target_name] = sfx_target_name # At url-generation time, the request isn't available to us anymore, # so we better store this citation info here now, since we need it # for sfx click passthrough # Oops, need to take this from SFX delivered metadata. response_data[:citation_year] = sfx_metadata['rft.date'].to_s[0,4] if sfx_metadata['rft.date'] response_data[:citation_volume] = sfx_metadata['rft.volume']; response_data[:citation_issue] = sfx_metadata['rft.issue'] response_data[:citation_spage] = sfx_metadata['rft.spage'] # Some debug info response_data[:debug_info] =" Target: #{sfx_target_name} ; SFX object ID: #{object_id}" response_data[:display_text] = (target/"./target_public_name").inner_html request.add_service_response( response_data.merge( :service => self, :service_type_value => umlaut_service )) end end end # Add in links to our related titles related_titles.each_pair do |issn, hash| request.add_service_response( :service => self, :display_text => "#{sfx_relationship_display(hash[:relationship])}: #{hash[:title]}", :notes => "#{ServiceTypeValue['fulltext'].display_name} available", :related_object_hash => hash, :service_type_value => "highlighted_link") end # Did we find a ctx best fit for enhancement? if best_fulltext_ctx enhance_referent(request, best_fulltext_ctx) elsif best_nofulltext_ctx enhance_referent(request, best_nofulltext_ctx) end end def sfx_click_passthrough # From config, or default to false. return @click_passthrough || false; end # Using the value of sfx_request_expire_crontab, determine if the # umlaut service response is so old that we can't use it for # sfx click passthrough anymore. def expired_sfx_request(response) require 'CronTab' crontab_str = @sfx_requests_expire_crontab return false unless crontab_str # no param, no determination possible crontab = CronTab.new( crontab_str ) time_of_response = response.created_at return false unless time_of_response # no recorded time, not possible either expire_time = crontab.nexttime( time_of_response ) # Give an extra five minutes of time, in case the expire # process takes up to five minutes to finish. return( Time.now > (expire_time + 5.minutes) ) end # Try to provide a weird reverse-engineered url to take the user THROUGH # sfx to their destination, so sfx will capture for statistics. # This relies on certain information from the orignal sfx response # being stored in the Response object at that point. Used by # sfx_backchannel_record service. def self.pass_through_url(response) base_url = response[:sfx_base_url] sfx_resolver_cgi_url = base_url + "/cgi/core/sfxresolver.cgi" dataString = "?tmp_ctx_svc_id=#{response[:sfx_target_index]}" dataString += "&tmp_ctx_obj_id=#{response[:sfx_obj_index]}" # Don't understand what this is, but it sometimes needs to be 1? # Hopefully it won't mess anything up when it's not neccesary. # Really have no idea when it would need to be something other # than 1. # Nope, sad to say it does mess up cases where it is not neccesary. # Grr. #dataString += "&tmp_parent_ctx_obj_id=1" dataString += "&service_id=#{response[:sfx_target_service_id]}" dataString += "&request_id=#{response[:sfx_request_id]}" dataString += "&rft.year=" dataString += URI.escape(response[:citation_year].to_s) if response[:citation_year] dataString += "&rft.volume=" dataString += URI.escape(response[:citation_volume].to_s) if response[:citation_volume] dataString += "&rft.issue=" dataString += URI.escape(response[:citation_issue].to_s) if response[:citation_issue] dataString += "&rft.spage=" dataString += URI.escape(response[:citation_spage]).to_s if response[:citation_spage] return sfx_resolver_cgi_url + dataString end # Class method to parse a perl_data block as XML in String # into a ContextObject. Argument is Nokogiri doc containing # the SFX element and children. def self.parse_perl_data(doc) co = OpenURL::ContextObject.new co.referent.set_format('journal') # default html_ent_coder = HTMLEntities.new doc.search('perldata/hash/item').each do |item| key = item['key'].to_s value = item.inner_html # Some normalization. SFX uses rft.year, which is not actually # legal. Stick it in rft.date instead. key = "rft.date" if key == "rft.year" prefix, stripped = key.split('.') # The auinit1 value is COMPLETELY messed up for reasons I do not know. # Double encoded in bizarre ways. next if key == '@rft.auinit1' || key == '@rft.auinit' # Darn multi-value SFX hackery, indicated with keys beginning # with '@'. Just take the first one, # our context object can't store more than one. Then regularize the # key name. if (prefix == '@rft') array_items = item.search("array/item") array_i = array_items[0] unless array_items.blank? prefix = prefix.slice(1, prefix.length) value = array_i ? array_i.inner_html : nil end # But this still has HTML entities in it sometimes. Now we've # got to decode THAT. value = html_ent_coder.decode(value) # object_type? Fix that to be the right way. if (prefix=='rft') && (key=='object_type') co.referent.set_format( value.downcase ) next end if (prefix == 'rft' && value) co.referent.set_metadata(stripped, value) end if (prefix=='@rft_id') identifiers = item.search('array/item') identifiers.each do |id| co.referent.add_identifier(id.inner_html) end end if (prefix=='@rfr_id') identifiers = item.search('array/item') identifiers.each do |id| co.referrer.add_identifier(id.inner_html) end end end return co end # Custom url generation for the weird case def response_url(service_response, submitted_params) if (related_object = service_response.data_values[:related_object_hash]) {:controller => 'resolve', "rft.issn" => related_object[:issn], "rft.title" => related_object[:title], "rft.object_id" => related_object[:sfx_object_id] } else service_response['url'] end end protected # Second argument is a Nokogiri element representing the # tag and children. def enhance_referent(request, perl_data) ActiveRecord::Base.connection_pool.with_connection do metadata = request.referent.metadata sfx_co = Sfx.parse_perl_data(perl_data) sfx_metadata = sfx_co.referent.metadata # Do NOT enhance for metadata type 'BOOK', unreliable matching from # SFX! return if sfx_metadata["object_type"] == "BOOK" || sfx_metadata["genre"] == "book" # If we already had metadata for journal title and the SFX one # differs, we want to over-write it. This is good for ambiguous # incoming OpenURLs, among other things. if request.referent.format == 'journal' request.referent.enhance_referent("jtitle", sfx_metadata['jtitle']) end # And ISSN if request.referent.format == 'journal' && ! sfx_metadata['issn'].blank? request.referent.enhance_referent('issn', sfx_metadata['issn']) end # The rest, we write only if blank, we don't over-write sfx_metadata.each do |key, value| if (metadata[key].blank?) # watch out for SFX's weird array values. request.referent.enhance_referent(key, value) end end end end # From Table 18 in SFX General User's Guide 3.0. def sfx_relationship_display(sfx_code) sfx_code = sfx_code.to_s # Most can simply be #humanized, a couple of over-rides @sfx_relationship_display ||= { "TRANSLATION_ENTRY" => "Translation", } display = @sfx_relationship_display[sfx_code] display = sfx_code.humanize if display.nil? return display end end