# Service that uses available metadata to try to find an exact match to a # WorldCat Identity. # # Requires sufficient author information and/or an oclcnumber to have enough # info to try and find a match. Best to run AFTER services that may enhance # metadata with this info (such as Amazon). # # See: http://outgoing.typepad.com/outgoing/2008/06/linking-to-worl.html # # Creates a highlighted_link # Even though the WorldCat Identities API is built on top of SRU we use # open-uri to simply fetch the "html" page (which is really XML with a stylesheet) # and look at the XML directly. #. SRU was too slow and was timing out the background service. # (because SRU parses the response with REXML? we don't want to have anything # to do with REXML! Even still retrieving the large XML file and traversing # it with Hpricot (what we used to use) was still rather slow. The suggestion is to enable few # note_types and then constrain the number shown. The defaults are hopefully # sane in this regard. note_types can be set to false to turn them off. # # Also can create an optional link to Wikipedia. # # There's probably a lot more we could pull out of these identities pages if # we wanted to. If more of these are used they might warrant their own # service type and part of the page for better layout. class WorldcatIdentities < Service require 'open-uri' # SRU is too slow even though we use an SRU-like link require 'nokogiri' include MetadataHelper attr_reader :url, :note_types, :display_name, :wikipedia_link, :openurl_base, :require_identifier, # below starts the note_types which can be restrained :num_of_roles, :num_of_subject_headings, :num_of_works, :num_of_genres def service_types_generated return [ ServiceTypeValue[:highlighted_link] ] end def initialize(config) @url = 'http://worldcat.org/identities/search/' @note_types = ["combined_counts"] @display_name = "WorldCat Identities" @require_identifier = false # any plural note_types can be restrained @num_of_roles = 5 @num_of_works = 1 @num_of_genres = 5 @wikipedia_link = true @openurl_widely_held = true @worldcat_widely_held = false @openurl_base = '/resolve' @credits = { "WorldCat Identities" => "http://www.worldcat.org/identities/" } super(config) end def handle(request) index, query = define_query(request.referent) unless query.blank? do_query(request, index, query) end return request.dispatched(self, true) end def define_query(rft) oclcnum = get_identifier(:info, "oclcnum", rft) metadata = rft.metadata # Do we have enough info to do a query with sufficient precision? # We are choosing better recall in exchange for lower precision. # We'll search with oclcnum if we have it, but not require it, we'll search # fuzzily on various parts of the name if neccesary. if ( oclcnum.blank? && ( metadata['aulast'].blank? || metadata['aufirst'].blank? ) && metadata['au'].blank? && metadata['aucorp'].blank? ) or (oclcnum.blank? && @require_identifier) Rails.logger.debug("Worldcat Identities Service Adaptor: Skipped: Insufficient metadata for lookup") return nil end # instead of searching across all indexes we target the one we want name_operator = "%3D" if ((! metadata['aulast'].blank?) && oclcnum) # Just last name is enough, we have an oclcnum. index = 'PersonalIdentities' name_part = 'FamilyName' name = clean_name(metadata['aulast']) elsif (! metadata['au'].blank? ) # Next choice, undivided author string index = "PersonalIdentities" name_part = 'Name' name = clean_name(metadata['au']) name_operator = "all" elsif (not metadata['aulast'].blank? and not metadata['aufirst'].blank?) # combine them. index = "PersonalIdentities" name_part = 'Name' name = clean_name(metadata['aufirst'] + ' ' + metadata['aulast']) name_operator = "all" elsif metadata['aucorp'] # corp name index = 'CorporateIdentities' name_part = 'Name' name = clean_name(metadata['aucorp']) else # oclcnum but no author information at all! Might still work... index = "Identities" end query_conditions = [] query_conditions << "local.#{name_part}+#{name_operator}+%22#{name}%22" if name query_conditions << "local.OCLCNumber+%3D+%22#{CGI.escape(oclcnum)}%22" unless oclcnum.blank? query = query_conditions.join("+and+") # Sort keys is important when we don't have an oclcnumber, and doesn't hurt # when we do. query += "&sortKeys=holdingscount" return index, query end # We might have to remove certain characters, but for now we just CGI.escape # it and remove any periods def clean_name(name) CGI.escape(name).gsub('.', '') end def do_query(request, index, query) # since we're only doing exact matching with last name and OCLCnum # we only request 1 record to hopefully speed things up. link = @url + index + '?query=' +query + "&maximumRecords=1" result = open(link, "Accept" => "text/xml").read xml = Nokogiri::XML(result) # Identities namespaces are all over the place, it's too hard # to interrogate with namespaces, ask nokogiri to remove them all # instead. xml.remove_namespaces! return nil if xml.at("numberOfRecords").inner_text == '0' create_link(request, xml) create_wikipedia_link(request, xml) if @wikipedia_link create_openurl_widely_held(request, xml) if @openurl_widely_held create_worldcat_widely_held(request, xml) if @worldcat_widely_held end def create_link(request, xml) display_name = "About " + extract_display_name(xml) extracted_notes = extract_notes(xml) if @note_types url = extract_url(xml) create_service_response(request, display_name, url, extracted_notes ) end def extract_notes(xml) note_pieces = [] # a tiny bit of metaprogramming to make it easy to add methods and config # for note_types @note_types.each do |nt| method = ("extract_" + nt).to_sym answer = self.send(method, xml) note_pieces << answer unless answer.nil? end return nil if note_pieces.blank? return note_pieces.join(' | ') end def extract_display_name(doc) name = [] rawname = doc.at("nameInfo/rawName") return nil unless rawname rawname.children.each do |name_part| name << name_part.inner_text end return nil if name.blank? return name.join(' ') end def extract_subject_headings(doc) subject_headings = [] (doc.search("biogSH")).each_with_index do |sh, i| subject_headings << sh.inner_text break if @num_of_subject_headings == i + 1 end return nil if subject_headings.blank? "subject headings: " + subject_headings.join('; ') end def extract_roles(doc) codes = [] (doc.search("relators/relator")).each_with_index do |relate, i| codes << relate.attributes['code'] break if @num_of_roles == i + 1 end return nil if codes.blank? roles = codes.map{|code| RELATOR_CODES[code] } "roles: " + roles.join(', ') end # FIXME a lot more could be done with "by citations". identities gives summaries # of the most popular works as well as other descriptive information like # subject headings. This might be able to be used for enhancing metadata. def extract_works(doc) works = [] doc.search("by/citation/title").each_with_index do |t, i| works << t.inner_text break if @num_of_works == i + 1 end return nil if works.blank? "most widely held #{works.length == 1 ? "work" : "works"}: " + works.join("; ") end def extract_genres(doc) genres = [] doc.search("genres/genre").each_with_index do |g, i| genres << g.inner_text break if @num_of_genres == i + 1 end return nil if genres.blank? "genres: " + genres.join(', ') end def extract_combined_counts(doc) work_count = extract_work_count(doc) publications_count = extract_publications_count(doc) holdings_count = extract_holdings_count(doc) work_count << " in " << publications_count << " with " << holdings_count end def extract_work_count(doc) work_count = doc.at("workCount").inner_text return insert_commas(work_count) << " works" end def extract_holdings_count(doc) total_holdings = doc.at("totalHoldings").inner_text return insert_commas(total_holdings) << " total holdings in WorldCat" end def extract_publications_count(doc) return insert_commas( doc.at("recordCount").inner_text ) << " publications" end def extract_url(doc) pnkey = doc.at("pnkey").inner_text return 'http://worldcat.org/identities/' << pnkey end def insert_commas(n) n.reverse.scan(/(?:\d*\.)?\d{1,3}-?/).join(',').reverse end def create_service_response(request, display_name, url, extracted_notes) request.add_service_response( :service=>self, :url=>url, :display_text=>display_name, :notes => extracted_notes, :service_type_value => :highlighted_link) end def create_wikipedia_link(request, xml) name_element = xml.at("wikiLink") return nil unless name_element name = name_element.inner_text # This is the base link that worldcat identities uses so we use the same link = "http://en.wikipedia.org/wiki/Special:Search?search=" << name request.add_service_response( :service=>self, :url=>link, :display_text=> "About " + name.titlecase, :notes => '', :source => 'Wikipedia', :service_type_value => :highlighted_link) end def create_openurl_widely_held(request, xml) widely_held = get_widely_held_info(xml) # try to remove circular links return nil if circular_link?(request, widely_held) openurl = create_openurl(request, widely_held) request.add_service_response( :service=>self, :url=>openurl, :display_text=> widely_held['title'], :notes => "This author's most widely held work.", :service_type_value => :highlighted_link) end def circular_link?(request, citation_info) rft = request.referent request_oclcnum = get_identifier(:info, "oclcnum", rft) request_title = get_search_title(rft) return true if citation_info['oclcnum'] == request_oclcnum #further cleaning might be necessary for titles to be good matches return true if citation_info['title'].strip == request_title.strip end #createsa minimal openurl to make a new request to umlaut def create_openurl(request, wh) metadata = request.referent.metadata co = OpenURL::ContextObject.new cor = co.referent cor.set_format(wh['record_type']) cor.add_identifier("info:oclcnum/#{wh['oclcnum']}") cor.set_metadata('aulast', metadata['aulast'] ) if metadata['aulast'] cor.set_metadata('aufirst', metadata['aufirst']) if metadata['aufirst'] cor.set_metadata('aucorp', metadata['aucorp']) if metadata['aucorp'] cor.set_metadata('title', wh['title']) link = @openurl_base + '?' + co.kev return link end # We just link to worldcat using the oclc number provided # FIXME this might need special partial if we incorporate a cover image def create_worldcat_widely_held(request, xml) # try to prevent circular links top_holding_info = get_widely_held_info(xml) return nil if circular_link?(request, top_holding_info) # http://www.worldcat.org/links/ most = top_holding_info['most'] title = top_holding_info['title'] oclcnum = top_holding_info['oclcnum'] link = 'http://www.worldcat.org/oclc/' << oclcnum cover_image_link = extract_cover_image_link(request, most) notes = "this author's most widely held work in WorldCat" if cover_image_link display_text = '' notes = title << ' is ' << notes else display_text = title end request.add_service_response( :service=>self, :url=>link, :display_text=> display_text, :notes => notes, :service_type_value => :highlighted_link) end def get_widely_held_info(xml) h = {} h['most'] = most = xml.at("by/citation") h['oclcnum'] = clean_oclcnum(most.at("oclcnum").inner_text) h['title'] = most.at("title").inner_text h['record_type'] = most.at('recordType').inner_text h end def extract_cover_image_link(request, citation) cover = citation.at("cover") return nil unless cover # we try not to show a cover if we already probably have the same cover # showing. oclc = clean_oclcnum( cover.attributes['oclc'] ) metadata = request.referent.metadata if metadata['oclcnum'] and metadata['oclcnum'] =~ oclc return nil end cover_number = cover.inner_text if metadata['isbn'] and metadata['isbn'] == cover_number return nil end if cover.attributes["type"] == 'isbn' link = "http://www.worldcat.org/wcpa/servlet/DCARead?standardNoType=1&standardNo=" return link << cover_number end return nil end def clean_oclcnum(num) if num =~ /(ocn0*|ocm0*|on0*|\(OCoLC\)|ocl70*|0+)(.*)$/ num = $2 end return num end # relator codes are from http://worldcat.org/identities/relators.xml which was # referenced from http://worldcat.org/identities/Identities.xsl RELATOR_CODES = { "act" => "Actor", "adp" => "Adapter", "aft" => "Author of afterword, colophon, etc.", "anm" => "Animator ", "ann" => "Annotator", "ant" => "Bibliographic antecedent", "app" => "Applicant", "aqt" => "Author in quotations or text abstracts", "arc" => "Architect", "arr" => "Arranger", "art" => "Artist", "asg" => "Assignee", "asn" => "Associated name", "att" => "Attributed name", "auc" => "Auctioneer", "aud" => "Author of dialog", "aui" => "Author of introduction", "aus" => "Author of screenplay", "aut" => "Author", "bdd" => "Binding designer", "bjd" => "Bookjacket designer", "bkd" => "Book designer", "bkp" => "Book producer", "bnd" => "Binder", "bpd" => "Bookplate designer", "bsl" => "Bookseller", "ccp" => "Conceptor", "chr" => "Choreographer", "clb" => "Collaborator", "cli" => "Client", "cll" => "Calligrapher", "clt" => "Collotyper", "cmm" => "Commentator", "cmp" => "Composer", "cmt" => "Compositor", "cng" => "Cinematographer ", "cnd" => "Conductor", "cns" => "Censor", "coe" => "Contestant -appellee", "col" => "Collector", "com" => "Compiler", "cos" => "Contestant", "cot" => "Contestant -appellant", "cov" => "Cover designer", "cpc" => "Copyright claimant", "cpe" => "Complainant-appellee", "cph" => "Copyright holder", "cpl" => "Complainant", "cpt" => "Complainant-appellant", "cre" => "Creator", "crp" => "Correspondent", "crr" => "Corrector", "csl" => "Consultant", "csp" => "Consultant to a project", "cst" => "Costume designer", "ctb" => "Contributor", "cte" => "Contestee-appellee", "ctg" => "Cartographer", "ctr" => "Contractor", "cts" => "Contestee", "ctt" => "Contestee-appellant", "cur" => "Curator", "cwt" => "Commentator for written text", "dfd" => "Defendant", "dfe" => "Defendant-appellee", "dft" => "Defendant-appellant", "dgg" => "Degree grantor", "dis" => "Dissertant", "dln" => "Delineator", "dnc" => "Dancer", "dnr" => "Donor", "dpc" => "Depicted", "dpt" => "Depositor", "drm" => "Draftsman", "drt" => "Director", "dsr" => "Designer", "dst" => "Distributor", "dte" => "Dedicatee", "dto" => "Dedicator", "dub" => "Dubious author", "edt" => "Editor", "egr" => "Engraver", "elt" => "Electrotyper", "eng" => "Engineer", "etr" => "Etcher", "exp" => "Expert", "fac" => "Facsimilist", "flm" => "Film editor", "fmo" => "Former owner", "fpy" => "First party", "fnd" => "Funder", "frg" => "Forger", "grt" => "Graphic technician", "hnr" => "Honoree", "hst" => "Host", "ill" => "Illustrator", "ilu" => "Illuminator", "ins" => "Inscriber", "inv" => "Inventor", "itr" => "Instrumentalist", "ive" => "Interviewee", "ivr" => "Interviewer", "lbt" => "Librettist", "lee" => "Libelee-appellee", "lel" => "Libelee", "len" => "Lender", "let" => "Libelee-appellant", "lgd" => "Lighting designer ", "lie" => "Libelant-appellee", "lil" => "Libelant", "lit" => "Libelant-appellant", "lsa" => "Landscape architect", "lse" => "Licensee", "lso" => "Licensor", "ltg" => "Lithographer", "lyr" => "Lyricist", "mfr" => "Manufacturer ", "mdc" => "Metadata contact", "mod" => "Moderator", "mon" => "Monitor", "mrk" => "Markup editor", "mte" => "Metal-engraver", "mus" => "Musician", "nrt" => "Narrator", "opn" => "Opponent", "org" => "Originator", "orm" => "Organizer of meeting", "oth" => "Other", "own" => "Owner", "pat" => "Patron", "pbd" => "Publishing director", "pbl" => "Publisher", "pfr" => "Proofreader", "pht" => "Photographer", "plt" => "Platemaker", "pop" => "Printer of plates", "ppm" => "Papermaker", "ppt" => "Puppeteer ", "prc" => "Process contact", "prd" => "Production personnel", "prf" => "Performer", "prg" => "Programmer", "prm" => "Printmaker", "pro" => "Producer", "prt" => "Printer", "pta" => "Patent applicant", "pte" => "Plaintiff -appellee", "ptf" => "Plaintiff", "pth" => "Patent holder", "ptt" => "Plaintiff-appellant", "rbr" => "Rubricator", "rce" => "Recording engineer", "rcp" => "Recipient", "red" => "Redactor", "ren" => "Renderer", "res" => "Researcher", "rev" => "Reviewer", "rpt" => "Reporter", "rpy" => "Responsible party", "rse" => "Respondent -appellee", "rsg" => "Restager ", "rsp" => "Respondent", "rst" => "Respondent-appellant", "rth" => "Research team head", "rtm" => "Research team member", "sad" => "Scientific advisor", "sce" => "Scenarist", "scl" => "Sculptor", "scr" => "Scribe", "sec" => "Secretary", "sgn" => "Signer", "sng" => "Singer", "spk" => "Speaker", "spn" => "Sponsor", "spy" => "Second party", "srv" => "Surveyor", "std" => "Set designer ", "stl" => "Storyteller", "stn" => "Standards body", "str" => "Stereotyper", "tch" => "Teacher ", "ths" => "Thesis advisor", "trc" => "Transcriber", "trl" => "Translator", "tyd" => "Type designer", "tyg" => "Typographer", "vdg" => "Videographer ", "voc" => "Vocalist", "wam" => "Writer of accompanying material", "wdc" => "Woodcutter", "wde" => "Wood -engraver", "wit" => "Witness" } end