# Service that searches Google Book Search to determine viewability. # It searches by ISBN, OCLCNUM and/or LCCN. # # Uses Google Books API, http://code.google.com/apis/books/docs/v1/getting_started.html # http://code.google.com/apis/books/docs/v1/using.html # # If a full view is available it returns a fulltext service response. # If partial view is available, return as "limited experts". # If no view at all, still includes a link in highlighted_links, to pay # lip service to google branding requirements. # Unfortunately there is no way tell which of the noview # books provide search, although some do -- search is advertised if full or # partial view is available. # # If a thumbnail_url is returned in the responses, a cover image is displayed. # # Can also enhances with an abstract, if available. -- off by default, set `abstract: true` to turn on. # # And fleshes out bibliographic details from an identifier -- if all you had was an # ISBN, will fill in title, author, etc in referent from GBS response. # # = Google API Key # # Setting an api key in :api_key STRONGLY recommended, or you'll # probably get rate limited (not clear what the limit is with no api # key supplied). You may have to ask for higher rate limit for your api # key than the default 1000/day, which you can do through the google # api console: # https://code.google.com/apis/console # # I requested 50k with this message, and was quickly approved with no questions # "Services for academic library (Johns Hopkins Libraries) web applications to match Google Books availability to items presented by our catalog, OpenURL link resolver, and other software. " # # Recommend setting your 'per user limit' to something crazy high, as well # as requesting more quota. class GoogleBookSearch < Service require 'multi_json' # Identifiers used in API response to indicate viewability level ViewFullValue = 'ALL_PAGES' ViewPartialValue = 'PARTIAL' # None might also be 'snippet', but Google doesn't want to distinguish ViewNoneValue = 'NO_PAGES' ViewUnknownValue = 'UNKNOWN' include MetadataHelper include UmlautHttp # required params # attr_reader is important for tests attr_reader :url, :display_name, :num_full_views def service_types_generated types= [] if @web_links types.push ServiceTypeValue[:highlighted_link] types.push ServiceTypeValue[:excerpts] end types.push(ServiceTypeValue[:search_inside]) if @search_inside types.push(ServiceTypeValue[:fulltext]) if @fulltext types.push(ServiceTypeValue[:cover_image]) if @cover_image types.push(ServiceTypeValue[:referent_enhance]) if @referent_enhance types.push(ServiceTypeValue[:abstract]) if @abstract return types end def initialize(config) @url = 'https://www.googleapis.com/books/v1/volumes?q=' @display_name = 'Google Books' # number of full views to show @num_full_views = 1 # default on, to enhance our metadata with stuff from google @referent_enhance = true # default OFF, add description/abstract from GBS @abstract = false # Other responses on by default but can be turned off @cover_image = true @fulltext = true @search_inside = true @web_links = true # to partial view :excerpts or :fulltext # google api key strongly recommended, otherwise you'll # probably get rate limited. @api_key = nil @credits = { "Google Books" => "http://books.google.com/" } # While you can theoretically look up by LCCN on Google Books, # we have found FREQUENT false positives. There's no longer any # way to even report these to Google. By default, don't lookup # by LCCN. @lookup_by_lccn = false super(config) end def handle(request) bibkeys = get_bibkeys(request.referent) return request.dispatched(self, true) if bibkeys.nil? data = do_query(bibkeys, request) if data.blank? || data["error"] # fail fatal return request.dispatched(self, false) end # 0 hits, return. return request.dispatched(self, true) if data["totalItems"] == 0 enhance_referent(request, data) if @referent_enhance add_abstract(request, data) if @abstract #return full views first if @fulltext full_views_shown = create_fulltext_service_response(request, data) end if @search_inside # Add search_inside link if appropriate add_search_inside(request, data) end # only if no full view is shown, add links for partial view or noview unless full_views_shown do_web_links(request, data) end if @cover_image thumbnail_url = find_thumbnail_url(data) if thumbnail_url add_cover_image(request, thumbnail_url) end end return request.dispatched(self, true) end # Take the FIRST hit from google, and use it's values to enhance # our metadata. Will NOT overwrite existing data. def enhance_referent(request, data) entry = data["items"].first if (volumeInfo = entry["volumeInfo"]) title = volumeInfo["title"] title += ": #{volumeInfo["subtitle"]}" if (title && volumeInfo["subtitle"]) element_enhance(request, "title", title) element_enhance(request, "au", volumeInfo["authors"].first) if volumeInfo["authors"] element_enhance(request, "pub", volumeInfo["publisher"]) element_enhance(request, "tpages", volumeInfo["pageCount"]) if (date = volumeInfo["publishedDate"]) && date =~ /^(\d\d\d\d)/ element_enhance(request, "date", $1) end # LCCN is only rarely included, but is sometimes, eg: # "industryIdentifiers"=>[{"type"=>"OTHER", "identifier"=>"LCCN:72627172"}], # Also "LCCN:76630875" # # And sometimes OCLC number like: # "industryIdentifiers"=>[{"type"=>"OTHER", "identifier"=>"OCLC:12345678"}], # (volumeInfo["industryIdentifiers"] || []).each do |hash| if hash["type"] == "ISBN_13" element_enhance(request, "isbn", hash["identifier"]) elsif hash["type"] == "OTHER" && hash["identifier"].starts_with?("LCCN:") lccn = normalize_lccn( hash["identifier"].slice(5, hash["identifier"].length) ) request.referent.add_identifier("info:lccn/#{lccn}") elsif hash["type"] == "OTHER" && hash["identifier"].starts_with?("OCLC:") oclcnum = normalize_lccn( hash["identifier"].slice(5, hash["identifier"].length) ) request.referent.add_identifier("info:oclcnum/#{oclcnum}") end end end end def add_abstract(request, data) info = data["items"].first.try {|h| h["volumeInfo"]} if description = info["description"] url = info["infoLink"] request.add_service_response( :service => self, :display_text => "Description from Google Books", :display_text_i18n => "description", :url => remove_query_context(url), :service_type_value => :abstract ) end end # Will not over-write existing referent values. def element_enhance(request, rft_key, value) if (value) request.referent.enhance_referent(rft_key, value.to_s, true, false, :overwrite => false) end end # returns nil or escaped string of bibkeys # to increase the chances of good hit, we send all available bibkeys # and later dedupe by id. # FIXME Assumes we only have one of each kind of identifier. def get_bibkeys(rft) isbn = get_identifier(:urn, "isbn", rft) oclcnum = get_identifier(:info, "oclcnum", rft) lccn = get_lccn(rft) # Google doesn't officially support oclc/lccn search, but does # index as token with prefix smashed up right with identifier # eg http://books.google.com/books/feeds/volumes?q=OCLC32012617 # # Except turns out doing it as a phrase search is important! Or # google's normalization/tokenization does odd things. keys = [] keys << ('isbn:' + isbn) if isbn keys << ('"' + "OCLC" + oclcnum + '"') if oclcnum # Only use LCCN if we've got nothing else, and we're allowing it. # it returns many false positives. if @lookup_by_lccn && lccn && keys.length == 0 keys << ('"' + 'LCCN' + lccn + '"') end return nil if keys.empty? keys = CGI.escape( keys.join(' OR ') ) return keys end def do_query(bibkeys, request) headers = build_headers(request) link = @url + bibkeys if @api_key link += "&key=#{@api_key}" end # Add on limit to only request books, not magazines. link += "&printType=books" Rails.logger.debug("GoogleBookSearch requesting: #{link}") response = http_fetch(link, :headers => headers, :raise_on_http_error_code => false) data = MultiJson.load(response.body) # If Google gives us an error cause it says it can't geo-locate, # remove the IP, log warning, and try again. if (data["error"] && data["error"]["errors"] && data["error"]["errors"].find {|h| h["reason"] == "unknownLocation"} ) Rails.logger.warn("GoogleBookSearch: geo-locate error, retrying without X-Forwarded-For: '#{link}' headers: #{headers.inspect} #{response.inspect}\n #{data.inspect}") response = http_fetch(link, :raise_on_http_error_code => false) data = MultiJson.load(response.body) end if (! response.kind_of?(Net::HTTPSuccess)) || data["error"] Rails.logger.error("GoogleBookSearch error: '#{link}' headers: #{headers.inspect} #{response.inspect}\n #{data.inspect}") end return data end # We don't need to fake a proxy request anymore, but we still # include X-Forwarded-For so google can return location-appropriate # availability. If there's an existing X-Forwarded-For, we respect # it and add on to it. def build_headers(request) original_forwarded_for = nil if (request.http_env && request.http_env['HTTP_X_FORWARDED_FOR']) original_forwarded_for = request.http_env['HTTP_X_FORWARDED_FOR'] end # we used to prepare a comma seperated list in x-forwarded-for if # we had multiple requests, as per the x-forwarded-for spec, but I # think Google doesn't like it. ip_address = (original_forwarded_for ? original_forwarded_for : request.client_ip_addr.to_s) return {} if ip_address.blank? # If we've got a comma-seperated list from an X-Forwarded-For, we # can't send it on to google, google won't accept that, just take # the first one in the list, which is actually the ultimate client # IP. split returns the whole string if seperator isn't found, convenient. ip_address = ip_address.split(",").first # If all we have is an internal/private IP from the internal network, # do NOT send that to Google, or Google will give you a 503 error # and refuse to process your request, as of 7 sep 2011. sigh. # Also if it doesn't look like an IP at all, forget it, don't send it. if ((! ip_address =~ /^\d+\.\d+\.\d+\/\d$/) || ip_address.start_with?("10.") || ip_address.start_with?("172.16") || ip_address.start_with?("192.168")) return {} else return {'X-Forwarded-For' => ip_address } end end def find_entries(gbs_response, viewabilities) unless (viewabilities.kind_of?(Array)) viewabilities = [viewabilities] end entries = gbs_response["items"].find_all do |entry| viewability = entry["accessInfo"]["viewability"] (viewability && viewabilities.include?(viewability)) end return entries end # We only create a fulltext service response if we have a full view. # We create only as many full views as are specified in config. def create_fulltext_service_response(request, data) full_views = find_entries(data, ViewFullValue) return nil if full_views.empty? count = 0 full_views.each do |fv| uri = fv["volumeInfo"]["previewLink"] request.add_service_response( :service => self, :display_text => @display_name, :display_text_i18n => "display_name", :url => remove_query_context(uri), :service_type_value => :fulltext ) count += 1 break if count == @num_full_views end return true end def add_search_inside(request, data) # Just take the first one we find, if multiple searchable_view = find_entries(data, [ViewFullValue, ViewPartialValue])[0] if ( searchable_view ) url = searchable_view["volumeInfo"]["infoLink"] request.add_service_response( :service => self, :display_text=>@display_name, :display_text_i18n => "display_name", :url=> remove_query_context(url), :service_type_value => :search_inside ) end end # create highlighted_link service response for partial and noview # Only show one web link. prefer a partial view over a noview. # Some noviews have a snippet/search, but we have no way to tell. def do_web_links(request, data) # some noview items will have a snippet view, but we have no way to tell info_views = find_entries(data, ViewPartialValue) viewability = ViewPartialValue if info_views.blank? info_views = find_entries(data, ViewNoneValue) viewability = ViewNoneValue end # Shouldn't ever get to this point, but just in case return nil if info_views.blank? url = '' iv = info_views.first type = nil if (viewability == ViewPartialValue && url = iv["volumeInfo"]["previewLink"]) url = fix_pg_gbs_link(url) display_text = @display_name display_text_i18n = "display_name" type = ServiceTypeValue[:excerpts] else url = iv["volumeInfo"]["infoLink"] url = fix_pg_gbs_link(url) display_text = "Book Information" display_text_i18n = "book_information" type = ServiceTypeValue[:highlighted_link] end request.add_service_response( :service=>self, :url=> remove_query_context(url), :display_text=>display_text, :display_text_i18n => display_text_i18n, :service_type_value => type ) end # google books direct links do weird things with linking to # internal pages, perhaps intending to be based on our # search criteria, which pages matched, but we're not # using it like that for links to excerpts or full page. # reverse engineer it to go to full page. def fix_pg_gbs_link(url) url.sub(/([\?\;\&])(pg=[^;&]+)/, '\1pg=1') end # Not all responses have a thumbnail_url. We look for them and return the 1st. def find_thumbnail_url(data) entries = data["items"].collect do |entry| entry["volumeInfo"]["imageLinks"]["thumbnail"] if entry["volumeInfo"] && entry["volumeInfo"]["imageLinks"] end # removenill values entries.compact! # pick the first of the available thumbnails, or nil return entries[0] end def add_cover_image(request, url) zoom_url = url.clone # if we're sent to a page other than the frontcover then strip out the # page number and insert front cover zoom_url.sub!(/&pg=.*?&/, '&printsec=frontcover&') # hack out the 'curl' if we can zoom_url.sub!('&edge=curl', '') request.add_service_response( :service=>self, :display_text => 'Cover Image', :url => zoom_url, :size => "medium", :service_type_value => :cover_image ) end # Google gives us URL to the book that contains a 'dq' param # with the original query, which for us is an ISSN/LCCN/OCLCnum query, # which we don't actually want to leave in there. def remove_query_context(url) url.sub(/&dq=[^&]+/, '') end # Catch url_for call for search_inside, because we're going to redirect def response_url(service_response, submitted_params) if ( ! (service_response.service_type_value.name == "search_inside" )) return super(service_response, submitted_params) else # search inside! base = service_response[:url] query = CGI.escape(submitted_params["query"] || "") # attempting to reverse engineer a bit to get 'snippet' # style results instead of 'onepage' style results. # snippet seem more user friendly, and are what google's own # interface seems to give you by default. but 'onepage' is the # default from our deep link, but if we copy the JS hash data, # it looks like we can get Google to 'snippet'. url = base + "&q=#{query}#v=snippet&q=#{query}&f=false" return url end end end # Important to quote search, see: "OCLC1246014" # Test WorldCat links # FIXME: This produces two 'noview' links because the ids don't match. # This might be as good as we can do though, unless we want to only ever show # one 'noview' link. Notice that the metadata does differ between the two. # http://localhost:3000/resolve?url_ver=Z39.88-2004&rfr_id=info%3Asid%2Fworldcat.org%3Aworldcat&rft_val_fmt=info%3Aofi%2Ffmt%3Akev%3Amtx%3Abook&req_dat=%3Csessionid%3E&rft_id=info%3Aoclcnum%2F34576818&rft_id=urn%3AISBN%3A9780195101386&rft_id=urn%3AISSN%3A&rft.aulast=Twain&rft.aufirst=Mark&rft.auinitm=&rft.btitle=The+prince+and+the+pauper&rft.atitle=&rft.date=1996&rft.tpages=&rft.isbn=9780195101386&rft.aucorp=&rft.place=New+York&rft.pub=Oxford+University+Press&rft.edition=&rft.series=&rft.genre=book&url_ver=Z39.88-2004 # # Snippet view returns noview through the API # http://localhost:3000/resolve?rft.isbn=0155374656 # # full view example, LCCN 07020699 ; OCLC: 1246014