lib/stanford-mods/searchworks.rb in stanford-mods-1.1.2 vs lib/stanford-mods/searchworks.rb in stanford-mods-1.1.3

- old
+ new

@@ -1,23 +1,25 @@ # encoding: UTF-8 require 'stanford-mods/searchworks_languages' require 'logger' +require 'mods' + # SearchWorks specific wranglings of MODS metadata as a mixin to the Stanford::Mods::Record object module Stanford module Mods class Record < ::Mods::Record - + # include langagues known to SearchWorks; try to error correct when possible (e.g. when ISO-639 disagrees with MARC standard) def sw_language_facet result = [] - @mods_ng_xml.language.each { |n| + @mods_ng_xml.language.each { |n| # get languageTerm codes and add their translations to the result - n.code_term.each { |ct| + n.code_term.each { |ct| if ct.authority.match(/^iso639/) begin - vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 } + vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 } vals.each do |v| iso639_val = ISO_639.find(v.strip).english_name if SEARCHWORKS_LANGUAGES.has_value?(iso639_val) result << iso639_val else @@ -27,18 +29,18 @@ rescue => e # TODO: this should be written to a logger p "Couldn't find english name for #{ct.text}" end else - vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 } + vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 } vals.each do |v| result << SEARCHWORKS_LANGUAGES[v.strip] end end } # add languageTerm text values - n.text_term.each { |tt| + n.text_term.each { |tt| val = tt.text.strip result << val if val.length > 0 && SEARCHWORKS_LANGUAGES.has_value?(val) } # add language values that aren't in languageTerm subelement @@ -46,100 +48,100 @@ result << n.text if SEARCHWORKS_LANGUAGES.has_value?(n.text) end } result.uniq end # language_facet - - + + # ---- AUTHOR ---- - + # @return [String] value for author_1xx_search field def sw_main_author main_author_w_date end - + # @return [Array<String>] values for author_7xx_search field def sw_addl_authors additional_authors_w_dates end - + # @return [Array<String>] values for author_person_facet, author_person_display def sw_person_authors personal_names_w_dates end - + # return the display_value_w_date for all <mods><name> elements that do not have type='personal' # @return [Array<String>] values for author_other_facet def sw_impersonal_authors @mods_ng_xml.plain_name.select {|n| n.type_at != 'personal'}.map { |n| n.display_value_w_date } end - + # @return [Array<String>] values for author_corp_display def sw_corporate_authors val = @mods_ng_xml.plain_name.select {|n| n.type_at == 'corporate'}.map { |n| n.display_value_w_date } val end - + # @return [Array<String>] values for author_meeting_display def sw_meeting_authors @mods_ng_xml.plain_name.select {|n| n.type_at == 'conference'}.map { |n| n.display_value_w_date } end - + # Returns a sortable version of the main_author: # main_author + sorting title # which is the mods approximation of the value created for a marc record # @return [String] value for author_sort field def sw_sort_author # substitute java Character.MAX_CODE_POINT for nil main_author so missing main authors sort last val = '' + (main_author_w_date ? main_author_w_date : "\u{10FFFF} ") + ( sort_title ? sort_title : '') val.gsub(/[[:punct:]]*/, '').strip end - + def main_author_w_date_test result = nil first_wo_role = nil self.plain_name.each { |n| if n.role.size == 0 first_wo_role ||= n end n.role.each { |r| - if r.authority.include?('marcrelator') && + if r.authority.include?('marcrelator') && (r.value.include?('Creator') || r.value.include?('Author')) result ||= n.display_value_w_date - end + end } } if !result && first_wo_role result = first_wo_role.display_value_w_date end result end # ---- end AUTHOR ---- - + # ---- TITLE ---- # @return [String] value for title_245a_search field def sw_short_title short_titles ? short_titles.first : nil end - + # @return [String] value for title_245_search, title_full_display def sw_full_title outer_nodes = @mods_ng_xml.title_info outer_node = outer_nodes ? outer_nodes.first : nil if outer_node nonSort = outer_node.nonSort.text.strip.empty? ? nil : outer_node.nonSort.text.strip - title = outer_node.title.text.strip.empty? ? nil: outer_node.title.text.strip + title = outer_node.title.text.strip.empty? ? nil : outer_node.title.text.strip preSubTitle = nonSort ? [nonSort, title].compact.join(" ") : title preSubTitle.sub!(/:$/, '') if preSubTitle # remove trailing colon subTitle = outer_node.subTitle.text.strip preParts = subTitle.empty? ? preSubTitle : preSubTitle + " : " + subTitle preParts.sub!(/\.$/, '') if preParts # remove trailing period - - partName = outer_node.partName.text.strip unless outer_node.partName.text.strip.empty? + + partName = outer_node.partName.text.strip unless outer_node.partName.text.strip.empty? partNumber = outer_node.partNumber.text.strip unless outer_node.partNumber.text.strip.empty? partNumber.sub!(/,$/, '') if partNumber # remove trailing comma if partNumber && partName parts = partNumber + ", " + partName elsif partNumber @@ -169,87 +171,87 @@ result.sub!(/[\.,;:\/\\]+$/, '') result.strip! end result end - - # this includes all titles except + + # this includes all titles except # @return [Array<String>] values for title_variant_search def sw_addl_titles full_titles.select { |s| s !~ Regexp.new(Regexp.escape(sw_short_title)) } end - + # Returns a sortable version of the main title # @return [String] value for title_sort field def sw_sort_title # get nonSort piece outer_nodes = @mods_ng_xml.title_info outer_node = outer_nodes ? outer_nodes.first : nil if outer_node nonSort = outer_node.nonSort.text.strip.empty? ? nil : outer_node.nonSort.text.strip end - + val = '' + ( sw_full_title ? sw_full_title : '') val.sub!(Regexp.new("^" + nonSort), '') if nonSort val.gsub!(/[[:punct:]]*/, '').strip val.squeeze(" ").strip end - + #remove trailing commas # @deprecated in favor of sw_title_display def sw_full_title_without_commas result = self.sw_full_title result.sub!(/,$/, '') if result result end - + # ---- end TITLE ---- # ---- SUBJECT ---- - + # Values are the contents of: # subject/geographic # subject/hierarchicalGeographic # subject/geographicCode (only include the translated value if it isn't already present from other mods geo fields) # @param [String] sep - the separator string for joining hierarchicalGeographic sub elements # @return [Array<String>] values for geographic_search Solr field for this document or [] if none def sw_geographic_search(sep = ' ') result = term_values([:subject, :geographic]) || [] - + # hierarchicalGeographic has sub elements - @mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node| + @mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node| hg_vals = [] - hg_node.element_children.each { |e| + hg_node.element_children.each { |e| hg_vals << e.text unless e.text.empty? } result << hg_vals.join(sep) unless hg_vals.empty? } trans_code_vals = @mods_ng_xml.subject.geographicCode.translated_value if trans_code_vals - trans_code_vals.each { |val| + trans_code_vals.each { |val| result << val if !result.include?(val) } end - result + result end - + # Values are the contents of: # subject/name/namePart # "Values from namePart subelements should be concatenated in the order they appear (e.g. "Shakespeare, William, 1564-1616")" # @param [String] sep - the separator string for joining namePart sub elements # @return [Array<String>] values for names inside subject elements or [] if none def sw_subject_names(sep = ', ') result = [] - @mods_ng_xml.subject.name_el.select { |n_el| n_el.namePart }.each { |name_el_w_np| + @mods_ng_xml.subject.name_el.select { |n_el| n_el.namePart }.each { |name_el_w_np| parts = name_el_w_np.namePart.map { |npn| npn.text unless npn.text.empty? }.compact result << parts.join(sep).strip unless parts.empty? } result end - + # Values are the contents of: # subject/titleInfo/(subelements) # @param [String] sep - the separator string for joining titleInfo sub elements # @return [Array<String>] values for titles inside subject elements or [] if none def sw_subject_titles(sep = ' ') @@ -258,11 +260,11 @@ parts = ti_el.element_children.map { |el| el.text unless el.text.empty? }.compact result << parts.join(sep).strip unless parts.empty? } result end - + # Values are the contents of: # mods/genre # mods/subject/topic # @return [Array<String>] values for the topic_search Solr field for this document or nil if none def topic_search @@ -277,31 +279,31 @@ # subject/topic # subject/name # subject/title # subject/occupation # with trailing comma, semicolon, and backslash (and any preceding spaces) removed - # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none + # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none def topic_facet vals = subject_topics ? Array.new(subject_topics) : [] vals.concat(subject_names) if subject_names vals.concat(subject_titles) if subject_titles vals.concat(subject_occupations) if subject_occupations - vals.map! { |val| + vals.map! { |val| v = val.sub(/[\\,;]$/, '') v.strip } vals.empty? ? nil : vals end # geographic_search values with trailing comma, semicolon, and backslash (and any preceding spaces) removed - # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none + # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none def geographic_facet geographic_search.map { |val| val.sub(/[\\,;]$/, '').strip } unless !geographic_search end # subject/temporal values with trailing comma, semicolon, and backslash (and any preceding spaces) removed - # @return [Array<String>] values for the era_facet Solr field for this document or nil if none + # @return [Array<String>] values for the era_facet Solr field for this document or nil if none def era_facet subject_temporal.map { |val| val.sub(/[\\,;]$/, '').strip } unless !subject_temporal end # Values are the contents of: @@ -314,20 +316,20 @@ result = self.sw_geographic_search # TODO: this should go into stanford-mods ... but then we have to set that gem up with a Logger # print a message for any unrecognized encodings xvals = self.subject.geographicCode.translated_value - codes = self.term_values([:subject, :geographicCode]) + codes = self.term_values([:subject, :geographicCode]) if codes && codes.size > xvals.size self.subject.geographicCode.each { |n| if n.authority != 'marcgac' && n.authority != 'marccountry' sw_logger.info("#{druid} has subject geographicCode element with untranslated encoding (#{n.authority}): #{n.to_xml}") end } end - # FIXME: stanford-mods should be returning [], not nil ... + # FIXME: stanford-mods should be returning [], not nil ... return nil if !result || result.empty? result end end @@ -354,18 +356,18 @@ vals = subject_temporal ? Array.new(subject_temporal) : [] gvals = self.term_values([:subject, :genre]) vals.concat(gvals) if gvals # print a message for any temporal encodings - self.subject.temporal.each { |n| + self.subject.temporal.each { |n| sw_logger.info("#{druid} has subject temporal element with untranslated encoding: #{n.to_xml}") if !n.encoding.empty? } vals.empty? ? nil : vals end end - + # Values are the contents of: # all subject subelements except subject/cartographic plus genre top level element # @return [Array<String>] values for the subject_all_search Solr field for this document or nil if none def subject_all_search vals = topic_search ? Array.new(topic_search) : [] @@ -386,30 +388,30 @@ # For the date display only, the first place to look is in the dates without encoding=marc array. # If no such dates, select the first date in the dates_marc_encoding array. Otherwise return nil # @return [String] value for the pub_date_display Solr field for this document or nil if none def pub_date_display return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty? - return dates_marc_encoding.first unless dates_marc_encoding.empty? + return dates_marc_encoding.first unless dates_marc_encoding.empty? return nil end # For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array. # If that doesn't exist, look in the dates without encoding=marc array. Otherwise return nil # @return [Array<String>] values for the date Solr field for this document or nil if none def pub_dates - return dates_marc_encoding unless dates_marc_encoding.empty? + return dates_marc_encoding unless dates_marc_encoding.empty? return dates_no_marc_encoding unless dates_no_marc_encoding.empty? return nil end - + def is_number?(object) true if Integer(object) rescue false end def is_date?(object) true if Date.parse(object) rescue false end - + # Get the publish year from mods # @return [String] 4 character year or nil if no valid date was found def pub_year #use the cached year if there is one if @pub_year @@ -421,11 +423,11 @@ dates = pub_dates if dates year = [] pruned_dates = [] dates.each do |f_date| - #remove ? and [] + #remove ? and [] pruned_dates << f_date.gsub('?','').gsub('[','').gsub(']','') end #try to find a date starting with the most normal date formats and progressing to more wonky ones @pub_year = get_plain_four_digit_year pruned_dates return @pub_year if @pub_year @@ -442,35 +444,31 @@ return @pub_year if @pub_year end @pub_year='' return nil end - + #creates a date suitable for sorting. Guarnteed to be 4 digits or nil def pub_date_sort pd=nil if pub_date pd=pub_date if pd.length == 3 pd='0'+pd end pd=pd.gsub('--','00') end - raise "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd and pd.length !=4 + raise "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd and pd.length !=4 pd end - + #The year the object was published, , filtered based on max_pub_date and min_pub_date from the config file #@return [String] 4 character year or nil def pub_date - val=pub_year - if val - return val - end - nil + pub_year || nil end - + #Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date #@return <Array[String]> with values for the pub date facet def pub_date_facet if pub_date if pub_date.start_with?('-') @@ -486,17 +484,17 @@ end else nil end end - + # ---- end PUBLICATION (place, year) ---- def sw_logger @logger ||= Logger.new(STDOUT) end - + # select one or more format values from the controlled vocabulary here: # http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index # @return <Array[String]> value in the SearchWorks controlled vocabulary # @deprecated - kept for backwards compatibility but not part of SW UI redesign work Summer 2014 def format @@ -524,12 +522,12 @@ when 'still image' val << 'Image' when 'text' val << 'Book' if issuance and issuance.include? 'monographic' book_genres = ['book chapter', 'Book chapter', 'Book Chapter', - 'issue brief', 'Issue brief', 'Issue Brief', - 'librettos', 'Librettos', + 'issue brief', 'Issue brief', 'Issue Brief', + 'librettos', 'Librettos', 'project report', 'Project report', 'Project Report', 'technical report', 'Technical report', 'Technical Report', 'working paper', 'Working paper', 'Working Paper'] val << 'Book' if genres and !(genres & book_genres).empty? conf_pub = ['conference publication', 'Conference publication', 'Conference Publication'] @@ -553,10 +551,23 @@ # http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format_main_ssim&rows=0&facet.sort=index # @return <Array[String]> value in the SearchWorks controlled vocabulary def format_main val = [] types = self.term_values(:typeOfResource) + article_genres = ['article', 'Article', + 'book chapter', 'Book chapter', 'Book Chapter', + 'issue brief', 'Issue brief', 'Issue Brief', + 'project report', 'Project report', 'Project Report', + 'student project report', 'Student project report', 'Student Project report', 'Student Project Report', + 'technical report', 'Technical report', 'Technical Report', + 'working paper', 'Working paper', 'Working Paper' + ] + book_genres = ['conference publication', 'Conference publication', 'Conference Publication', + 'instruction', 'Instruction', + 'librettos', 'Librettos', + 'thesis', 'Thesis' + ] if types genres = self.term_values(:genre) issuance = self.term_values([:origin_info,:issuance]) types.each do |type| case type @@ -579,26 +590,13 @@ when 'sound recording-nonmusical', 'sound recording' val << 'Sound recording' when 'still image' val << 'Image' when 'text' - article_genres = ['article', 'Article', - 'book chapter', 'Book chapter', 'Book Chapter', - 'issue brief', 'Issue brief', 'Issue Brief', - 'project report', 'Project report', 'Project Report', - 'student project report', 'Student project report', 'Student Project report', 'Student Project Report', - 'technical report', 'Technical report', 'Technical Report', - 'working paper', 'Working paper', 'Working Paper' - ] - val << 'Book' if genres and !(genres & article_genres).empty? + val << 'Book' if genres and !(genres & article_genres).empty? val << 'Book' if issuance and issuance.include? 'monographic' - book_genres = ['conference publication', 'Conference publication', 'Conference Publication', - 'instruction', 'Instruction', - 'librettos', 'Librettos', - 'thesis', 'Thesis' - ] - val << 'Book' if genres and !(genres & book_genres).empty? + val << 'Book' if genres and !(genres & book_genres).empty? val << 'Journal/Periodical' if issuance and issuance.include? 'continuing' when 'three dimensional object' val << 'Object' end end @@ -669,35 +667,35 @@ # convenience method for subject/topic values (to avoid parsing the mods for the same thing multiple times) def subject_topics @subject_topics ||= self.term_values([:subject, :topic]) end - + #get a 4 digit year like 1865 from the date array def get_plain_four_digit_year dates dates.each do |f_date| matches=f_date.scan(/\d{4}/) if matches.length == 1 - @pub_year=matches.first + @pub_year=matches.first return matches.first else #if there are multiples, check for ones with CE after them matches.each do |match| #look for things like '1865-6 CE' pos = f_date.index(Regexp.new(match+'...CE')) pos = pos ? pos.to_i : 0 if f_date.include?(match+' CE') or pos > 0 @pub_year=match - return match - end + return match + end end return matches.first end end return nil end - + # If a year has a "u" in it, replace instances of u with 0 # @param [String] dates # @return String def get_u_year dates dates.each do |f_date| @@ -712,11 +710,11 @@ return matches.first.gsub('u','-') end end return nil end - + #get a double digit century like '12th century' from the date array def get_double_digit_century dates dates.each do |f_date| matches=f_date.scan(/\d{2}th/) if matches.length == 1 @@ -730,17 +728,17 @@ pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE')) pos = pos ? pos.to_i : 0 if f_date.include?(match+' CE') or pos > 0 @pub_year=((match[0,2].to_i) - 1).to_s+'--' return @pub_year - end + end end end end return nil end - + #get a 3 digit year like 965 from the date array def get_three_digit_year dates dates.each do |f_date| matches=f_date.scan(/\d{3}/) if matches.length > 0 @@ -751,18 +749,18 @@ end #get the 3 digit BC year, return it as a negative, so -700 for 300 BC. Other methods will translate it to proper display, this is good for sorting. def get_bc_year dates dates.each do |f_date| matches=f_date.scan(/\d{3} B.C./) - if matches.length > 0 + if matches.length > 0 bc_year=matches.first[0..2] return (bc_year.to_i-1000).to_s end end return nil end - + #get a single digit century like '9th century' from the date array def get_single_digit_century dates dates.each do |f_date| matches=f_date.scan(/\d{1}th/) if matches.length == 1 @@ -776,14 +774,14 @@ pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE')) pos = pos ? pos.to_i : 0 if f_date.include?(match+' CE') or pos > 0 @pub_year=((match[0,1].to_i) - 1).to_s+'--' return @pub_year - end + end end end - end + end return nil end # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc" def dates_marc_encoding @@ -799,10 +797,10 @@ parse_dates_from_originInfo @dates_no_marc_encoding end end - # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info + # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info # with and without encoding=marc def parse_dates_from_originInfo @dates_marc_encoding = [] @dates_no_marc_encoding = [] self.origin_info.dateIssued.each { |di|