searchworks.rb in stanford-mods-1.1.3

- old
+ new

@@ -1,23 +1,25 @@
 # encoding: UTF-8
 require 'stanford-mods/searchworks_languages'
 require 'logger'
+require 'mods'
+
 # SearchWorks specific wranglings of MODS metadata as a mixin to the Stanford::Mods::Record object
 module Stanford
   module Mods
 
     class Record < ::Mods::Record
-      
+
       # include langagues known to SearchWorks; try to error correct when possible (e.g. when ISO-639 disagrees with MARC standard)
       def sw_language_facet
         result = []
-        @mods_ng_xml.language.each { |n| 
+        @mods_ng_xml.language.each { |n|
           # get languageTerm codes and add their translations to the result
-          n.code_term.each { |ct| 
+          n.code_term.each { |ct|
             if ct.authority.match(/^iso639/)
               begin
-                vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 } 
+                vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 }
                 vals.each do |v|
                   iso639_val = ISO_639.find(v.strip).english_name
                   if SEARCHWORKS_LANGUAGES.has_value?(iso639_val)
                     result << iso639_val
                   else
@@ -27,18 +29,18 @@
               rescue => e
                 # TODO:  this should be written to a logger
                 p "Couldn't find english name for #{ct.text}"
               end
             else
-              vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 } 
+              vals = ct.text.split(/[,|\ ]/).reject {|x| x.strip.length == 0 }
               vals.each do |v|
                 result << SEARCHWORKS_LANGUAGES[v.strip]
               end
             end
           }
           # add languageTerm text values
-          n.text_term.each { |tt| 
+          n.text_term.each { |tt|
             val = tt.text.strip
             result << val if val.length > 0 && SEARCHWORKS_LANGUAGES.has_value?(val)
           }
 
           # add language values that aren't in languageTerm subelement
@@ -46,100 +48,100 @@
             result << n.text if SEARCHWORKS_LANGUAGES.has_value?(n.text)
           end
         }
         result.uniq
       end # language_facet
-      
-      
+
+
       # ---- AUTHOR ----
-      
+
       # @return [String] value for author_1xx_search field
       def sw_main_author
         main_author_w_date
       end
-      
+
       # @return [Array<String>] values for author_7xx_search field
       def sw_addl_authors
         additional_authors_w_dates
       end
-      
+
       # @return [Array<String>] values for author_person_facet, author_person_display
       def sw_person_authors
         personal_names_w_dates
       end
-      
+
       # return the display_value_w_date for all <mods><name> elements that do not have type='personal'
       # @return [Array<String>] values for author_other_facet
       def sw_impersonal_authors
         @mods_ng_xml.plain_name.select {|n| n.type_at != 'personal'}.map { |n| n.display_value_w_date }
       end
-      
+
       # @return [Array<String>] values for author_corp_display
       def sw_corporate_authors
         val = @mods_ng_xml.plain_name.select {|n| n.type_at == 'corporate'}.map { |n| n.display_value_w_date }
         val
       end
-      
+
       # @return [Array<String>] values for author_meeting_display
       def sw_meeting_authors
         @mods_ng_xml.plain_name.select {|n| n.type_at == 'conference'}.map { |n| n.display_value_w_date }
       end
-      
+
       # Returns a sortable version of the main_author:
       #  main_author + sorting title
       # which is the mods approximation of the value created for a marc record
       # @return [String] value for author_sort field
       def sw_sort_author
         #  substitute java Character.MAX_CODE_POINT for nil main_author so missing main authors sort last
         val = '' + (main_author_w_date ? main_author_w_date : "\u{10FFFF} ") + ( sort_title ? sort_title : '')
         val.gsub(/[[:punct:]]*/, '').strip
       end
-      
+
       def main_author_w_date_test
         result = nil
         first_wo_role = nil
         self.plain_name.each { |n|
           if n.role.size == 0
             first_wo_role ||= n
           end
           n.role.each { |r|
-            if r.authority.include?('marcrelator') && 
+            if r.authority.include?('marcrelator') &&
               (r.value.include?('Creator') || r.value.include?('Author'))
               result ||= n.display_value_w_date
-            end          
+            end
           }
         }
         if !result && first_wo_role
           result = first_wo_role.display_value_w_date
         end
         result
       end
 
       # ---- end AUTHOR ----
-      
+
       # ---- TITLE ----
 
       # @return [String] value for title_245a_search field
       def sw_short_title
         short_titles ? short_titles.first : nil
       end
-      
+
       # @return [String] value for title_245_search, title_full_display
       def sw_full_title
         outer_nodes = @mods_ng_xml.title_info
         outer_node = outer_nodes ? outer_nodes.first : nil
         if outer_node
           nonSort = outer_node.nonSort.text.strip.empty? ? nil : outer_node.nonSort.text.strip
-          title = outer_node.title.text.strip.empty? ? nil: outer_node.title.text.strip
+          title   = outer_node.title.text.strip.empty?   ? nil : outer_node.title.text.strip
           preSubTitle = nonSort ? [nonSort, title].compact.join(" ") : title
           preSubTitle.sub!(/:$/, '') if preSubTitle # remove trailing colon
 
           subTitle = outer_node.subTitle.text.strip
           preParts = subTitle.empty? ? preSubTitle : preSubTitle + " : " + subTitle
           preParts.sub!(/\.$/, '') if preParts # remove trailing period
-          
-          partName = outer_node.partName.text.strip unless outer_node.partName.text.strip.empty?
+
+          partName   = outer_node.partName.text.strip   unless outer_node.partName.text.strip.empty?
           partNumber = outer_node.partNumber.text.strip unless outer_node.partNumber.text.strip.empty?
           partNumber.sub!(/,$/, '') if partNumber # remove trailing comma
           if partNumber && partName
             parts = partNumber + ", " + partName
           elsif partNumber
@@ -169,87 +171,87 @@
           result.sub!(/[\.,;:\/\\]+$/, '')
           result.strip!
         end
         result
       end
-      
-      # this includes all titles except 
+
+      # this includes all titles except
       # @return [Array<String>] values for title_variant_search
       def sw_addl_titles
         full_titles.select { |s| s !~ Regexp.new(Regexp.escape(sw_short_title)) }
       end
-      
+
       # Returns a sortable version of the main title
       # @return [String] value for title_sort field
       def sw_sort_title
         # get nonSort piece
         outer_nodes = @mods_ng_xml.title_info
         outer_node = outer_nodes ? outer_nodes.first : nil
         if outer_node
           nonSort = outer_node.nonSort.text.strip.empty? ? nil : outer_node.nonSort.text.strip
         end
-        
+
         val = '' + ( sw_full_title ? sw_full_title : '')
         val.sub!(Regexp.new("^" + nonSort), '') if nonSort
         val.gsub!(/[[:punct:]]*/, '').strip
         val.squeeze(" ").strip
       end
-      
+
       #remove trailing commas
       # @deprecated in favor of sw_title_display
       def sw_full_title_without_commas
         result = self.sw_full_title
         result.sub!(/,$/, '') if result
         result
       end
-    
+
       # ---- end TITLE ----
 
       # ---- SUBJECT ----
-      
+
       # Values are the contents of:
       #   subject/geographic
       #   subject/hierarchicalGeographic
       #   subject/geographicCode  (only include the translated value if it isn't already present from other mods geo fields)
       # @param [String] sep - the separator string for joining hierarchicalGeographic sub elements
       # @return [Array<String>] values for geographic_search Solr field for this document or [] if none
       def sw_geographic_search(sep = ' ')
         result = term_values([:subject, :geographic]) || []
-        
+
         # hierarchicalGeographic has sub elements
-        @mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node|  
+        @mods_ng_xml.subject.hierarchicalGeographic.each { |hg_node|
           hg_vals = []
-          hg_node.element_children.each { |e| 
+          hg_node.element_children.each { |e|
             hg_vals << e.text unless e.text.empty?
           }
           result << hg_vals.join(sep) unless hg_vals.empty?
         }
 
         trans_code_vals = @mods_ng_xml.subject.geographicCode.translated_value
         if trans_code_vals
-          trans_code_vals.each { |val|  
+          trans_code_vals.each { |val|
             result << val if !result.include?(val)
           }
         end
 
-        result    
+        result
       end
-      
+
       # Values are the contents of:
       #   subject/name/namePart
       #  "Values from namePart subelements should be concatenated in the order they appear (e.g. "Shakespeare, William, 1564-1616")"
       # @param [String] sep - the separator string for joining namePart sub elements
       # @return [Array<String>] values for names inside subject elements or [] if none
       def sw_subject_names(sep = ', ')
         result = []
-        @mods_ng_xml.subject.name_el.select { |n_el| n_el.namePart }.each { |name_el_w_np|  
+        @mods_ng_xml.subject.name_el.select { |n_el| n_el.namePart }.each { |name_el_w_np|
           parts = name_el_w_np.namePart.map { |npn| npn.text unless npn.text.empty? }.compact
           result << parts.join(sep).strip unless parts.empty?
         }
         result
       end
-      
+
       # Values are the contents of:
       #   subject/titleInfo/(subelements)
       # @param [String] sep - the separator string for joining titleInfo sub elements
       # @return [Array<String>] values for titles inside subject elements or [] if none
       def sw_subject_titles(sep = ' ')
@@ -258,11 +260,11 @@
           parts = ti_el.element_children.map { |el| el.text unless el.text.empty? }.compact
           result << parts.join(sep).strip unless parts.empty?
         }
         result
       end
-            
+
       # Values are the contents of:
       #   mods/genre
       #   mods/subject/topic
       # @return [Array<String>] values for the topic_search Solr field for this document or nil if none
       def topic_search
@@ -277,31 +279,31 @@
       #   subject/topic
       #   subject/name
       #   subject/title
       #   subject/occupation
       #  with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none 
+      # @return [Array<String>] values for the topic_facet Solr field for this document or nil if none
       def topic_facet
         vals = subject_topics ? Array.new(subject_topics) : []
         vals.concat(subject_names) if subject_names
         vals.concat(subject_titles) if subject_titles
         vals.concat(subject_occupations) if subject_occupations
-        vals.map! { |val| 
+        vals.map! { |val|
           v = val.sub(/[\\,;]$/, '')
           v.strip
         }
         vals.empty? ? nil : vals
       end
 
       # geographic_search values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none 
+      # @return [Array<String>] values for the geographic_facet Solr field for this document or nil if none
       def geographic_facet
         geographic_search.map { |val| val.sub(/[\\,;]$/, '').strip } unless !geographic_search
       end
 
       # subject/temporal values with trailing comma, semicolon, and backslash (and any preceding spaces) removed
-      # @return [Array<String>] values for the era_facet Solr field for this document or nil if none 
+      # @return [Array<String>] values for the era_facet Solr field for this document or nil if none
       def era_facet
         subject_temporal.map { |val| val.sub(/[\\,;]$/, '').strip } unless !subject_temporal
       end
 
       # Values are the contents of:
@@ -314,20 +316,20 @@
           result = self.sw_geographic_search
 
           # TODO:  this should go into stanford-mods ... but then we have to set that gem up with a Logger
           # print a message for any unrecognized encodings
           xvals = self.subject.geographicCode.translated_value
-          codes = self.term_values([:subject, :geographicCode]) 
+          codes = self.term_values([:subject, :geographicCode])
           if codes && codes.size > xvals.size
             self.subject.geographicCode.each { |n|
               if n.authority != 'marcgac' && n.authority != 'marccountry'
                 sw_logger.info("#{druid} has subject geographicCode element with untranslated encoding (#{n.authority}): #{n.to_xml}")
               end
             }
           end
 
-          # FIXME:  stanford-mods should be returning [], not nil ... 
+          # FIXME:  stanford-mods should be returning [], not nil ...
           return nil if !result || result.empty?
           result
         end
       end
 
@@ -354,18 +356,18 @@
           vals = subject_temporal ? Array.new(subject_temporal) : []
           gvals = self.term_values([:subject, :genre])
           vals.concat(gvals) if gvals
 
           # print a message for any temporal encodings
-          self.subject.temporal.each { |n| 
+          self.subject.temporal.each { |n|
             sw_logger.info("#{druid} has subject temporal element with untranslated encoding: #{n.to_xml}") if !n.encoding.empty?
           }
 
           vals.empty? ? nil : vals
         end
       end
-      
+
       # Values are the contents of:
       #  all subject subelements except subject/cartographic plus  genre top level element
       # @return [Array<String>] values for the subject_all_search Solr field for this document or nil if none
       def subject_all_search
         vals = topic_search ? Array.new(topic_search) : []
@@ -386,30 +388,30 @@
       # For the date display only, the first place to look is in the dates without encoding=marc array.
       # If no such dates, select the first date in the dates_marc_encoding array.  Otherwise return nil
       # @return [String] value for the pub_date_display Solr field for this document or nil if none
       def pub_date_display
           return dates_no_marc_encoding.first unless dates_no_marc_encoding.empty?
-          return dates_marc_encoding.first unless dates_marc_encoding.empty?
+          return dates_marc_encoding.first    unless dates_marc_encoding.empty?
           return nil
       end
 
       # For the date indexing, sorting and faceting, the first place to look is in the dates with encoding=marc array.
       # If that doesn't exist, look in the dates without encoding=marc array.  Otherwise return nil
       # @return [Array<String>] values for the date Solr field for this document or nil if none
       def pub_dates
-        return dates_marc_encoding unless dates_marc_encoding.empty?
+        return dates_marc_encoding    unless dates_marc_encoding.empty?
         return dates_no_marc_encoding unless dates_no_marc_encoding.empty?
         return nil
       end
-      
+
       def is_number?(object)
         true if Integer(object) rescue false
       end
       def is_date?(object)
         true if Date.parse(object) rescue false
       end
-  
+
       # Get the publish year from mods
       # @return [String] 4 character year or nil if no valid date was found
       def pub_year
         #use the cached year if there is one
         if @pub_year
@@ -421,11 +423,11 @@
         dates = pub_dates
         if dates
           year = []
           pruned_dates = []
           dates.each do |f_date|
-            #remove ? and [] 
+            #remove ? and []
             pruned_dates << f_date.gsub('?','').gsub('[','').gsub(']','')
           end
           #try to find a date starting with the most normal date formats and progressing to more wonky ones
           @pub_year = get_plain_four_digit_year pruned_dates
           return @pub_year if @pub_year
@@ -442,35 +444,31 @@
           return @pub_year if @pub_year
         end
         @pub_year=''
         return nil
       end
-      
+
       #creates a date suitable for sorting. Guarnteed to be 4 digits or nil
       def pub_date_sort
         pd=nil
         if pub_date
           pd=pub_date
           if pd.length == 3
             pd='0'+pd
           end
           pd=pd.gsub('--','00')
         end
-        raise "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd and pd.length !=4 
+        raise "pub_date_sort was about to return a non 4 digit value #{pd}!" if pd and pd.length !=4
         pd
       end
-      
+
       #The year the object was published, , filtered based on max_pub_date and min_pub_date from the config file
       #@return [String] 4 character year or nil
       def pub_date
-        val=pub_year
-        if val
-          return val
-        end
-        nil
+        pub_year || nil
       end
-      
+
       #Values for the pub date facet. This is less strict than the 4 year date requirements for pub_date
       #@return <Array[String]> with values for the pub date facet
       def pub_date_facet
         if pub_date
           if pub_date.start_with?('-')
@@ -486,17 +484,17 @@
           end
         else
           nil
         end
       end
-      
+
       # ---- end PUBLICATION (place, year) ----
 
       def sw_logger
         @logger ||= Logger.new(STDOUT)
       end
-      
+
       # select one or more format values from the controlled vocabulary here:
       #   http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format&rows=0&facet.sort=index
       # @return <Array[String]> value in the SearchWorks controlled vocabulary
       # @deprecated - kept for backwards compatibility but not part of SW UI redesign work Summer 2014
       def format
@@ -524,12 +522,12 @@
               when 'still image'
                 val << 'Image'
               when 'text'
                 val << 'Book' if issuance and issuance.include? 'monographic'
                 book_genres = ['book chapter', 'Book chapter', 'Book Chapter',
-                  'issue brief', 'Issue brief', 'Issue Brief', 
-                  'librettos', 'Librettos', 
+                  'issue brief', 'Issue brief', 'Issue Brief',
+                  'librettos', 'Librettos',
                   'project report', 'Project report', 'Project Report',
                   'technical report', 'Technical report', 'Technical Report',
                   'working paper', 'Working paper', 'Working Paper']
                 val << 'Book' if genres and !(genres & book_genres).empty?
                 conf_pub = ['conference publication', 'Conference publication', 'Conference Publication']
@@ -553,10 +551,23 @@
       #   http://searchworks-solr-lb.stanford.edu:8983/solr/select?facet.field=format_main_ssim&rows=0&facet.sort=index
       # @return <Array[String]> value in the SearchWorks controlled vocabulary
       def format_main
         val = []
         types = self.term_values(:typeOfResource)
+        article_genres = ['article', 'Article',
+          'book chapter', 'Book chapter', 'Book Chapter',
+          'issue brief', 'Issue brief', 'Issue Brief',
+          'project report', 'Project report', 'Project Report',
+          'student project report', 'Student project report', 'Student Project report', 'Student Project Report',
+          'technical report', 'Technical report', 'Technical Report',
+          'working paper', 'Working paper', 'Working Paper'
+        ]
+        book_genres = ['conference publication', 'Conference publication', 'Conference Publication',
+          'instruction', 'Instruction',
+          'librettos', 'Librettos',
+          'thesis', 'Thesis'
+        ]
         if types
           genres = self.term_values(:genre)
           issuance = self.term_values([:origin_info,:issuance])
           types.each do |type|
             case type
@@ -579,26 +590,13 @@
               when 'sound recording-nonmusical', 'sound recording'
                 val << 'Sound recording'
               when 'still image'
                 val << 'Image'
               when 'text'
-                article_genres = ['article', 'Article',
-                  'book chapter', 'Book chapter', 'Book Chapter',
-                  'issue brief', 'Issue brief', 'Issue Brief',
-                  'project report', 'Project report', 'Project Report',
-                  'student project report', 'Student project report', 'Student Project report', 'Student Project Report',
-                  'technical report', 'Technical report', 'Technical Report',
-                  'working paper', 'Working paper', 'Working Paper'
-                  ]
-                val << 'Book' if genres and !(genres & article_genres).empty?
+                val << 'Book' if genres   and !(genres & article_genres).empty?
                 val << 'Book' if issuance and issuance.include? 'monographic'
-                book_genres = ['conference publication', 'Conference publication', 'Conference Publication',
-                  'instruction', 'Instruction',
-                  'librettos', 'Librettos',
-                  'thesis', 'Thesis'
-                  ]
-                val << 'Book' if genres and !(genres & book_genres).empty?
+                val << 'Book' if genres   and !(genres & book_genres).empty?
                 val << 'Journal/Periodical' if issuance and issuance.include? 'continuing'
               when 'three dimensional object'
                 val << 'Object'
             end
           end
@@ -669,35 +667,35 @@
 
       # convenience method for subject/topic values (to avoid parsing the mods for the same thing multiple times)
       def subject_topics
         @subject_topics ||= self.term_values([:subject, :topic])
       end
-  
+
       #get a 4 digit year like 1865 from the date array
       def get_plain_four_digit_year dates
         dates.each do |f_date|
           matches=f_date.scan(/\d{4}/)
           if matches.length == 1
-            @pub_year=matches.first 
+            @pub_year=matches.first
             return matches.first
           else
             #if there are multiples, check for ones with CE after them
             matches.each do |match|
               #look for things like '1865-6 CE'
               pos = f_date.index(Regexp.new(match+'...CE'))
               pos = pos ? pos.to_i : 0
               if f_date.include?(match+' CE') or pos > 0
                 @pub_year=match
-                return match  
-              end 
+                return match
+              end
             end
             return matches.first
           end
         end
         return nil
       end
-      
+
       # If a year has a "u" in it, replace instances of u with 0
       # @param [String] dates
       # @return String
       def get_u_year dates
         dates.each do |f_date|
@@ -712,11 +710,11 @@
             return matches.first.gsub('u','-')
           end
         end
         return nil
       end
-  
+
       #get a double digit century like '12th century' from the date array
       def get_double_digit_century dates
         dates.each do |f_date|
           matches=f_date.scan(/\d{2}th/)
           if matches.length == 1
@@ -730,17 +728,17 @@
               pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE'))
               pos = pos ? pos.to_i : 0
               if f_date.include?(match+' CE') or pos > 0
                 @pub_year=((match[0,2].to_i) - 1).to_s+'--'
                 return @pub_year
-              end 
+              end
             end
           end
         end
         return nil
       end
-  
+
       #get a 3 digit year like 965 from the date array
       def get_three_digit_year dates
         dates.each do |f_date|
           matches=f_date.scan(/\d{3}/)
           if matches.length > 0
@@ -751,18 +749,18 @@
       end
       #get the 3 digit BC year, return it as a negative, so -700 for 300 BC. Other methods will translate it to proper display, this is good for sorting.
       def get_bc_year dates
         dates.each do |f_date|
           matches=f_date.scan(/\d{3} B.C./)
-          if matches.length > 0   
+          if matches.length > 0
             bc_year=matches.first[0..2]
             return (bc_year.to_i-1000).to_s
           end
         end
         return nil
       end
-  
+
       #get a single digit century like '9th century' from the date array
       def get_single_digit_century dates
         dates.each do |f_date|
           matches=f_date.scan(/\d{1}th/)
           if matches.length == 1
@@ -776,14 +774,14 @@
               pos = pos ? pos.to_i : f_date.index(Regexp.new(match+' century CE'))
               pos = pos ? pos.to_i : 0
               if f_date.include?(match+' CE') or pos > 0
                 @pub_year=((match[0,1].to_i) - 1).to_s+'--'
                 return @pub_year
-              end 
+              end
             end
           end
-        end 
+        end
         return nil
       end
 
       # @return [Array<String>] dates from dateIssued and dateCreated tags from origin_info with encoding="marc"
       def dates_marc_encoding
@@ -799,10 +797,10 @@
           parse_dates_from_originInfo
           @dates_no_marc_encoding
         end
       end
 
-      # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info 
+      # Populate @dates_marc_encoding and @dates_no_marc_encoding from dateIssued and dateCreated tags from origin_info
       # with and without encoding=marc
       def parse_dates_from_originInfo
         @dates_marc_encoding = []
         @dates_no_marc_encoding = []
         self.origin_info.dateIssued.each { |di|