imprint.rb in stanford-mods-3.0.0.alpha1

- old
+ new

@@ -1,6 +1,6 @@
-require 'active_support/core_ext/integer/inflections'
+require 'mods/marc_country_codes'
 
 module Stanford
   module Mods
     ##
     # Get the imprint information from originInfo elements (and sub elements) to create display strings
@@ -8,81 +8,37 @@
     # This code is adapted from the mods_display gem.  In a perfect world, this
     # code would make use of the date_parsing class instead of reimplementing pieces of it;
     # however, the date_parsing class only does years, and this does finer tuned dates and also
     # reformats them according to the encoding.
     class Imprint
-      # @param [Nokogiri::XML::NodeSet] originInfo_ng_nodeset of originInfo nodes
-      def initialize(originInfo_ng_nodeset)
-        @originInfo_ng_nodeset = originInfo_ng_nodeset
+      attr_reader :element
+
+      # @param [Nokogiri::XML::Node] an originInfo node
+      def initialize(element)
+        @element = element
       end
 
-      require 'marc_countries'
-
-      # @return Array<String> each String is an imprint statement from a single originInfo element
       def imprint_statements
-        results = []
-        @originInfo_ng_nodeset.each do |origin_info_node|
-          edition = edition_vals_str(origin_info_node)
-          place = place_vals_str(origin_info_node)
-          publisher = publisher_vals_str(origin_info_node)
-          dates = date_str(origin_info_node)
-
-          place_pub = compact_and_join_with_delimiter([place, publisher], ' : ')
-          edition_place_pub = compact_and_join_with_delimiter([edition, place_pub], ' - ')
-          ed_place_pub_dates = compact_and_join_with_delimiter([edition_place_pub, dates], ', ')
-
-          results << ed_place_pub_dates unless ed_place_pub_dates.empty?
-        end
-        results
+        display_str
       end
 
+      # @return <String> an imprint statement from a single originInfo element
       def display_str
-        imprint_statements.join('; ') if imprint_statements.present?
-      end
+        edition = edition_vals_str
+        place = place_vals_str
+        publisher = publisher_vals_str
+        dates = date_str
 
-      # @return Array<Integer> an array of publication years for the resource
-      def publication_date_for_slider
-        @originInfo_ng_nodeset.map do |origin_info_node|
-          date_elements = if origin_info_node.as_object.first.key_dates.any?
-                            origin_info_node.as_object.first.key_dates.map(&:as_object).map(&:first)
-                          else
-                            date_field_keys.map do |date_field|
-                              next unless origin_info_node.respond_to?(date_field)
+        place_pub = compact_and_join_with_delimiter([place, publisher], ' : ')
+        edition_place_pub = compact_and_join_with_delimiter([edition, place_pub], ' - ')
+        ed_place_pub_dates = compact_and_join_with_delimiter([edition_place_pub, dates], ', ')
 
-                              date_elements = origin_info_node.send(date_field)
-                              date_elements.map(&:as_object).map(&:first) if date_elements.any?
-                            end.compact.first
-                          end
-
-          if date_elements.nil? || date_elements.none?
-            []
-          elsif date_elements.find(&:start?) &&
-                date_elements.find(&:start?).as_range &&
-                date_elements.find(&:end?) &&
-                date_elements.find(&:end?).as_range
-            start_date = date_elements.find(&:start?)
-            end_date = date_elements.find(&:end?)
-
-            (start_date.as_range.min.year..end_date.as_range.max.year).to_a
-          elsif date_elements.find(&:start?) && date_elements.find(&:start?).as_range
-            start_date = date_elements.find(&:start?)
-
-            (start_date.as_range.min.year..Time.now.year).to_a
-          elsif date_elements.one?
-            date_elements.first.to_a.map(&:year)
-          else
-            date_elements.map { |v| v.to_a.map(&:year) }.flatten
-          end
-        end.flatten
+        ed_place_pub_dates
       end
 
       private
 
-      def extract_year(el)
-        DateParsing.year_int_from_date_str(el.text)
-      end
-
       def compact_and_join_with_delimiter(values, delimiter)
         compact_values = values.compact.reject { |v| v.strip.empty? }
         return compact_values.join(delimiter) if compact_values.length == 1 ||
                                                  !ends_in_terminating_punctuation?(delimiter)
 
@@ -98,31 +54,31 @@
 
       def ends_in_terminating_punctuation?(value)
         value.strip.end_with?('.', ',', ':', ';')
       end
 
-      def edition_vals_str(origin_info_node)
-        origin_info_node.edition.reject do |e|
+      def edition_vals_str
+        element.edition.reject do |e|
           e.text.strip.empty?
         end.map(&:text).join(' ').strip
       end
 
-      def publisher_vals_str(origin_info_node)
-        return if origin_info_node.publisher.text.strip.empty?
+      def publisher_vals_str
+        return if element.publisher.text.strip.empty?
 
-        publishers = origin_info_node.publisher.reject do |p|
+        publishers = element.publisher.reject do |p|
           p.text.strip.empty?
         end.map(&:text)
         compact_and_join_with_delimiter(publishers, ' : ')
       end
 
       # PLACE processing methods ------
 
-      def place_vals_str(origin_info_node)
-        return if origin_info_node.place.text.strip.empty?
+      def place_vals_str
+        return if element.place.text.strip.empty?
 
-        places = place_terms(origin_info_node).reject do |p|
+        places = place_terms.reject do |p|
           p.text.strip.empty?
         end.map(&:text)
         compact_and_join_with_delimiter(places, ' : ')
       end
 
@@ -131,336 +87,203 @@
           !term.attributes['type'].respond_to?(:value) ||
             term.attributes['type'].value == 'text'
         end
       end
 
-      def place_terms(origin_info_element)
-        return [] unless origin_info_element.respond_to?(:place) &&
-                         origin_info_element.place.respond_to?(:placeTerm)
+      def place_terms
+        return [] unless element.respond_to?(:place) &&
+                         element.place.respond_to?(:placeTerm)
 
-        if unencoded_place_terms?(origin_info_element)
-          origin_info_element.place.placeTerm.select do |term|
+        if unencoded_place_terms?(element)
+          element.place.placeTerm.select do |term|
             !term.attributes['type'].respond_to?(:value) ||
               term.attributes['type'].value == 'text'
           end.compact
         else
-          origin_info_element.place.placeTerm.map do |term|
+          element.place.placeTerm.map do |term|
             next unless term.attributes['type'].respond_to?(:value) &&
                         term.attributes['type'].value == 'code' &&
                         term.attributes['authority'].respond_to?(:value) &&
                         term.attributes['authority'].value == 'marccountry' &&
-                        MARC_COUNTRIES.include?(term.text.strip)
+                        !['xx', 'vp'].include?(term.text.strip) &&
+                        MARC_COUNTRY.include?(term.text.strip)
 
             term = term.clone
-            term.content = MARC_COUNTRIES[term.text.strip]
+            term.content = MARC_COUNTRY[term.text.strip]
             term
           end.compact
         end
       end
 
       # DATE processing methods ------
 
-      def date_str(origin_info_node)
-        date_vals = origin_info_date_vals(origin_info_node)
+      def date_str
+        date_vals = origin_info_date_vals
         return if date_vals.empty?
-
         date_vals.map(&:strip).join(' ')
       end
 
-      def origin_info_date_vals(origin_info_node)
+      def origin_info_date_vals
         date_field_keys.map do |date_field|
-          next unless origin_info_node.respond_to?(date_field)
+          next unless element.respond_to?(date_field)
 
-          date_elements = origin_info_node.send(date_field)
-          date_elements_display_vals(date_elements) if date_elements.present?
+          date_elements = element.send(date_field)
+          parse_dates(date_elements) if date_elements.present?
         end.compact.flatten
       end
 
-      def date_elements_display_vals(ng_date_elements)
-        apply_date_qualifier_decoration(
-          dedup_dates(
-            join_date_ranges(
-              process_decade_century_dates(
-                process_bc_ad_dates(
-                  process_encoded_dates(ignore_bad_dates(ng_date_elements))
-                )
-              )
-            )
-          )
-        )
-      end
-
       def date_field_keys
         [:dateIssued, :dateCreated, :dateCaptured, :copyrightDate]
       end
 
-      def ignore_bad_dates(ng_date_elements)
-        ng_date_elements.select do |ng_date_element|
-          val = ng_date_element.text.strip
-          val != '9999' && val != '0000-00-00' && val != 'uuuu'
-        end
-      end
+      class DateValue
+        attr_reader :value
+        delegate :text, :date, :point, :qualifier, :encoding, to: :value
 
-      def process_encoded_dates(ng_date_elements)
-        ng_date_elements.map do |ng_date_element|
-          if date_is_w3cdtf?(ng_date_element)
-            process_w3cdtf_date(ng_date_element)
-          elsif date_is_iso8601?(ng_date_element)
-            process_iso8601_date(ng_date_element)
-          else
-            ng_date_element
-          end
+        def initialize(value)
+          @value = value
         end
-      end
 
-      # note that there is no year 0:  from https://en.wikipedia.org/wiki/Anno_Domini
-      # "AD counting years from the start of this epoch, and BC denoting years before the start of the era.
-      # There is no year zero in this scheme, so the year AD 1 immediately follows the year 1 BC."
-      # See also https://consul.stanford.edu/display/chimera/MODS+display+rules for etdf
-      def process_bc_ad_dates(ng_date_elements)
-        ng_date_elements.map do |ng_date_element|
-          case
-          when date_is_edtf?(ng_date_element) && ng_date_element.text.strip == '0'
-            ng_date_element.content = "1 B.C."
-          when date_is_bc_edtf?(ng_date_element)
-            year = ng_date_element.text.strip.gsub(/^-0*/, '').to_i + 1
-            ng_date_element.content = "#{year} B.C."
-          when date_is_ad?(ng_date_element)
-            ng_date_element.content = "#{ng_date_element.text.strip.gsub(/^0*/, '')} A.D."
-          end
-          ng_date_element
+        # True if the element text isn't blank or the placeholder "9999".
+        def valid?
+          text.present? && !['9999', '0000-00-00', 'uuuu'].include?(text.strip)
         end
-      end
 
-      def process_decade_century_dates(ng_date_elements)
-        ng_date_elements.map do |ng_date_element|
-          if date_is_decade?(ng_date_element)
-            process_decade_date(ng_date_element)
-          elsif date_is_century?(ng_date_element)
-            process_century_date(ng_date_element)
-          else
-            ng_date_element
+        # Element text reduced to digits and hyphen. Captures date ranges and
+        # negative (B.C.) dates. Used for comparison/deduping.
+        def base_value
+          if text =~ /^\[?1\d{3}-\d{2}\??\]?$/
+            return text.sub(/(\d{2})(\d{2})-(\d{2})/, '\1\2-\1\3')
           end
+
+          text.gsub(/(?<![\d])(\d{1,3})([xu-]{1,3})/i) { "#{$1}#{'0' * $2.length}"}.scan(/[\d-]/).join
         end
-      end
 
-      def join_date_ranges(ng_date_elements)
-        if dates_are_range?(ng_date_elements)
-          start_date = ng_date_elements.find { |d| d.attributes['point'] && d.attributes['point'].value == 'start' }
-          end_date = ng_date_elements.find { |d| d.attributes['point'] && d.attributes['point'].value == 'end' }
-          ng_date_elements.map do |date|
-            date = date.clone # clone the date object so we don't append the same one
-            if normalize_date(date.text) == normalize_date(start_date.text)
-              date.content = [start_date.text, end_date.text].join(' - ')
-              date
-            elsif normalize_date(date.text) != normalize_date(end_date.text)
-              date
-            end
-          end.compact
-        elsif dates_are_open_range?(ng_date_elements)
-          start_date = ng_date_elements.find { |d| d.attributes['point'] && d.attributes['point'].value == 'start' }
-          ng_date_elements.map do |date|
-            date = date.clone # clone the date object so we don't append the same one
-            date.content = "#{start_date.text}-" if date.text == start_date.text
-            date
+        # Decoded version of the date, if it was encoded. Strips leading zeroes.
+        def decoded_value
+          return text.strip unless date
+
+          unless encoding.present?
+            return text.strip unless text =~ /^-?\d+$/ || text =~ /^[\dXxu?-]{4}$/
           end
-        else
-          ng_date_elements
-        end
-      end
 
-      def dedup_dates(ng_date_elements)
-        date_text = ng_date_elements.map { |d| normalize_date(d.text) }
-        if date_text != date_text.uniq
-          if ng_date_elements.find { |d| d.attributes['qualifier'].respond_to?(:value) }
-            [ng_date_elements.find { |d| d.attributes['qualifier'].respond_to?(:value) }]
-          elsif ng_date_elements.find { |d| !d.attributes['encoding'] }
-            [ng_date_elements.find { |d| !d.attributes['encoding'] }]
+          # Delegate to the appropriate decoding method, if any
+          case value.precision
+          when :day
+            date.strftime('%B %e, %Y')
+          when :month
+            date.strftime('%B %Y')
+          when :year
+            year = date.year
+            if year < 1
+              "#{year.abs + 1} B.C."
+            # Any dates before the year 1000 are explicitly marked A.D.
+            elsif year > 1 && year < 1000
+              "#{year} A.D."
+            else
+              year.to_s
+            end
+          when :century
+            return "#{(date.to_s[0..1].to_i + 1).ordinalize} century"
+          when :decade
+            return "#{date.year}s"
           else
-            [ng_date_elements.first]
+            text.strip
           end
-        else
-          ng_date_elements
         end
-      end
 
-      def apply_date_qualifier_decoration(ng_date_elements)
-        return_fields = ng_date_elements.map do |date|
-          date = date.clone
-          if date_is_approximate?(date)
-            date.content = "[ca. #{date.text}]"
-          elsif date_is_questionable?(date)
-            date.content = "[#{date.text}?]"
-          elsif date_is_inferred?(date)
-            date.content = "[#{date.text}]"
-          end
+        # Decoded date with "B.C." or "A.D." and qualifier markers. See (outdated):
+        # https://consul.stanford.edu/display/chimera/MODS+display+rules#MODSdisplayrules-3b.%3CoriginInfo%3E
+        def qualified_value
+          date = decoded_value
+
+          return "[ca. #{date}]" if qualifier == 'approximate'
+          return "[#{date}?]" if qualifier == 'questionable'
+          return "[#{date}]" if qualifier == 'inferred'
+
           date
         end
-        return_fields.map(&:text)
       end
 
-      def date_is_approximate?(ng_date_element)
-        ng_date_element.attributes['qualifier'] &&
-          ng_date_element.attributes['qualifier'].respond_to?(:value) &&
-          ng_date_element.attributes['qualifier'].value == 'approximate'
-      end
-
-      def date_is_questionable?(ng_date_element)
-        ng_date_element.attributes['qualifier'] &&
-          ng_date_element.attributes['qualifier'].respond_to?(:value) &&
-          ng_date_element.attributes['qualifier'].value == 'questionable'
-      end
-
-      def date_is_inferred?(ng_date_element)
-        ng_date_element.attributes['qualifier'] &&
-          ng_date_element.attributes['qualifier'].respond_to?(:value) &&
-          ng_date_element.attributes['qualifier'].value == 'inferred'
-      end
-
-      def dates_are_open_range?(ng_date_elements)
-        ng_date_elements.any? do |element|
-          element.attributes['point'] &&
-            element.attributes['point'].respond_to?(:value) &&
-            element.attributes['point'].value == 'start'
-        end && !ng_date_elements.any? do |element|
-          element.attributes['point'] &&
-            element.attributes['point'].respond_to?(:value) &&
-            element.attributes['point'].value == 'end'
+      class DateRange
+        def initialize(start: nil, stop: nil)
+          @start = start
+          @stop = stop
         end
-      end
 
-      def dates_are_range?(ng_date_elements)
-        attributes = ng_date_elements.map do |date|
-          if date.attributes['point'].respond_to?(:value)
-            date.attributes['point'].value
-          end
+        # Base value as hyphen-joined string. Used for comparison/deduping.
+        def base_value
+          "#{@start&.base_value}-#{@stop&.base_value}"
         end
-        attributes.include?('start') &&
-          attributes.include?('end')
-      end
 
-      def process_w3cdtf_date(ng_date_element)
-        ng_date_element = ng_date_element.clone
-        ng_date_element.content = begin
-          if ng_date_element.text.strip =~ /^\d{4}-\d{2}-\d{2}$/
-            Date.parse(ng_date_element.text).strftime(full_date_format)
-          elsif ng_date_element.text.strip =~ /^\d{4}-\d{2}$/
-            Date.parse("#{ng_date_element.text}-01").strftime(short_date_format)
-          else
-            ng_date_element.content
-          end
-                                  rescue
-                                    ng_date_element.content
+        # Base values as array. Used for comparison/deduping of individual dates.
+        def base_values
+          [@start&.base_value, @stop&.base_value].compact
         end
-        ng_date_element
-      end
 
-      def process_iso8601_date(ng_date_element)
-        ng_date_element = ng_date_element.clone
-        ng_date_element.content = begin
-          if ng_date_element.text.strip =~ /^\d{8,}$/
-            Date.parse(ng_date_element.text).strftime(full_date_format)
-          else
-            ng_date_element.content
-          end
-                                  rescue
-                                    ng_date_element.content
+        # The encoding value for the start of the range, or stop if not present.
+        def encoding
+          @start&.encoding || @stop&.encoding
         end
-        ng_date_element
-      end
 
-      DECADE_4CHAR_REGEXP = Regexp.new('(^|.*\D)(\d{3}[u\-?x])(.*)')
+        # Decoded dates with "B.C." or "A.D." and qualifier markers applied to
+        # the entire range, or individually if dates differ.
+        def qualified_value
+          if @start&.qualifier == @stop&.qualifier
+            qualifier = @start&.qualifier || @stop&.qualifier
+            date = "#{@start&.decoded_value} - #{@stop&.decoded_value}"
+            return "[ca. #{date}]" if qualifier == 'approximate'
+            return "[#{date}?]" if qualifier == 'questionable'
+            return "[#{date}]" if qualifier == 'inferred'
 
-      # strings like 195x, 195u, 195- and 195?  become '1950s' in the ng_date_element content
-      def process_decade_date(ng_date_element)
-        my_ng_date_element = ng_date_element.clone
-        my_ng_date_element.content = begin
-          orig_date_str = ng_date_element.text.strip
-          # note:  not calling DateParsing.display_str_for_decade directly because non-year text is lost
-          decade_matches = orig_date_str.match(DECADE_4CHAR_REGEXP) if orig_date_str
-          if decade_matches
-            decade_str = decade_matches[2]
-            changed_to_zero = decade_str.to_s.tr('u\-?x', '0') if decade_str
-            zeroth_year = DateParsing.new(changed_to_zero).sortable_year_for_yyyy if changed_to_zero
-            new_decade_str = "#{zeroth_year}s" if zeroth_year
-            my_ng_date_element.content = "#{decade_matches[1]}#{new_decade_str}#{decade_matches[3]}"
+            date
           else
-            my_ng_date_element.content
+            "#{@start&.qualified_value} - #{@stop&.qualified_value}"
           end
-                                     rescue
-                                       my_ng_date_element.content
         end
-        my_ng_date_element
       end
 
-      CENTURY_4CHAR_REGEXP = Regexp.new('(^|.*\D)((\d{1,2})[u\-]{2})(.*)')
+      def parse_dates(elements)
+        # convert to DateValue objects and keep only valid ones
+        dates = elements.map(&:as_object).flatten.map { |element| DateValue.new(element) }.select(&:valid?)
+        # join any date ranges into DateRange objects
+        point, nonpoint = dates.partition(&:point)
+        if point.any?
+          range = DateRange.new(start: point.find { |date| date.point == 'start' },
+                                stop: point.find { |date| date.point == 'end' })
+          nonpoint.unshift(range)
+        end
+        dates = nonpoint
 
-      # strings like 18uu, 18-- become '19th century' in the ng_date_element content
-      def process_century_date(ng_date_element)
-        my_ng_date_element = ng_date_element.clone
-        my_ng_date_element.content = begin
-          orig_date_str = ng_date_element.text.strip
-          # note:  not calling DateParsing.display_str_for_century directly because non-year text is lost
-          century_matches = orig_date_str.match(CENTURY_4CHAR_REGEXP) if orig_date_str
-          if century_matches
-            new_century_str = "#{(century_matches[3].to_i + 1).ordinalize} century"
-            my_ng_date_element.content = "#{century_matches[1]}#{new_century_str}#{century_matches[4]}"
+        # ensure dates are unique with respect to their base values
+        dates = dates.group_by(&:base_value).map do |_value, group|
+          next group.first if group.one?
+
+          # if one of the duplicates wasn't encoded, use that one. see:
+          # https://consul.stanford.edu/display/chimera/MODS+display+rules#MODSdisplayrules-3b.%3CoriginInfo%3E
+          if group.reject(&:encoding).any?
+            group.reject(&:encoding).first
+
+          # otherwise just randomly pick the first in the group
           else
-            my_ng_date_element.content
+            group.last
           end
-                                     rescue
-                                       my_ng_date_element.content
         end
-        my_ng_date_element
-      end
 
-      def field_is_encoded?(ng_element, encoding)
-        ng_element.attributes['encoding'] &&
-          ng_element.attributes['encoding'].respond_to?(:value) &&
-          ng_element.attributes['encoding'].value.downcase == encoding
-      end
+        # compare the remaining dates against one part of the other of a range
+        date_ranges = dates.select { |date| date.is_a?(DateRange) }
 
-      def date_is_bc_edtf?(ng_date_element)
-        ng_date_element.text.strip.start_with?('-') && date_is_edtf?(ng_date_element)
-      end
+        # remove any range that duplicates an unencoded date that includes that range
+        duplicated_ranges = dates.flat_map do |date|
+          next if date.is_a?(DateRange) || date.encoding.present?
 
-      def date_is_ad?(ng_date_element)
-        str = ng_date_element.text.strip.gsub(/^0*/, '')
-        str.present? && str.length < 4 && !str.match('A.D.')
-      end
+          date_ranges.select { |r| r.base_values.include?(date.base_value) }
+        end
 
-      def date_is_edtf?(ng_date_element)
-        field_is_encoded?(ng_date_element, 'edtf')
-      end
+        dates = dates - duplicated_ranges
 
-      def date_is_w3cdtf?(ng_date_element)
-        field_is_encoded?(ng_date_element, 'w3cdtf')
-      end
-
-      def date_is_iso8601?(ng_date_element)
-        field_is_encoded?(ng_date_element, 'iso8601')
-      end
-
-      # @return true if decade string needs tweaking for display
-      def date_is_decade?(ng_date_element)
-        ng_date_element.text.strip.match(DECADE_4CHAR_REGEXP)
-      end
-
-      # @return true if century string needs tweaking for display
-      def date_is_century?(ng_date_element)
-        ng_date_element.text.strip.match(CENTURY_4CHAR_REGEXP)
-      end
-
-      def full_date_format(full_date_format = '%B %-d, %Y')
-        @full_date_format ||= full_date_format
-      end
-
-      def short_date_format(short_date_format = '%B %Y')
-        @short_date_format ||= short_date_format
-      end
-
-      def normalize_date(date_str)
-        date_str.strip.gsub(/^\[*ca\.\s*|c|\[|\]|\?/, '')
+        # output formatted dates with qualifiers, A.D./B.C., etc.
+        dates.map(&:qualified_value)
       end
     end
   end
 end