# Encoding: UTF-8 require 'traject/marc_extractor' module Traject::Macros # extracting various semantic parts out of a Marc21 record. Few of these # come directly from Marc21 spec or other specs with no judgement, they # are all to some extent opinionated, based on actual practice and actual # data, some more than others. If it doens't do what you want, don't use it. # But if it does, you can use it, and continue to get updates with future # versions of Traject. module Marc21Semantics # shortcut MarcExtractor = Traject::MarcExtractor # Extract OCLC numbers from, by default 035a's by known prefixes, then stripped # just the num, and de-dup. def oclcnum(extract_fields = "035a") extractor = MarcExtractor.new(extract_fields, :separator => nil) lambda do |record, accumulator| list = extractor.extract(record).collect! do |o| Marc21Semantics.oclcnum_extract(o) end.compact accumulator.concat list.uniq if list end end # If a num begins with a known OCLC prefix, return it without the prefix. # otherwise nil. # # Allow (OCoLC) and/or ocn/ocm/on OCLCPAT = / \A\s* (?:(?:\(OCoLC\)) | (?:\(OCoLC\))?(?:(?:ocm)|(?:ocn)|(?:on)) )(\d+) /x def self.oclcnum_extract(num) if OCLCPAT.match(num) return $1 else return nil end end # A sortable author value, created by concatenating: # * the main entry author, if there is one (fields 100, 110 or 111) # * the main entry uniform title (240), if there is one - not including non-filing chars as noted in 2nd indicator of the 240 # * If no 240, the 245 title, not including non-filing chars as noted in ind 2 of the 245 # # Always returns a SINGLE string, based on concatenation. # # Thanks SolrMarc for basic logic. # # Note: You'll want to pay attention to the Solr schema field definition # you're using, and have it do case-insensitivity or any other normalization # you might want. # # these probably should be taking only certain subfields, but we're copying # from SolrMarc that didn't do so either and nobody noticed, so not bothering for now. def marc_sortable_author lambda do |record, accumulator| accumulator << Marc21Semantics.get_sortable_author(record) end end def self.get_sortable_author(record) onexx = MarcExtractor.cached("100:110:111", :first => true, :trim_punctuation => true).extract(record).first onexx = onexx.strip if onexx titles = [] MarcExtractor.cached("240:245", :first => true).each_matching_line(record) do |field, spec| non_filing = field.indicator2.to_i str = field.subfields.collect {|sf| Marc21.trim_punctuation(sf.value.strip).strip}.join(" ") str = str.slice(non_filing, str.length) titles << str end.first title = titles.first title = title.strip if title return [onexx, title].compact.join(" ") end # 245 a and b, with non-filing characters stripped off def marc_sortable_title lambda do |record, accumulator| st = Marc21Semantics.get_sortable_title(record) accumulator << st if st end end def self.get_sortable_title(record) MarcExtractor.cached("245ab").collect_matching_lines(record) do |field, spec, extractor| str = extractor.collect_subfields(field, spec).first if str.nil? # maybe an APPM archival record with only a 'k' str = field['k'] end if str.nil? # still? All we can do is bail, I guess return nil end non_filing = field.indicator2.to_i str = str.slice(non_filing, str.length) str = Marc21.trim_punctuation(str) str end.first end # A generic way to strip a filing version (i.e., a string with the non-filing # characters stripped off) # # Always returns an array. If :include_original=>true is passed in, # that array will include the original string with the non-filing # characters still in it. def extract_marc_filing_version(spec='245abdefghknp', opts={}) include_original = opts.delete(:include_original) if opts.size > 0 raise RuntimeError.new("extract_marc_filing_version can take only :include_original as an argument, not #{opts.keys.map{|x| "'#{x}'"}.join(' or ')}") end extractor = Traject::MarcExtractor.cached(spec, opts) lambda do |record, accumulator, context| extractor.collect_matching_lines(record) do |field, spec| str = extractor.collect_subfields(field, spec).first next unless str and !str.empty? vals = [Marc21Semantics.filing_version(field, str, spec)] if include_original vals.unshift str vals.uniq! end accumulator.concat vals end end end # Take in a field, a string extracted from that field, and a spec and # return the filing version (i.e., the string without the # non-filing characters) def self.filing_version(field, str, spec) # Control fields don't have non-filing characters return str if field.kind_of? MARC::ControlField # 2nd indicator must be > 0 ind2 = field.indicator2.to_i return str unless ind2 > 0 # The spechash must either (a) have no subfields specified, or # (b) include the first subfield in the record subs = spec.subfields return str unless subs && subs.include?(field.subfields[0].code) # OK. If we got this far we actually need to strip characters off the string return str[ind2..-1] end # maps languages, by default out of 008[35-37] and 041a and 041d # # Can specify other spec if you want, say, 041b (lang of abstract) # or 041e (lang of librettos), or 041h (lang of original) instead or in addition. # # de-dups values so you don't get the same one twice. # # Exact spec of #marc_languages may change with new user data on what # works best. def marc_languages(spec = "008[35-37]:041a:041d") translation_map = Traject::TranslationMap.new("marc_languages") extractor = MarcExtractor.new(spec, :separator => nil) lambda do |record, accumulator| codes = extractor.collect_matching_lines(record) do |field, spec, extractor| if extractor.control_field?(field) (spec.bytes ? field.value.byteslice(spec.bytes) : field.value) else extractor.collect_subfields(field, spec).collect do |value| # sometimes multiple language codes are jammed together in one subfield, and # we need to separate ourselves. sigh. unless value.length == 3 value = value.scan(/.{1,3}/) # split into an array of 3-length substrs end value end.flatten end end codes = codes.uniq translation_map.translate_array!(codes) accumulator.concat codes end end # Adds in marc fields in spec (default is recommended series spec, but you can specify your own) # -- only trick is that 490's are skipped of first indicator is 1 -- if 490 first # indicator is "1", "series traced", that means the series title mentioned here is # already covered by another field we're including, so we don't want to double count it, possibly # with slight variation. def marc_series_facet(spec = "440a:490a:800abcdt:810abcdt:811acdeft:830adfgklmnoprst") extractor = MarcExtractor.new(spec) lambda do |record, accumulator| values = extractor.collect_matching_lines(record) do |field, spec, extractor| extractor.collect_subfields(field, spec) unless (field.tag == "490" && field.indicator1 == "1") end.compact # trim punctuation values.collect! do |s| Marc21.trim_punctuation(s) end accumulator.concat( values ) end end # Takes marc 048ab instrument code, and translates it to human-displayable # string. Takes first two chars of 048a or b, to translate (ignores numeric code) # # Pass in custom spec if you want just a or b, to separate soloists or whatever. def marc_instrumentation_humanized(spec = "048ab", options = {}) translation_map = Traject::TranslationMap.new(options[:translation_map] || "marc_instruments") extractor = MarcExtractor.new(spec, :separator => nil) lambda do |record, accumulator| values = extractor.extract(record) human = values.collect do |value| translation_map[ value.slice(0, 2) ] end.uniq accumulator.concat human if human && human.length > 0 end end # This weird one actually returns marc instrumentation codes, not # humanized. But it normalizes them by breaking them down into a numeric and non-numeric # version. For instance "ba01" will be indexed as both "ba01" and "ba". # ALSO, if the code is in a subfield b (soloist), it'll be indexed # _additionally_ as "ba01.s" and "ba.s". # # This has proven useful for expert music librarian searching by hand; it could # also be the basis of a GUI that executes searches behind the scenes for these # codes. def marc_instrument_codes_normalized(spec = "048") soloist_suffix = ".s" extractor = MarcExtractor.new("048", :separator => nil) return lambda do |record, accumulator| accumulator.concat( extractor.collect_matching_lines(record) do |field, spec, extractor| values = [] field.subfields.each do |sf| v = sf.value # Unless there's at least two chars, it's malformed, we can # do nothing next unless v.length >= 2 # Index both with and without number -- both with soloist suffix # if in a $b values << v values << "#{v}#{soloist_suffix}" if sf.code == 'b' if v.length >= 4 bare = v.slice(0,2) # just the prefix values << bare values << "#{bare}#{soloist_suffix}" if sf.code == 'b' end end values end.uniq ) end end # An opinionated algorithm for getting a SINGLE publication date out of marc # # * Prefers using 008, but will resort to 260c # * If 008 represents a date range, will take the midpoint of the range, # only if range is smaller than estimate_tolerance, default 15 years. # * Ignores dates below min_year (default 500) or above max_year (this year plus 6 years), # because experience shows too many of these were in error. # # Yeah, this code ends up ridiculous. def marc_publication_date(options = {}) estimate_tolerance = options[:estimate_tolerance] || 15 min_year = options[:min_year] || 500 max_year = options[:max_year] || (Time.new.year + 6) lambda do |record, accumulator| date = Marc21Semantics.publication_date(record, estimate_tolerance, min_year, max_year) accumulator << date if date end end # See #marc_publication_date. Yeah, this is a holy mess. # Maybe it should actually be extracted to it's own class! def self.publication_date(record, estimate_tolerance = 15, min_year = 500, max_year = (Time.new.year + 6)) field008 = MarcExtractor.cached("008").extract(record).first found_date = nil if field008 && field008.length >= 11 date_type = field008.slice(6) date1_str = field008.slice(7,4) date2_str = field008.slice(11, 4) if field008.length > 15 # for date_type q=questionable, we have a range. if (date_type == 'q') # make unknown digits at the beginning or end of range, date1 = date1_str.sub("u", "0").to_i date2 = date2_str.sub("u", "9").to_i # do we have a range we can use? if (date2 > date1) && ((date2 - date1) <= estimate_tolerance) found_date = (date2 + date1)/2 end end # didn't find a date that way, and anything OTHER than date_type # n=unknown, q=questionable, try single date -- for some date types, # there's a date range between date1 and date2, yeah, we often take # the FIRST date then, the earliest. That's just what we're doing. if found_date.nil? && date_type != 'n' && date_type != 'q' # in date_type 'r', second date is original publication date, use that I think? date_str = (date_type == 'r' && date2_str.to_i != 0) ? date2_str : date1_str # Deal with stupid 'u's, which end up meaning a range too, # find midpoint and make sure our tolerance is okay. ucount = 0 while (!date_str.nil?) && (i = date_str.index('u')) ucount += 1 date_str[i] = "0" end date = date_str.to_i if ucount > 0 && date != 0 delta = 10 ** ucount # 10^ucount, expontent if delta <= estimate_tolerance found_date = date + (delta/2) end elsif date != 0 found_date = date end end end # Okay, nothing from 008, try 260 if found_date.nil? v260c = MarcExtractor.cached("260c", :separator => nil).extract(record).first # just try to take the first four digits out of there, we're not going to try # anything crazy. if v260c =~ /(\d{4})/ found_date = $1.to_i end end # is it within our acceptable range? found_date = nil if found_date && (found_date < min_year || found_date > max_year) return found_date end # REGEX meant to rule out obvious non-LCC's, and only allow things # plausibly LCC's. LCC_REGEX = /\A *[A-Z]{1,3}[ .]*(?:(\d+)(?:\s*?\.\s*?(\d+))?).*/ # Looks up Library of Congress Classification (LCC) or NLM Medical Subject Headings (MeSH) # from usual parts of the marc record. Maps them to high-level broad categories, # basically just using the first part of the LCC. Note it's just looking in bib-level # locations for LCCs, you're on your own with holdings. # # Sanity checks to make sure the thing looks like an LCC with a regex, before # mapping. # # Will call it 'Unknown' if it's got nothing else, or pass in :default => something else, # or nil. # # The categories output aren't great, but they're something. def marc_lcc_to_broad_category( options = {}, spec="050a:060a:090a:096a") # Trying to match things that look like LCC, and not match things # that don't. Is tricky. lcc_regex = LCC_REGEX default_value = options.has_key?(:default) ? options[:default] : "Unknown" translation_map = Traject::TranslationMap.new("lcc_top_level") extractor = MarcExtractor.new(spec, :separator => nil) lambda do |record, accumulator| candidates = extractor.extract(record) candidates.reject! do |candidate| !(candidate =~ lcc_regex) end accumulator.concat translation_map.translate_array!(candidates.collect {|a| a.lstrip.slice(0, 1)}).uniq if default_value && accumulator.empty? accumulator << default_value end end end # An opinionated method of making a geographic facet out of BOTH 048 marc # codes, AND geo subdivisions in 6xx LCSH subjects. # # The LCSH geo subdivisions are further normalized: # * geo qualifiers in $z fields into parens, so "Germany -- Berlin" becomes "Berlin (Germany)" # (to be consistent with how same areas are written in $a fields -- doesn't # get everything, but gets lots of em) # * qualified regions like that are additionally 'posted up', so "Germany -- Berlin" gets # recorded additionally as "Germany" def marc_geo_facet(options = {}) marc_geo_map = Traject::TranslationMap.new("marc_geographic") a_fields_spec = options[:geo_a_fields] || "651a:691a" z_fields_spec = options[:geo_z_fields] || "600:610:611:630:648:650:654:655:656:690:651:691" extractor_043a = MarcExtractor.new("043a", :separator => nil) extractor_a_fields = MarcExtractor.new(a_fields_spec, :separator => nil) extractor_z_fields = MarcExtractor.new(z_fields_spec) lambda do |record, accumulator| accumulator.concat( extractor_043a.extract(record).collect do |code| # remove any trailing hyphens, then map marc_geo_map[code.gsub(/\-+\Z/, '')] end.compact ) #LCSH 651a and 691a go in more or less normally. accumulator.concat( extractor_a_fields.extract(record).collect do |s| # remove trailing periods, which they sometimes have if they were # at end of LCSH. s.sub(/\. */, '') end ) # fields we take z's from have a bit more normalization extractor_z_fields.each_matching_line(record) do |field, spec, extractor| z_fields = field.subfields.find_all {|sf| sf.code == "z"}.collect {|sf| sf.value } # depending on position in total field, may be a period on the end # we want to remove. z_fields.collect! {|s| s.gsub(/\. *\Z/, '')} if z_fields.length == 2 # normalize subdivision as parenthetical accumulator << "#{z_fields[1]} (#{z_fields[0]})" # and 'post up' accumulator << z_fields[0] else # just add all the z's if there's 1 or more than 2. accumulator.concat z_fields end end accumulator.uniq! end end # Opinionated routine to create values for a chronology/era facet out of # LCSH chron subdivisions. Does some normalization: # for 651 with a chron facet fitting the form # "aaaaa, yyyy-yyyy", it will add in the $a. For instance: # 651 a| United States x| History y| Civil War, 1861-1865 # --> "United States: Civil War, 1861-1865" def marc_era_facet ordinary_fields_spec = "600y:610y:611y:630y:648ay:650y:654y:656y:690y" special_fields_spec = "651:691" separator = ": " extractor_ordinary_fields = MarcExtractor.new(ordinary_fields_spec) extractor_special_fields = MarcExtractor.new(special_fields_spec) lambda do |record, accumulator| # straightforward ones accumulator.concat( extractor_ordinary_fields.extract(record).collect do |v| # May have a period we have to remove, if it was at end of tag v.sub(/\. *\Z/, '') end) # weird ones extractor_special_fields.each_matching_line(record) do |field, spec, extractor| field.subfields.each do |sf| next unless sf.code == 'y' if sf.value =~ /\A\s*.+,\s+(ca.\s+)?\d\d\d\d?(-\d\d\d\d?)?( B\.C\.)?[.,; ]*\Z/ # it's our pattern, add the $a in please accumulator << "#{field['a']}#{separator}#{sf.value.sub(/\. *\Z/, '')}" else accumulator << sf.value.sub(/\. *\Z/, '') end end end accumulator.uniq! end end # Extracts LCSH-carrying fields, and formatting them # as a pre-coordinated LCSH string, for instance suitable for including # in a facet. # # You can supply your own list of fields as a spec, but for significant # customization you probably just want to write your own method in # terms of the Marc21Semantics.assemble_lcsh method. def marc_lcsh_formatted(options = {}) spec = options[:spec] || "600:610:611:630:648:650:651:654:662" subd_separator = options[:subdivison_separator] || " — " other_separator = options[:other_separator] || " " extractor = MarcExtractor.new(spec) return lambda do |record, accumulator| accumulator.concat( extractor.collect_matching_lines(record) do |field, spec| Marc21Semantics.assemble_lcsh(field, subd_separator, other_separator) end) end end # Takes a MARC::Field and formats it into a pre-coordinated LCSH string # with subdivision seperators in the right place. # # For 600 fields especially, need to not just join with subdivision seperator # to take acount of $a$d$t -- for other fields, might be able to just # join subfields, not sure. # # WILL strip trailing period from generated string, contrary to some LCSH practice. # Our data is inconsistent on whether it has period or not, this was # the easiest way to standardize. # # Default subdivision seperator is em-dash with spaces, set to '--' if you want. # # Cite: "Dash (-) that precedes a subdivision in an extended 600 subject heading # is not carried in the MARC record. It may be system generated as a display constant # associated with the content of subfield $v, $x, $y, and $z." # http://www.loc.gov/marc/bibliographic/bd600.html def self.assemble_lcsh(marc_field, subd_separator = " — ", other_separator = " ") str = "" subd_prefix_codes = %w{v x y z} marc_field.subfields.each_with_index do |sf, i| # ignore non-alphabetic, like numeric control subfields next unless sf.code =~ /\A[a-z]\Z/ prefix = if subd_prefix_codes.include? sf.code subd_separator elsif i == 0 "" else other_separator end str << prefix << sf.value end str.gsub!(/\.\Z/, '') return nil if str == "" return str end end end