module Traject # MarcExtractor is a class for extracting lists of strings from a MARC::Record, # according to specifications. See #parse_string_spec for description of string # string arguments used to specify extraction. See #initialize for options # that can be set controlling extraction. # # Examples: # # array_of_stuff = MarcExtractor.new(marc_record, "001:245abc:700a").extract # values = MarcExtractor.new(marc_record, "040a", :seperator => nil).extract # class MarcExtractor attr_accessor :options, :marc_record, :spec_hash # Convenience method to construct a MarcExtractor object and # run extract on it. # # First arg is a marc record. # # Second arg is either a string that will be given to parse_string_spec, # OR a hash that's the return value of parse_string_spec. # # Third arg is an optional options hash that will be passed as # third arg of MarcExtractor constructor. def self.extract_by_spec(marc_record, specification, options = {}) (raise ArgumentError, "first argument must not be nil") if marc_record.nil? Traject::MarcExtractor.new(marc_record, specification, options).extract end # Take a hash that's the output of #parse_string_spec, return # an array of strings extracted from a marc record accordingly # # Second arg can either be a string specification that will be passed # to MarcExtractor.parse_string_spec, or a Hash that's # already been created by it. # # options: # # [:seperator] default ' ' (space), what to use to seperate # subfield values when joining strings # # [:alternate_script] default :include, include linked 880s for tags # that match spec. Also: # * false => do not include. # * :only => only include linked 880s, not original def initialize(marc_record, spec, options = {}) self.options = { :seperator => ' ', :alternate_script => :include }.merge(options) self.marc_record = marc_record self.spec_hash = spec.kind_of?(Hash) ? spec : self.class.parse_string_spec(spec) end # Converts from a string marc spec like "245abc:700a" to a nested hash used internally # to represent the specification. # # a String specification is a string of form: # {tag}{|indicators|}{subfields} seperated by colons # tag is three chars (usually but not neccesarily numeric), # indicators are optional two chars prefixed by hyphen, # subfields are optional list of chars (alphanumeric) # # indicator spec must be two chars, but one can be * meaning "don't care". # space to mean 'blank' # # "245|01|abc65:345abc:700|*5|:800" # # Or, for control (fixed) fields (ordinarily fields 001-010), you can include a byte slice specification, # but can NOT include subfield or indicator specifications. Plus can use special tag "LDR" for # the marc leader. (TODO) # # "008[35-37]:LDR[5]" # => bytes 35-37 inclusive of field 008, and byte 5 of the marc leader. # # Returns a nested hash keyed by tags. # { tag => { # :subfields => ['a', 'b', '2'] # actually, a SET. may be empty or nil # :indicators => ['1', '0'] # An array. may be empty or nil; duple, either one can be nil # } #} # For byte offsets, :bytes => 12 or :bytes => (7..10) # # * subfields and indicators can only be provided for marc data/variable fields # * byte slice can only be provided for marc control fields (generally tags less than 010) # # See tests for more examples. def self.parse_string_spec(spec_string) hash = {} spec_string.split(":").each do |part| if (part =~ /\A([a-zA-Z0-9]{3})(\|([a-z0-9\ \*]{2})\|)?([a-z0-9]*)?\Z/) # variable field tag, indicators, subfields = $1, $3, $4 hash[tag] ||= {} if subfields subfields.each_char do |subfield| hash[tag][:subfields] ||= Array.new hash[tag][:subfields] << subfield end end if indicators hash[tag][:indicators] = [ (indicators[0] if indicators[0] != "*"), (indicators[1] if indicators[1] != "*") ] end elsif (part =~ /\A([a-zA-Z0-9]{3})(\[(\d+)(-(\d+))?\])\Z/) # "005[4-5]" tag, byte1, byte2 = $1, $3, $5 hash[tag] ||= {} if byte1 && byte2 hash[tag][:bytes] = ((byte1.to_i)..(byte2.to_i)) elsif byte1 hash[tag][:bytes] = byte1.to_i end else raise ArgumentError.new("Unrecognized marc extract specification: #{part}") end end return hash end # Returns array of strings, extracted values. Maybe empty array. def extract results = [] self.each_matching_line do |field, spec| if control_field?(field) results << (spec[:bytes] ? field.value.byteslice(spec[:bytes]) : field.value) else results.concat collect_subfields(field, spec) end end return results end # Yields a block for every line in source record that matches # spec. First arg to block is MARC::DataField or ControlField, second # is the hash specification that it matched on. May take account # of options such as :alternate_script # # Third (optional) arg to block is self, the MarcExtractor object, useful for custom # implementations. def each_matching_line self.marc_record.each do |field| if (spec = spec_covering_field(field)) && matches_indicators(field, spec) yield(field, spec, self) end end end # line each_matching_line, takes a block to process each matching line, # but collects results of block into an array -- flattens any subarrays for you! # # Useful for re-use of this class for custom processing def collect_matching_lines results = [] self.each_matching_line do |field, spec, extractor| results.concat [yield(field, spec, extractor)].flatten end return results end # Pass in a marc data field and a hash spec, returns # an ARRAY of one or more strings, subfields extracted # and processed per spec. Takes account of options such # as :seperator # # Always returns array, sometimes empty array. def collect_subfields(field, spec) subfields = field.subfields.collect do |subfield| subfield.value if spec[:subfields].nil? || spec[:subfields].include?(subfield.code) end.compact return subfields if subfields.empty? # empty array, just return it. return options[:seperator] ? [ subfields.join( options[:seperator]) ] : subfields end # Is there a spec covering extraction from this field? # May return true on 880's matching other tags depending # on value of :alternate_script # if :alternate_script is :only, will return original spec when field is an 880. # otherwise will always return nil for 880s, you have to handle :alternate_script :include # elsewhere, to add in the 880 in the right order def spec_covering_field(field) if field.tag == "880" && field['6'] && options[:alternate_script] != false # pull out the spec for corresponding original marc tag this 880 corresponds to # Due to bug in jruby https://github.com/jruby/jruby/issues/886 , we need # to do this weird encode gymnastics, which fixes it for mysterious reasons. orig_field = field["6"].encode(field["6"].encoding).byteslice(0,3) field["6"] && self.spec_hash[ orig_field ] elsif options[:alternate_script] != :only self.spec_hash[field.tag] end end def control_field?(field) # should the MARC gem have a more efficient way to do this, # define #control_field? on both ControlField and DataField? return field.kind_of? MARC::ControlField end # a marc field, and an individual spec hash, {:subfields => array, :indicators => array} def matches_indicators(field, spec) return true if spec[:indicators].nil? return (spec[:indicators][0].nil? || spec[:indicators][0] == field.indicator1) && (spec[:indicators][1].nil? || spec[:indicators][1] == field.indicator2) end end end