require 'nokogiri'
require 'marc/xml_parsers'
require 'marc_extensions'
require 'berkeley_library/util/files'
module BerkeleyLibrary
module TIND
module MARC
# A customized XML reader for reading MARC records from TIND search results.
class XMLReader
include Enumerable
include ::MARC::NokogiriReader
include BerkeleyLibrary::Util::Files
# ############################################################
# Constant
COMMENT_TOTAL_RE = /Search-Engine-Total-Number-Of-Results: ([0-9]+)/.freeze
# ############################################################
# Attributes
attr_reader :search_id
# Returns the total number of records, based on the `` tag
# returned by the TIND Search API, or the special comment
# `Search-Engine-Total-Number-Of-Results` returned by TIND
# Regular Search in XML format.
#
# Note that the total is not guaranteed to be present, and if present,
# may not be present unless at least some records have been parsed.
#
# @return [Integer, nil] the total number of records, or `nil` if the total has not been read yet
def total
@total&.to_i
end
# Returns the number of records yielded.
#
# @return [Integer] the number of records yielded.
def records_yielded
@records_yielded ||= 0
end
# ############################################################
# Initializer
# Reads MARC records from an XML datasource given either as an XML string, a file path,
# or as an IO object.
#
# @param source [String, Pathname, IO] an XML string, the path to a file, or an IO to read from directly
# @param freeze [Boolean] whether to freeze each record after reading
def initialize(source, freeze: false)
@handle = ensure_io(source)
@freeze = freeze
init
end
class << self
# Reads MARC records from an XML datasource given either as an XML string, a file path,
# or as an IO object.
#
# @param source [String, Pathname, IO] an XML string, the path to a file, or an IO to read from directly
# @param freeze [Boolean] whether to freeze each record after reading
def read(source, freeze: false)
new(source, freeze: freeze)
end
end
# ############################################################
# MARC::GenericPullParser overrides
def yield_record
@record[:record].tap do |record|
clean_cf_values(record)
move_cf000_to_leader(record)
record.freeze if @freeze
end
super
ensure
increment_records_yielded!
end
# ############################################################
# Nokogiri::XML::SAX::Document overrides
# @see Nokogiri::XML::Sax::Document#start_element_namespace
# rubocop:disable Metrics/ParameterLists
def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
super
@current_element_name = name
end
# rubocop:enable Metrics/ParameterLists
# @see Nokogiri::XML::Sax::Document#end_element_namespace
def end_element_namespace(name, prefix = nil, uri = nil)
super
@current_element_name = nil
end
# @see Nokogiri::XML::Sax::Document#characters
def characters(string)
return unless (name = @current_element_name)
case name
when 'search_id'
@search_id = string
when 'total'
@total = string.to_i
else
super
end
end
# @see Nokogiri::XML::Sax::Document#comment
def comment(string)
return unless (md = COMMENT_TOTAL_RE.match(string))
@total = md[1].to_i
end
# ############################################################
# Private
private
# TIND uses instead of
def move_cf000_to_leader(record)
return unless (cf_000 = record['000'])
record.leader = cf_000.value
record.fields.delete(cf_000)
end
# TIND uses \ (0x5c), not space (0x32), for unspecified values in positional fields
def clean_cf_values(record)
record.each_control_field { |cf| cf.value = cf.value&.gsub('\\', ' ') }
end
def ensure_io(file)
return file if reader_like?(file)
return File.new(file) if file_exists?(file)
return StringIO.new(file) if file =~ /^\s*