unless defined? JRUBY_VERSION raise "Only works under JRUBY" end begin include_class Java::org.marc4j.marc.impl.RecordImpl rescue NameError => e jardir = File.join(File.dirname(__FILE__), '..', 'jars') require "#{jardir}/marc4j.jar" end require 'set' # Re-open the MarcReader interface, define #each and include Enumerable # # We also automatically call #hashify on the records that stream through # #each in order to speed up RecordImpl#[] when (a) doing many operations on a single # record, and (b) we're not worried about interleaved tags (e.g., a 520 followed by a 510 followed # by another 520) module Java::OrgMarc4j::MarcReader include Enumerable # Return the next record, after calling #hashify on it def each(hashify=true) while self.hasNext r = self.next r.hashify if hashify yield r end end end module MARC4J4R # Add some sugar to the MarcReader interface # # Adjust the interface so that a #new call to any implementations that # implement it can take a java.io.InputStream, ruby IO obejct, or String # (that will be interpreted as a filename) without complaining. # # The mechanism -- running module_eval on a string-representation of the # new method in each of the hard-coded implementations of MarcReader # (MarcStreamReader,MarcPermissiveStreamReader,MarcXmlReader) -- is ugly # and deeply unsettling. # # @author Bill Dueber # # A string used to override the initializer for each stream reader # Need to do it this ugly way because of the way java and ruby interact; # can't just add it to the MarcReader interface the way I wanted to. NEWINIT = <<-ENDBINDER include Enumerable alias_method :oldinit, :initialize def initialize(fromwhere) stream = nil if fromwhere.is_a? Java::JavaIO::InputStream or fromwhere.is_a? Java::JavaIO::ByteArrayInputStream stream = fromwhere elsif fromwhere.is_a? IO stream = fromwhere.to_inputstream else stream = java.io.FileInputStream.new(fromwhere.to_java_string) end if self.class == Java::org.marc4j.MarcPermissiveStreamReader self.oldinit(stream, true, true) else self.oldinit(stream) end end ENDBINDER Java::org.marc4j.MarcStreamReader.module_eval(NEWINIT) Java::org.marc4j.MarcPermissiveStreamReader.module_eval(NEWINIT) Java::org.marc4j.MarcXmlReader.module_eval(NEWINIT) # Get a marc reader of the appropriate type # @param [String, IO, java.io.InputStream] input The IO stream (or filename) from which you want to read # @param [:strictmarc, :permissivemarc, :marcxml] The type of MARC reader you want. # @return [MarcReader] A MarcReader object with the syntactic sugar added in this file (e.g, each) # # @example Get a strict binary MARC reader for the file 'test.mrc' # reader = MARC4J4R.reader('test.mrc') # # @example Get a permissive binary MARC reader # reader = MARC4J4R.reader('test.mrc', :permissivemarc) # # @example Get a reader for an xml file # reader = MARC4J4R.reader('test.xml', :marcxml) # # @example Get a reader based on an existing IO object # require 'open-uri' # infile = open('http://my.machine.com/test.mrc') # reader = MARC4J4R.reader(infile) def reader(input, type = :strictmarc) case type when :strictmarc then return Java::org.marc4j.MarcStreamReader.new(input) when :permissivemarc then return Java::org.marc4j.MarcPermissiveStreamReader.new(input) when :marcxml then return Java::org.marc4j.MarcXmlReader.new(input) when :alephsequential then return MARC4J4R::AlephSequentialReader.new(input) else raise ArgumentError, "Reader type #{type} illegal: must be :strictmarc, :permissivemarc, :marcxml, or :alephsequential" end end module_function :reader # Implement an AlephSequential reader class AlephSequentialReader include Enumerable def initialize(fromwhere) stream = nil if fromwhere.is_a? Java::JavaIO::InputStream stream = fromwhere.to_io elsif fromwhere.is_a? IO stream = fromwhere else stream = File.new(fromwhere) end @handle = stream end def each record = nil currentID = nil @handle.each_line do |l| l.chomp! next unless l =~ /\S/ vals = l.unpack('a9 a a3 c c a3 a*') id, tag, ind1, ind2, data = vals[0], vals[2], vals[3], vals[4], vals[6] # id, tag, ind1, ind2, junk, data = *(l.unpack('A10 a3 c c a3 A*')) if id != currentID if record yield record end record = RecordImpl.new currentID = id end if tag == 'LDR' record.setLeader(Java::org.marc4j.marc.impl.LeaderImpl.new(data)) else record << buildField(tag,ind1,ind2,data) end end yield record end SUBREGEXP = /\$\$(.)/ def buildField (tag, ind1, ind2, data) if Java::org.marc4j.marc.impl.Verifier.isControlField tag return Java::org.marc4j.marc.impl.ControlFieldImpl.new(tag, data) else f = Java::org.marc4j.marc.impl.DataFieldImpl.new(tag, ind1, ind2) data.split(SUBREGEXP)[1..-1].each_slice(2) do |code, value| f.addSubfield Java::org.marc4j.marc.impl.SubfieldImpl.new(code[0].ord, value) end return f end end end # End of class AlephSequentialReader end include_class Java::org.marc4j.marc.impl::RecordImpl include_class Java::org.marc4j.marc.impl::ControlFieldImpl include_class Java::org.marc4j.marc.impl::DataFieldImpl include_class Java::org.marc4j.marc.impl::SubfieldImpl # Open up RecordImpl to add some sugar, including Enumberable as well # @author Bill Dueber class RecordImpl include Enumerable alias_method :<<, :addVariableField alias_method :append, :addVariableField alias_method :fields, :getVariableFields # Export as a MARC-Hash, as described at # http://robotlibrarian.billdueber.com/marc-hash-the-saga-continues-now-with-even-less-structure/ # @return A marc-hash representation of the record, suitable for calling .to_json on or whatever def to_marchash h = {} h['type'] = 'marc-hash' h['version'] = [1,0] h['leader'] = self.leader fields = [] self.getVariableFields.each do |f| if f.controlField? fields << [f.tag, f.value] else farray = [f.tag, f.indicator1 || ' ', f.indicator2 || ' '] subs = [] f.each do |subfield| subs << [subfield.code, subfield.value] end farray.push subs fields << farray end end h['fields'] = fields return h end # Create a local hash by tag number; makes some stuff faster # Called automatically if you use reader.each def hashify return if @hashedtags # don't do it more than once @hashedtags = {} self.getVariableFields.each do |f| @hashedtags[f.tag] ||= [] @hashedtags[f.tag].push f end end # Create a nice string of the record def to_s arr = ['LEADER ' + self.leader] self.each do |f| arr.push f.to_s end return arr.join("\n") end # Get the leader as a string (marc4j would otherwise return Leader object) def leader self.get_leader.toString end # Cycle through the fields in the order the appear in the record def each self.getVariableFields.each do |f| yield f end end # Get the first field associated with a tag # @param [String] tag The tag # @return [Field] The first matching field, or nil if none. Note that # to mirror ruby-marc, this returns a single field def [] tag if defined? @hashedtags if @hashedtags[tag] return @hashedtags[tag][0] else return nil end else return self.getVariableField(tag) end end # Get a (possibly empty) list of fields with the given tag(s) # # @param [String, Array] tags A string (or Array of strings) with the tags you're interested in # @param [Boolean] originalorder Whether or not results should be presented in the original order within the # record or with a two-column sort of (a) Order of the tag in the list of tags sent, (b) order within that tag # in the record # @return [Array] Either an empty list or a list of one or more matched fields will be returned. # # originalorder == false will use an internal hash and be faster in many cases (see #hashify) # # @example originalorder == false # # Given a record that looks like # # 010 $a 68027371 # # 035 $a (RLIN)MIUG0001728-B # # 035 $a (CaOTULAS)159818044 # # 035 $a (OCoLC)ocm00001728 # # r.find_by_tag(['035', '010']).each {|f| puts f.to_s} # # 035 $a (RLIN)MIUG0001728-B # # 035 $a (CaOTULAS)159818044 # # 035 $a (OCoLC)ocm00001728 # # 010 $a 68027371 # # # The results are ordered first by tag as passed in, then by original order within the tag # # @example Just get all fields for a single tag # ohThirtyFives = r.find_by_tag('035') # # @example Get a bunch of standard identifiers # standardIDs = r.find_by_tag(['022', '020', '010']) # # @example originalorder == true # r.find_by_tag(['035', '010'], true).each {|f| puts f.to_s} # # 010 $a 68027371 # # 035 $a (RLIN)MIUG0001728-B # # 035 $a (CaOTULAS)159818044 # # 035 $a (OCoLC)ocm00001728 def find_by_tag(tags, originalorder = false) self.hashify unless @hashedtags and !originalorder if !tags.is_a? Array return @hashedtags[tags] || [] end if originalorder return self.find_all {|f| tags.include? f.tag} else # puts "Tags is #{tags}: got #{@hashedtags.values_at(*tags)}" return @hashedtags.values_at(*tags).flatten.compact end end # Return the record as valid MARC-XML # @return String A MARC-XML representation of the record, including the XML header def to_xml return @xml if @xml begin @xml = java.io.StringWriter.new res = javax.xml.transform.stream.StreamResult.new(@xml) writer = org.marc4j.MarcXmlWriter.new(res) writer.write(self) writer.writeEndDocument return @xml.toString rescue "Woops! to_xml failed for record #{self['001'].data}: #{$!}" end end def to_marc begin s = Java::java.io.ByteArrayOutputStream.new writer = org.marc4j.MarcStreamWriter.new(s) writer.write(self) @marcbinary = s.to_string puts @marcbinary return @marcbinary rescue # "Woops! to_marc failed for record #{self['001'].data}: #{$!}" "Whoops! Failed: #{$!}" end end end class ControlFieldImpl def value return self.data end def controlField? return true end def self.control_tag? tag return Java::org.marc4j.marc.impl.Verifier.isControlField tag end # Pretty-print # @param [String] joiner What string to use to join the subfields # @param [String] The pretty string def to_s return self.tag + " " + self.value end end class DataFieldImpl include Enumerable alias_method :<<, :addSubfield def controlField? return false end # Broken. Need to check subs as well def == other self.tag == other.tag and self.indicator1 == other.indicator1 and self.indicator2 == other.indicator2 end # Pretty-print # @param [String] joiner What string to use to join the subfields # @param [String] The pretty string def to_s (joiner = ' ') arr = [self.tag + ' ' + self.indicator1 + self.indicator2] self.each do |s| arr.push s.to_s end return arr.join(joiner) end # Get the value of the first subfield of this field with the given code # @param [String] code 1-character string of the subfield code # @return [String] The value of the first matched subfield def [] code raise ArgumentError, "Code must be a one-character string, not #{code}" unless code.is_a? String and code.size == 1 # need to send a char value that the underlying java can deal with sub = self.getSubfield(code[0].ord) if (sub) return sub.getData else return nil end end # Get all values from the subfields for the given code or array of codes # @param [String, Array] code (Array of?) 1-character string(s) of the subfield code # @param [Boolean] myorder Use the order of subfields that I gave instead of the order they're in the record # @return [Array] A possibly-empty array of Strings made up of the values in the subfields whose # code is included in the given codes. If myorder == true, use the order in which they are passed in; if a code is repeated # (ocassionally legal) subfield values will appear first ordered by the passed array, then by order within # the document. # # If myorder is false, just return the values for matching subfields in the order they appear in the field. # # @example Quick examples: # # 260 $a New York, $b Van Nostrand Reinhold Co. $c 1969 # rec['260'].sub_values('a') #=> ["New York,"] # rec['260'].sub_values(['a', 'c']) #=> ["New York,", "1969"] # rec['260'].sub_values(['c', 'a']) #=> ["New York,", "1969"] # rec['260'].sub_values(['c', 'a'], true) #=> ["1969", "New York"] def sub_values(code, myorder = false) # Do a little razzle-dazzle for the common case when a single code is given if not [Set, Array].include? code.class c = code elsif code.size == 1 c = code.first end if c return self.find_all { |s| c == s.code}.map {|s| s.data} end # unless [Set, Array].include? code.class # code = [code] # # puts "Arrayified for code #{code} / #{code.class}" # end if myorder subs = [] code.each do |c| subs << self.find_all {|s| c == s.code} end return subs.flatten.map {|s| s.data} else return self.find_all{|s| code.include? s.code}.map {|s| s.data} end end # Get first indicator as a one-character string def indicator1 return self.getIndicator1.chr end # Get second indicator as a one-character string def indicator2 return self.getIndicator2.chr end # Iterate over the subfields def each self.getSubfields.each do |s| yield s end end # Get the concatentated values of the subfields in order the appear in the field # @param [String] joiner The string used to join the subfield values def value joiner=' ' data = self.getSubfields.map {|s| s.data} return data.join(joiner) end end class SubfieldImpl def == other return self.code == other.code and self.data == other.data end def value return self.data end def code return self.getCode.chr end def to_s return '$' + self.code + " " + self.data end end