#!/usr/bin/env jruby --server -J-Djruby.compile.frameless=true -J-Djruby.compile.positionless=true -J-Djruby.compile.peephole=true require 'marc2solr' require 'marc2solr/marc2solr_custom' require 'rubygems' require 'pp' require 'marcspec' require 'threach' opts = MARC2Solr::Conf.new # pp opts ########################### # Get a master logger ########################### LOG = opts.masterLogger # Crank down the SUSS logging susslog = RJack::Logback['org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer'] susslog.level = RJack::Logback::WARN # Perform the command case opts.command ################# # Commit -- just send a commit to the configured solr ################# when "commit" LOG.info "Commit to #{opts.sussURL}" if opts[:dryrun] LOG.debug "Using javabin" if opts[:javabin] LOG.info "DRY RUN. Stopping now." else opts.suss.commit LOG.info "Commit done" end ##################################################### # delete -- delete IDs listed in the given file(s) ##################################################### when "delete" delfiles = opts.rest unless delfiles.size > 0 LOG.error "command 'delete' needs at least one filename" puts "\n\nERROR: command 'delete' needs at least one filename" opts.print_command_help('delete') end # Get the suss suss = opts.suss # Make sure they can all be opened delfiles.each do |filename| unless File.readable? filename LOG.error "Can't open configuration file `#{filename}`" raise ArgumentError, "Can't open configuration file `#{filename}`" end end # Now go ahead and process them total = 0 delfiles.each do |filename| count = 0 f = File.open(filename) LOG.info "Deleting IDs listed in #{filename}" LOG.info "DRY RUN ONLY" if opts[:dryrun] f.each_line do |id| id.chomp! suss.deleteById(id) unless opts[:dryrun] LOG.debug "Deleted id\t{}", id count += 1 total += 1 end f.close LOG.info "Tried to delete #{count} ids from file '#{filename}'" end LOG.info "Tried to delete #{total} ids from all #{delfiles.size} files" if delfiles.size > 1 unless opts[:dryrun] or opts[:skipcommit] LOG.info "Sending final commit" suss.commit LOG.info "Final commit finished" end ##################################################### # index -- index the given marc files ##################################################### when "index" marcfiles = opts.rest unless marcfiles.size > 0 LOG.error "command 'delete' needs at least one marc file to index" puts "\n\nERROR: command 'index' needs at least one filename" opts.print_command_help('index') end # Make sure we've got an indexfile unless opts[:indexfile] LOG.error "Index configuration file (index.rb) is rquired." raise ArgumentError, "Index configuraiton file (index.rb) is required" end # Make sure everything can be opened if File.readable? opts[:indexfile] LOG.debug "Found index file #{opts[:indexfile]}" else LOG.error "Index configuration file '#{opts[:indexfile]}' (set with --indexfile) cannot be found/read" raise ArgumentError, "Index configuration file '#{opts[:indexfile]}' (set with --indexfile) cannot be found/read" end if opts[:tmapdir] if File.readable? opts[:tmapdir] LOG.debug "Found translation maps directory #{opts[:tmapdir]}" else LOG.error "Translation Map directory '#{opts[:tmapdir]}' (set with --tmapdir) cannot be found/read" raise ArgumentError, "Translation Map directory '#{opts[:tmapdir]}' (set with --tmapdir) cannot be found/read" end else LOG.warn "No translation maps defined; hope that's what you intended" end marcfiles.each do |filename| if filename == "STDIN" LOG.info "Using standard input as a marc file" next end unless File.readable? filename LOG.error "Can't open MARC file `#{filename}`" raise ArgumentError, "Can't open MARC file `#{filename}`" else LOG.debug "Adding marc file #{filename} to queue" end end # Load all the files in the customdir(s) LOG.debug "Loading custom routines" if opts[:customdir].size > 0 opts[:customdir].each do |dir| unless File.exist? dir LOG.warn "Skipping load directory '#{dir}': Not found" opts[:customdir].delete dir end LOG.info "Loading files in #{dir}" Dir.glob(["#{dir}/*.jar"]).each do |x| LOG.debug "Loading #{x}" require x end Dir.glob(["#{dir}/*.rb"]).each do |x| LOG.debug "Loading #{x}" require x end end end # Get the suss. Will throw an Argument error if there's not enough information suss = opts.suss # Create a specset ss = MARCSpec::SpecSet.new ss.loadMapsFromDir opts[:tmapdir] if opts[:tmapdir] ss.buildSpecsFromDSLFile opts[:indexfile] # Set up if we're using threach or not if opts[:threads] > 1 LOG.info "Using #{opts[:threads]} threads; activiating threach" method = :threach args = [opts[:threads], :each_with_index] else LOG.info "Using 1 thread" method = :each_with_index args = [] end if opts[:dryrun] LOG.info "Begin DRY RUN; nothing will be sent to Solr" end # Clear solr if so declared if opts[:clearsolr] if opts[:dryrun] LOG.info "Would have cleared out solr (but for dryrun)" else suss.deleteByQuery('*:*') suss.commit LOG.info "Cleared out solr" end end # Read each filename in turn, indexing records in each i = 0 # Declare out here so we have a counter independent of the file being indexed start = Time.new.to_f marcfiles.each do |filename| LOG.info "Indexing file {}", filename reader = opts.reader(filename) reader.send(method, *args) do |r, i| Thread.current[:index] = i LOG.debug "Record {} with 001 {}", i, r['001'].value doc = ss.doc_from_marc(r, opts[:benchmark]) # Send it to solr unless opts[:dryrun] suss << doc end # Print out the record and/or the document opts[:debugfile].puts r if opts[:printmarc] opts[:debugfile].puts doc if opts[:printdoc] opts[:debugfile].puts "\n\n" if opts[:printdoc] or opts[:printmarc] if Thread.current[:index] % opts[:logbatchsize] == 0 pace = Thread.current[:index] / (Time.new.to_f - start) LOG.info "%d indexed (overall pace: %.0f rec/sec)" % [Thread.current[:index], pace] end end # reader end # marcfiles # Commit unless opts[:dryrun] or opts[:skipcommit] LOG.info "Sending final commit" suss.commit LOG.info "Final commit finished" end # Be done LOG.info "Done indexing" pace = i / (Time.new.to_f - start) LOG.info "%d indexed (overall pace: %.0f rec/sec)" % [i, pace] # Log the benchmarking information if requested if opts[:benchmark] ss.benchmarks.keys.sort{|a,b| ss.benchmarks[b].real <=> ss.benchmarks[a].real}.each do |k| LOG.info("%-20s %s" % [k + ':', ss.benchmarks[k].real.to_s]) end end end # end of the case statement