#!/usr/bin/env jruby --server --1.9 require 'marc2solr' require 'marc2solr/marc2solr_custom' require 'rubygems' require 'pp' require 'marcspec' require 'jruby_threach' opts = MARC2Solr::Conf.new # Perform the command # If we're going to be talking to Solr, make sure it's there! HC = Java::org.apache.commons.httpclient def pingSolr(opts) if opts[:dryrun] LOG.info "Dry run: not bothing to check and see if solr is alive" else url = opts.sussURL + '/admin/ping' client = HC.HttpClient.new method = HC.methods.GetMethod.new(url) begin code = client.executeMethod(method) rescue LOG.error "Can't ping solr URL '#{url}'" if defined? LOG raise ArgumentError, "(Not up?) Can't ping solr server at #{opts.sussURL}" end unless code == 200 LOG.error "Can't ping solr URL '#{url}'" if defined? LOG raise ArgumentError, "Got response code #{code} from #{opts.sussURL}" end end end case opts.command ################### # Ping -- see if your solr is alive ################### when "ping" begin pingSolr(opts) puts "\n#{opts.sussURL} is alive\n" rescue ArgumentError => e puts e.message end ################# # Commit -- just send a commit to the configured solr ################# when "commit" LOG = opts.masterLogger # Crank down the SUSS logging susslog = RJack::Logback['org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer'] susslog.level = RJack::Logback::WARN pingSolr(opts) LOG.info "Commit to #{opts.sussURL}" if opts[:dryrun] LOG.debug "Using javabin" if opts[:javabin] LOG.info "DRY RUN. Stopping now." else opts.suss.commit LOG.info "Commit done" end ##################################################### # delete -- delete IDs listed in the given file(s) ##################################################### when "delete" LOG = opts.masterLogger # Crank down the SUSS logging susslog = RJack::Logback['org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer'] susslog.level = RJack::Logback::WARN delfiles = opts.rest unless delfiles.size > 0 LOG.error "command 'delete' needs at least one filename" puts "\n\nERROR: command 'delete' needs at least one filename" opts.print_command_help('delete') end # Get the suss suss = opts.suss # Make sure they can all be opened delfiles.each do |filename| unless File.readable? filename LOG.error "Can't open configuration file `#{filename}`" raise ArgumentError, "Can't open configuration file `#{filename}`" end end # Now go ahead and process them total = 0 delfiles.each do |filename| count = 0 f = File.open(filename) LOG.info "Deleting IDs listed in #{filename}" LOG.info "DRY RUN ONLY" if opts[:dryrun] f.each_line do |id| id.chomp! suss.deleteById(id) unless opts[:dryrun] LOG.debug "Deleted id\t{}", id count += 1 total += 1 end f.close LOG.info "Tried to delete #{count} ids from file '#{filename}'" end LOG.info "Tried to delete #{total} ids from all #{delfiles.size} files" if delfiles.size > 1 unless opts[:dryrun] or opts[:skipcommit] LOG.info "Sending final commit" suss.commit LOG.info "Final commit finished" end ##################################################### # index -- index the given marc files ##################################################### when "index" LOG = opts.masterLogger # Crank down the SUSS logging susslog = RJack::Logback['org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer'] susslog.level = RJack::Logback::WARN marcfiles = opts.rest unless marcfiles.size > 0 LOG.error "command 'delete' needs at least one marc file to index" puts "\n\nERROR: command 'index' needs at least one filename" opts.print_command_help('index') end # Make sure we've got an indexfile unless opts[:indexfile] LOG.error "Index configuration file (index.rb) is rquired." raise ArgumentError, "Index configuraiton file (index.rb) is required" end # Make sure everything can be opened if File.readable? opts[:indexfile] LOG.debug "Found index file #{opts[:indexfile]}" else LOG.error "Index configuration file '#{opts[:indexfile]}' (set with --indexfile) cannot be found/read" raise ArgumentError, "Index configuration file '#{opts[:indexfile]}' (set with --indexfile) cannot be found/read" end if opts[:tmapdir] if File.readable? opts[:tmapdir] LOG.debug "Found translation maps directory #{opts[:tmapdir]}" else LOG.error "Translation Map directory '#{opts[:tmapdir]}' (set with --tmapdir) cannot be found/read" raise ArgumentError, "Translation Map directory '#{opts[:tmapdir]}' (set with --tmapdir) cannot be found/read" end else LOG.warn "No translation maps defined; hope that's what you intended" end marcfiles.each do |filename| if filename == "STDIN" LOG.info "Using standard input as a marc file" next end unless File.readable? filename LOG.error "Can't open MARC file `#{filename}`" raise ArgumentError, "Can't open MARC file `#{filename}`" else LOG.debug "Adding marc file #{filename} to queue" end end # Load all the files in the customdir(s) LOG.debug "Loading custom routines" if opts[:customdir].size > 0 opts[:customdir].each do |dir| unless File.exist? dir LOG.warn "Skipping load directory '#{dir}': Not found" opts[:customdir].delete dir end LOG.info "Loading files in #{dir}" Dir.glob(["#{dir}/*.jar"]).each do |x| LOG.debug "Loading #{x}" require x end Dir.glob(["#{dir}/*.rb"]).each do |x| LOG.debug "Loading #{x}" require x end end end # Get the suss. Will throw an Argument error if there's not enough information suss = opts.suss # Create a specset ss = MARCSpec::SpecSet.new ss.loadMapsFromDir opts[:tmapdir] if opts[:tmapdir] ss.buildSpecsFromDSLFile opts[:indexfile] # Set up if we're using threach or not if opts[:threads] > 1 LOG.info "Using #{opts[:threads]} threads; activiating threach" method = :threach args = [opts[:threads], :each_with_index] else LOG.info "Using 1 thread" method = :each_with_index args = [] end if opts[:dryrun] LOG.info "Begin DRY RUN; nothing will be sent to Solr" end # Clear solr if so declared if opts[:clearsolr] if opts[:dryrun] LOG.info "Would have cleared out solr (but for dryrun)" else suss.deleteByQuery('*:*') suss.commit LOG.info "Cleared out solr" end end # Read each filename in turn, indexing records in each i = 0 # Declare out here so we have a counter independent of the file being indexed start = Time.new.to_f marcfiles.each do |filename| LOG.info "Indexing file {}", filename reader = opts.reader(filename) reader.send(method, *args) do |r, i| Thread.current[:index] = i r.cachespot[opts] = opts LOG.debug "Record {} with 001 {}", i, r['001'].value doc = ss.doc_from_marc(r, opts[:benchmark]) # Send it to solr unless opts[:dryrun] suss << doc end # Print out the record and/or the document opts[:debugfile].puts r if opts[:printmarc] opts[:debugfile].puts doc if opts[:printdoc] opts[:debugfile].puts "\n\n" if opts[:printdoc] or opts[:printmarc] if Thread.current[:index] % opts[:logbatchsize] == 0 pace = Thread.current[:index] / (Time.new.to_f - start) LOG.info "%d indexed (overall pace: %.0f rec/sec)" % [Thread.current[:index], pace] end end # reader end # marcfiles # Commit unless opts[:dryrun] or opts[:skipcommit] LOG.info "Sending final commit" suss.commit LOG.info "Final commit finished" end # Be done LOG.info "Done indexing" pace = i / (Time.new.to_f - start) LOG.info "%d indexed (overall pace: %.0f rec/sec)" % [i, pace] # Log the benchmarking information if requested if opts[:benchmark] ss.benchmarks.keys.sort{|a,b| ss.benchmarks[b].real <=> ss.benchmarks[a].real}.each do |k| LOG.info("%-20s %s" % [k + ':', ss.benchmarks[k].real.to_s]) end end end # end of the case statement