Sha256: 0880f68c8eae033aa6af1cf4e7dc25a303d59a5d03790e651678e46f92f493f1
Contents?: true
Size: 1.34 KB
Versions: 9
Compression:
Stored size: 1.34 KB
Contents
require 'active_support/core_ext' require 'libxml' require 'oj' # # (compressed) bulk files can be posted using curl, e.g. with the following command # # gzip -c -d aleph.PRIMO.20120908.091506.1.es_bulk.gz | curl -XPOST 'localhost:9200/catalog/record/_bulk' --data-binary @- # module Mabmapper class ElasticSearchWriter def self.out_file(output_dir_name, file_name, options = {}) file_basename = File.basename(file_name).gsub(/\.tar.gz|\.tar|\.tgz/, '.es_bulk') file_basename << '.gz' if options[:will_be_gziped] === true File.join(output_dir_name, file_basename) end def initialize(io) @io = io # Set libxml as minixml backend to improve performance ActiveSupport::XmlMini.backend = 'LibXML' end def add_file(name, mode) # :yields: io yield self end def add_file_simple(name, mode, size) # :yields: io yield self end def close @io.close unless @io.closed? end # we assume that data is string serialized xml def write(xml) bulk = [] hash = Hash.from_xml(xml) bulk.push(Oj.dump({ index: { _id: "#{hash['document']['id']}" }}, mode: :compat)) bulk.push(Oj.dump(hash['document'], mode: :compat)) # Beware, right positions of newlines is vital for elasticsearch bulk import @io.write(bulk.join("\n") << "\n") end end end
Version data entries
9 entries across 9 versions & 1 rubygems