lib/searchkick/index.rb in searchkick-0.8.5 vs lib/searchkick/index.rb in searchkick-0.8.6
- old
+ new
@@ -1,11 +1,12 @@
module Searchkick
class Index
- attr_reader :name
+ attr_reader :name, :options
- def initialize(name)
+ def initialize(name, options = {})
@name = name
+ @options = options
end
def create(options = {})
client.indices.create index: name, body: options
end
@@ -20,10 +21,27 @@
def refresh
client.indices.refresh index: name
end
+ def alias_exists?
+ client.indices.exists_alias name: name
+ end
+
+ def swap(new_name)
+ old_indices =
+ begin
+ client.indices.get_alias(name: name).keys
+ rescue Elasticsearch::Transport::Transport::Errors::NotFound
+ []
+ end
+ actions = old_indices.map{|old_name| {remove: {index: old_name, alias: name}} } + [{add: {index: new_name, alias: name}}]
+ client.indices.update_aliases body: {actions: actions}
+ end
+
+ # record based
+
def store(record)
client.index(
index: name,
type: document_type(record),
id: search_id(record),
@@ -55,19 +73,400 @@
type: document_type(record),
id: record.id
)["_source"]
end
- def klass_document_type(klass)
- if klass.respond_to?(:document_type)
- klass.document_type
+ def reindex_record(record)
+ if record.destroyed? or !record.should_index?
+ begin
+ remove(record)
+ rescue Elasticsearch::Transport::Transport::Errors::NotFound
+ # do nothing
+ end
else
- klass.model_name.to_s.underscore
+ store(record)
end
end
+ def reindex_record_async(record)
+ if defined?(Searchkick::ReindexV2Job)
+ Searchkick::ReindexV2Job.perform_later(record.class.name, record.id.to_s)
+ else
+ Delayed::Job.enqueue Searchkick::ReindexJob.new(record.class.name, record.id.to_s)
+ end
+ end
+
+ def similar_record(record, options = {})
+ like_text = retrieve(record).to_hash
+ .keep_if{|k,v| !options[:fields] || options[:fields].map(&:to_s).include?(k) }
+ .values.compact.join(" ")
+
+ # TODO deep merge method
+ options[:where] ||= {}
+ options[:where][:_id] ||= {}
+ options[:where][:_id][:not] = record.id.to_s
+ options[:limit] ||= 10
+ options[:similar] = true
+
+ # TODO use index class instead of record class
+ search_model(record.class, like_text, options)
+ end
+
+ # search
+
+ def search_model(searchkick_klass, term = nil, options = {}, &block)
+ query = Searchkick::Query.new(searchkick_klass, term, options)
+ if block
+ block.call(query.body)
+ end
+ if options[:execute] == false
+ query
+ else
+ query.execute
+ end
+ end
+
+ # reindex
+
+ def create_index
+ index = Searchkick::Index.new("#{name}_#{Time.now.strftime('%Y%m%d%H%M%S%L')}", @options)
+ index.create(index_options)
+ index
+ end
+
+ # remove old indices that start w/ index_name
+ def clean_indices
+ all_indices = client.indices.get_aliases
+ indices = all_indices.select{|k, v| (v.empty? || v["aliases"].empty?) && k =~ /\A#{Regexp.escape(name)}_\d{14,17}\z/ }.keys
+ indices.each do |index|
+ Searchkick::Index.new(index).delete
+ end
+ indices
+ end
+
+ # https://gist.github.com/jarosan/3124884
+ # http://www.elasticsearch.org/blog/changing-mapping-with-zero-downtime/
+ def reindex_scope(scope, options = {})
+ skip_import = options[:import] == false
+
+ clean_indices
+
+ index = create_index
+
+ # check if alias exists
+ if alias_exists?
+ # import before swap
+ index.import_scope(scope) unless skip_import
+
+ # get existing indices to remove
+ swap(index.name)
+ clean_indices
+ else
+ delete if exists?
+ swap(index.name)
+
+ # import after swap
+ index.import_scope(scope) unless skip_import
+ end
+
+ index.refresh
+
+ true
+ end
+
+ def import_scope(scope)
+ batch_size = @options[:batch_size] || 1000
+
+ # use scope for import
+ scope = scope.search_import if scope.respond_to?(:search_import)
+ if scope.respond_to?(:find_in_batches)
+ scope.find_in_batches batch_size: batch_size do |batch|
+ import batch.select{|item| item.should_index? }
+ end
+ else
+ # https://github.com/karmi/tire/blob/master/lib/tire/model/import.rb
+ # use cursor for Mongoid
+ items = []
+ scope.all.each do |item|
+ items << item if item.should_index?
+ if items.length == batch_size
+ index.import items
+ items = []
+ end
+ end
+ import items
+ end
+ end
+
+ def index_options
+ options = @options
+
+ if options[:mappings] and !options[:merge_mappings]
+ settings = options[:settings] || {}
+ mappings = options[:mappings]
+ else
+ settings = {
+ analysis: {
+ analyzer: {
+ searchkick_keyword: {
+ type: "custom",
+ tokenizer: "keyword",
+ filter: ["lowercase"] + (options[:stem_conversions] == false ? [] : ["searchkick_stemmer"])
+ },
+ default_index: {
+ type: "custom",
+ tokenizer: "standard",
+ # synonym should come last, after stemming and shingle
+ # shingle must come before searchkick_stemmer
+ filter: ["standard", "lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]
+ },
+ searchkick_search: {
+ type: "custom",
+ tokenizer: "standard",
+ filter: ["standard", "lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]
+ },
+ searchkick_search2: {
+ type: "custom",
+ tokenizer: "standard",
+ filter: ["standard", "lowercase", "asciifolding", "searchkick_stemmer"]
+ },
+ # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb
+ searchkick_autocomplete_index: {
+ type: "custom",
+ tokenizer: "searchkick_autocomplete_ngram",
+ filter: ["lowercase", "asciifolding"]
+ },
+ searchkick_autocomplete_search: {
+ type: "custom",
+ tokenizer: "keyword",
+ filter: ["lowercase", "asciifolding"]
+ },
+ searchkick_word_search: {
+ type: "custom",
+ tokenizer: "standard",
+ filter: ["lowercase", "asciifolding"]
+ },
+ searchkick_suggest_index: {
+ type: "custom",
+ tokenizer: "standard",
+ filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]
+ },
+ searchkick_text_start_index: {
+ type: "custom",
+ tokenizer: "keyword",
+ filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
+ },
+ searchkick_text_middle_index: {
+ type: "custom",
+ tokenizer: "keyword",
+ filter: ["lowercase", "asciifolding", "searchkick_ngram"]
+ },
+ searchkick_text_end_index: {
+ type: "custom",
+ tokenizer: "keyword",
+ filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
+ },
+ searchkick_word_start_index: {
+ type: "custom",
+ tokenizer: "standard",
+ filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
+ },
+ searchkick_word_middle_index: {
+ type: "custom",
+ tokenizer: "standard",
+ filter: ["lowercase", "asciifolding", "searchkick_ngram"]
+ },
+ searchkick_word_end_index: {
+ type: "custom",
+ tokenizer: "standard",
+ filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
+ }
+ },
+ filter: {
+ searchkick_index_shingle: {
+ type: "shingle",
+ token_separator: ""
+ },
+ # lucky find http://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7
+ searchkick_search_shingle: {
+ type: "shingle",
+ token_separator: "",
+ output_unigrams: false,
+ output_unigrams_if_no_shingles: true
+ },
+ searchkick_suggest_shingle: {
+ type: "shingle",
+ max_shingle_size: 5
+ },
+ searchkick_edge_ngram: {
+ type: "edgeNGram",
+ min_gram: 1,
+ max_gram: 50
+ },
+ searchkick_ngram: {
+ type: "nGram",
+ min_gram: 1,
+ max_gram: 50
+ },
+ searchkick_stemmer: {
+ type: "snowball",
+ language: options[:language] || "English"
+ }
+ },
+ tokenizer: {
+ searchkick_autocomplete_ngram: {
+ type: "edgeNGram",
+ min_gram: 1,
+ max_gram: 50
+ }
+ }
+ }
+ }
+
+ if Searchkick.env == "test"
+ settings.merge!(number_of_shards: 1, number_of_replicas: 0)
+ end
+
+ settings.deep_merge!(options[:settings] || {})
+
+ # synonyms
+ synonyms = options[:synonyms] || []
+ if synonyms.any?
+ settings[:analysis][:filter][:searchkick_synonym] = {
+ type: "synonym",
+ synonyms: synonyms.select{|s| s.size > 1 }.map{|s| s.join(",") }
+ }
+ # choosing a place for the synonym filter when stemming is not easy
+ # https://groups.google.com/forum/#!topic/elasticsearch/p7qcQlgHdB8
+ # TODO use a snowball stemmer on synonyms when creating the token filter
+
+ # http://elasticsearch-users.115913.n3.nabble.com/synonym-multi-words-search-td4030811.html
+ # I find the following approach effective if you are doing multi-word synonyms (synonym phrases):
+ # - Only apply the synonym expansion at index time
+ # - Don't have the synonym filter applied search
+ # - Use directional synonyms where appropriate. You want to make sure that you're not injecting terms that are too general.
+ settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_synonym")
+ settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_synonym"
+ end
+
+ if options[:wordnet]
+ settings[:analysis][:filter][:searchkick_wordnet] = {
+ type: "synonym",
+ format: "wordnet",
+ synonyms_path: Searchkick.wordnet_path
+ }
+
+ settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_wordnet")
+ settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_wordnet"
+ end
+
+ if options[:special_characters] == false
+ settings[:analysis][:analyzer].each do |analyzer, analyzer_settings|
+ analyzer_settings[:filter].reject!{|f| f == "asciifolding" }
+ end
+ end
+
+ mapping = {}
+
+ # conversions
+ if options[:conversions]
+ mapping[:conversions] = {
+ type: "nested",
+ properties: {
+ query: {type: "string", analyzer: "searchkick_keyword"},
+ count: {type: "integer"}
+ }
+ }
+ end
+
+ mapping_options = Hash[
+ [:autocomplete, :suggest, :text_start, :text_middle, :text_end, :word_start, :word_middle, :word_end, :highlight]
+ .map{|type| [type, (options[type] || []).map(&:to_s)] }
+ ]
+
+ mapping_options.values.flatten.uniq.each do |field|
+ field_mapping = {
+ type: "multi_field",
+ fields: {
+ field => {type: "string", index: "not_analyzed"},
+ "analyzed" => {type: "string", index: "analyzed"}
+ # term_vector: "with_positions_offsets" for fast / correct highlighting
+ # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-request-highlighting.html#_fast_vector_highlighter
+ }
+ }
+
+ mapping_options.except(:highlight).each do |type, fields|
+ if fields.include?(field)
+ field_mapping[:fields][type] = {type: "string", index: "analyzed", analyzer: "searchkick_#{type}_index"}
+ end
+ end
+
+ if mapping_options[:highlight].include?(field)
+ field_mapping[:fields]["analyzed"][:term_vector] = "with_positions_offsets"
+ end
+
+ mapping[field] = field_mapping
+ end
+
+ (options[:locations] || []).map(&:to_s).each do |field|
+ mapping[field] = {
+ type: "geo_point"
+ }
+ end
+
+ (options[:unsearchable] || []).map(&:to_s).each do |field|
+ mapping[field] = {
+ type: "string",
+ index: "no"
+ }
+ end
+
+ mappings = {
+ _default_: {
+ properties: mapping,
+ # https://gist.github.com/kimchy/2898285
+ dynamic_templates: [
+ {
+ string_template: {
+ match: "*",
+ match_mapping_type: "string",
+ mapping: {
+ # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
+ type: "multi_field",
+ fields: {
+ # analyzed field must be the default field for include_in_all
+ # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
+ # however, we can include the not_analyzed field in _all
+ # and the _all index analyzer will take care of it
+ "{name}" => {type: "string", index: "not_analyzed"},
+ "analyzed" => {type: "string", index: "analyzed"}
+ }
+ }
+ }
+ }
+ ]
+ }
+ }.deep_merge(options[:mappings] || {})
+ end
+
+ {
+ settings: settings,
+ mappings: mappings
+ }
+ end
+
+ # other
+
def tokens(text, options = {})
client.indices.analyze({text: text, index: name}.merge(options))["tokens"].map{|t| t["token"] }
+ end
+
+ def klass_document_type(klass)
+ if klass.respond_to?(:document_type)
+ klass.document_type
+ else
+ klass.model_name.to_s.underscore
+ end
end
protected
def client