index.rb in searchkick-0.8.6

- old
+ new

@@ -1,11 +1,12 @@
 module Searchkick
   class Index
-    attr_reader :name
+    attr_reader :name, :options
 
-    def initialize(name)
+    def initialize(name, options = {})
       @name = name
+      @options = options
     end
 
     def create(options = {})
       client.indices.create index: name, body: options
     end
@@ -20,10 +21,27 @@
 
     def refresh
       client.indices.refresh index: name
     end
 
+    def alias_exists?
+      client.indices.exists_alias name: name
+    end
+
+    def swap(new_name)
+      old_indices =
+        begin
+          client.indices.get_alias(name: name).keys
+        rescue Elasticsearch::Transport::Transport::Errors::NotFound
+          []
+        end
+      actions = old_indices.map{|old_name| {remove: {index: old_name, alias: name}} } + [{add: {index: new_name, alias: name}}]
+      client.indices.update_aliases body: {actions: actions}
+    end
+
+    # record based
+
     def store(record)
       client.index(
         index: name,
         type: document_type(record),
         id: search_id(record),
@@ -55,19 +73,400 @@
         type: document_type(record),
         id: record.id
       )["_source"]
     end
 
-    def klass_document_type(klass)
-      if klass.respond_to?(:document_type)
-        klass.document_type
+    def reindex_record(record)
+      if record.destroyed? or !record.should_index?
+        begin
+          remove(record)
+        rescue Elasticsearch::Transport::Transport::Errors::NotFound
+          # do nothing
+        end
       else
-        klass.model_name.to_s.underscore
+        store(record)
       end
     end
 
+    def reindex_record_async(record)
+      if defined?(Searchkick::ReindexV2Job)
+          Searchkick::ReindexV2Job.perform_later(record.class.name, record.id.to_s)
+      else
+        Delayed::Job.enqueue Searchkick::ReindexJob.new(record.class.name, record.id.to_s)
+      end
+    end
+
+    def similar_record(record, options = {})
+      like_text = retrieve(record).to_hash
+        .keep_if{|k,v| !options[:fields] || options[:fields].map(&:to_s).include?(k) }
+        .values.compact.join(" ")
+
+      # TODO deep merge method
+      options[:where] ||= {}
+      options[:where][:_id] ||= {}
+      options[:where][:_id][:not] = record.id.to_s
+      options[:limit] ||= 10
+      options[:similar] = true
+
+      # TODO use index class instead of record class
+      search_model(record.class, like_text, options)
+    end
+
+    # search
+
+    def search_model(searchkick_klass, term = nil, options = {}, &block)
+      query = Searchkick::Query.new(searchkick_klass, term, options)
+      if block
+        block.call(query.body)
+      end
+      if options[:execute] == false
+        query
+      else
+        query.execute
+      end
+    end
+
+    # reindex
+
+    def create_index
+      index = Searchkick::Index.new("#{name}_#{Time.now.strftime('%Y%m%d%H%M%S%L')}", @options)
+      index.create(index_options)
+      index
+    end
+
+    # remove old indices that start w/ index_name
+    def clean_indices
+      all_indices = client.indices.get_aliases
+      indices = all_indices.select{|k, v| (v.empty? || v["aliases"].empty?) && k =~ /\A#{Regexp.escape(name)}_\d{14,17}\z/ }.keys
+      indices.each do |index|
+        Searchkick::Index.new(index).delete
+      end
+      indices
+    end
+
+    # https://gist.github.com/jarosan/3124884
+    # http://www.elasticsearch.org/blog/changing-mapping-with-zero-downtime/
+    def reindex_scope(scope, options = {})
+      skip_import = options[:import] == false
+
+      clean_indices
+
+      index = create_index
+
+      # check if alias exists
+      if alias_exists?
+        # import before swap
+        index.import_scope(scope) unless skip_import
+
+        # get existing indices to remove
+        swap(index.name)
+        clean_indices
+      else
+        delete if exists?
+        swap(index.name)
+
+        # import after swap
+        index.import_scope(scope) unless skip_import
+      end
+
+      index.refresh
+
+      true
+    end
+
+    def import_scope(scope)
+      batch_size = @options[:batch_size] || 1000
+
+      # use scope for import
+      scope = scope.search_import if scope.respond_to?(:search_import)
+      if scope.respond_to?(:find_in_batches)
+        scope.find_in_batches batch_size: batch_size do |batch|
+          import batch.select{|item| item.should_index? }
+        end
+      else
+        # https://github.com/karmi/tire/blob/master/lib/tire/model/import.rb
+        # use cursor for Mongoid
+        items = []
+        scope.all.each do |item|
+          items << item if item.should_index?
+          if items.length == batch_size
+            index.import items
+            items = []
+          end
+        end
+        import items
+      end
+    end
+
+    def index_options
+      options = @options
+
+      if options[:mappings] and !options[:merge_mappings]
+        settings = options[:settings] || {}
+        mappings = options[:mappings]
+      else
+        settings = {
+          analysis: {
+            analyzer: {
+              searchkick_keyword: {
+                type: "custom",
+                tokenizer: "keyword",
+                filter: ["lowercase"] + (options[:stem_conversions] == false ? [] : ["searchkick_stemmer"])
+              },
+              default_index: {
+                type: "custom",
+                tokenizer: "standard",
+                # synonym should come last, after stemming and shingle
+                # shingle must come before searchkick_stemmer
+                filter: ["standard", "lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"]
+              },
+              searchkick_search: {
+                type: "custom",
+                tokenizer: "standard",
+                filter: ["standard", "lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"]
+              },
+              searchkick_search2: {
+                type: "custom",
+                tokenizer: "standard",
+                filter: ["standard", "lowercase", "asciifolding", "searchkick_stemmer"]
+              },
+              # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb
+              searchkick_autocomplete_index: {
+                type: "custom",
+                tokenizer: "searchkick_autocomplete_ngram",
+                filter: ["lowercase", "asciifolding"]
+              },
+              searchkick_autocomplete_search: {
+                type: "custom",
+                tokenizer: "keyword",
+                filter: ["lowercase", "asciifolding"]
+              },
+              searchkick_word_search: {
+                type: "custom",
+                tokenizer: "standard",
+                filter: ["lowercase", "asciifolding"]
+              },
+              searchkick_suggest_index: {
+                type: "custom",
+                tokenizer: "standard",
+                filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"]
+              },
+              searchkick_text_start_index: {
+                type: "custom",
+                tokenizer: "keyword",
+                filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
+              },
+              searchkick_text_middle_index: {
+                type: "custom",
+                tokenizer: "keyword",
+                filter: ["lowercase", "asciifolding", "searchkick_ngram"]
+              },
+              searchkick_text_end_index: {
+                type: "custom",
+                tokenizer: "keyword",
+                filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
+              },
+              searchkick_word_start_index: {
+                type: "custom",
+                tokenizer: "standard",
+                filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"]
+              },
+              searchkick_word_middle_index: {
+                type: "custom",
+                tokenizer: "standard",
+                filter: ["lowercase", "asciifolding", "searchkick_ngram"]
+              },
+              searchkick_word_end_index: {
+                type: "custom",
+                tokenizer: "standard",
+                filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"]
+              }
+            },
+            filter: {
+              searchkick_index_shingle: {
+                type: "shingle",
+                token_separator: ""
+              },
+              # lucky find http://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7
+              searchkick_search_shingle: {
+                type: "shingle",
+                token_separator: "",
+                output_unigrams: false,
+                output_unigrams_if_no_shingles: true
+              },
+              searchkick_suggest_shingle: {
+                type: "shingle",
+                max_shingle_size: 5
+              },
+              searchkick_edge_ngram: {
+                type: "edgeNGram",
+                min_gram: 1,
+                max_gram: 50
+              },
+              searchkick_ngram: {
+                type: "nGram",
+                min_gram: 1,
+                max_gram: 50
+              },
+              searchkick_stemmer: {
+                type: "snowball",
+                language: options[:language] || "English"
+              }
+            },
+            tokenizer: {
+              searchkick_autocomplete_ngram: {
+                type: "edgeNGram",
+                min_gram: 1,
+                max_gram: 50
+              }
+            }
+          }
+        }
+
+        if Searchkick.env == "test"
+          settings.merge!(number_of_shards: 1, number_of_replicas: 0)
+        end
+
+        settings.deep_merge!(options[:settings] || {})
+
+        # synonyms
+        synonyms = options[:synonyms] || []
+        if synonyms.any?
+          settings[:analysis][:filter][:searchkick_synonym] = {
+            type: "synonym",
+            synonyms: synonyms.select{|s| s.size > 1 }.map{|s| s.join(",") }
+          }
+          # choosing a place for the synonym filter when stemming is not easy
+          # https://groups.google.com/forum/#!topic/elasticsearch/p7qcQlgHdB8
+          # TODO use a snowball stemmer on synonyms when creating the token filter
+
+          # http://elasticsearch-users.115913.n3.nabble.com/synonym-multi-words-search-td4030811.html
+          # I find the following approach effective if you are doing multi-word synonyms (synonym phrases):
+          # - Only apply the synonym expansion at index time
+          # - Don't have the synonym filter applied search
+          # - Use directional synonyms where appropriate. You want to make sure that you're not injecting terms that are too general.
+          settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_synonym")
+          settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_synonym"
+        end
+
+        if options[:wordnet]
+          settings[:analysis][:filter][:searchkick_wordnet] = {
+            type: "synonym",
+            format: "wordnet",
+            synonyms_path: Searchkick.wordnet_path
+          }
+
+          settings[:analysis][:analyzer][:default_index][:filter].insert(4, "searchkick_wordnet")
+          settings[:analysis][:analyzer][:default_index][:filter] << "searchkick_wordnet"
+        end
+
+        if options[:special_characters] == false
+          settings[:analysis][:analyzer].each do |analyzer, analyzer_settings|
+            analyzer_settings[:filter].reject!{|f| f == "asciifolding" }
+          end
+        end
+
+        mapping = {}
+
+        # conversions
+        if options[:conversions]
+          mapping[:conversions] = {
+            type: "nested",
+            properties: {
+              query: {type: "string", analyzer: "searchkick_keyword"},
+              count: {type: "integer"}
+            }
+          }
+        end
+
+        mapping_options = Hash[
+          [:autocomplete, :suggest, :text_start, :text_middle, :text_end, :word_start, :word_middle, :word_end, :highlight]
+            .map{|type| [type, (options[type] || []).map(&:to_s)] }
+        ]
+
+        mapping_options.values.flatten.uniq.each do |field|
+          field_mapping = {
+            type: "multi_field",
+            fields: {
+              field => {type: "string", index: "not_analyzed"},
+              "analyzed" => {type: "string", index: "analyzed"}
+              # term_vector: "with_positions_offsets" for fast / correct highlighting
+              # http://www.elasticsearch.org/guide/en/elasticsearch/reference/current/search-request-highlighting.html#_fast_vector_highlighter
+            }
+          }
+
+          mapping_options.except(:highlight).each do |type, fields|
+            if fields.include?(field)
+              field_mapping[:fields][type] = {type: "string", index: "analyzed", analyzer: "searchkick_#{type}_index"}
+            end
+          end
+
+          if mapping_options[:highlight].include?(field)
+            field_mapping[:fields]["analyzed"][:term_vector] = "with_positions_offsets"
+          end
+
+          mapping[field] = field_mapping
+        end
+
+        (options[:locations] || []).map(&:to_s).each do |field|
+          mapping[field] = {
+            type: "geo_point"
+          }
+        end
+
+        (options[:unsearchable] || []).map(&:to_s).each do |field|
+          mapping[field] = {
+            type: "string",
+            index: "no"
+          }
+        end
+
+        mappings = {
+          _default_: {
+            properties: mapping,
+            # https://gist.github.com/kimchy/2898285
+            dynamic_templates: [
+              {
+                string_template: {
+                  match: "*",
+                  match_mapping_type: "string",
+                  mapping: {
+                    # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
+                    type: "multi_field",
+                    fields: {
+                      # analyzed field must be the default field for include_in_all
+                      # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/
+                      # however, we can include the not_analyzed field in _all
+                      # and the _all index analyzer will take care of it
+                      "{name}" => {type: "string", index: "not_analyzed"},
+                      "analyzed" => {type: "string", index: "analyzed"}
+                    }
+                  }
+                }
+              }
+            ]
+          }
+        }.deep_merge(options[:mappings] || {})
+      end
+
+      {
+        settings: settings,
+        mappings: mappings
+      }
+    end
+
+    # other
+
     def tokens(text, options = {})
       client.indices.analyze({text: text, index: name}.merge(options))["tokens"].map{|t| t["token"] }
+    end
+
+    def klass_document_type(klass)
+      if klass.respond_to?(:document_type)
+        klass.document_type
+      else
+        klass.model_name.to_s.underscore
+      end
     end
 
     protected
 
     def client