lib/searchkick/index_options.rb in searchkick-4.4.0 vs lib/searchkick/index_options.rb in searchkick-4.4.1

- old
+ new

@@ -1,491 +1,558 @@ module Searchkick - module IndexOptions - def index_options - options = @options - language = options[:language] - language = language.call if language.respond_to?(:call) + class IndexOptions + attr_reader :options - below62 = Searchkick.server_below?("6.2.0") - below70 = Searchkick.server_below?("7.0.0") - below73 = Searchkick.server_below?("7.3.0") + def initialize(index) + @options = index.options + end - if below70 - index_type = options[:_type] - index_type = index_type.call if index_type.respond_to?(:call) - end - + def index_options custom_mapping = options[:mappings] || {} - if below70 && custom_mapping.keys.map(&:to_sym).include?(:properties) + if below70? && custom_mapping.keys.map(&:to_sym).include?(:properties) # add type custom_mapping = {index_type => custom_mapping} end if options[:mappings] && !options[:merge_mappings] settings = options[:settings] || {} mappings = custom_mapping else - default_type = "text" - default_analyzer = :searchkick_index - keyword_mapping = {type: "keyword"} + settings = generate_settings + mappings = generate_mappings.symbolize_keys.deep_merge(custom_mapping.symbolize_keys) + end - keyword_mapping[:ignore_above] = options[:ignore_above] || 30000 + set_deep_paging(settings) if options[:deep_paging] - settings = { - analysis: { - analyzer: { - searchkick_keyword: { - type: "custom", - tokenizer: "keyword", - filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : []) - }, - default_analyzer => { - type: "custom", - # character filters -> tokenizer -> token filters - # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html - char_filter: ["ampersand"], - tokenizer: "standard", - # synonym should come last, after stemming and shingle - # shingle must come before searchkick_stemmer - filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"] - }, - searchkick_search: { - type: "custom", - char_filter: ["ampersand"], - tokenizer: "standard", - filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"] - }, - searchkick_search2: { - type: "custom", - char_filter: ["ampersand"], - tokenizer: "standard", - filter: ["lowercase", "asciifolding", "searchkick_stemmer"] - }, - # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb - searchkick_autocomplete_search: { - type: "custom", - tokenizer: "keyword", - filter: ["lowercase", "asciifolding"] - }, - searchkick_word_search: { - type: "custom", - tokenizer: "standard", - filter: ["lowercase", "asciifolding"] - }, - searchkick_suggest_index: { - type: "custom", - tokenizer: "standard", - filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"] - }, - searchkick_text_start_index: { - type: "custom", - tokenizer: "keyword", - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] - }, - searchkick_text_middle_index: { - type: "custom", - tokenizer: "keyword", - filter: ["lowercase", "asciifolding", "searchkick_ngram"] - }, - searchkick_text_end_index: { - type: "custom", - tokenizer: "keyword", - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] - }, - searchkick_word_start_index: { - type: "custom", - tokenizer: "standard", - filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] - }, - searchkick_word_middle_index: { - type: "custom", - tokenizer: "standard", - filter: ["lowercase", "asciifolding", "searchkick_ngram"] - }, - searchkick_word_end_index: { - type: "custom", - tokenizer: "standard", - filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] - } - }, - filter: { - searchkick_index_shingle: { - type: "shingle", - token_separator: "" - }, - # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7 - searchkick_search_shingle: { - type: "shingle", - token_separator: "", - output_unigrams: false, - output_unigrams_if_no_shingles: true - }, - searchkick_suggest_shingle: { - type: "shingle", - max_shingle_size: 5 - }, - searchkick_edge_ngram: { - type: "edge_ngram", - min_gram: 1, - max_gram: 50 - }, - searchkick_ngram: { - type: "ngram", - min_gram: 1, - max_gram: 50 - }, - searchkick_stemmer: { - # use stemmer if language is lowercase, snowball otherwise - type: language == language.to_s.downcase ? "stemmer" : "snowball", - language: language || "English" - } - }, - char_filter: { - # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html - # &_to_and - ampersand: { - type: "mapping", - mappings: ["&=> and "] - } - } - } - } + { + settings: settings, + mappings: mappings + } + end - stem = options[:stem] + def generate_settings + language = options[:language] + language = language.call if language.respond_to?(:call) - case language - when "chinese" - settings[:analysis][:analyzer].merge!( + settings = { + analysis: { + analyzer: { + searchkick_keyword: { + type: "custom", + tokenizer: "keyword", + filter: ["lowercase"] + (options[:stem_conversions] ? ["searchkick_stemmer"] : []) + }, default_analyzer => { - type: "ik_smart" + type: "custom", + # character filters -> tokenizer -> token filters + # https://www.elastic.co/guide/en/elasticsearch/guide/current/analysis-intro.html + char_filter: ["ampersand"], + tokenizer: "standard", + # synonym should come last, after stemming and shingle + # shingle must come before searchkick_stemmer + filter: ["lowercase", "asciifolding", "searchkick_index_shingle", "searchkick_stemmer"] }, searchkick_search: { - type: "ik_smart" + type: "custom", + char_filter: ["ampersand"], + tokenizer: "standard", + filter: ["lowercase", "asciifolding", "searchkick_search_shingle", "searchkick_stemmer"] }, searchkick_search2: { - type: "ik_max_word" - } - ) - - stem = false - when "chinese2", "smartcn" - settings[:analysis][:analyzer].merge!( - default_analyzer => { - type: "smartcn" + type: "custom", + char_filter: ["ampersand"], + tokenizer: "standard", + filter: ["lowercase", "asciifolding", "searchkick_stemmer"] }, - searchkick_search: { - type: "smartcn" + # https://github.com/leschenko/elasticsearch_autocomplete/blob/master/lib/elasticsearch_autocomplete/analyzers.rb + searchkick_autocomplete_search: { + type: "custom", + tokenizer: "keyword", + filter: ["lowercase", "asciifolding"] }, - searchkick_search2: { - type: "smartcn" - } - ) - - stem = false - when "japanese" - settings[:analysis][:analyzer].merge!( - default_analyzer => { - type: "kuromoji" + searchkick_word_search: { + type: "custom", + tokenizer: "standard", + filter: ["lowercase", "asciifolding"] }, - searchkick_search: { - type: "kuromoji" + searchkick_suggest_index: { + type: "custom", + tokenizer: "standard", + filter: ["lowercase", "asciifolding", "searchkick_suggest_shingle"] }, - searchkick_search2: { - type: "kuromoji" - } - ) - - stem = false - when "korean" - settings[:analysis][:analyzer].merge!( - default_analyzer => { - type: "openkoreantext-analyzer" + searchkick_text_start_index: { + type: "custom", + tokenizer: "keyword", + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] }, - searchkick_search: { - type: "openkoreantext-analyzer" + searchkick_text_middle_index: { + type: "custom", + tokenizer: "keyword", + filter: ["lowercase", "asciifolding", "searchkick_ngram"] }, - searchkick_search2: { - type: "openkoreantext-analyzer" - } - ) - - stem = false - when "korean2" - settings[:analysis][:analyzer].merge!( - default_analyzer => { - type: "nori" + searchkick_text_end_index: { + type: "custom", + tokenizer: "keyword", + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] }, - searchkick_search: { - type: "nori" + searchkick_word_start_index: { + type: "custom", + tokenizer: "standard", + filter: ["lowercase", "asciifolding", "searchkick_edge_ngram"] }, - searchkick_search2: { - type: "nori" + searchkick_word_middle_index: { + type: "custom", + tokenizer: "standard", + filter: ["lowercase", "asciifolding", "searchkick_ngram"] + }, + searchkick_word_end_index: { + type: "custom", + tokenizer: "standard", + filter: ["lowercase", "asciifolding", "reverse", "searchkick_edge_ngram", "reverse"] } - ) - - stem = false - when "vietnamese" - settings[:analysis][:analyzer].merge!( - default_analyzer => { - type: "vi_analyzer" + }, + filter: { + searchkick_index_shingle: { + type: "shingle", + token_separator: "" }, - searchkick_search: { - type: "vi_analyzer" + # lucky find https://web.archiveorange.com/archive/v/AAfXfQ17f57FcRINsof7 + searchkick_search_shingle: { + type: "shingle", + token_separator: "", + output_unigrams: false, + output_unigrams_if_no_shingles: true }, - searchkick_search2: { - type: "vi_analyzer" - } - ) - - stem = false - when "polish", "ukrainian" - settings[:analysis][:analyzer].merge!( - default_analyzer => { - type: language + searchkick_suggest_shingle: { + type: "shingle", + max_shingle_size: 5 }, - searchkick_search: { - type: language + searchkick_edge_ngram: { + type: "edge_ngram", + min_gram: 1, + max_gram: 50 }, - searchkick_search2: { - type: language + searchkick_ngram: { + type: "ngram", + min_gram: 1, + max_gram: 50 + }, + searchkick_stemmer: { + # use stemmer if language is lowercase, snowball otherwise + type: language == language.to_s.downcase ? "stemmer" : "snowball", + language: language || "English" } - ) + }, + char_filter: { + # https://www.elastic.co/guide/en/elasticsearch/guide/current/custom-analyzers.html + # &_to_and + ampersand: { + type: "mapping", + mappings: ["&=> and "] + } + } + } + } - stem = false - end + update_language(settings, language) + update_stemming(settings) - if Searchkick.env == "test" - settings[:number_of_shards] = 1 - settings[:number_of_replicas] = 0 - end + if Searchkick.env == "test" + settings[:number_of_shards] = 1 + settings[:number_of_replicas] = 0 + end - if options[:similarity] - settings[:similarity] = {default: {type: options[:similarity]}} - end + # TODO remove in Searchkick 5 (classic no longer supported) + if options[:similarity] + settings[:similarity] = {default: {type: options[:similarity]}} + end - unless below62 - settings[:index] = { - max_ngram_diff: 49, - max_shingle_diff: 4 - } - end + unless below62? + settings[:index] = { + max_ngram_diff: 49, + max_shingle_diff: 4 + } + end - if options[:case_sensitive] - settings[:analysis][:analyzer].each do |_, analyzer| - analyzer[:filter].delete("lowercase") - end + if options[:case_sensitive] + settings[:analysis][:analyzer].each do |_, analyzer| + analyzer[:filter].delete("lowercase") end + end - if stem == false - settings[:analysis][:filter].delete(:searchkick_stemmer) - settings[:analysis][:analyzer].each do |_, analyzer| - analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter] - end + # TODO do this last in Searchkick 5 + settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys) + + add_synonyms(settings) + add_search_synonyms(settings) + # TODO remove in Searchkick 5 + add_wordnet(settings) if options[:wordnet] + + if options[:special_characters] == false + settings[:analysis][:analyzer].each_value do |analyzer_settings| + analyzer_settings[:filter].reject! { |f| f == "asciifolding" } end + end - settings = settings.symbolize_keys.deep_merge((options[:settings] || {}).symbolize_keys) + settings + end - # synonyms - synonyms = options[:synonyms] || [] - synonyms = synonyms.call if synonyms.respond_to?(:call) - if synonyms.any? - settings[:analysis][:filter][:searchkick_synonym] = { - type: "synonym", - # only remove a single space from synonyms so three-word synonyms will fail noisily instead of silently - synonyms: synonyms.select { |s| s.size > 1 }.map { |s| s.is_a?(Array) ? s.map { |s2| s2.sub(/\s+/, "") }.join(",") : s }.map(&:downcase) + def update_language(settings, language) + case language + when "chinese" + settings[:analysis][:analyzer].merge!( + default_analyzer => { + type: "ik_smart" + }, + searchkick_search: { + type: "ik_smart" + }, + searchkick_search2: { + type: "ik_max_word" } - # choosing a place for the synonym filter when stemming is not easy - # https://groups.google.com/forum/#!topic/elasticsearch/p7qcQlgHdB8 - # TODO use a snowball stemmer on synonyms when creating the token filter + ) + when "chinese2", "smartcn" + settings[:analysis][:analyzer].merge!( + default_analyzer => { + type: "smartcn" + }, + searchkick_search: { + type: "smartcn" + }, + searchkick_search2: { + type: "smartcn" + } + ) + when "japanese" + settings[:analysis][:analyzer].merge!( + default_analyzer => { + type: "kuromoji" + }, + searchkick_search: { + type: "kuromoji" + }, + searchkick_search2: { + type: "kuromoji" + } + ) + when "korean" + settings[:analysis][:analyzer].merge!( + default_analyzer => { + type: "openkoreantext-analyzer" + }, + searchkick_search: { + type: "openkoreantext-analyzer" + }, + searchkick_search2: { + type: "openkoreantext-analyzer" + } + ) + when "korean2" + settings[:analysis][:analyzer].merge!( + default_analyzer => { + type: "nori" + }, + searchkick_search: { + type: "nori" + }, + searchkick_search2: { + type: "nori" + } + ) + when "vietnamese" + settings[:analysis][:analyzer].merge!( + default_analyzer => { + type: "vi_analyzer" + }, + searchkick_search: { + type: "vi_analyzer" + }, + searchkick_search2: { + type: "vi_analyzer" + } + ) + when "polish", "ukrainian" + settings[:analysis][:analyzer].merge!( + default_analyzer => { + type: language + }, + searchkick_search: { + type: language + }, + searchkick_search2: { + type: language + } + ) + end + end - # http://elasticsearch-users.115913.n3.nabble.com/synonym-multi-words-search-td4030811.html - # I find the following approach effective if you are doing multi-word synonyms (synonym phrases): - # - Only apply the synonym expansion at index time - # - Don't have the synonym filter applied search - # - Use directional synonyms where appropriate. You want to make sure that you're not injecting terms that are too general. - settings[:analysis][:analyzer][default_analyzer][:filter].insert(2, "searchkick_synonym") + def update_stemming(settings) + stem = options[:stem] - %w(word_start word_middle word_end).each do |type| - settings[:analysis][:analyzer]["searchkick_#{type}_index".to_sym][:filter].insert(2, "searchkick_synonym") - end + # language analyzer used + stem = false if settings[:analysis][:analyzer][default_analyzer][:type] != "custom" + + if stem == false + settings[:analysis][:filter].delete(:searchkick_stemmer) + settings[:analysis][:analyzer].each do |_, analyzer| + analyzer[:filter].delete("searchkick_stemmer") if analyzer[:filter] end + end - search_synonyms = options[:search_synonyms] || [] - search_synonyms = search_synonyms.call if search_synonyms.respond_to?(:call) - if search_synonyms.is_a?(String) || search_synonyms.any? - if search_synonyms.is_a?(String) - synonym_graph = { - type: "synonym_graph", - synonyms_path: search_synonyms - } - synonym_graph[:updateable] = true unless below73 - else - synonym_graph = { - type: "synonym_graph", - # TODO confirm this is correct - synonyms: search_synonyms.select { |s| s.size > 1 }.map { |s| s.is_a?(Array) ? s.join(",") : s }.map(&:downcase) - } - end - settings[:analysis][:filter][:searchkick_synonym_graph] = synonym_graph + if options[:stemmer_override] + stemmer_override = { + type: "stemmer_override" + } + if options[:stemmer_override].is_a?(String) + stemmer_override[:rules_path] = options[:stemmer_override] + else + stemmer_override[:rules] = options[:stemmer_override] + end + settings[:analysis][:filter][:searchkick_stemmer_override] = stemmer_override - [:searchkick_search2, :searchkick_word_search].each do |analyzer| - settings[:analysis][:analyzer][analyzer][:filter].insert(2, "searchkick_synonym_graph") - end + settings[:analysis][:analyzer].each do |_, analyzer| + stemmer_index = analyzer[:filter].index("searchkick_stemmer") if analyzer[:filter] + analyzer[:filter].insert(stemmer_index, "searchkick_stemmer_override") if stemmer_index end + end - if options[:wordnet] - settings[:analysis][:filter][:searchkick_wordnet] = { - type: "synonym", - format: "wordnet", - synonyms_path: Searchkick.wordnet_path - } + if options[:stem_exclusion] + settings[:analysis][:filter][:searchkick_stem_exclusion] = { + type: "keyword_marker", + keywords: options[:stem_exclusion] + } - settings[:analysis][:analyzer][default_analyzer][:filter].insert(4, "searchkick_wordnet") - settings[:analysis][:analyzer][default_analyzer][:filter] << "searchkick_wordnet" - - %w(word_start word_middle word_end).each do |type| - settings[:analysis][:analyzer]["searchkick_#{type}_index".to_sym][:filter].insert(2, "searchkick_wordnet") - end + settings[:analysis][:analyzer].each do |_, analyzer| + stemmer_index = analyzer[:filter].index("searchkick_stemmer") if analyzer[:filter] + analyzer[:filter].insert(stemmer_index, "searchkick_stem_exclusion") if stemmer_index end + end + end - if options[:special_characters] == false - settings[:analysis][:analyzer].each_value do |analyzer_settings| - analyzer_settings[:filter].reject! { |f| f == "asciifolding" } - end - end + def generate_mappings + mapping = {} - mapping = {} + keyword_mapping = {type: "keyword"} + keyword_mapping[:ignore_above] = options[:ignore_above] || 30000 - # conversions - Array(options[:conversions]).each do |conversions_field| - mapping[conversions_field] = { - type: "nested", - properties: { - query: {type: default_type, analyzer: "searchkick_keyword"}, - count: {type: "integer"} - } + # conversions + Array(options[:conversions]).each do |conversions_field| + mapping[conversions_field] = { + type: "nested", + properties: { + query: {type: default_type, analyzer: "searchkick_keyword"}, + count: {type: "integer"} } - end + } + end - mapping_options = Hash[ - [:suggest, :word, :text_start, :text_middle, :text_end, :word_start, :word_middle, :word_end, :highlight, :searchable, :filterable] - .map { |type| [type, (options[type] || []).map(&:to_s)] } - ] + mapping_options = Hash[ + [:suggest, :word, :text_start, :text_middle, :text_end, :word_start, :word_middle, :word_end, :highlight, :searchable, :filterable] + .map { |type| [type, (options[type] || []).map(&:to_s)] } + ] - word = options[:word] != false && (!options[:match] || options[:match] == :word) + word = options[:word] != false && (!options[:match] || options[:match] == :word) - mapping_options[:searchable].delete("_all") + mapping_options[:searchable].delete("_all") - analyzed_field_options = {type: default_type, index: true, analyzer: default_analyzer} + analyzed_field_options = {type: default_type, index: true, analyzer: default_analyzer} - mapping_options.values.flatten.uniq.each do |field| - fields = {} + mapping_options.values.flatten.uniq.each do |field| + fields = {} - if options.key?(:filterable) && !mapping_options[:filterable].include?(field) - fields[field] = {type: default_type, index: false} - else - fields[field] = keyword_mapping - end + if options.key?(:filterable) && !mapping_options[:filterable].include?(field) + fields[field] = {type: default_type, index: false} + else + fields[field] = keyword_mapping + end - if !options[:searchable] || mapping_options[:searchable].include?(field) - if word - fields[:analyzed] = analyzed_field_options + if !options[:searchable] || mapping_options[:searchable].include?(field) + if word + fields[:analyzed] = analyzed_field_options - if mapping_options[:highlight].include?(field) - fields[:analyzed][:term_vector] = "with_positions_offsets" - end + if mapping_options[:highlight].include?(field) + fields[:analyzed][:term_vector] = "with_positions_offsets" end + end - mapping_options.except(:highlight, :searchable, :filterable, :word).each do |type, f| - if options[:match] == type || f.include?(field) - fields[type] = {type: default_type, index: true, analyzer: "searchkick_#{type}_index"} - end + mapping_options.except(:highlight, :searchable, :filterable, :word).each do |type, f| + if options[:match] == type || f.include?(field) + fields[type] = {type: default_type, index: true, analyzer: "searchkick_#{type}_index"} end end - - mapping[field] = fields[field].merge(fields: fields.except(field)) end - (options[:locations] || []).map(&:to_s).each do |field| - mapping[field] = { - type: "geo_point" - } - end + mapping[field] = fields[field].merge(fields: fields.except(field)) + end - options[:geo_shape] = options[:geo_shape].product([{}]).to_h if options[:geo_shape].is_a?(Array) - (options[:geo_shape] || {}).each do |field, shape_options| - mapping[field] = shape_options.merge(type: "geo_shape") - end + (options[:locations] || []).map(&:to_s).each do |field| + mapping[field] = { + type: "geo_point" + } + end - if options[:inheritance] - mapping[:type] = keyword_mapping - end + options[:geo_shape] = options[:geo_shape].product([{}]).to_h if options[:geo_shape].is_a?(Array) + (options[:geo_shape] || {}).each do |field, shape_options| + mapping[field] = shape_options.merge(type: "geo_shape") + end - routing = {} - if options[:routing] - routing = {required: true} - unless options[:routing] == true - routing[:path] = options[:routing].to_s - end + if options[:inheritance] + mapping[:type] = keyword_mapping + end + + routing = {} + if options[:routing] + routing = {required: true} + unless options[:routing] == true + routing[:path] = options[:routing].to_s end + end - dynamic_fields = { - # analyzed field must be the default field for include_in_all - # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/ - # however, we can include the not_analyzed field in _all - # and the _all index analyzer will take care of it - "{name}" => keyword_mapping - } + dynamic_fields = { + # analyzed field must be the default field for include_in_all + # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/ + # however, we can include the not_analyzed field in _all + # and the _all index analyzer will take care of it + "{name}" => keyword_mapping + } - if options.key?(:filterable) - dynamic_fields["{name}"] = {type: default_type, index: false} + if options.key?(:filterable) + dynamic_fields["{name}"] = {type: default_type, index: false} + end + + unless options[:searchable] + if options[:match] && options[:match] != :word + dynamic_fields[options[:match]] = {type: default_type, index: true, analyzer: "searchkick_#{options[:match]}_index"} end - unless options[:searchable] - if options[:match] && options[:match] != :word - dynamic_fields[options[:match]] = {type: default_type, index: true, analyzer: "searchkick_#{options[:match]}_index"} - end - - if word - dynamic_fields[:analyzed] = analyzed_field_options - end + if word + dynamic_fields[:analyzed] = analyzed_field_options end + end - # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/ - multi_field = dynamic_fields["{name}"].merge(fields: dynamic_fields.except("{name}")) + # http://www.elasticsearch.org/guide/reference/mapping/multi-field-type/ + multi_field = dynamic_fields["{name}"].merge(fields: dynamic_fields.except("{name}")) - mappings = { - properties: mapping, - _routing: routing, - # https://gist.github.com/kimchy/2898285 - dynamic_templates: [ - { - string_template: { - match: "*", - match_mapping_type: "string", - mapping: multi_field - } + mappings = { + properties: mapping, + _routing: routing, + # https://gist.github.com/kimchy/2898285 + dynamic_templates: [ + { + string_template: { + match: "*", + match_mapping_type: "string", + mapping: multi_field } - ] + } + ] + } + + if below70? + mappings = {index_type => mappings} + end + + mappings + end + + def add_synonyms(settings) + synonyms = options[:synonyms] || [] + synonyms = synonyms.call if synonyms.respond_to?(:call) + if synonyms.any? + settings[:analysis][:filter][:searchkick_synonym] = { + type: "synonym", + # only remove a single space from synonyms so three-word synonyms will fail noisily instead of silently + synonyms: synonyms.select { |s| s.size > 1 }.map { |s| s.is_a?(Array) ? s.map { |s2| s2.sub(/\s+/, "") }.join(",") : s }.map(&:downcase) } + # choosing a place for the synonym filter when stemming is not easy + # https://groups.google.com/forum/#!topic/elasticsearch/p7qcQlgHdB8 + # TODO use a snowball stemmer on synonyms when creating the token filter - if below70 - mappings = {index_type => mappings} - end + # http://elasticsearch-users.115913.n3.nabble.com/synonym-multi-words-search-td4030811.html + # I find the following approach effective if you are doing multi-word synonyms (synonym phrases): + # - Only apply the synonym expansion at index time + # - Don't have the synonym filter applied search + # - Use directional synonyms where appropriate. You want to make sure that you're not injecting terms that are too general. + settings[:analysis][:analyzer][default_analyzer][:filter].insert(2, "searchkick_synonym") - mappings = mappings.symbolize_keys.deep_merge(custom_mapping.symbolize_keys) + %w(word_start word_middle word_end).each do |type| + settings[:analysis][:analyzer]["searchkick_#{type}_index".to_sym][:filter].insert(2, "searchkick_synonym") + end end + end - if options[:deep_paging] - if !settings.dig(:index, :max_result_window) && !settings[:"index.max_result_window"] - settings[:index] ||= {} - settings[:index][:max_result_window] = 1_000_000_000 + def add_search_synonyms(settings) + search_synonyms = options[:search_synonyms] || [] + search_synonyms = search_synonyms.call if search_synonyms.respond_to?(:call) + if search_synonyms.is_a?(String) || search_synonyms.any? + if search_synonyms.is_a?(String) + synonym_graph = { + type: "synonym_graph", + synonyms_path: search_synonyms + } + synonym_graph[:updateable] = true unless below73? + else + synonym_graph = { + type: "synonym_graph", + # TODO confirm this is correct + synonyms: search_synonyms.select { |s| s.size > 1 }.map { |s| s.is_a?(Array) ? s.join(",") : s }.map(&:downcase) + } end + settings[:analysis][:filter][:searchkick_synonym_graph] = synonym_graph + + [:searchkick_search2, :searchkick_word_search].each do |analyzer| + settings[:analysis][:analyzer][analyzer][:filter].insert(2, "searchkick_synonym_graph") + end end + end - { - settings: settings, - mappings: mappings + def add_wordnet(settings) + settings[:analysis][:filter][:searchkick_wordnet] = { + type: "synonym", + format: "wordnet", + synonyms_path: Searchkick.wordnet_path } + + settings[:analysis][:analyzer][default_analyzer][:filter].insert(4, "searchkick_wordnet") + settings[:analysis][:analyzer][default_analyzer][:filter] << "searchkick_wordnet" + + %w(word_start word_middle word_end).each do |type| + settings[:analysis][:analyzer]["searchkick_#{type}_index".to_sym][:filter].insert(2, "searchkick_wordnet") + end + end + + def set_deep_paging(settings) + if !settings.dig(:index, :max_result_window) && !settings[:"index.max_result_window"] + settings[:index] ||= {} + settings[:index][:max_result_window] = 1_000_000_000 + end + end + + def index_type + @index_type ||= begin + index_type = options[:_type] + index_type = index_type.call if index_type.respond_to?(:call) + index_type + end + end + + def default_type + "text" + end + + def default_analyzer + :searchkick_index + end + + def below62? + Searchkick.server_below?("6.2.0") + end + + def below70? + Searchkick.server_below?("7.0.0") + end + + def below73? + Searchkick.server_below?("7.3.0") end end end