require 'thinking_sphinx/search/facets'
module ThinkingSphinx
# Once you've got those indexes in and built, this is the stuff that
# matters - how to search! This class provides a generic search
# interface - which you can use to search all your indexed models at once.
# Most times, you will just want a specific model's results - to search and
# search_for_ids methods will do the job in exactly the same manner when
# called from a model.
#
class Search
GlobalFacetOptions = {
:all_attributes => false,
:class_facet => true
}
class << self
include ThinkingSphinx::Search::Facets
# Searches for results that match the parameters provided. Will only
# return the ids for the matching objects. See #search for syntax
# examples.
#
# Note that this only searches the Sphinx index, with no ActiveRecord
# queries. Thus, if your index is not in sync with the database, this
# method may return ids that no longer exist there.
#
def search_for_ids(*args)
results, client = search_results(*args.clone)
options = args.extract_options!
page = options[:page] ? options[:page].to_i : 1
ThinkingSphinx::Collection.ids_from_results(results, page, client.limit, options)
end
# Searches through the Sphinx indexes for relevant matches. There's
# various ways to search, sort, group and filter - which are covered
# below.
#
# Also, if you have WillPaginate installed, the search method can be used
# just like paginate. The same parameters - :page and :per_page - work as
# expected, and the returned result set can be used by the will_paginate
# helper.
#
# == Basic Searching
#
# The simplest way of searching is straight text.
#
# ThinkingSphinx::Search.search "pat"
# ThinkingSphinx::Search.search "google"
# User.search "pat", :page => (params[:page] || 1)
# Article.search "relevant news issue of the day"
#
# If you specify :include, like in an #find call, this will be respected
# when loading the relevant models from the search results.
#
# User.search "pat", :include => :posts
#
# == Match Modes
#
# Sphinx supports 5 different matching modes. By default Thinking Sphinx
# uses :all, which unsurprisingly requires all the supplied search terms
# to match a result.
#
# Alternative modes include:
#
# User.search "pat allan", :match_mode => :any
# User.search "pat allan", :match_mode => :phrase
# User.search "pat | allan", :match_mode => :boolean
# User.search "@name pat | @username pat", :match_mode => :extended
#
# Any will find results with any of the search terms. Phrase treats the search
# terms a single phrase instead of individual words. Boolean and extended allow
# for more complex query syntax, refer to the sphinx documentation for further
# details.
#
# == Weighting
#
# Sphinx has support for weighting, where matches in one field can be considered
# more important than in another. Weights are integers, with 1 as the default.
# They can be set per-search like this:
#
# User.search "pat allan", :field_weights => { :alias => 4, :aka => 2 }
#
# If you're searching multiple models, you can set per-index weights:
#
# ThinkingSphinx::Search.search "pat", :index_weights => { User => 10 }
#
# See http://sphinxsearch.com/doc.html#weighting for further details.
#
# == Searching by Fields
#
# If you want to step it up a level, you can limit your search terms to
# specific fields:
#
# User.search :conditions => {:name => "pat"}
#
# This uses Sphinx's extended match mode, unless you specify a different
# match mode explicitly (but then this way of searching won't work). Also
# note that you don't need to put in a search string.
#
# == Searching by Attributes
#
# Also known as filters, you can limit your searches to documents that
# have specific values for their attributes. There are three ways to do
# this. The first two techniques work in all scenarios - using the :with
# or :with_all options.
#
# ThinkingSphinx::Search.search :with => {:tag_ids => 10}
# ThinkingSphinx::Search.search :with => {:tag_ids => [10,12]}
# ThinkingSphinx::Search.search :with_all => {:tag_ids => [10,12]}
#
# The first :with search will match records with a tag_id attribute of 10.
# The second :with will match records with a tag_id attribute of 10 OR 12.
# If you need to find records that are tagged with ids 10 AND 12, you
# will need to use the :with_all search parameter. This is particuarly
# useful in conjunction with Multi Value Attributes (MVAs).
#
# The third filtering technique is only viable if you're searching with a
# specific model (not multi-model searching). With a single model,
# Thinking Sphinx can figure out what attributes and fields are available,
# so you can put it all in the :conditions hash, and it will sort it out.
#
# Node.search :conditions => {:parent_id => 10}
#
# Filters can be single values, arrays of values, or ranges.
#
# Article.search "East Timor", :conditions => {:rating => 3..5}
#
# == Excluding by Attributes
#
# Sphinx also supports negative filtering - where the filters are of
# attribute values to exclude. This is done with the :without option:
#
# User.search :without => {:role_id => 1}
#
# == Excluding by Primary Key
#
# There is a shortcut to exclude records by their ActiveRecord primary key:
#
# User.search :without_ids => 1
#
# Pass an array or a single value.
#
# The primary key must be an integer as a negative filter is used. Note
# that for multi-model search, an id may occur in more than one model.
#
# == Infix (Star) Searching
#
# By default, Sphinx uses English stemming, e.g. matching "shoes" if you
# search for "shoe". It won't find "Melbourne" if you search for
# "elbourn", though.
#
# Enable infix searching by something like this in config/sphinx.yml:
#
# development:
# enable_star: 1
# min_infix_length: 2
#
# Note that this will make indexing take longer.
#
# With those settings (and after reindexing), wildcard asterisks can be used
# in queries:
#
# Location.search "*elbourn*"
#
# To automatically add asterisks around every token (but not operators),
# pass the :star option:
#
# Location.search "elbourn -ustrali", :star => true, :match_mode => :boolean
#
# This would become "*elbourn* -*ustrali*". The :star option only adds the
# asterisks. You need to make the config/sphinx.yml changes yourself.
#
# By default, the tokens are assumed to match the regular expression /\w+/u.
# If you've modified the charset_table, pass another regular expression, e.g.
#
# User.search("oo@bar.c", :star => /[\w@.]+/u)
#
# to search for "*oo@bar.c*" and not "*oo*@*bar*.*c*".
#
# == Sorting
#
# Sphinx can only sort by attributes, so generally you will need to avoid
# using field names in your :order option. However, if you're searching
# on a single model, and have specified some fields as sortable, you can
# use those field names and Thinking Sphinx will interpret accordingly.
# Remember: this will only happen for single-model searches, and only
# through the :order option.
#
# Location.search "Melbourne", :order => :state
# User.search :conditions => {:role_id => 2}, :order => "name ASC"
#
# Keep in mind that if you use a string, you *must* specify the direction
# (ASC or DESC) else Sphinx won't return any results. If you use a symbol
# then Thinking Sphinx assumes ASC, but if you wish to state otherwise,
# use the :sort_mode option:
#
# Location.search "Melbourne", :order => :state, :sort_mode => :desc
#
# Of course, there are other sort modes - check out the Sphinx
# documentation[http://sphinxsearch.com/doc.html] for that level of
# detail though.
#
# If desired, you can sort by a column in your model instead of a sphinx
# field or attribute. This sort only applies to the current page, so is
# most useful when performing a search with a single page of results.
#
# User.search("pat", :sql_order => "name")
#
# == Grouping
#
# For this you can use the group_by, group_clause and group_function
# options - which are all directly linked to Sphinx's expectations. No
# magic from Thinking Sphinx. It can get a little tricky, so make sure
# you read all the relevant
# documentation[http://sphinxsearch.com/doc.html#clustering] first.
#
# Grouping is done via three parameters within the options hash
# * :group_function determines the way grouping is done
# * :group_by determines the field which is used for grouping
# * :group_clause determines the sorting order
#
# === group_function
#
# Valid values for :group_function are
# * :day, :week, :month, :year - Grouping is done by the respective timeframes.
# * :attr, :attrpair - Grouping is done by the specified attributes(s)
#
# === group_by
#
# This parameter denotes the field by which grouping is done. Note that the
# specified field must be a sphinx attribute or index.
#
# === group_clause
#
# This determines the sorting order of the groups. In a grouping search,
# the matches within a group will sorted by the :sort_mode and :order parameters.
# The group matches themselves however, will be sorted by :group_clause.
#
# The syntax for this is the same as an order parameter in extended sort mode.
# Namely, you can specify an SQL-like sort expression with up to 5 attributes
# (including internal attributes), eg: "@relevance DESC, price ASC, @id DESC"
#
# === Grouping by timestamp
#
# Timestamp grouping groups off items by the day, week, month or year of the
# attribute given. In order to do this you need to define a timestamp attribute,
# which pretty much looks like the standard defintion for any attribute.
#
# define_index do
# #
# # All your other stuff
# #
# has :created_at
# end
#
# When you need to fire off your search, it'll go something to the tune of
#
# Fruit.search "apricot", :group_function => :day, :group_by => 'created_at'
#
# The @groupby special attribute will contain the date for that group.
# Depending on the :group_function parameter, the date format will be
#
# * :day - YYYYMMDD
# * :week - YYYYNNN (NNN is the first day of the week in question,
# counting from the start of the year )
# * :month - YYYYMM
# * :year - YYYY
#
#
# === Grouping by attribute
#
# The syntax is the same as grouping by timestamp, except for the fact that the
# :group_function parameter is changed
#
# Fruit.search "apricot", :group_function => :attr, :group_by => 'size'
#
#
# == Geo/Location Searching
#
# Sphinx - and therefore Thinking Sphinx - has the facility to search
# around a geographical point, using a given latitude and longitude. To
# take advantage of this, you will need to have both of those values in
# attributes. To search with that point, you can then use one of the
# following syntax examples:
#
# Address.search "Melbourne", :geo => [1.4, -2.217], :order => "@geodist asc"
# Address.search "Australia", :geo => [-0.55, 3.108], :order => "@geodist asc"
# :latitude_attr => "latit", :longitude_attr => "longit"
#
# The first example applies when your latitude and longitude attributes
# are named any of lat, latitude, lon, long or longitude. If that's not
# the case, you will need to explicitly state them in your search, _or_
# you can do so in your model:
#
# define_index do
# has :latit # Float column, stored in radians
# has :longit # Float column, stored in radians
#
# set_property :latitude_attr => "latit"
# set_property :longitude_attr => "longit"
# end
#
# Now, geo-location searching really only has an affect if you have a
# filter, sort or grouping clause related to it - otherwise it's just a
# normal search, and _will not_ return a distance value otherwise. To
# make use of the positioning difference, use the special attribute
# "@geodist" in any of your filters or sorting or grouping clauses.
#
# And don't forget - both the latitude and longitude you use in your
# search, and the values in your indexes, need to be stored as a float in radians,
# _not_ degrees. Keep in mind that if you do this conversion in SQL
# you will need to explicitly declare a column type of :float.
#
# define_index do
# has 'RADIANS(lat)', :as => :lat, :type => :float
# # ...
# end
#
# Once you've got your results set, you can access the distances as
# follows:
#
# @results.each_with_geodist do |result, distance|
# # ...
# end
#
# The distance value is returned as a float, representing the distance in
# metres.
#
# == Handling a Stale Index
#
# Especially if you don't use delta indexing, you risk having records in the
# Sphinx index that are no longer in the database. By default, those will simply
# come back as nils:
#
# >> pat_user.delete
# >> User.search("pat")
# Sphinx Result: [1,2]
# => [nil, <#User id: 2>]
#
# (If you search across multiple models, you'll get ActiveRecord::RecordNotFound.)
#
# You can simply Array#compact these results or handle the nils in some other way, but
# Sphinx will still report two results, and the missing records may upset your layout.
#
# If you pass :retry_stale => true to a single-model search, missing records will
# cause Thinking Sphinx to retry the query but excluding those records. Since search
# is paginated, the new search could potentially include missing records as well, so by
# default Thinking Sphinx will retry three times. Pass :retry_stale => 5 to retry five
# times, and so on. If there are still missing ids on the last retry, they are
# shown as nils.
#
def search(*args)
query = args.clone # an array
options = query.extract_options!
retry_search_on_stale_index(query, options) do
results, client = search_results(*(query + [options]))
log "Sphinx Error: #{results[:error]}", :error if results[:error]
klass = options[:class]
page = options[:page] ? options[:page].to_i : 1
ThinkingSphinx::Collection.create_from_results(results, page, client.limit, options)
end
end
def retry_search_on_stale_index(query, options, &block)
stale_ids = []
stale_retries_left = case options[:retry_stale]
when true
3 # default to three retries
when nil, false
0 # no retries
else options[:retry_stale].to_i
end
begin
# Passing this in an option so Collection.create_from_results can see it.
# It should only raise on stale records if there are any retries left.
options[:raise_on_stale] = stale_retries_left > 0
block.call
# If ThinkingSphinx::Collection.create_from_results found records in Sphinx but not
# in the DB and the :raise_on_stale option is set, this exception is raised. We retry
# a limited number of times, excluding the stale ids from the search.
rescue StaleIdsException => e
stale_retries_left -= 1
stale_ids |= e.ids # For logging
options[:without_ids] = Array(options[:without_ids]) | e.ids # Actual exclusion
tries = stale_retries_left
log "Sphinx Stale Ids (%s %s left): %s" % [
tries, (tries==1 ? 'try' : 'tries'), stale_ids.join(', ')
]
retry
end
end
def count(*args)
results, client = search_results(*args.clone)
results[:total_found] || 0
end
# Checks if a document with the given id exists within a specific index.
# Expected parameters:
#
# - ID of the document
# - Index to check within
# - Options hash (defaults to {})
#
# Example:
#
# ThinkingSphinx::Search.search_for_id(10, "user_core", :class => User)
#
def search_for_id(*args)
options = args.extract_options!
client = client_from_options options
query, filters = search_conditions(
options[:class], options[:conditions] || {}
)
client.filters += filters
client.match_mode = :extended unless query.empty?
client.id_range = args.first..args.first
begin
return client.query(query, args[1])[:matches].length > 0
rescue Errno::ECONNREFUSED => err
raise ThinkingSphinx::ConnectionError, "Connection to Sphinx Daemon (searchd) failed."
end
end
private
# This method handles the common search functionality, and returns both
# the result hash and the client. Not super elegant, but it'll do for
# the moment.
#
def search_results(*args)
options = args.extract_options!
query = args.join(' ')
client = client_from_options options
query = star_query(query, options[:star]) if options[:star]
extra_query, filters = search_conditions(
options[:class], options[:conditions] || {}
)
client.filters += filters
client.match_mode = :extended unless extra_query.empty?
query = [query, extra_query].join(' ')
query.strip! # Because "" and " " are not equivalent
set_sort_options! client, options
client.limit = options[:per_page].to_i if options[:per_page]
page = options[:page] ? options[:page].to_i : 1
page = 1 if page <= 0
client.offset = (page - 1) * client.limit
begin
log "Sphinx: #{query}"
results = client.query query
log "Sphinx Result:"
log results[:matches].collect { |m|
m[:attributes]["sphinx_internal_id"]
}.inspect
rescue Errno::ECONNREFUSED => err
raise ThinkingSphinx::ConnectionError, "Connection to Sphinx Daemon (searchd) failed."
end
return results, client
end
# Set all the appropriate settings for the client, using the provided
# options hash.
#
def client_from_options(options = {})
config = ThinkingSphinx::Configuration.instance
client = Riddle::Client.new config.address, config.port
klass = options[:class]
index_options = klass ? klass.sphinx_index_options : {}
# The Riddle default is per-query max_matches=1000. If we set the
# per-server max to a smaller value in sphinx.yml, we need to override
# the Riddle default or else we get search errors like
# "per-query max_matches=1000 out of bounds (per-server max_matches=200)"
if per_server_max_matches = config.configuration.searchd.max_matches
options[:max_matches] ||= per_server_max_matches
end
# Turn :index_weights => { "foo" => 2, User => 1 }
# into :index_weights => { "foo" => 2, "user_core" => 1, "user_delta" => 1 }
if iw = options[:index_weights]
options[:index_weights] = iw.inject({}) do |hash, (index,weight)|
if index.is_a?(Class)
name = ThinkingSphinx::Index.name(index)
hash["#{name}_core"] = weight
hash["#{name}_delta"] = weight
else
hash[index] = weight
end
hash
end
end
[
:max_matches, :match_mode, :sort_mode, :sort_by, :id_range,
:group_by, :group_function, :group_clause, :group_distinct, :cut_off,
:retry_count, :retry_delay, :index_weights, :rank_mode,
:max_query_time, :field_weights, :filters, :anchor, :limit
].each do |key|
client.send(
key.to_s.concat("=").to_sym,
options[key] || index_options[key] || client.send(key)
)
end
options[:classes] = [klass] if klass
client.anchor = anchor_conditions(klass, options) || {} if client.anchor.empty?
client.filters << Riddle::Client::Filter.new(
"sphinx_deleted", [0]
)
# class filters
client.filters << Riddle::Client::Filter.new(
"class_crc", options[:classes].collect { |k| k.to_crc32s }.flatten
) if options[:classes]
# normal attribute filters
client.filters += options[:with].collect { |attr,val|
Riddle::Client::Filter.new attr.to_s, filter_value(val)
} if options[:with]
# exclusive attribute filters
client.filters += options[:without].collect { |attr,val|
Riddle::Client::Filter.new attr.to_s, filter_value(val), true
} if options[:without]
# every-match attribute filters
client.filters += options[:with_all].collect { |attr,vals|
Array(vals).collect { |val|
Riddle::Client::Filter.new attr.to_s, filter_value(val)
}
}.flatten if options[:with_all]
# exclusive attribute filter on primary key
client.filters += Array(options[:without_ids]).collect { |id|
Riddle::Client::Filter.new 'sphinx_internal_id', filter_value(id), true
} if options[:without_ids]
client
end
def star_query(query, custom_token = nil)
token = custom_token.is_a?(Regexp) ? custom_token : /\w+/u
query.gsub(/("#{token}(.*?#{token})?"|(?![!-])#{token})/u) do
pre, proper, post = $`, $&, $'
is_operator = pre.match(%r{(\W|^)[@~/]\Z}) # E.g. "@foo", "/2", "~3", but not as part of a token
is_quote = proper.starts_with?('"') && proper.ends_with?('"') # E.g. "foo bar", with quotes
has_star = pre.ends_with?("*") || post.starts_with?("*")
if is_operator || is_quote || has_star
proper
else
"*#{proper}*"
end
end
end
def filter_value(value)
case value
when Range
value.first.is_a?(Time) ? timestamp(value.first)..timestamp(value.last) : value
when Array
value.collect { |val| val.is_a?(Time) ? timestamp(val) : val }
else
Array(value)
end
end
# Returns the integer timestamp for a Time object.
#
# If using Rails 2.1+, need to handle timezones to translate them back to
# UTC, as that's what datetimes will be stored as by MySQL.
#
# in_time_zone is a method that was added for the timezone support in
# Rails 2.1, which is why it's used for testing. I'm sure there's better
# ways, but this does the job.
#
def timestamp(value)
value.respond_to?(:in_time_zone) ? value.utc.to_i : value.to_i
end
# Translate field and attribute conditions to the relevant search string
# and filters.
#
def search_conditions(klass, conditions={})
attributes = klass ? klass.sphinx_indexes.collect { |index|
index.attributes.collect { |attrib| attrib.unique_name }
}.flatten : []
search_string = []
filters = []
conditions.each do |key,val|
if attributes.include?(key.to_sym)
filters << Riddle::Client::Filter.new(
key.to_s, filter_value(val)
)
else
search_string << "@#{key} #{val}"
end
end
return search_string.join(' '), filters
end
# Return the appropriate latitude and longitude values, depending on
# whether the relevant attributes have been defined, and also whether
# there's actually any values.
#
def anchor_conditions(klass, options)
attributes = klass ? klass.sphinx_indexes.collect { |index|
index.attributes.collect { |attrib| attrib.unique_name }
}.flatten : []
lat_attr = klass ? klass.sphinx_indexes.collect { |index|
index.options[:latitude_attr]
}.compact.first : nil
lon_attr = klass ? klass.sphinx_indexes.collect { |index|
index.options[:longitude_attr]
}.compact.first : nil
lat_attr = options[:latitude_attr] if options[:latitude_attr]
lat_attr ||= :lat if attributes.include?(:lat)
lat_attr ||= :latitude if attributes.include?(:latitude)
lon_attr = options[:longitude_attr] if options[:longitude_attr]
lon_attr ||= :lng if attributes.include?(:lng)
lon_attr ||= :lon if attributes.include?(:lon)
lon_attr ||= :long if attributes.include?(:long)
lon_attr ||= :longitude if attributes.include?(:longitude)
lat = options[:lat]
lon = options[:lon]
if options[:geo]
lat = options[:geo].first
lon = options[:geo].last
end
lat && lon ? {
:latitude_attribute => lat_attr.to_s,
:latitude => lat,
:longitude_attribute => lon_attr.to_s,
:longitude => lon
} : nil
end
# Set the sort options using the :order key as well as the appropriate
# Riddle settings.
#
def set_sort_options!(client, options)
klass = options[:class]
fields = klass ? klass.sphinx_indexes.collect { |index|
index.fields.collect { |field| field.unique_name }
}.flatten : []
index_options = klass ? klass.sphinx_index_options : {}
order = options[:order] || index_options[:order]
case order
when Symbol
client.sort_mode = :attr_asc if client.sort_mode == :relevance || client.sort_mode.nil?
if fields.include?(order)
client.sort_by = order.to_s.concat("_sort")
else
client.sort_by = order.to_s
end
when String
client.sort_mode = :extended unless options[:sort_mode]
client.sort_by = sorted_fields_to_attributes(order, fields)
else
# do nothing
end
client.sort_mode = :attr_asc if client.sort_mode == :asc
client.sort_mode = :attr_desc if client.sort_mode == :desc
end
# Search through a collection of fields and translate any appearances
# of them in a string to their attribute equivalent for sorting.
#
def sorted_fields_to_attributes(string, fields)
fields.each { |field|
string.gsub!(/(^|\s)#{field}(,?\s|$)/) { |match|
match.gsub field.to_s, field.to_s.concat("_sort")
}
}
string
end
def log(message, method = :debug)
return if ::ActiveRecord::Base.logger.nil?
::ActiveRecord::Base.logger.send method, message
end
end
end
end