lib/templates/sphinx/sphinx.conf.erb in capitate-0.1.9 vs lib/templates/sphinx/sphinx.conf.erb in capitate-0.2.1
- old
+ new
@@ -1,424 +1,53 @@
-#
-# Sphinx configuration file sample
-# TODO: This isn't currently used by any recipe. The current setup recipe uses sphinx conf on per application
-# basis.
-#
-
-#############################################################################
-## data source definition
-#############################################################################
-
-source <%= application %>
+# sphinx config
+source pages
{
- # data source type
- # for now, known types are 'mysql', 'pgsql' and 'xmlpipe'
- # MUST be defined
type = mysql
-
# whether to strip HTML
# values can be 0 (don't strip) or 1 (do strip)
# WARNING, only works with mysql source for now
# WARNING, should work ok for PERFECTLY formed XHTML for now
# WARNING, POSSIBLE TO BUG on malformed everday HTML
# optional, default is 0
- strip_html = 0
+ strip_html = 1
# what HTML attributes to index if stripping HTML
# format is as follows:
#
- # index_html_attrs = img=alt,title; a=title;
- #
- # optional, default is to not index anything
- index_html_attrs =
-
- #####################################################################
-
- # some straightforward parameters for 'mysql' source type
- sql_host = 127.0.0.1
- sql_user = <%= db_user %>
- sql_pass = <%= db_pass %>
- sql_db = <%= db_name %>
- sql_port = 3306 # optional, default is 3306
-
- # sql_sock = /tmp/mysql.sock
- #
- # optional
- # usually '/var/lib/mysql/mysql.sock' on Linux
- # usually '/tmp/mysql.sock' on FreeBSD
-
- # pre-query, executed before the main fetch query
- # useful eg. to setup encoding or mark records
- # optional, default is empty
- #
+ index_html_attrs = img=alt,title; a=title;
+
+ sql_host = <%= sphinx_db_host %>
+ sql_user = <%= sphinx_db_user %>
+ sql_pass = <%= sphinx_db_pass %>
+ sql_db = <%= sphinx_db_name %>
+ sql_port = <%= sphinx_db_port %> # optional, default is 3306
# sql_query_pre = SET CHARACTER_SET_RESULTS=cp1251
sql_query_pre = SET NAMES UTF8
-
- # main document fetch query
- #
- # you can specify up to 32 (formally SPH_MAX_FIELDS in sphinx.h) fields;
- # all of the fields which are not document_id or attributes (see below)
- # will be full-text indexed
- #
- # document_id MUST be the very first field
- # document_id MUST be positive (non-zero, non-negative)
- # document_id MUST fit into 32 bits
- # document_id MUST be unique
- #
# mandatory
- sql_query = \
- SELECT id, UNIX_TIMESTAMP(created_at) AS created_at, body FROM todo
-
- # query range setup
- #
- # useful to avoid MyISAM table locks and big result sets
- # when indexing lots of data
- #
- # to use query ranges, you should
- # 1) provide a query to fetch min/max id (ie. id range) from data set;
- # 2) configure step size in which this range will be walked;
- # 3) use $start and $end macros somewhere in the main fetch query.
- #
- # 'sql_query_range' must return exactly two integer fields
- # in exactly min_id, max_id order
- #
- # 'sql_range_step' must be a positive integer
- # optional, default is 1024
- #
- # 'sql_query' must contain both '$start' and '$end' macros
- # if you are using query ranges (because it obviously would be an
- # error to index the whole table many times)
- #
- # note that the intervals specified by $start/$end do not
- # overlap, so you should NOT remove document ids which are exactly
- # equal to $start or $end in your query
- #
- # here's an example which will index 'documents' table
- # fetching (at most) one thousand entries at a time:
- #
- # sql_query_range = SELECT MIN(id),MAX(id) FROM documents
- # sql_range_step = 1000
- # sql_query = \
- # SELECT doc.id, doc.id AS group, doc.title, doc.data \
- # FROM documents doc \
- # WHERE id>=$start AND id<=$end
-
-
- # attribute columns
- #
- # attribute values MUST be positive (non-zero, non-negative) integers
- # attribute values MUST fit into 32 bits
- #
- # attributes are additional values associated with each document which
- # may be used to perform additional filtering and sorting during search.
- # attributes are NOT full-text indexed; they are stored in the full text
- # index as is.
- #
- # a good example would be a forum posts table. one might need to search
- # through 'title' and 'content' fields but to limit search to specific
- # values of 'author_id', or 'forum_id', or to sort by 'post_date', or to
- # group matches by 'thread_id', or to group posts by month of the
- # 'post_date' and provide statistics.
- #
- # this all can be achieved by specifying all the mentioned columns
- # (excluding 'title' and 'content' which are full-text fields) as
- # attributes and then using API calls to setup filtering, sorting,
- # and grouping.
- #
- # sql_group_column is used to declare integer attributes.
- #
- # sql_date_column is used to declare UNIX timestamp attributes.
- #
- # sql_str2ordinal_column is used to declare integer attributes which
- # values are computed as ordinal numbers of corresponding column value
- # in sorted list of column values. WARNING, all such strings values
- # are going to be stored in RAM while indexing, and "C" locale will
- # be used when sorting!
- #
- # starting with 0.9.7, there may be multiple attribute columns specified.
- # here's an example for that mentioned posts table:
- #
- # sql_group_column = author_id
- # sql_group_column = forum_id
- # sql_group_column = thread_id
- # sql_date_column = post_unix_timestamp
- # sql_date_column = last_edit_unix_timestamp
- #
- # optional, default is empty
- ##sql_group_column = group_id
- sql_date_column = created_at
- # sql_str2ordinal_column = author_name
-
- # post-query, executed on the end of main fetch query
- #
- # note that indexing is NOT completed at the point when post-query
- # gets executed and might very well fail
- #
- # optional, default is empty
- ##sql_query_post =
-
- # post-index-query, executed on succsefully completed indexing
- #
- # $maxid macro is the max document ID which was actually
- # fetched from the database
- #
- # optional, default is empty
- #
- # sql_query_post_index = REPLACE INTO counters ( id, val ) \
- # VALUES ( 'max_indexed_id', $maxid )
-
-
- # document info query
- #
- # ONLY used by search utility to display document information
- # MUST be able to fetch document info by its id, therefore
- # MUST contain '$id' macro
- #
- # optional, default is empty
- ##sql_query_info = SELECT * FROM documents WHERE id=$id
-
- #####################################################################
-
- # demo config for 'xmlpipe' source type is a little below
- #
- # with xmlpipe, indexer opens a pipe to a given command,
- # and then reads documents from stdin
- #
- # indexer expects one or more documents from xmlpipe stdin
- # each document must be formatted exactly as follows:
- #
- # <document>
- # <id>123</id>
- # <group>45</group>
- # <timestamp>1132223498</timestamp>
- # <title>test title</title>
- # <body>
- # this is my document body
- # </body>
- # </document>
- #
- # timestamp element is optional, its default value is 1
- # all the other elements are mandatory
-
- # type = xmlpipe
- # xmlpipe_command = cat /var/test.xml
+ sql_query = QUERY
+ sql_query_range = SELECT MIN(id),MAX(id) FROM TABLE_NAME
+ sql_range_step = 1000
+ sql_group_column = user_id
+ sql_group_column = language
+ sql_date_column = published_date
+ sql_date_column = last_modified
}
-# inherited source example
-#
-# all the parameters are copied from the parent source,
-# and may then be overridden in this source definition
-##source src1stripped : src1
-##{
-## strip_html = 1
-##}
-
-#############################################################################
-## index definition
-#############################################################################
-
-# local index example
-#
-# this is an index which is stored locally in the filesystem
-#
-# all indexing-time options (such as morphology and charsets)
-# are configured per local index
-index <%= application %>
+index pages
{
- # which document source to index
- # at least one MUST be defined
- #
- # multiple sources MAY be specified; to do so, just add more
- # "source = NAME" lines. in this case, ALL the document IDs
- # in ALL the specified sources MUST be unique
- source = <%= application %>
-
- # this is path and index file name without extension
- #
- # indexer will append different extensions to this path to
- # generate names for both permanent and temporary index files
- #
- # .tmp* files are temporary and can be safely removed
- # if indexer fails to remove them automatically
- #
- # .sp* files are fulltext index data files. specifically,
- # .spa contains attribute values attached to each document id
- # .spd contains doclists and hitlists
- # .sph contains index header (schema and other settings)
- # .spi contains wordlists
- #
- # MUST be defined
- path = /var/sphinx/<%= application %>
-
- # docinfo (ie. per-document attribute values) storage strategy
- # defines how docinfo will be stored
- #
- # available values are "none", "inline" and "extern"
- #
- # "none" means there'll be no docinfo at all (no groups/dates)
- #
- # "inline" means that the docinfo will be stored in the .spd
- # file along with the document ID lists (doclists)
- #
- # "extern" means that the docinfo will be stored in the .spa
- # file separately
- #
- # externally stored docinfo should (basically) be kept in RAM
- # when querying; therefore, "inline" may be the only viable option
- # for really huge (50-100+ million docs) datasets. however, for
- # smaller datasets "extern" storage makes both indexing and
- # searching MUCH more efficient.
- #
- # additional search-time memory requirements for extern storage are
- #
- # ( 1 + number_of_attrs )*number_of_docs*4 bytes
- #
- # so 10 million docs with 2 groups and 1 timestamp will take
- # (1+2+1)*10M*4 = 160 MB of RAM. this is PER DAEMON, ie. searchd
- # will alloc 160 MB on startup, read the data and keep it shared
- # between queries; the children will NOT allocate additional
- # copies of this data.
- #
- # default is "extern" (as most collections are smaller than 100M docs)
+ source = pages
+ path = <%= sphinx_index_root %>/pages
docinfo = extern
-
- # morphology
- #
- # currently supported morphology preprocessors are Porter stemmers
- # for English and Russian, and Soundex. more stemmers could be added
- # at users request.
- #
- # available values are "none", "stem_en", "stem_ru", "stem_enru",
- # and "soundex"
- #
- # optional, default is "none"
- #
- # morphology = none
- # morphology = stem_en
- # morphology = stem_ru
- # morphology = stem_enru
- # morphology = soundex
- morphology = none
-
- # stopwords file
- #
- # format is plain text in whatever encoding you use
- # optional, default is empty
- #
- # stopwords = /var/data/stopwords.txt
- #stopwords =
-
- # minimum word length
- #
- # only the words that are of this length and above will be indexed;
- # for example, if min_word_len is 4, "the" won't be indexed,
- # but "they" will be.
- #
- # default is 1, which (obviously) means to index everything
+ morphology = stem_en
+ stopwords = <%= sphinx_conf_path %>/stopwords.txt
min_word_len = 1
-
- # charset encoding type
- #
- # known types are 'sbcs' (Single Byte CharSet) and 'utf-8'
- #
- # optional, default is sbcs
charset_type = utf-8
-
- # charset definition and case folding rules "table"
- #
- # optional, default value depends on charset_type
- #
- # for now, defaults are configured to support English and Russian
- # this behavior MAY change in future versions
- #
- # 'sbcs' default value is
- # charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF
- #
- # 'utf-8' default value is
- # charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F
-
- # minimum prefix length
- #
- # if prefix length is positive, indexer will not only index all words,
- # but all the possible prefixes (ie. word beginnings) as well
- #
- # for instance, "exam" query against such index will match documents
- # which contain "example" word, even if they do not contain "exam"
- #
- # indexing prefixes will make the index grow significantly
- # and could degrade search times
- #
- # currently there's no way to rank perfect word matches higher
- # than prefix matches using only one index; you could setup two
- # indexes for that
- #
- # default is 0, which means NOT to index prefixes
min_prefix_len = 0
-
- # minimum infix length
- #
- # if infix length is positive, indexer will not only index all words,
- # but all the possible infixes (ie. characters subsequences starting
- # anywhere inside the word) as well
- #
- # for instance, "amp" query against such index will match documents
- # which contain "example" word, even if they do not contain "amp"
- #
- # indexing prefixes will make the index grow significantly
- # and could degrade search times
- #
- # currently there's no way to rank perfect word matches higher
- # than infix matches using only one index; you could setup two
- # indexes for that
- #
- # default is 0, which means NOT to index infixes
min_infix_len = 0
-
- # n-grams length
- #
- # n-grams provide basic CJK support for unsegmented texts. if using
- # n-grams, streams of CJK characters are indexed as n-grams. for example,
- # if incoming stream is ABCDEF and n is 2, this text would be indexed
- # as if it was AB BC CD DE EF.
- #
- # this feature is in alpha version state and only n=1 is currently
- # supported; this is going to be improved.
- #
- # note that if search query is segmented (ie. words are separated with
- # whitespace), words are in quotes and extended matching mode is used,
- # then all matching documents will be returned even if their text was
- # *not* segmented. in the example above, ABCDEF text will be indexed as
- # A B C D E F, and "BCD" query will be transformed to "B C D" (where
- # quotes is phrase matching operator), so the document will match.
- #
- # optional, default is 0, which means NOT to use n-grams
- #
- # ngram_len = 1
-
- # n-gram characters table
- #
- # specifies what specific characters are subject to n-gram
- # extraction. format is similar to charset_table.
- #
- # optional, default is empty
- #
- # ngrams_chars = U+3000..U+2FA1F
}
-
-# inherited index example
-#
-# all the parameters are copied from the parent index,
-# and may then be overridden in this index definition
-##index test1stemmed : test1
-##{
-## path = /var/data/test1stemmed
-## morphology = stem_en
-##}
-
-
#############################################################################
## indexer settings
#############################################################################
indexer
@@ -430,11 +59,11 @@
#
# will grow if set unacceptably low
# will warn if set too low and potentially hurting the performance
#
# optional, default is 32M
- mem_limit = 32M
+ mem_limit = 64M
}
#############################################################################
## searchd settings
#############################################################################
@@ -445,26 +74,26 @@
# incoming network requests
#
# optional, default is to listen on all addresses,
# ie. address = 0.0.0.0
#
- address = 127.0.0.1
+ address = <%= sphinx_host %>
# address = 192.168.0.1
# port on which search daemon will listen
- port = 3312
+ port = <%= sphinx_port %>
# log file
# searchd run info is logged here
- log = /var/sphinx/searchd.log
+ log = <%= sphinx_log_root %>/searchd.log
# query log file
# all the search queries are logged here
- query_log = /var/sphinx/query.log
+ query_log = <%= sphinx_log_root %>/query.log
# client read timeout, seconds
read_timeout = 5
@@ -475,10 +104,10 @@
# a file which will contain searchd process ID
# used for different external automation scripts
# MUST be present
- pid_file = /var/sphinx/searchd.pid
+ pid_file = <%= sphinx_pid_path %>
# maximum amount of matches this daemon would ever retrieve
# from each index and serve to client
#