lib/templates/sphinx/sphinx.conf.erb in capitate-0.1.9 vs lib/templates/sphinx/sphinx.conf.erb in capitate-0.2.1

- old
+ new

@@ -1,424 +1,53 @@ -# -# Sphinx configuration file sample -# TODO: This isn't currently used by any recipe. The current setup recipe uses sphinx conf on per application -# basis. -# - -############################################################################# -## data source definition -############################################################################# - -source <%= application %> +# sphinx config +source pages { - # data source type - # for now, known types are 'mysql', 'pgsql' and 'xmlpipe' - # MUST be defined type = mysql - # whether to strip HTML # values can be 0 (don't strip) or 1 (do strip) # WARNING, only works with mysql source for now # WARNING, should work ok for PERFECTLY formed XHTML for now # WARNING, POSSIBLE TO BUG on malformed everday HTML # optional, default is 0 - strip_html = 0 + strip_html = 1 # what HTML attributes to index if stripping HTML # format is as follows: # - # index_html_attrs = img=alt,title; a=title; - # - # optional, default is to not index anything - index_html_attrs = - - ##################################################################### - - # some straightforward parameters for 'mysql' source type - sql_host = 127.0.0.1 - sql_user = <%= db_user %> - sql_pass = <%= db_pass %> - sql_db = <%= db_name %> - sql_port = 3306 # optional, default is 3306 - - # sql_sock = /tmp/mysql.sock - # - # optional - # usually '/var/lib/mysql/mysql.sock' on Linux - # usually '/tmp/mysql.sock' on FreeBSD - - # pre-query, executed before the main fetch query - # useful eg. to setup encoding or mark records - # optional, default is empty - # + index_html_attrs = img=alt,title; a=title; + + sql_host = <%= sphinx_db_host %> + sql_user = <%= sphinx_db_user %> + sql_pass = <%= sphinx_db_pass %> + sql_db = <%= sphinx_db_name %> + sql_port = <%= sphinx_db_port %> # optional, default is 3306 # sql_query_pre = SET CHARACTER_SET_RESULTS=cp1251 sql_query_pre = SET NAMES UTF8 - - # main document fetch query - # - # you can specify up to 32 (formally SPH_MAX_FIELDS in sphinx.h) fields; - # all of the fields which are not document_id or attributes (see below) - # will be full-text indexed - # - # document_id MUST be the very first field - # document_id MUST be positive (non-zero, non-negative) - # document_id MUST fit into 32 bits - # document_id MUST be unique - # # mandatory - sql_query = \ - SELECT id, UNIX_TIMESTAMP(created_at) AS created_at, body FROM todo - - # query range setup - # - # useful to avoid MyISAM table locks and big result sets - # when indexing lots of data - # - # to use query ranges, you should - # 1) provide a query to fetch min/max id (ie. id range) from data set; - # 2) configure step size in which this range will be walked; - # 3) use $start and $end macros somewhere in the main fetch query. - # - # 'sql_query_range' must return exactly two integer fields - # in exactly min_id, max_id order - # - # 'sql_range_step' must be a positive integer - # optional, default is 1024 - # - # 'sql_query' must contain both '$start' and '$end' macros - # if you are using query ranges (because it obviously would be an - # error to index the whole table many times) - # - # note that the intervals specified by $start/$end do not - # overlap, so you should NOT remove document ids which are exactly - # equal to $start or $end in your query - # - # here's an example which will index 'documents' table - # fetching (at most) one thousand entries at a time: - # - # sql_query_range = SELECT MIN(id),MAX(id) FROM documents - # sql_range_step = 1000 - # sql_query = \ - # SELECT doc.id, doc.id AS group, doc.title, doc.data \ - # FROM documents doc \ - # WHERE id>=$start AND id<=$end - - - # attribute columns - # - # attribute values MUST be positive (non-zero, non-negative) integers - # attribute values MUST fit into 32 bits - # - # attributes are additional values associated with each document which - # may be used to perform additional filtering and sorting during search. - # attributes are NOT full-text indexed; they are stored in the full text - # index as is. - # - # a good example would be a forum posts table. one might need to search - # through 'title' and 'content' fields but to limit search to specific - # values of 'author_id', or 'forum_id', or to sort by 'post_date', or to - # group matches by 'thread_id', or to group posts by month of the - # 'post_date' and provide statistics. - # - # this all can be achieved by specifying all the mentioned columns - # (excluding 'title' and 'content' which are full-text fields) as - # attributes and then using API calls to setup filtering, sorting, - # and grouping. - # - # sql_group_column is used to declare integer attributes. - # - # sql_date_column is used to declare UNIX timestamp attributes. - # - # sql_str2ordinal_column is used to declare integer attributes which - # values are computed as ordinal numbers of corresponding column value - # in sorted list of column values. WARNING, all such strings values - # are going to be stored in RAM while indexing, and "C" locale will - # be used when sorting! - # - # starting with 0.9.7, there may be multiple attribute columns specified. - # here's an example for that mentioned posts table: - # - # sql_group_column = author_id - # sql_group_column = forum_id - # sql_group_column = thread_id - # sql_date_column = post_unix_timestamp - # sql_date_column = last_edit_unix_timestamp - # - # optional, default is empty - ##sql_group_column = group_id - sql_date_column = created_at - # sql_str2ordinal_column = author_name - - # post-query, executed on the end of main fetch query - # - # note that indexing is NOT completed at the point when post-query - # gets executed and might very well fail - # - # optional, default is empty - ##sql_query_post = - - # post-index-query, executed on succsefully completed indexing - # - # $maxid macro is the max document ID which was actually - # fetched from the database - # - # optional, default is empty - # - # sql_query_post_index = REPLACE INTO counters ( id, val ) \ - # VALUES ( 'max_indexed_id', $maxid ) - - - # document info query - # - # ONLY used by search utility to display document information - # MUST be able to fetch document info by its id, therefore - # MUST contain '$id' macro - # - # optional, default is empty - ##sql_query_info = SELECT * FROM documents WHERE id=$id - - ##################################################################### - - # demo config for 'xmlpipe' source type is a little below - # - # with xmlpipe, indexer opens a pipe to a given command, - # and then reads documents from stdin - # - # indexer expects one or more documents from xmlpipe stdin - # each document must be formatted exactly as follows: - # - # <document> - # <id>123</id> - # <group>45</group> - # <timestamp>1132223498</timestamp> - # <title>test title</title> - # <body> - # this is my document body - # </body> - # </document> - # - # timestamp element is optional, its default value is 1 - # all the other elements are mandatory - - # type = xmlpipe - # xmlpipe_command = cat /var/test.xml + sql_query = QUERY + sql_query_range = SELECT MIN(id),MAX(id) FROM TABLE_NAME + sql_range_step = 1000 + sql_group_column = user_id + sql_group_column = language + sql_date_column = published_date + sql_date_column = last_modified } -# inherited source example -# -# all the parameters are copied from the parent source, -# and may then be overridden in this source definition -##source src1stripped : src1 -##{ -## strip_html = 1 -##} - -############################################################################# -## index definition -############################################################################# - -# local index example -# -# this is an index which is stored locally in the filesystem -# -# all indexing-time options (such as morphology and charsets) -# are configured per local index -index <%= application %> +index pages { - # which document source to index - # at least one MUST be defined - # - # multiple sources MAY be specified; to do so, just add more - # "source = NAME" lines. in this case, ALL the document IDs - # in ALL the specified sources MUST be unique - source = <%= application %> - - # this is path and index file name without extension - # - # indexer will append different extensions to this path to - # generate names for both permanent and temporary index files - # - # .tmp* files are temporary and can be safely removed - # if indexer fails to remove them automatically - # - # .sp* files are fulltext index data files. specifically, - # .spa contains attribute values attached to each document id - # .spd contains doclists and hitlists - # .sph contains index header (schema and other settings) - # .spi contains wordlists - # - # MUST be defined - path = /var/sphinx/<%= application %> - - # docinfo (ie. per-document attribute values) storage strategy - # defines how docinfo will be stored - # - # available values are "none", "inline" and "extern" - # - # "none" means there'll be no docinfo at all (no groups/dates) - # - # "inline" means that the docinfo will be stored in the .spd - # file along with the document ID lists (doclists) - # - # "extern" means that the docinfo will be stored in the .spa - # file separately - # - # externally stored docinfo should (basically) be kept in RAM - # when querying; therefore, "inline" may be the only viable option - # for really huge (50-100+ million docs) datasets. however, for - # smaller datasets "extern" storage makes both indexing and - # searching MUCH more efficient. - # - # additional search-time memory requirements for extern storage are - # - # ( 1 + number_of_attrs )*number_of_docs*4 bytes - # - # so 10 million docs with 2 groups and 1 timestamp will take - # (1+2+1)*10M*4 = 160 MB of RAM. this is PER DAEMON, ie. searchd - # will alloc 160 MB on startup, read the data and keep it shared - # between queries; the children will NOT allocate additional - # copies of this data. - # - # default is "extern" (as most collections are smaller than 100M docs) + source = pages + path = <%= sphinx_index_root %>/pages docinfo = extern - - # morphology - # - # currently supported morphology preprocessors are Porter stemmers - # for English and Russian, and Soundex. more stemmers could be added - # at users request. - # - # available values are "none", "stem_en", "stem_ru", "stem_enru", - # and "soundex" - # - # optional, default is "none" - # - # morphology = none - # morphology = stem_en - # morphology = stem_ru - # morphology = stem_enru - # morphology = soundex - morphology = none - - # stopwords file - # - # format is plain text in whatever encoding you use - # optional, default is empty - # - # stopwords = /var/data/stopwords.txt - #stopwords = - - # minimum word length - # - # only the words that are of this length and above will be indexed; - # for example, if min_word_len is 4, "the" won't be indexed, - # but "they" will be. - # - # default is 1, which (obviously) means to index everything + morphology = stem_en + stopwords = <%= sphinx_conf_path %>/stopwords.txt min_word_len = 1 - - # charset encoding type - # - # known types are 'sbcs' (Single Byte CharSet) and 'utf-8' - # - # optional, default is sbcs charset_type = utf-8 - - # charset definition and case folding rules "table" - # - # optional, default value depends on charset_type - # - # for now, defaults are configured to support English and Russian - # this behavior MAY change in future versions - # - # 'sbcs' default value is - # charset_table = 0..9, A..Z->a..z, _, a..z, U+A8->U+B8, U+B8, U+C0..U+DF->U+E0..U+FF, U+E0..U+FF - # - # 'utf-8' default value is - # charset_table = 0..9, A..Z->a..z, _, a..z, U+410..U+42F->U+430..U+44F, U+430..U+44F - - # minimum prefix length - # - # if prefix length is positive, indexer will not only index all words, - # but all the possible prefixes (ie. word beginnings) as well - # - # for instance, "exam" query against such index will match documents - # which contain "example" word, even if they do not contain "exam" - # - # indexing prefixes will make the index grow significantly - # and could degrade search times - # - # currently there's no way to rank perfect word matches higher - # than prefix matches using only one index; you could setup two - # indexes for that - # - # default is 0, which means NOT to index prefixes min_prefix_len = 0 - - # minimum infix length - # - # if infix length is positive, indexer will not only index all words, - # but all the possible infixes (ie. characters subsequences starting - # anywhere inside the word) as well - # - # for instance, "amp" query against such index will match documents - # which contain "example" word, even if they do not contain "amp" - # - # indexing prefixes will make the index grow significantly - # and could degrade search times - # - # currently there's no way to rank perfect word matches higher - # than infix matches using only one index; you could setup two - # indexes for that - # - # default is 0, which means NOT to index infixes min_infix_len = 0 - - # n-grams length - # - # n-grams provide basic CJK support for unsegmented texts. if using - # n-grams, streams of CJK characters are indexed as n-grams. for example, - # if incoming stream is ABCDEF and n is 2, this text would be indexed - # as if it was AB BC CD DE EF. - # - # this feature is in alpha version state and only n=1 is currently - # supported; this is going to be improved. - # - # note that if search query is segmented (ie. words are separated with - # whitespace), words are in quotes and extended matching mode is used, - # then all matching documents will be returned even if their text was - # *not* segmented. in the example above, ABCDEF text will be indexed as - # A B C D E F, and "BCD" query will be transformed to "B C D" (where - # quotes is phrase matching operator), so the document will match. - # - # optional, default is 0, which means NOT to use n-grams - # - # ngram_len = 1 - - # n-gram characters table - # - # specifies what specific characters are subject to n-gram - # extraction. format is similar to charset_table. - # - # optional, default is empty - # - # ngrams_chars = U+3000..U+2FA1F } - -# inherited index example -# -# all the parameters are copied from the parent index, -# and may then be overridden in this index definition -##index test1stemmed : test1 -##{ -## path = /var/data/test1stemmed -## morphology = stem_en -##} - - ############################################################################# ## indexer settings ############################################################################# indexer @@ -430,11 +59,11 @@ # # will grow if set unacceptably low # will warn if set too low and potentially hurting the performance # # optional, default is 32M - mem_limit = 32M + mem_limit = 64M } ############################################################################# ## searchd settings ############################################################################# @@ -445,26 +74,26 @@ # incoming network requests # # optional, default is to listen on all addresses, # ie. address = 0.0.0.0 # - address = 127.0.0.1 + address = <%= sphinx_host %> # address = 192.168.0.1 # port on which search daemon will listen - port = 3312 + port = <%= sphinx_port %> # log file # searchd run info is logged here - log = /var/sphinx/searchd.log + log = <%= sphinx_log_root %>/searchd.log # query log file # all the search queries are logged here - query_log = /var/sphinx/query.log + query_log = <%= sphinx_log_root %>/query.log # client read timeout, seconds read_timeout = 5 @@ -475,10 +104,10 @@ # a file which will contain searchd process ID # used for different external automation scripts # MUST be present - pid_file = /var/sphinx/searchd.pid + pid_file = <%= sphinx_pid_path %> # maximum amount of matches this daemon would ever retrieve # from each index and serve to client #