# sphinx config source pages { type = mysql # whether to strip HTML # values can be 0 (don't strip) or 1 (do strip) # WARNING, only works with mysql source for now # WARNING, should work ok for PERFECTLY formed XHTML for now # WARNING, POSSIBLE TO BUG on malformed everday HTML # optional, default is 0 strip_html = 1 # what HTML attributes to index if stripping HTML # format is as follows: # index_html_attrs = img=alt,title; a=title; sql_host = <%= sphinx_db_host %> sql_user = <%= sphinx_db_user %> sql_pass = <%= sphinx_db_pass %> sql_db = <%= sphinx_db_name %> sql_port = <%= sphinx_db_port %> # optional, default is 3306 sql_query_pre = SET NAMES UTF8 sql_query_pre = SET SESSION query_cache_type=OFF sql_query_pre = INSERT INTO indexer_status (id, started_at, status, index_name, hostname) VALUES (10, NOW(), 'indexing', 'pages', '<%= sphinx_hostname %>') \ ON DUPLICATE KEY UPDATE started_at = NOW(), status = 'indexing' sql_query = SELECT id, user_id, language, UNIX_TIMESTAMP(created_at) AS created_at, UNIX_TIMESTAMP(updated_at) AS updated_at, body, title FROM pages WHERE id>=$start AND id<=$end sql_query_range = SELECT MIN(id),MAX(id) FROM pages where type='Article' sql_range_step = 1000 sql_query_post = UPDATE indexer_status SET updated_at = NOW(), status = 'updated' WHERE index_name = 'pages' and hostname = '<%= sphinx_hostname %>' sql_attr_uint = user_id sql_attr_timestamp = created_at sql_attr_timestamp = updated_at } source pages_delta : pages { # Clear and reset sql_query_pre sql_query_pre = sql_query_pre = SET NAMES UTF8 sql_query_pre = SET SESSION query_cache_type=OFF sql_query_pre = INSERT INTO indexer_status (id, started_at, status, index_name, hostname) VALUES (11, NOW(), 'indexing', 'pages_delta', '<%= sphinx_hostname %>') \ ON DUPLICATE KEY UPDATE started_at = NOW(), status = 'indexing' sql_query = SELECT id, user_id, language, UNIX_TIMESTAMP(created_at) AS created_at, UNIX_TIMESTAMP(updated_at) AS updated_at, body, title \ FROM pages \ WHERE updated_at >= (SELECT updated_at FROM indexer_status WHERE id = 11) sql_query_post = sql_query_post = UPDATE indexer_status SET updated_at = NOW(), status = 'updated' WHERE index_name = 'pages_delta' and hostname = '<%= sphinx_hostname %>' sql_query_range = sql_range_step = } index pages { source = pages path = <%= sphinx_index_root %>/pages docinfo = extern morphology = stem_en stopwords = <%= sphinx_conf_path %>/stopwords.txt min_word_len = 1 charset_type = utf-8 min_prefix_len = 0 min_infix_len = 0 } index pages_delta : pages { source = pages_delta path = <%= sphinx_index_root %>/pages_delta } ############################################################################# ## indexer settings ############################################################################# indexer { # memory limit # # may be specified in bytes (no postfix), kilobytes (mem_limit=1000K) # or megabytes (mem_limit=10M) # # will grow if set unacceptably low # will warn if set too low and potentially hurting the performance # # optional, default is 32M mem_limit = 64M } ############################################################################# ## searchd settings ############################################################################# searchd { # IP address on which search daemon will bind and accept # incoming network requests # # optional, default is to listen on all addresses, # ie. address = 0.0.0.0 # address = <%= sphinx_host %> # address = 192.168.0.1 # port on which search daemon will listen port = <%= sphinx_port %> # log file # searchd run info is logged here log = <%= sphinx_log_root %>/searchd.log # query log file # all the search queries are logged here query_log = <%= sphinx_log_root %>/query.log # client read timeout, seconds read_timeout = 5 # maximum amount of children to fork # useful to control server load max_children = 30 # a file which will contain searchd process ID # used for different external automation scripts # MUST be present pid_file = <%= sphinx_pid_path %> # maximum amount of matches this daemon would ever retrieve # from each index and serve to client # # this parameter affects per-client memory and CPU usage # (16+ bytes per match) in match sorting phase; so blindly raising # it to 1 million is definitely NOT recommended # # starting from 0.9.7, it can be decreased on the fly through # the corresponding API call; increasing is prohibited to protect # against malicious and/or malformed requests # # default is 1000 (just like with Google) max_matches = 1000 } # --eof--