lib/dexter/indexer.rb in pgdexter-0.1.3 vs lib/dexter/indexer.rb in pgdexter-0.1.4
- old
+ new
@@ -6,10 +6,11 @@
@database_url = database_url
@create = options[:create]
@log_level = options[:log_level]
@exclude_tables = options[:exclude]
@log_sql = options[:log_sql]
+ @log_explain = options[:log_explain]
create_extension
end
def process_queries(queries)
@@ -29,38 +30,35 @@
end
# analyze tables if needed
analyze_tables(tables) if tables.any?
- # get initial costs for queries
- calculate_initial_cost(queries.reject(&:missing_tables))
+ # create hypothetical indexes and explain queries
+ candidates = tables.any? ? create_hypothetical_indexes(queries.reject(&:missing_tables), tables) : {}
- # create hypothetical indexes
- candidates = tables.any? ? create_hypothetical_indexes(tables) : {}
-
- # get new costs and see if new indexes were used
+ # see if new indexes were used and meet bar
new_indexes = determine_indexes(queries, candidates)
# display and create new indexes
show_and_create_indexes(new_indexes)
end
private
def create_extension
- select_all("SET client_min_messages = warning")
- select_all("CREATE EXTENSION IF NOT EXISTS hypopg")
+ execute("SET client_min_messages = warning")
+ execute("CREATE EXTENSION IF NOT EXISTS hypopg")
end
def reset_hypothetical_indexes
- select_all("SELECT hypopg_reset()")
+ execute("SELECT hypopg_reset()")
end
def analyze_tables(tables)
tables = tables.to_a.sort
- analyze_stats = select_all <<-SQL
+ analyze_stats = execute <<-SQL
SELECT
schemaname AS schema,
relname AS table,
last_analyze,
last_autoanalyze
@@ -77,60 +75,84 @@
tables.each do |table|
if !last_analyzed[table] || last_analyzed[table] < Time.now - 3600
statement = "ANALYZE #{quote_ident(table)}"
log "Running analyze: #{statement}"
- select_all(statement)
+ execute(statement)
end
end
end
- def calculate_initial_cost(queries)
+ def calculate_plan(queries)
queries.each do |query|
begin
- query.initial_cost = plan(query.statement)["Total Cost"]
+ query.plans << plan(query.statement)
+ if @log_explain
+ log "Explaining query"
+ puts
+ # Pass format to prevent ANALYZE
+ puts execute("EXPLAIN (FORMAT TEXT) #{safe_statement(query.statement)}").map { |r| r["QUERY PLAN"] }.join("\n")
+ puts
+ end
rescue PG::Error
# do nothing
end
end
end
- def create_hypothetical_indexes(tables)
+ def create_hypothetical_indexes(queries, tables)
+ # get initial costs for queries
+ calculate_plan(queries)
+ explainable_queries = queries.select(&:explainable?)
+
# get existing indexes
index_set = Set.new
indexes(tables).each do |index|
# TODO make sure btree
index_set << [index["table"], index["columns"]]
end
# create hypothetical indexes
candidates = {}
- columns(tables).each do |col|
- unless index_set.include?([col[:table], [col[:column]]])
- unless ["json", "jsonb"].include?(col[:type])
- candidates[col] = select_all("SELECT * FROM hypopg_create_index('CREATE INDEX ON #{quote_ident(col[:table])} (#{[col[:column]].map { |c| quote_ident(c) }.join(", ")})')").first["indexname"]
- end
- end
- end
+ columns_by_table = columns(tables).group_by { |c| c[:table] }
+
+ # create single column indexes
+ create_hypothetical_indexes_helper(columns_by_table, 1, index_set, candidates)
+
+ # get next round of costs
+ calculate_plan(explainable_queries)
+
+ # create multicolumn indexes
+ create_hypothetical_indexes_helper(columns_by_table, 2, index_set, candidates)
+
+ # get next round of costs
+ calculate_plan(explainable_queries)
+
candidates
end
def determine_indexes(queries, candidates)
new_indexes = {}
queries.each do |query|
- if query.initial_cost
- new_plan = plan(query.statement)
- query.new_cost = new_plan["Total Cost"]
- cost_savings = query.new_cost < query.initial_cost * 0.5
+ if query.explainable?
+ new_cost, new_cost2 = query.costs[1..2]
+ cost_savings = new_cost < query.initial_cost * 0.5
+ # set high bar for multicolumn indexes
+ cost_savings2 = new_cost > 100 && new_cost2 < new_cost * 0.5
+
+ query.new_cost = cost_savings2 ? new_cost2 : new_cost
+
query_indexes = []
- candidates.each do |col, index_name|
- if new_plan.inspect.include?(index_name)
+ candidates.each do |col_set, index_name|
+ key = cost_savings2 ? 2 : 1
+
+ if query.plans[key].inspect.include?(index_name)
index = {
- table: col[:table],
- columns: [col[:column]]
+ table: col_set[0][:table],
+ columns: col_set.map { |c| c[:column] }
}
query_indexes << index
if cost_savings
new_indexes[index] ||= index.dup
@@ -140,16 +162,16 @@
end
end
if @log_level == "debug2"
log "Processed #{query.fingerprint}"
- if query.initial_cost
+ if query.explainable?
log "Cost: #{query.initial_cost} -> #{query.new_cost}"
if query_indexes.any?
log "Indexes: #{query_indexes.map { |i| "#{i[:table]} (#{i[:columns].join(", ")})" }.join(", ")}"
- log "Need 50% cost savings to suggest index" unless cost_savings
+ log "Need 50% cost savings to suggest index" unless cost_savings || cost_savings2
else
log "Indexes: None"
end
elsif query.fingerprint == "unknown"
log "Could not parse query"
@@ -193,11 +215,11 @@
# 4. release lock
new_indexes.each do |index|
statement = "CREATE INDEX CONCURRENTLY ON #{quote_ident(index[:table])} (#{index[:columns].map { |c| quote_ident(c) }.join(", ")})"
log "Creating index: #{statement}"
started_at = Time.now
- select_all(statement)
+ execute(statement)
log "Index created: #{((Time.now - started_at) * 1000).to_i} ms"
end
end
else
log "No new indexes found"
@@ -221,11 +243,11 @@
end
rescue PG::ConnectionBad
abort "Bad database url"
end
- def select_all(query)
+ def execute(query)
# use exec_params instead of exec for security
#
# Unlike PQexec, PQexecParams allows at most one SQL command in the given string.
# (There can be semicolons in it, but not more than one nonempty command.)
# This is a limitation of the underlying protocol, but has some usefulness
@@ -236,15 +258,27 @@
conn.exec_params(query, []).to_a
end
def plan(query)
# strip semi-colons as another measure of defense
- JSON.parse(select_all("EXPLAIN (FORMAT JSON) #{query.gsub(";", "")}").first["QUERY PLAN"]).first["Plan"]
+ JSON.parse(execute("EXPLAIN (FORMAT JSON) #{safe_statement(query)}").first["QUERY PLAN"]).first["Plan"]
end
+ # TODO for multicolumn indexes, use ordering
+ def create_hypothetical_indexes_helper(columns_by_table, n, index_set, candidates)
+ columns_by_table.each do |table, cols|
+ # no reason to use btree index for json columns
+ cols.reject { |c| ["json", "jsonb"].include?(c[:type]) }.permutation(n) do |col_set|
+ if !index_set.include?([table, col_set.map { |col| col[:column] }])
+ candidates[col_set] = execute("SELECT * FROM hypopg_create_index('CREATE INDEX ON #{quote_ident(table)} (#{col_set.map { |c| quote_ident(c[:column]) }.join(", ")})')").first["indexname"]
+ end
+ end
+ end
+ end
+
def database_tables
- result = select_all <<-SQL
+ result = execute <<-SQL
SELECT
table_name
FROM
information_schema.tables
WHERE
@@ -257,27 +291,29 @@
def possible_tables(queries)
Set.new(queries.flat_map(&:tables).uniq & database_tables)
end
def columns(tables)
- columns = select_all <<-SQL
+ columns = execute <<-SQL
SELECT
table_name,
column_name,
data_type
FROM
information_schema.columns
WHERE
table_schema = 'public' AND
table_name IN (#{tables.map { |t| quote(t) }.join(", ")})
+ ORDER BY
+ 1, 2
SQL
columns.map { |v| {table: v["table_name"], column: v["column_name"], type: v["data_type"]} }
end
def indexes(tables)
- select_all(<<-SQL
+ execute(<<-SQL
SELECT
schemaname AS schema,
t.relname AS table,
ix.relname AS name,
regexp_replace(pg_get_indexdef(i.indexrelid), '^[^\\(]*\\((.*)\\)$', '\\1') AS columns,
@@ -328,8 +364,12 @@
end
# from activesupport
def squish(str)
str.to_s.gsub(/\A[[:space:]]+/, "").gsub(/[[:space:]]+\z/, "").gsub(/[[:space:]]+/, " ")
+ end
+
+ def safe_statement(statement)
+ statement.gsub(";", "")
end
end
end