lib/dexter/indexer.rb in pgdexter-0.1.3 vs lib/dexter/indexer.rb in pgdexter-0.1.4

- old
+ new

@@ -6,10 +6,11 @@ @database_url = database_url @create = options[:create] @log_level = options[:log_level] @exclude_tables = options[:exclude] @log_sql = options[:log_sql] + @log_explain = options[:log_explain] create_extension end def process_queries(queries) @@ -29,38 +30,35 @@ end # analyze tables if needed analyze_tables(tables) if tables.any? - # get initial costs for queries - calculate_initial_cost(queries.reject(&:missing_tables)) + # create hypothetical indexes and explain queries + candidates = tables.any? ? create_hypothetical_indexes(queries.reject(&:missing_tables), tables) : {} - # create hypothetical indexes - candidates = tables.any? ? create_hypothetical_indexes(tables) : {} - - # get new costs and see if new indexes were used + # see if new indexes were used and meet bar new_indexes = determine_indexes(queries, candidates) # display and create new indexes show_and_create_indexes(new_indexes) end private def create_extension - select_all("SET client_min_messages = warning") - select_all("CREATE EXTENSION IF NOT EXISTS hypopg") + execute("SET client_min_messages = warning") + execute("CREATE EXTENSION IF NOT EXISTS hypopg") end def reset_hypothetical_indexes - select_all("SELECT hypopg_reset()") + execute("SELECT hypopg_reset()") end def analyze_tables(tables) tables = tables.to_a.sort - analyze_stats = select_all <<-SQL + analyze_stats = execute <<-SQL SELECT schemaname AS schema, relname AS table, last_analyze, last_autoanalyze @@ -77,60 +75,84 @@ tables.each do |table| if !last_analyzed[table] || last_analyzed[table] < Time.now - 3600 statement = "ANALYZE #{quote_ident(table)}" log "Running analyze: #{statement}" - select_all(statement) + execute(statement) end end end - def calculate_initial_cost(queries) + def calculate_plan(queries) queries.each do |query| begin - query.initial_cost = plan(query.statement)["Total Cost"] + query.plans << plan(query.statement) + if @log_explain + log "Explaining query" + puts + # Pass format to prevent ANALYZE + puts execute("EXPLAIN (FORMAT TEXT) #{safe_statement(query.statement)}").map { |r| r["QUERY PLAN"] }.join("\n") + puts + end rescue PG::Error # do nothing end end end - def create_hypothetical_indexes(tables) + def create_hypothetical_indexes(queries, tables) + # get initial costs for queries + calculate_plan(queries) + explainable_queries = queries.select(&:explainable?) + # get existing indexes index_set = Set.new indexes(tables).each do |index| # TODO make sure btree index_set << [index["table"], index["columns"]] end # create hypothetical indexes candidates = {} - columns(tables).each do |col| - unless index_set.include?([col[:table], [col[:column]]]) - unless ["json", "jsonb"].include?(col[:type]) - candidates[col] = select_all("SELECT * FROM hypopg_create_index('CREATE INDEX ON #{quote_ident(col[:table])} (#{[col[:column]].map { |c| quote_ident(c) }.join(", ")})')").first["indexname"] - end - end - end + columns_by_table = columns(tables).group_by { |c| c[:table] } + + # create single column indexes + create_hypothetical_indexes_helper(columns_by_table, 1, index_set, candidates) + + # get next round of costs + calculate_plan(explainable_queries) + + # create multicolumn indexes + create_hypothetical_indexes_helper(columns_by_table, 2, index_set, candidates) + + # get next round of costs + calculate_plan(explainable_queries) + candidates end def determine_indexes(queries, candidates) new_indexes = {} queries.each do |query| - if query.initial_cost - new_plan = plan(query.statement) - query.new_cost = new_plan["Total Cost"] - cost_savings = query.new_cost < query.initial_cost * 0.5 + if query.explainable? + new_cost, new_cost2 = query.costs[1..2] + cost_savings = new_cost < query.initial_cost * 0.5 + # set high bar for multicolumn indexes + cost_savings2 = new_cost > 100 && new_cost2 < new_cost * 0.5 + + query.new_cost = cost_savings2 ? new_cost2 : new_cost + query_indexes = [] - candidates.each do |col, index_name| - if new_plan.inspect.include?(index_name) + candidates.each do |col_set, index_name| + key = cost_savings2 ? 2 : 1 + + if query.plans[key].inspect.include?(index_name) index = { - table: col[:table], - columns: [col[:column]] + table: col_set[0][:table], + columns: col_set.map { |c| c[:column] } } query_indexes << index if cost_savings new_indexes[index] ||= index.dup @@ -140,16 +162,16 @@ end end if @log_level == "debug2" log "Processed #{query.fingerprint}" - if query.initial_cost + if query.explainable? log "Cost: #{query.initial_cost} -> #{query.new_cost}" if query_indexes.any? log "Indexes: #{query_indexes.map { |i| "#{i[:table]} (#{i[:columns].join(", ")})" }.join(", ")}" - log "Need 50% cost savings to suggest index" unless cost_savings + log "Need 50% cost savings to suggest index" unless cost_savings || cost_savings2 else log "Indexes: None" end elsif query.fingerprint == "unknown" log "Could not parse query" @@ -193,11 +215,11 @@ # 4. release lock new_indexes.each do |index| statement = "CREATE INDEX CONCURRENTLY ON #{quote_ident(index[:table])} (#{index[:columns].map { |c| quote_ident(c) }.join(", ")})" log "Creating index: #{statement}" started_at = Time.now - select_all(statement) + execute(statement) log "Index created: #{((Time.now - started_at) * 1000).to_i} ms" end end else log "No new indexes found" @@ -221,11 +243,11 @@ end rescue PG::ConnectionBad abort "Bad database url" end - def select_all(query) + def execute(query) # use exec_params instead of exec for security # # Unlike PQexec, PQexecParams allows at most one SQL command in the given string. # (There can be semicolons in it, but not more than one nonempty command.) # This is a limitation of the underlying protocol, but has some usefulness @@ -236,15 +258,27 @@ conn.exec_params(query, []).to_a end def plan(query) # strip semi-colons as another measure of defense - JSON.parse(select_all("EXPLAIN (FORMAT JSON) #{query.gsub(";", "")}").first["QUERY PLAN"]).first["Plan"] + JSON.parse(execute("EXPLAIN (FORMAT JSON) #{safe_statement(query)}").first["QUERY PLAN"]).first["Plan"] end + # TODO for multicolumn indexes, use ordering + def create_hypothetical_indexes_helper(columns_by_table, n, index_set, candidates) + columns_by_table.each do |table, cols| + # no reason to use btree index for json columns + cols.reject { |c| ["json", "jsonb"].include?(c[:type]) }.permutation(n) do |col_set| + if !index_set.include?([table, col_set.map { |col| col[:column] }]) + candidates[col_set] = execute("SELECT * FROM hypopg_create_index('CREATE INDEX ON #{quote_ident(table)} (#{col_set.map { |c| quote_ident(c[:column]) }.join(", ")})')").first["indexname"] + end + end + end + end + def database_tables - result = select_all <<-SQL + result = execute <<-SQL SELECT table_name FROM information_schema.tables WHERE @@ -257,27 +291,29 @@ def possible_tables(queries) Set.new(queries.flat_map(&:tables).uniq & database_tables) end def columns(tables) - columns = select_all <<-SQL + columns = execute <<-SQL SELECT table_name, column_name, data_type FROM information_schema.columns WHERE table_schema = 'public' AND table_name IN (#{tables.map { |t| quote(t) }.join(", ")}) + ORDER BY + 1, 2 SQL columns.map { |v| {table: v["table_name"], column: v["column_name"], type: v["data_type"]} } end def indexes(tables) - select_all(<<-SQL + execute(<<-SQL SELECT schemaname AS schema, t.relname AS table, ix.relname AS name, regexp_replace(pg_get_indexdef(i.indexrelid), '^[^\\(]*\\((.*)\\)$', '\\1') AS columns, @@ -328,8 +364,12 @@ end # from activesupport def squish(str) str.to_s.gsub(/\A[[:space:]]+/, "").gsub(/[[:space:]]+\z/, "").gsub(/[[:space:]]+/, " ") + end + + def safe_statement(statement) + statement.gsub(";", "") end end end