indexer.rb in pgdexter-0.3.3

- old
+ new

@@ -10,10 +10,11 @@
       @log_sql = options[:log_sql]
       @log_explain = options[:log_explain]
       @min_time = options[:min_time] || 0
       @min_calls = options[:min_calls] || 0
       @analyze = options[:analyze]
+      @min_cost_savings_pct = options[:min_cost_savings_pct].to_i
       @options = options
 
       create_extension unless extension_exists?
       execute("SET lock_timeout = '5s'")
     end
@@ -26,24 +27,41 @@
 
     def process_queries(queries)
       # reset hypothetical indexes
       reset_hypothetical_indexes
 
-      tables = Set.new(database_tables)
+      tables = Set.new(database_tables + materialized_views)
 
       # map tables without schema to schema
       no_schema_tables = {}
       search_path_index = Hash[search_path.map.with_index.to_a]
       tables.group_by { |t| t.split(".")[-1] }.each do |group, t2|
         no_schema_tables[group] = t2.sort_by { |t| [search_path_index[t.split(".")[0]] || 1000000, t] }[0]
       end
 
+      # add tables from views
+      view_tables = database_view_tables
+      view_tables.each do |v, vt|
+        view_tables[v] = vt.map { |t| no_schema_tables[t] || t }
+      end
+
+      # fully resolve tables
+      # make sure no views in result
+      view_tables.each do |v, vt|
+        view_tables[v] = vt.flat_map { |t| view_tables[t] || [t] }.uniq
+      end
+
       # filter queries from other databases and system tables
       queries.each do |query|
         # add schema to table if needed
         query.tables = query.tables.map { |t| no_schema_tables[t] || t }
 
+        # substitute view tables
+        new_tables = query.tables.flat_map { |t| view_tables[t] || [t] }.uniq
+        query.tables_from_views = new_tables - query.tables
+        query.tables = new_tables
+
         # check for missing tables
         query.missing_tables = !query.tables.all? { |t| tables.include?(t) }
       end
 
       # set tables
@@ -164,10 +182,11 @@
       calculate_plan(queries)
       explainable_queries = queries.select { |q| q.explainable? && q.high_cost? }
 
       # filter tables for performance
       tables = Set.new(explainable_queries.flat_map(&:tables))
+      tables_from_views = Set.new(explainable_queries.flat_map(&:tables_from_views))
 
       if tables.any?
         # since every set of multi-column indexes are expensive
         # try to parse out columns
         possible_columns = Set.new
@@ -180,11 +199,12 @@
             end
           end
         end
 
         # create hypothetical indexes
-        columns_by_table = columns(tables).select { |c| possible_columns.include?(c[:column]) }.group_by { |c| c[:table] }
+        # use all columns in tables from views
+        columns_by_table = columns(tables).select { |c| possible_columns.include?(c[:column]) || tables_from_views.include?(c[:table]) }.group_by { |c| c[:table] }
 
         # create single column indexes
         create_hypothetical_indexes_helper(columns_by_table, 1, candidates)
 
         # get next round of costs
@@ -263,33 +283,80 @@
             index_set << [index["table"], index["columns"].first(2)]
           end
         end
       end
 
+      savings_ratio = (1 - @min_cost_savings_pct / 100.0)
+
       queries.each do |query|
         if query.explainable? && query.high_cost?
           new_cost, new_cost2 = query.costs[1..2]
 
-          cost_savings = new_cost < query.initial_cost * 0.5
+          cost_savings = new_cost < query.initial_cost * savings_ratio
 
           # set high bar for multicolumn indexes
-          cost_savings2 = new_cost > 100 && new_cost2 < new_cost * 0.5
+          cost_savings2 = new_cost > 100 && new_cost2 < new_cost * savings_ratio
 
           key = cost_savings2 ? 2 : 1
           query_indexes = hypo_indexes_from_plan(index_name_to_columns, query.plans[key], index_set)
 
           # likely a bad suggestion, so try single column
           if cost_savings2 && query_indexes.size > 1
             query_indexes = hypo_indexes_from_plan(index_name_to_columns, query.plans[1], index_set)
             cost_savings2 = false
           end
 
-          # TODO if multiple indexes are found (for either single or multicolumn)
+          suggest_index = cost_savings || cost_savings2
+
+          cost_savings3 = false
+          new_cost3 = nil
+
+          # if multiple indexes are found (for either single or multicolumn)
           # determine the impact of each individually
-          # for now, be conservative and don't suggest if more than one index
-          suggest_index = (cost_savings || cost_savings2) && query_indexes.size == 1
+          # there may be a better single index that we're not considering
+          # that didn't get picked up by pass1 or pass2
+          # TODO clean this up
+          # TODO suggest more than one index from this if savings are there
+          if suggest_index && query_indexes.size > 1
+            winning_index = nil
+            winning_cost = nil
+            winning_plan = nil
 
+            query_indexes.each do |query_index|
+              reset_hypothetical_indexes
+              create_hypothetical_index(query_index[:table], query_index[:columns].map { |v| {column: v} })
+              plan3 = plan(query.statement)
+              cost3 = plan3["Total Cost"]
+
+              if !winning_cost || cost3 < winning_cost
+                winning_cost = cost3
+                winning_index = query_index
+                winning_plan = plan3
+              end
+            end
+
+            query.plans << winning_plan
+
+            # duplicated from above
+            # TODO DRY
+            use_winning =
+              if cost_savings2
+                new_cost > 100 && winning_cost < new_cost * savings_ratio
+              else
+                winning_cost < query.initial_cost * savings_ratio
+              end
+
+            if use_winning
+              query_indexes = [winning_index]
+              cost_savings3 = true
+              new_cost3 = winning_cost
+              query.pass3_indexes = query_indexes
+            else
+              suggest_index = false
+            end
+          end
+
           if suggest_index
             query_indexes.each do |index|
               new_indexes[index] ||= index.dup
               (new_indexes[index][:queries] ||= []) << query
             end
@@ -297,11 +364,11 @@
 
           query.indexes = query_indexes
           query.suggest_index = suggest_index
           query.new_cost =
             if suggest_index
-              cost_savings2 ? new_cost2 : new_cost
+              cost_savings3 ? new_cost3 : (cost_savings2 ? new_cost2 : new_cost)
             else
               query.initial_cost
             end
 
           # TODO optimize
@@ -366,13 +433,16 @@
           elsif query.explainable?
             query_indexes = query.indexes || []
             log "Start: #{query.costs[0]}"
             log "Pass1: #{query.costs[1]} : #{log_indexes(query.pass1_indexes || [])}"
             log "Pass2: #{query.costs[2]} : #{log_indexes(query.pass2_indexes || [])}"
+            if query.costs[3]
+              log "Pass3: #{query.costs[3]} : #{log_indexes(query.pass3_indexes || [])}"
+            end
             log "Final: #{query.new_cost} : #{log_indexes(query.suggest_index ? query_indexes : [])}"
             if query_indexes.size == 1 && !query.suggest_index
-              log "Need 50% cost savings to suggest index"
+              log "Need #{@min_cost_savings_pct}% cost savings to suggest index"
             end
           else
             log "Could not run explain"
           end
           log
@@ -447,15 +517,19 @@
     # TODO for multicolumn indexes, use ordering
     def create_hypothetical_indexes_helper(columns_by_table, n, candidates)
       columns_by_table.each do |table, cols|
         # no reason to use btree index for json columns
         cols.reject { |c| ["json", "jsonb"].include?(c[:type]) }.permutation(n) do |col_set|
-          candidates[col_set] = execute("SELECT * FROM hypopg_create_index('CREATE INDEX ON #{quote_ident(table)} (#{col_set.map { |c| quote_ident(c[:column])  }.join(", ")})')").first["indexname"]
+          candidates[col_set] = create_hypothetical_index(table, col_set)
         end
       end
     end
 
+    def create_hypothetical_index(table, col_set)
+      execute("SELECT * FROM hypopg_create_index('CREATE INDEX ON #{quote_ident(table)} (#{col_set.map { |c| quote_ident(c[:column])  }.join(", ")})')").first["indexname"]
+    end
+
     def database_tables
       result = execute <<-SQL
         SELECT
           table_schema || '.' || table_name AS table_name
         FROM
@@ -464,10 +538,47 @@
           table_catalog = current_database()
       SQL
       result.map { |r| r["table_name"] }
     end
 
+    def materialized_views
+      if server_version_num >= 90300
+        result = execute <<-SQL
+          SELECT
+            schemaname || '.' || matviewname AS table_name
+          FROM
+            pg_matviews
+        SQL
+        result.map { |r| r["table_name"] }
+      else
+        []
+      end
+    end
+
+    def server_version_num
+      execute("SHOW server_version_num").first["server_version_num"].to_i
+    end
+
+    def database_view_tables
+      result = execute <<-SQL
+        SELECT
+          schemaname || '.' || viewname AS table_name,
+          definition
+        FROM
+          pg_views
+        WHERE
+          schemaname NOT IN ('information_schema', 'pg_catalog')
+      SQL
+
+      view_tables = {}
+      result.each do |row|
+        view_tables[row["table_name"]] = PgQuery.parse(row["definition"]).tables
+      end
+
+      view_tables
+    end
+
     def stat_statements
       result = execute <<-SQL
         SELECT
           DISTINCT query
         FROM
@@ -513,16 +624,18 @@
     end
 
     def columns(tables)
       columns = execute <<-SQL
         SELECT
-          table_schema || '.' || table_name AS table_name,
-          column_name,
-          data_type
-        FROM
-          information_schema.columns
-        WHERE
-          table_schema || '.' || table_name IN (#{tables.map { |t| quote(t) }.join(", ")})
+          s.nspname || '.' || t.relname AS table_name,
+          a.attname AS column_name,
+          pg_catalog.format_type(a.atttypid, a.atttypmod) AS data_type
+        FROM pg_attribute a
+          JOIN pg_class t on a.attrelid = t.oid
+          JOIN pg_namespace s on t.relnamespace = s.oid
+        WHERE a.attnum > 0
+          AND NOT a.attisdropped
+          AND s.nspname || '.' || t.relname IN (#{tables.map { |t| quote(t) }.join(", ")})
         ORDER BY
           1, 2
       SQL
 
       columns.map { |v| {table: v["table_name"], column: v["column_name"], type: v["data_type"]} }