//! Support for egraphs represented in the DataFlowGraph. use crate::alias_analysis::{AliasAnalysis, LastStores}; use crate::ctxhash::{CtxEq, CtxHash, CtxHashMap}; use crate::cursor::{Cursor, CursorPosition, FuncCursor}; use crate::dominator_tree::DominatorTree; use crate::egraph::domtree::DomTreeWithChildren; use crate::egraph::elaborate::Elaborator; use crate::fx::FxHashSet; use crate::inst_predicates::is_pure_for_egraph; use crate::ir::{ DataFlowGraph, Function, Inst, InstructionData, Type, Value, ValueDef, ValueListPool, }; use crate::loop_analysis::LoopAnalysis; use crate::opts::generated_code::ContextIter; use crate::opts::IsleContext; use crate::trace; use crate::unionfind::UnionFind; use cranelift_entity::packed_option::ReservedValue; use cranelift_entity::SecondaryMap; use std::hash::Hasher; mod cost; mod domtree; mod elaborate; /// Pass over a Function that does the whole aegraph thing. /// /// - Removes non-skeleton nodes from the Layout. /// - Performs a GVN-and-rule-application pass over all Values /// reachable from the skeleton, potentially creating new Union /// nodes (i.e., an aegraph) so that some values have multiple /// representations. /// - Does "extraction" on the aegraph: selects the best value out of /// the tree-of-Union nodes for each used value. /// - Does "scoped elaboration" on the aegraph: chooses one or more /// locations for pure nodes to become instructions again in the /// layout, as forced by the skeleton. /// /// At the beginning and end of this pass, the CLIF should be in a /// state that passes the verifier and, additionally, has no Union /// nodes. During the pass, Union nodes may exist, and instructions in /// the layout may refer to results of instructions that are not /// placed in the layout. pub struct EgraphPass<'a> { /// The function we're operating on. func: &'a mut Function, /// Dominator tree, used for elaboration pass. domtree: &'a DominatorTree, /// Alias analysis, used during optimization. alias_analysis: &'a mut AliasAnalysis<'a>, /// "Domtree with children": like `domtree`, but with an explicit /// list of children, rather than just parent pointers. domtree_children: DomTreeWithChildren, /// Loop analysis results, used for built-in LICM during /// elaboration. loop_analysis: &'a LoopAnalysis, /// Which canonical Values do we want to rematerialize in each /// block where they're used? /// /// (A canonical Value is the *oldest* Value in an eclass, /// i.e. tree of union value-nodes). remat_values: FxHashSet, /// Stats collected while we run this pass. pub(crate) stats: Stats, /// Union-find that maps all members of a Union tree (eclass) back /// to the *oldest* (lowest-numbered) `Value`. eclasses: UnionFind, } /// Context passed through node insertion and optimization. pub(crate) struct OptimizeCtx<'opt, 'analysis> where 'analysis: 'opt, { // Borrowed from EgraphPass: pub(crate) func: &'opt mut Function, pub(crate) value_to_opt_value: &'opt mut SecondaryMap, pub(crate) gvn_map: &'opt mut CtxHashMap<(Type, InstructionData), Value>, pub(crate) eclasses: &'opt mut UnionFind, pub(crate) remat_values: &'opt mut FxHashSet, pub(crate) stats: &'opt mut Stats, pub(crate) alias_analysis: &'opt mut AliasAnalysis<'analysis>, pub(crate) alias_analysis_state: &'opt mut LastStores, // Held locally during optimization of one node (recursively): pub(crate) rewrite_depth: usize, pub(crate) subsume_values: FxHashSet, } /// For passing to `insert_pure_enode`. Sometimes the enode already /// exists as an Inst (from the original CLIF), and sometimes we're in /// the middle of creating it and want to avoid inserting it if /// possible until we know we need it. pub(crate) enum NewOrExistingInst { New(InstructionData, Type), Existing(Inst), } impl NewOrExistingInst { fn get_inst_key<'a>(&'a self, dfg: &'a DataFlowGraph) -> (Type, InstructionData) { match self { NewOrExistingInst::New(data, ty) => (*ty, *data), NewOrExistingInst::Existing(inst) => { let ty = dfg.ctrl_typevar(*inst); (ty, dfg.insts[*inst].clone()) } } } } impl<'opt, 'analysis> OptimizeCtx<'opt, 'analysis> where 'analysis: 'opt, { /// Optimization of a single instruction. /// /// This does a few things: /// - Looks up the instruction in the GVN deduplication map. If we /// already have the same instruction somewhere else, with the /// same args, then we can alias the original instruction's /// results and omit this instruction entirely. /// - Note that we do this canonicalization based on the /// instruction with its arguments as *canonical* eclass IDs, /// that is, the oldest (smallest index) `Value` reachable in /// the tree-of-unions (whole eclass). This ensures that we /// properly canonicalize newer nodes that use newer "versions" /// of a value that are still equal to the older versions. /// - If the instruction is "new" (not deduplicated), then apply /// optimization rules: /// - All of the mid-end rules written in ISLE. /// - Store-to-load forwarding. /// - Update the value-to-opt-value map, and update the eclass /// union-find, if we rewrote the value to different form(s). pub(crate) fn insert_pure_enode(&mut self, inst: NewOrExistingInst) -> Value { // Create the external context for looking up and updating the // GVN map. This is necessary so that instructions themselves // do not have to carry all the references or data for a full // `Eq` or `Hash` impl. let gvn_context = GVNContext { union_find: self.eclasses, value_lists: &self.func.dfg.value_lists, }; self.stats.pure_inst += 1; if let NewOrExistingInst::New(..) = inst { self.stats.new_inst += 1; } // Does this instruction already exist? If so, add entries to // the value-map to rewrite uses of its results to the results // of the original (existing) instruction. If not, optimize // the new instruction. if let Some(&orig_result) = self .gvn_map .get(&inst.get_inst_key(&self.func.dfg), &gvn_context) { self.stats.pure_inst_deduped += 1; if let NewOrExistingInst::Existing(inst) = inst { debug_assert_eq!(self.func.dfg.inst_results(inst).len(), 1); let result = self.func.dfg.first_result(inst); self.value_to_opt_value[result] = orig_result; self.eclasses.union(result, orig_result); self.stats.union += 1; result } else { orig_result } } else { // Now actually insert the InstructionData and attach // result value (exactly one). let (inst, result, ty) = match inst { NewOrExistingInst::New(data, typevar) => { let inst = self.func.dfg.make_inst(data); // TODO: reuse return value? self.func.dfg.make_inst_results(inst, typevar); let result = self.func.dfg.first_result(inst); // Add to eclass unionfind. self.eclasses.add(result); // New inst. We need to do the analysis of its result. (inst, result, typevar) } NewOrExistingInst::Existing(inst) => { let result = self.func.dfg.first_result(inst); let ty = self.func.dfg.ctrl_typevar(inst); (inst, result, ty) } }; let opt_value = self.optimize_pure_enode(inst); let gvn_context = GVNContext { union_find: self.eclasses, value_lists: &self.func.dfg.value_lists, }; self.gvn_map.insert( (ty, self.func.dfg.insts[inst].clone()), opt_value, &gvn_context, ); self.value_to_opt_value[result] = opt_value; opt_value } } /// Optimizes an enode by applying any matching mid-end rewrite /// rules (or store-to-load forwarding, which is a special case), /// unioning together all possible optimized (or rewritten) forms /// of this expression into an eclass and returning the `Value` /// that represents that eclass. fn optimize_pure_enode(&mut self, inst: Inst) -> Value { // A pure node always has exactly one result. let orig_value = self.func.dfg.first_result(inst); let mut isle_ctx = IsleContext { ctx: self }; // Limit rewrite depth. When we apply optimization rules, they // may create new nodes (values) and those are, recursively, // optimized eagerly as soon as they are created. So we may // have more than one ISLE invocation on the stack. (This is // necessary so that as the toplevel builds the // right-hand-side expression bottom-up, it uses the "latest" // optimized values for all the constituent parts.) To avoid // infinite or problematic recursion, we bound the rewrite // depth to a small constant here. const REWRITE_LIMIT: usize = 5; if isle_ctx.ctx.rewrite_depth > REWRITE_LIMIT { isle_ctx.ctx.stats.rewrite_depth_limit += 1; return orig_value; } isle_ctx.ctx.rewrite_depth += 1; // Invoke the ISLE toplevel constructor, getting all new // values produced as equivalents to this value. trace!("Calling into ISLE with original value {}", orig_value); isle_ctx.ctx.stats.rewrite_rule_invoked += 1; let mut optimized_values = crate::opts::generated_code::constructor_simplify(&mut isle_ctx, orig_value); // Create a union of all new values with the original (or // maybe just one new value marked as "subsuming" the // original, if present.) let mut union_value = orig_value; while let Some(optimized_value) = optimized_values.next(&mut isle_ctx) { trace!( "Returned from ISLE for {}, got {:?}", orig_value, optimized_value ); if optimized_value == orig_value { trace!(" -> same as orig value; skipping"); continue; } if isle_ctx.ctx.subsume_values.contains(&optimized_value) { // Merge in the unionfind so canonicalization // still works, but take *only* the subsuming // value, and break now. isle_ctx.ctx.eclasses.union(optimized_value, union_value); union_value = optimized_value; break; } let old_union_value = union_value; union_value = isle_ctx .ctx .func .dfg .union(old_union_value, optimized_value); isle_ctx.ctx.stats.union += 1; trace!(" -> union: now {}", union_value); isle_ctx.ctx.eclasses.add(union_value); isle_ctx .ctx .eclasses .union(old_union_value, optimized_value); isle_ctx.ctx.eclasses.union(old_union_value, union_value); } isle_ctx.ctx.rewrite_depth -= 1; union_value } /// Optimize a "skeleton" instruction, possibly removing /// it. Returns `true` if the instruction should be removed from /// the layout. fn optimize_skeleton_inst(&mut self, inst: Inst) -> bool { self.stats.skeleton_inst += 1; // Not pure, but may still be a load or store: // process it to see if we can optimize it. if let Some(new_result) = self.alias_analysis .process_inst(self.func, self.alias_analysis_state, inst) { self.stats.alias_analysis_removed += 1; let result = self.func.dfg.first_result(inst); self.value_to_opt_value[result] = new_result; true } else { // Set all results to identity-map to themselves // in the value-to-opt-value map. for &result in self.func.dfg.inst_results(inst) { self.value_to_opt_value[result] = result; self.eclasses.add(result); } false } } } impl<'a> EgraphPass<'a> { /// Create a new EgraphPass. pub fn new( func: &'a mut Function, domtree: &'a DominatorTree, loop_analysis: &'a LoopAnalysis, alias_analysis: &'a mut AliasAnalysis<'a>, ) -> Self { let num_values = func.dfg.num_values(); let domtree_children = DomTreeWithChildren::new(func, domtree); Self { func, domtree, domtree_children, loop_analysis, alias_analysis, stats: Stats::default(), eclasses: UnionFind::with_capacity(num_values), remat_values: FxHashSet::default(), } } /// Run the process. pub fn run(&mut self) { self.remove_pure_and_optimize(); trace!("egraph built:\n{}\n", self.func.display()); if cfg!(feature = "trace-log") { for (value, def) in self.func.dfg.values_and_defs() { trace!(" -> {} = {:?}", value, def); match def { ValueDef::Result(i, 0) => { trace!(" -> {} = {:?}", i, self.func.dfg.insts[i]); } _ => {} } } } trace!("stats: {:?}", self.stats); self.elaborate(); } /// Remove pure nodes from the `Layout` of the function, ensuring /// that only the "side-effect skeleton" remains, and also /// optimize the pure nodes. This is the first step of /// egraph-based processing and turns the pure CFG-based CLIF into /// a CFG skeleton with a sea of (optimized) nodes tying it /// together. /// /// As we walk through the code, we eagerly apply optimization /// rules; at any given point we have a "latest version" of an /// eclass of possible representations for a `Value` in the /// original program, which is itself a `Value` at the root of a /// union-tree. We keep a map from the original values to these /// optimized values. When we encounter any instruction (pure or /// side-effecting skeleton) we rewrite its arguments to capture /// the "latest" optimized forms of these values. (We need to do /// this as part of this pass, and not later using a finished map, /// because the eclass can continue to be updated and we need to /// only refer to its subset that exists at this stage, to /// maintain acyclicity.) fn remove_pure_and_optimize(&mut self) { let mut cursor = FuncCursor::new(self.func); let mut value_to_opt_value: SecondaryMap = SecondaryMap::with_default(Value::reserved_value()); let mut gvn_map: CtxHashMap<(Type, InstructionData), Value> = CtxHashMap::with_capacity(cursor.func.dfg.num_values()); // In domtree preorder, visit blocks. (TODO: factor out an // iterator from this and elaborator.) let root = self.domtree_children.root(); let mut block_stack = vec![root]; while let Some(block) = block_stack.pop() { // We popped this block; push children // immediately, then process this block. block_stack.extend(self.domtree_children.children(block)); trace!("Processing block {}", block); cursor.set_position(CursorPosition::Before(block)); let mut alias_analysis_state = self.alias_analysis.block_starting_state(block); for ¶m in cursor.func.dfg.block_params(block) { trace!("creating initial singleton eclass for blockparam {}", param); self.eclasses.add(param); value_to_opt_value[param] = param; } while let Some(inst) = cursor.next_inst() { trace!("Processing inst {}", inst); // While we're passing over all insts, create initial // singleton eclasses for all result and blockparam // values. Also do initial analysis of all inst // results. for &result in cursor.func.dfg.inst_results(inst) { trace!("creating initial singleton eclass for {}", result); self.eclasses.add(result); } // Rewrite args of *all* instructions using the // value-to-opt-value map. cursor.func.dfg.resolve_aliases_in_arguments(inst); for arg in cursor.func.dfg.inst_args_mut(inst) { let new_value = value_to_opt_value[*arg]; trace!("rewriting arg {} of inst {} to {}", arg, inst, new_value); debug_assert_ne!(new_value, Value::reserved_value()); *arg = new_value; } // Build a context for optimization, with borrows of // state. We can't invoke a method on `self` because // we've borrowed `self.func` mutably (as // `cursor.func`) so we pull apart the pieces instead // here. let mut ctx = OptimizeCtx { func: cursor.func, value_to_opt_value: &mut value_to_opt_value, gvn_map: &mut gvn_map, eclasses: &mut self.eclasses, rewrite_depth: 0, subsume_values: FxHashSet::default(), remat_values: &mut self.remat_values, stats: &mut self.stats, alias_analysis: self.alias_analysis, alias_analysis_state: &mut alias_analysis_state, }; if is_pure_for_egraph(ctx.func, inst) { // Insert into GVN map and optimize any new nodes // inserted (recursively performing this work for // any nodes the optimization rules produce). let inst = NewOrExistingInst::Existing(inst); ctx.insert_pure_enode(inst); // We've now rewritten all uses, or will when we // see them, and the instruction exists as a pure // enode in the eclass, so we can remove it. cursor.remove_inst_and_step_back(); } else { if ctx.optimize_skeleton_inst(inst) { cursor.remove_inst_and_step_back(); } } } } } /// Scoped elaboration: compute a final ordering of op computation /// for each block and update the given Func body. After this /// runs, the function body is back into the state where every /// Inst with an used result is placed in the layout (possibly /// duplicated, if our code-motion logic decides this is the best /// option). /// /// This works in concert with the domtree. We do a preorder /// traversal of the domtree, tracking a scoped map from Id to /// (new) Value. The map's scopes correspond to levels in the /// domtree. /// /// At each block, we iterate forward over the side-effecting /// eclasses, and recursively generate their arg eclasses, then /// emit the ops themselves. /// /// To use an eclass in a given block, we first look it up in the /// scoped map, and get the Value if already present. If not, we /// need to generate it. We emit the extracted enode for this /// eclass after recursively generating its args. Eclasses are /// thus computed "as late as possible", but then memoized into /// the Id-to-Value map and available to all dominated blocks and /// for the rest of this block. (This subsumes GVN.) fn elaborate(&mut self) { let mut elaborator = Elaborator::new( self.func, self.domtree, &self.domtree_children, self.loop_analysis, &mut self.remat_values, &mut self.eclasses, &mut self.stats, ); elaborator.elaborate(); self.check_post_egraph(); } #[cfg(debug_assertions)] fn check_post_egraph(&self) { // Verify that no union nodes are reachable from inst args, // and that all inst args' defining instructions are in the // layout. for block in self.func.layout.blocks() { for inst in self.func.layout.block_insts(block) { for &arg in self.func.dfg.inst_args(inst) { match self.func.dfg.value_def(arg) { ValueDef::Result(i, _) => { debug_assert!(self.func.layout.inst_block(i).is_some()); } ValueDef::Union(..) => { panic!("egraph union node {} still reachable at {}!", arg, inst); } _ => {} } } } } } #[cfg(not(debug_assertions))] fn check_post_egraph(&self) {} } /// Implementation of external-context equality and hashing on /// InstructionData. This allows us to deduplicate instructions given /// some context that lets us see its value lists and the mapping from /// any value to "canonical value" (in an eclass). struct GVNContext<'a> { value_lists: &'a ValueListPool, union_find: &'a UnionFind, } impl<'a> CtxEq<(Type, InstructionData), (Type, InstructionData)> for GVNContext<'a> { fn ctx_eq( &self, (a_ty, a_inst): &(Type, InstructionData), (b_ty, b_inst): &(Type, InstructionData), ) -> bool { a_ty == b_ty && a_inst.eq(b_inst, self.value_lists, |value| { self.union_find.find(value) }) } } impl<'a> CtxHash<(Type, InstructionData)> for GVNContext<'a> { fn ctx_hash(&self, state: &mut H, (ty, inst): &(Type, InstructionData)) { std::hash::Hash::hash(&ty, state); inst.hash(state, self.value_lists, |value| self.union_find.find(value)); } } /// Statistics collected during egraph-based processing. #[derive(Clone, Debug, Default)] pub(crate) struct Stats { pub(crate) pure_inst: u64, pub(crate) pure_inst_deduped: u64, pub(crate) skeleton_inst: u64, pub(crate) alias_analysis_removed: u64, pub(crate) new_inst: u64, pub(crate) union: u64, pub(crate) subsume: u64, pub(crate) remat: u64, pub(crate) rewrite_rule_invoked: u64, pub(crate) rewrite_depth_limit: u64, pub(crate) elaborate_visit_node: u64, pub(crate) elaborate_memoize_hit: u64, pub(crate) elaborate_memoize_miss: u64, pub(crate) elaborate_memoize_miss_remat: u64, pub(crate) elaborate_licm_hoist: u64, pub(crate) elaborate_func: u64, pub(crate) elaborate_func_pre_insts: u64, pub(crate) elaborate_func_post_insts: u64, }