WIP: Better dominators

Keno · Keno · commit addfc97a6156 · 2018-09-11T19:21:11.000-04:00
This commit replaces the naive algorithm for replacing dominator trees by a faster implementation based on the Semi-NCA algorithm (reference in the code comments). LLVM recently switched to this algorithm and found it to be faster in practice than SLT (which it used before). It is also slightly easier to implement. More importantly though, it should easily extend to dynamic dominators. I'm hoping this will fix the performance problems with constructing dominators noted in #25927 as well as providing the basis for a dynamic dominator implementation to fix #29107.
diff --git a/base/compiler/ssair/domtree.jl b/base/compiler/ssair/domtree.jl
@@ -54,17 +54,18 @@ function iterate(doms::DominatedBlocks, state::Nothing=nothing)
     return (bb, nothing)
 end
 
-# Construct Dom Tree
-# Simple algorithm - TODO: Switch to the fast version (e.g. https://tanujkhattar.wordpress.com/2016/01/11/dominator-tree-of-a-directed-graph/)
-function construct_domtree(cfg::CFG)
+function naive_idoms(cfg::CFG)
     nblocks = length(cfg.blocks)
     dom_all = BitSet(1:nblocks)
     dominators = BitSet[n == 1 ? BitSet(1) : copy(dom_all) for n = 1:nblocks]
     changed = true
     while changed
         changed = false
         for n = 2:nblocks
-            isempty(cfg.blocks[n].preds) && continue
+            if isempty(cfg.blocks[n].preds)
+                isempty(dominators[n]) || (dominators[n] = BitSet())
+                continue
+            end
             firstp, rest = Iterators.peel(Iterators.filter(p->p != 0, cfg.blocks[n].preds))
             new_doms = copy(dominators[firstp])
             for p in rest
@@ -91,7 +92,16 @@ function construct_domtree(cfg::CFG)
             idoms[i] = dom
         end
     end
+    idoms
+end
+
+# Construct Dom Tree
+function construct_domtree(cfg::CFG)
+    idoms = SNCA(cfg)
+    nidoms = naive_idoms(cfg)
+    @assert idoms == nidoms
     # Compute children
+    nblocks = length(cfg.blocks)
     domtree = DomTreeNode[DomTreeNode() for _ = 1:nblocks]
     for (idx, idom) in Iterators.enumerate(idoms)
         (idx == 1 || idom == 0) && continue
@@ -101,3 +111,165 @@ function construct_domtree(cfg::CFG)
     update_level!(domtree, 1, 1)
     DomTree(idoms, domtree)
 end
+
+#================================ [SNCA] ======================================#
+#
+#   This section implements the Semi-NCA (SNCA) dominator tree construction from
+#   described in Georgiadis' PhD thesis [LG05], which itself is a simplification
+#   of the Simple Lenguare-Tarjan (SLT) algorithm [LG79]. This algorithm matches
+#   the algorithm choice in LLVM and seems to be a sweet spot in implementation
+#   simplicity and efficiency.
+#
+#   [LG05]  Linear-Time Algorithms for Dominators and Related Problems
+#           Loukas Georgiadis, Princeton University, November 2005, pp. 21-23:
+#           ftp://ftp.cs.princeton.edu/reports/2005/737.pdf
+#
+#   [LT79]  A fast algorithm for finding dominators in a flowgraph
+#           Thomas Lengauer, Robert Endre Tarjan, July 1979, ACM TOPLAS 1-1
+#           http://www.dtic.mil/dtic/tr/fulltext/u2/a054144.pdf
+#
+begin
+    # We could make these real structs, but probably not worth the extra
+    # overhead. Still, give them names for documentary purposes.
+    const BBNumber = UInt
+    const DFSNumber = UInt
+
+    """
+    Keeps the per-BB state of the Semi NCA algorithm. In the original
+    formulation, there are three separate length `n` arrays, `label`, `semi` and
+    `ancestor`. Instead, for efficiency, we use one array in a array-of-structs
+    style setup.
+    """
+    struct Node
+        semi::DFSNumber
+        label::DFSNumber
+    end
+
+    struct DFSTree
+        # Maps DFS number to BB number
+        numbering::Vector{BBNumber}
+        # Maps BB number to DFS number
+        reverse::Vector{DFSNumber}
+        # Records parent relationships in the DFS tree (DFS number -> DFS number)
+        # Storing it this way saves a few lookups in the snca_compress! algorithm
+        parents::Vector{DFSNumber}
+    end
+    length(D::DFSTree) = length(D.numbering)
+    preorder(D::DFSTree) = OneTo(length(D))
+    _drop(xs::AbstractUnitRange, n::Integer) = (first(xs)+n):last(xs)
+
+    function DFSTree(nblocks::Int)
+        DFSTree(
+            Vector{BBNumber}(undef, nblocks),
+            zeros(DFSNumber, nblocks),
+            Vector{DFSNumber}(undef, nblocks))
+    end
+
+    function DFS(cfg::CFG, current_node::BBNumber)::DFSTree
+        dfs = DFSTree(length(cfg.blocks))
+        # TODO: We could reuse the storage in DFSTree for our worklist. We're
+        # guaranteed for the worklist to be smaller than the remaining space in
+        # DFSTree
+        worklist = Tuple{DFSNumber, BBNumber}[(0, current_node)]
+        dfs_num = 1
+        parent = 0
+        while !isempty(worklist)
+            (parent, current_node) = pop!(worklist)
+            dfs.reverse[current_node] = dfs_num
+            dfs.numbering[dfs_num] = current_node
+            dfs.parents[dfs_num] = parent
+            for succ in cfg.blocks[current_node].succs
+                dfs.reverse[succ] != 0 && continue
+                # Mark things that are currently in the worklist
+                dfs.reverse[succ] = 1
+                push!(worklist, (dfs_num, succ))
+            end
+            dfs_num += 1
+        end
+        # If all blocks are reachable, this is a no-op, otherwise,
+        # we shrink these arrays.
+        resize!(dfs.numbering, dfs_num - 1)
+        resize!(dfs.parents, dfs_num - 1)
+        dfs
+    end
+
+    """
+    Matches the snca_compress algorithm in Figure 2.8 of [LG05], with the
+    modification suggested in the paper to use `last_linked` to determine
+    whether an ancestor has been processed rather than storing `0` in the
+    ancestor array.
+    """
+    function snca_compress!(state::Vector{Node}, ancestors::Vector{DFSNumber},
+                            v::DFSNumber, last_linked::DFSNumber)
+        u = ancestors[v]
+        @assert u < v
+        if u >= last_linked
+            snca_compress!(state, ancestors, u, last_linked)
+            if state[u].label < state[v].label
+                state[v] = Node(state[v].semi, state[u].label)
+            end
+            ancestors[v] = ancestors[u]
+        end
+        nothing
+    end
+
+    """
+    The main Semi-NCA algrithm. Matches Figure 2.8 in [LG05].
+    Note that the pseudocode in [LG05] is not entirely accurate.
+    The best way to understand what's happening is to read [LT79], then the
+    description of SLT in in [LG05] (warning: inconsistent notation), then
+    the description of Semi-NCA.
+    """
+    function SNCA(cfg::CFG)
+        D = DFS(cfg, BBNumber(1))
+        # `label` is initialized to the identity mapping (though
+        # the paper doesn't make that clear). The rational for this is Lemma
+        # 2.4 in [LG05] (i.e. Theorem 4 in ). Note however, that we don't
+        # ever look at `semi` until it is fully initialized, so we could leave
+        # it unitialized here if we wanted to.
+        state = Node[ Node(typemax(DFSNumber), w) for w in preorder(D) ]
+        # Initialize idoms to parents. Note that while idoms are eventually
+        # BB indexed, we keep it DFS indexed until a final post-processing
+        # pass to avoid extra memory references during the O(N^2) phase below.
+        idoms_dfs = copy(D.parents)
+        # We abuse the parents array as the ancestors array.
+        # Semi-NCA does not look at the parents array at all.
+        # SLT would, but never simultaneously, so we could still
+        # do this.
+        ancestors = D.parents
+        for w ∈ reverse(_drop(preorder(D), 1))
+            # LLVM initializes this to the parent, the paper initializes this to
+            # `w`, but it doesn't really matter (the parent is a predecessor,
+            # so at worst we'll discover it below). Save a memory reference here.
+            semi_w = typemax(DFSNumber)
+            for v ∈ cfg.blocks[D.numbering[w]].preds
+                # For the purpose of the domtree, ignore virtual predecessors
+                # into catch blocks.
+                v == 0 && continue
+                vdfs = D.reverse[v]
+                # Ignore unreachable predecessors
+                vdfs == 0 && continue
+                last_linked = DFSNumber(w + 1)
+                # N.B.: This conditional is missing from the psuedocode
+                # in figure 2.8 of [LG05]. It corresponds to the
+                # `ancestor[v] != 0` check in the `eval` implementation in
+                # figure 2.6
+                if vdfs >= last_linked
+                    snca_compress!(state, ancestors, vdfs, last_linked)
+                end
+                semi_w = min(semi_w, state[vdfs].label)
+            end
+            state[w] = Node(semi_w, semi_w)
+        end
+        for v ∈ _drop(preorder(D), 1)
+            idom = idoms_dfs[v]
+            vsemi = state[v].semi
+            while idom > vsemi
+                idom = idoms_dfs[idom]
+            end
+            idoms_dfs[v] = idom
+        end
+        idoms_bb = Int[ (i == 1 || D.reverse[i] == 0) ? 0 : D.numbering[idoms_dfs[D.reverse[i]]] for i = 1:length(cfg.blocks) ]
+        idoms_bb
+    end
+end