From 12a948cc47bf3e7b91967e0b75a328823d156afd Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@alumni.harvard.edu>
Date: Tue, 11 Sep 2018 19:21:11 -0400
Subject: [PATCH] Better dominators

This commit replaces the naive algorithm for replacing dominator
trees by a faster implementation based on the Semi-NCA algorithm
(reference in the code comments). LLVM recently switched to this
algorithm and found it to be faster in practice than SLT (which
it used before). It is also slightly easier to implement. More
importantly though, it should easily extend to dynamic dominators.

This fixes the preformance problems in dominator construction noted
in #25927 and should provide a basis for a dynamic dominator
implementation to fix #29107.
---
 base/compiler/ssair/domtree.jl | 184 ++++++++++++++++++++++++++++++++-
 1 file changed, 179 insertions(+), 5 deletions(-)

diff --git a/base/compiler/ssair/domtree.jl b/base/compiler/ssair/domtree.jl
index 73769e69ae953..27c9af8b44fd1 100644
--- a/base/compiler/ssair/domtree.jl
+++ b/base/compiler/ssair/domtree.jl
@@ -54,17 +54,18 @@ function iterate(doms::DominatedBlocks, state::Nothing=nothing)
     return (bb, nothing)
 end
 
-# Construct Dom Tree
-# Simple algorithm - TODO: Switch to the fast version (e.g. https://tanujkhattar.wordpress.com/2016/01/11/dominator-tree-of-a-directed-graph/)
-function construct_domtree(cfg::CFG)
+function naive_idoms(cfg::CFG)
     nblocks = length(cfg.blocks)
-    dom_all = BitSet(1:nblocks)
+    # The extra +1 helps us detect unreachable blocks below
+    dom_all = BitSet(1:nblocks+1)
     dominators = BitSet[n == 1 ? BitSet(1) : copy(dom_all) for n = 1:nblocks]
     changed = true
     while changed
         changed = false
         for n = 2:nblocks
-            isempty(cfg.blocks[n].preds) && continue
+            if isempty(cfg.blocks[n].preds)
+                continue
+            end
             firstp, rest = Iterators.peel(Iterators.filter(p->p != 0, cfg.blocks[n].preds))
             new_doms = copy(dominators[firstp])
             for p in rest
@@ -78,6 +79,10 @@ function construct_domtree(cfg::CFG)
     # Compute idoms
     idoms = fill(0, nblocks)
     for i = 2:nblocks
+        if dominators[i] == dom_all
+            idoms[i] = 0
+            continue
+        end
         doms = collect(dominators[i])
         for dom in doms
             i == dom && continue
@@ -91,7 +96,14 @@ function construct_domtree(cfg::CFG)
             idoms[i] = dom
         end
     end
+    idoms
+end
+
+# Construct Dom Tree
+function construct_domtree(cfg::CFG)
+    idoms = SNCA(cfg)
     # Compute children
+    nblocks = length(cfg.blocks)
     domtree = DomTreeNode[DomTreeNode() for _ = 1:nblocks]
     for (idx, idom) in Iterators.enumerate(idoms)
         (idx == 1 || idom == 0) && continue
@@ -101,3 +113,165 @@ function construct_domtree(cfg::CFG)
     update_level!(domtree, 1, 1)
     DomTree(idoms, domtree)
 end
+
+#================================ [SNCA] ======================================#
+#
+#   This section implements the Semi-NCA (SNCA) dominator tree construction from
+#   described in Georgiadis' PhD thesis [LG05], which itself is a simplification
+#   of the Simple Lenguare-Tarjan (SLT) algorithm [LG79]. This algorithm matches
+#   the algorithm choice in LLVM and seems to be a sweet spot in implementation
+#   simplicity and efficiency.
+#
+#   [LG05]  Linear-Time Algorithms for Dominators and Related Problems
+#           Loukas Georgiadis, Princeton University, November 2005, pp. 21-23:
+#           ftp://ftp.cs.princeton.edu/reports/2005/737.pdf
+#
+#   [LT79]  A fast algorithm for finding dominators in a flowgraph
+#           Thomas Lengauer, Robert Endre Tarjan, July 1979, ACM TOPLAS 1-1
+#           http://www.dtic.mil/dtic/tr/fulltext/u2/a054144.pdf
+#
+begin
+    # We could make these real structs, but probably not worth the extra
+    # overhead. Still, give them names for documentary purposes.
+    const BBNumber = UInt
+    const DFSNumber = UInt
+
+    """
+    Keeps the per-BB state of the Semi NCA algorithm. In the original
+    formulation, there are three separate length `n` arrays, `label`, `semi` and
+    `ancestor`. Instead, for efficiency, we use one array in a array-of-structs
+    style setup.
+    """
+    struct Node
+        semi::DFSNumber
+        label::DFSNumber
+    end
+
+    struct DFSTree
+        # Maps DFS number to BB number
+        numbering::Vector{BBNumber}
+        # Maps BB number to DFS number
+        reverse::Vector{DFSNumber}
+        # Records parent relationships in the DFS tree (DFS number -> DFS number)
+        # Storing it this way saves a few lookups in the snca_compress! algorithm
+        parents::Vector{DFSNumber}
+    end
+    length(D::DFSTree) = length(D.numbering)
+    preorder(D::DFSTree) = OneTo(length(D))
+    _drop(xs::AbstractUnitRange, n::Integer) = (first(xs)+n):last(xs)
+
+    function DFSTree(nblocks::Int)
+        DFSTree(
+            Vector{BBNumber}(undef, nblocks),
+            zeros(DFSNumber, nblocks),
+            Vector{DFSNumber}(undef, nblocks))
+    end
+
+    function DFS(cfg::CFG, current_node::BBNumber)::DFSTree
+        dfs = DFSTree(length(cfg.blocks))
+        # TODO: We could reuse the storage in DFSTree for our worklist. We're
+        # guaranteed for the worklist to be smaller than the remaining space in
+        # DFSTree
+        worklist = Tuple{DFSNumber, BBNumber}[(0, current_node)]
+        dfs_num = 1
+        parent = 0
+        while !isempty(worklist)
+            (parent, current_node) = pop!(worklist)
+            dfs.reverse[current_node] = dfs_num
+            dfs.numbering[dfs_num] = current_node
+            dfs.parents[dfs_num] = parent
+            for succ in cfg.blocks[current_node].succs
+                dfs.reverse[succ] != 0 && continue
+                # Mark things that are currently in the worklist
+                dfs.reverse[succ] = 1
+                push!(worklist, (dfs_num, succ))
+            end
+            dfs_num += 1
+        end
+        # If all blocks are reachable, this is a no-op, otherwise,
+        # we shrink these arrays.
+        resize!(dfs.numbering, dfs_num - 1)
+        resize!(dfs.parents, dfs_num - 1)
+        dfs
+    end
+
+    """
+    Matches the snca_compress algorithm in Figure 2.8 of [LG05], with the
+    modification suggested in the paper to use `last_linked` to determine
+    whether an ancestor has been processed rather than storing `0` in the
+    ancestor array.
+    """
+    function snca_compress!(state::Vector{Node}, ancestors::Vector{DFSNumber},
+                            v::DFSNumber, last_linked::DFSNumber)
+        u = ancestors[v]
+        @assert u < v
+        if u >= last_linked
+            snca_compress!(state, ancestors, u, last_linked)
+            if state[u].label < state[v].label
+                state[v] = Node(state[v].semi, state[u].label)
+            end
+            ancestors[v] = ancestors[u]
+        end
+        nothing
+    end
+
+    """
+    The main Semi-NCA algrithm. Matches Figure 2.8 in [LG05].
+    Note that the pseudocode in [LG05] is not entirely accurate.
+    The best way to understand what's happening is to read [LT79], then the
+    description of SLT in in [LG05] (warning: inconsistent notation), then
+    the description of Semi-NCA.
+    """
+    function SNCA(cfg::CFG)
+        D = DFS(cfg, BBNumber(1))
+        # `label` is initialized to the identity mapping (though
+        # the paper doesn't make that clear). The rational for this is Lemma
+        # 2.4 in [LG05] (i.e. Theorem 4 in ). Note however, that we don't
+        # ever look at `semi` until it is fully initialized, so we could leave
+        # it unitialized here if we wanted to.
+        state = Node[ Node(typemax(DFSNumber), w) for w in preorder(D) ]
+        # Initialize idoms to parents. Note that while idoms are eventually
+        # BB indexed, we keep it DFS indexed until a final post-processing
+        # pass to avoid extra memory references during the O(N^2) phase below.
+        idoms_dfs = copy(D.parents)
+        # We abuse the parents array as the ancestors array.
+        # Semi-NCA does not look at the parents array at all.
+        # SLT would, but never simultaneously, so we could still
+        # do this.
+        ancestors = D.parents
+        for w ∈ reverse(_drop(preorder(D), 1))
+            # LLVM initializes this to the parent, the paper initializes this to
+            # `w`, but it doesn't really matter (the parent is a predecessor,
+            # so at worst we'll discover it below). Save a memory reference here.
+            semi_w = typemax(DFSNumber)
+            for v ∈ cfg.blocks[D.numbering[w]].preds
+                # For the purpose of the domtree, ignore virtual predecessors
+                # into catch blocks.
+                v == 0 && continue
+                vdfs = D.reverse[v]
+                # Ignore unreachable predecessors
+                vdfs == 0 && continue
+                last_linked = DFSNumber(w + 1)
+                # N.B.: This conditional is missing from the psuedocode
+                # in figure 2.8 of [LG05]. It corresponds to the
+                # `ancestor[v] != 0` check in the `eval` implementation in
+                # figure 2.6
+                if vdfs >= last_linked
+                    snca_compress!(state, ancestors, vdfs, last_linked)
+                end
+                semi_w = min(semi_w, state[vdfs].label)
+            end
+            state[w] = Node(semi_w, semi_w)
+        end
+        for v ∈ _drop(preorder(D), 1)
+            idom = idoms_dfs[v]
+            vsemi = state[v].semi
+            while idom > vsemi
+                idom = idoms_dfs[idom]
+            end
+            idoms_dfs[v] = idom
+        end
+        idoms_bb = Int[ (i == 1 || D.reverse[i] == 0) ? 0 : D.numbering[idoms_dfs[D.reverse[i]]] for i = 1:length(cfg.blocks) ]
+        idoms_bb
+    end
+end