Skip to content

Commit be2f319

Browse files
committed
cmd/compile: greedy basic block layout
Implement Pettis&Hanse's greedy algorithm, i.e. bottom-up variant Fixes #66420
1 parent de5b418 commit be2f319

File tree

3 files changed

+231
-9
lines changed

3 files changed

+231
-9
lines changed

src/cmd/compile/internal/ssa/compile.go

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -508,8 +508,7 @@ var passes = [...]pass{
508508
{name: "late nilcheck", fn: nilcheckelim2},
509509
{name: "flagalloc", fn: flagalloc, required: true}, // allocate flags register
510510
{name: "regalloc", fn: regalloc, required: true}, // allocate int & float registers + stack slots
511-
{name: "loop rotate", fn: loopRotate},
512-
{name: "trim", fn: trim}, // remove empty blocks
511+
{name: "trim", fn: trim}, // remove empty blocks
513512
}
514513

515514
// Double-check phase ordering constraints.
@@ -577,8 +576,6 @@ var passOrder = [...]constraint{
577576
{"schedule", "flagalloc"},
578577
// regalloc needs flags to be allocated first.
579578
{"flagalloc", "regalloc"},
580-
// loopRotate will confuse regalloc.
581-
{"regalloc", "loop rotate"},
582579
// trim needs regalloc to be done first.
583580
{"regalloc", "trim"},
584581
// memcombine works better if fuse happens first, to help merge stores.

src/cmd/compile/internal/ssa/flagalloc.go

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,12 @@ func flagalloc(f *Func) {
5353
}
5454
}
5555
}
56+
visitOrder := layoutOrder(f)
5657

5758
// For blocks which have a flags control value, that's the only value
5859
// we can leave in the flags register at the end of the block. (There
5960
// is no place to put a flag regeneration instruction.)
60-
for _, b := range f.Blocks {
61+
for _, b := range visitOrder {
6162
if b.Kind == BlockDefer {
6263
// Defer blocks internally use/clobber the flags value.
6364
end[b.ID] = nil
@@ -109,7 +110,7 @@ func flagalloc(f *Func) {
109110
// Add flag spill and recomputation where they are needed.
110111
var remove []*Value // values that should be checked for possible removal
111112
var oldSched []*Value
112-
for _, b := range f.Blocks {
113+
for _, b := range visitOrder {
113114
oldSched = append(oldSched[:0], b.Values...)
114115
b.Values = b.Values[:0]
115116
// The current live flag value (the pre-flagalloc copy).
@@ -188,7 +189,7 @@ func flagalloc(f *Func) {
188189
}
189190

190191
// Save live flag state for later.
191-
for _, b := range f.Blocks {
192+
for _, b := range visitOrder {
192193
b.FlagsLiveAtEnd = end[b.ID] != nil
193194
}
194195

@@ -223,7 +224,7 @@ func flagalloc(f *Func) {
223224
}
224225

225226
// Process affected blocks, preserving value order.
226-
for _, b := range f.Blocks {
227+
for _, b := range visitOrder {
227228
if !removeBlocks.contains(b.ID) {
228229
continue
229230
}

src/cmd/compile/internal/ssa/layout.go

Lines changed: 225 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,16 @@
44

55
package ssa
66

7+
import (
8+
"fmt"
9+
"sort"
10+
)
11+
712
// layout orders basic blocks in f with the goal of minimizing control flow instructions.
813
// After this phase returns, the order of f.Blocks matters and is the order
914
// in which those blocks will appear in the assembly output.
1015
func layout(f *Func) {
11-
f.Blocks = layoutOrder(f)
16+
f.Blocks = greedyBlockOrder(f)
1217
}
1318

1419
// Register allocation may use a different order which has constraints
@@ -183,3 +188,222 @@ blockloop:
183188
return order
184189
//f.Blocks = order
185190
}
191+
192+
// ----------------------------------------------------------------------------
193+
// Greedy Basic Block Layout
194+
//
195+
// This is an adaptation of Pettis & Hansen's greedy algorithm for laying out
196+
// basic blocks. See Profile Guided Code Positioning by Pettis & Hansen. The idea
197+
// is to arrange hot blocks near each other. Initially all blocks are belongs to
198+
// its own chain, then starting from hottest edge and repeatedly merge two proper
199+
// chains iff the edge dest is the first block of dest chain and edge src is the
200+
// last block of src chain. Once all edges are processed, the chains are sorted
201+
// by hottness and merge count and generate final block order.
202+
203+
// chain is a linear sequence of blocks
204+
type chain struct {
205+
id int
206+
blocks []*Block
207+
priority int // merge count
208+
}
209+
210+
func (t *chain) first() *Block {
211+
return t.blocks[0]
212+
}
213+
214+
func (t *chain) last() *Block {
215+
return t.blocks[len(t.blocks)-1]
216+
}
217+
218+
// edge simply represents a CFG edge
219+
type edge struct {
220+
src *Block
221+
dst *Block
222+
weight int // frequency
223+
}
224+
225+
const (
226+
WeightTaken = 100
227+
WeightNotTaken = 0
228+
)
229+
230+
func (e *edge) String() string {
231+
return fmt.Sprintf("%v->%v(%d)", e.src, e.dst, e.weight)
232+
}
233+
234+
type chainGraph struct {
235+
chainId int
236+
chains []*chain
237+
edges []*edge
238+
b2chain map[*Block]*chain
239+
}
240+
241+
func (g *chainGraph) newChain(block *Block) *chain {
242+
tr := &chain{g.chainId, []*Block{block}, 0 /*priority*/}
243+
g.b2chain[block] = tr
244+
g.chains = append(g.chains, tr)
245+
g.chainId++
246+
return tr
247+
}
248+
249+
func (g *chainGraph) getChain(b *Block) *chain {
250+
return g.b2chain[b]
251+
}
252+
253+
func (g *chainGraph) mergeChain(to, from *chain) {
254+
for _, block := range from.blocks {
255+
g.b2chain[block] = to
256+
}
257+
to.blocks = append(to.blocks, from.blocks...)
258+
to.priority++ // increment
259+
g.chains[from.id] = nil
260+
}
261+
262+
func (g *chainGraph) print() {
263+
fmt.Printf("== Edges:\n")
264+
for _, edge := range g.edges {
265+
fmt.Printf("%v\n", edge)
266+
}
267+
fmt.Printf("== Chains:\n")
268+
for _, ch := range g.chains {
269+
if ch == nil {
270+
continue
271+
}
272+
fmt.Printf("id:%d priority:%d blocks:%v\n", ch.id, ch.priority, ch.blocks)
273+
}
274+
}
275+
276+
func greedyBlockOrder(fn *Func) []*Block {
277+
graph := &chainGraph{0, []*chain{}, []*edge{}, make(map[*Block]*chain)}
278+
279+
// Initially every block is in its own chain
280+
for _, block := range fn.Blocks {
281+
graph.newChain(block)
282+
283+
if len(block.Succs) == 1 {
284+
graph.edges = append(graph.edges, &edge{block, block.Succs[0].b, WeightTaken})
285+
} else if len(block.Succs) == 2 && block.Likely != BranchUnknown {
286+
// Static branch prediction is available
287+
taken := 0
288+
if block.Likely == BranchUnlikely {
289+
taken = 1
290+
}
291+
e1 := &edge{block, block.Succs[taken].b, WeightTaken}
292+
e2 := &edge{block, block.Succs[1-taken].b, WeightNotTaken}
293+
graph.edges = append(graph.edges, e1, e2)
294+
} else {
295+
// Block predication is unknown or there are more than 2 successors
296+
for _, succ := range block.Succs {
297+
e1 := &edge{block, succ.b, WeightTaken}
298+
graph.edges = append(graph.edges, e1)
299+
}
300+
}
301+
}
302+
303+
// Sort edges by weight and move slow path to end
304+
j := len(graph.edges) - 1
305+
for i, edge := range graph.edges {
306+
if edge.weight == 0 {
307+
if edge.dst.Kind == BlockExit && i < j {
308+
graph.edges[j], graph.edges[i] = graph.edges[i], graph.edges[j]
309+
j--
310+
}
311+
}
312+
}
313+
sort.SliceStable(graph.edges, func(i, j int) bool {
314+
e1, e2 := graph.edges[i], graph.edges[j]
315+
// If the weights are the same, then keep the original order, this
316+
// ensures that adjacent edges are accessed sequentially, which has
317+
// a noticeable impact on performance
318+
return e1.weight >= e2.weight
319+
})
320+
321+
// Merge proper chains until no more chains can be merged
322+
for _, edge := range graph.edges {
323+
src := graph.getChain(edge.src)
324+
dst := graph.getChain(edge.dst)
325+
if src == dst {
326+
// Loop detected, "rotate" the loop from [..,header,body,latch] to
327+
// [..,body,latch,header]
328+
for idx, block := range src.blocks {
329+
if block == edge.dst && block.Kind != BlockPlain /*already rotated?*/ {
330+
c := append(src.blocks[0:idx], src.blocks[idx+1:]...)
331+
c = append(c, block)
332+
src.blocks = c
333+
break
334+
}
335+
}
336+
continue
337+
}
338+
if edge.dst == dst.first() && edge.src == src.last() {
339+
graph.mergeChain(src, dst)
340+
}
341+
}
342+
for i := 0; i < len(graph.chains); i++ {
343+
// Remove nil chains because they are merged
344+
if graph.chains[i] == nil {
345+
graph.chains = append(graph.chains[:i], graph.chains[i+1:]...)
346+
i--
347+
} else if graph.chains[i].first() == fn.Entry {
348+
// Entry chain must be present at beginning
349+
graph.chains[0], graph.chains[i] = graph.chains[i], graph.chains[0]
350+
}
351+
}
352+
353+
// Reorder chains based by hottness and priority
354+
before := make(map[*chain][]*chain)
355+
for _, edge := range graph.edges {
356+
// Compute the "before" precedence relation between chain, specifically,
357+
// the chain that is taken is arranged before the chain that is not taken.
358+
// This is because hardware prediction thought forward branch is less
359+
// frequently taken, while backedge is more frequently taken.
360+
if edge.weight == WeightNotTaken {
361+
src := graph.getChain(edge.src)
362+
dst := graph.getChain(edge.dst)
363+
before[src] = append(before[src], dst)
364+
}
365+
}
366+
// assert(graph.chains[0].first() == fn.Entry, "entry chain must be first")
367+
const idxSkipEntry = 1 // Entry chain is always first
368+
sort.SliceStable(graph.chains[idxSkipEntry:], func(i, j int) bool {
369+
c1, c2 := graph.chains[i+idxSkipEntry], graph.chains[j+idxSkipEntry]
370+
// Respect precedence relation
371+
for _, b := range before[c1] {
372+
if b == c2 {
373+
return true
374+
}
375+
}
376+
// Higher merge count is considered
377+
if c1.priority != c2.priority {
378+
return c1.priority > c2.priority
379+
}
380+
// Non-terminated chain is considered
381+
if s1, s2 := len(c1.last().Succs), len(c2.last().Succs); s1 != s2 {
382+
return s1 > s2
383+
}
384+
// Keep original order if we can't decide
385+
return true
386+
})
387+
388+
// Generate final block order
389+
blockOrder := make([]*Block, 0)
390+
for _, chain := range graph.chains {
391+
blockOrder = append(blockOrder, chain.blocks...)
392+
}
393+
fn.laidout = true
394+
395+
if fn.pass.debug > 2 {
396+
fmt.Printf("Block ordering(%v):\n", fn.Name)
397+
graph.print()
398+
}
399+
if len(blockOrder) != len(fn.Blocks) {
400+
graph.print()
401+
fn.Fatalf("miss blocks in final order")
402+
}
403+
if entryChain := graph.getChain(fn.Entry); entryChain != graph.chains[0] ||
404+
entryChain.first() != fn.Entry {
405+
graph.print()
406+
fn.Fatalf("entry block is not first block")
407+
}
408+
return blockOrder
409+
}

0 commit comments

Comments
 (0)