cmd/compile: tight code optimization opportunities

The generated x86 code can be improved in some fairly simple ways - hoisting computed constants out of loop bodies, and avoiding unnecessary register moves - that have a significant performance impact on tight loops. In the following example those improvements produce a 35% speedup.

Here is an alternate, DFA-based implementation of `utf8.Valid` that I have been playing with:

```
func Valid(x []byte) bool {
	state := uint64(1 * 6)
	for _, b := range x {
		state = dfa[b] >> (state & 63)
	}
	return (state & 63) == 1*6
}

const (
	s00 = 0 | (1*6)<<(1*6)
	sC0 = 0
	sC2 = 0 | (2*6)<<(1*6)
	sE0 = 0 | (3*6)<<(1*6)
	sE1 = 0 | (4*6)<<(1*6)
	sED = 0 | (5*6)<<(1*6)
	sEE = sE1
	sF0 = 0 | (6*6)<<(1*6)
	sF1 = 0 | (7*6)<<(1*6)
	sF4 = 0 | (8*6)<<(1*6)
	sF5 = 0

	s80 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(7*6) | (4*6)<<(8*6)
	s90 = 0 | (1*6)<<(2*6) | (2*6)<<(4*6) | (4*6)<<(5*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
	sA0 = 0 | (1*6)<<(2*6) | (2*6)<<(3*6) | (2*6)<<(4*6) | (4*6)<<(6*6) | (4*6)<<(7*6)
)

var dfa = [256]uint64{
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00, s00,
	s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80, s80,
	s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90, s90,
	sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
	sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0, sA0,
	sC0, sC0, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
	sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2, sC2,
	sE0, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sE1, sED, sEE, sEE,
	sF0, sF1, sF1, sF1, sF4, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5, sF5,
}
```

There are no big benchmarks of Valid in the package, but here are some that could be added:

```
var valid1k = bytes.Repeat([]byte("0123456789日本語日本語日本語日abcdefghijklmnopqrstuvwx"), 16)
var valid1M = bytes.Repeat(valid1k, 1024)

func BenchmarkValid1K(b *testing.B) {
	b.SetBytes(int64(len(valid1k)))
	for i := 0; i < b.N; i++ {
		Valid(valid1k)
	}
}

func BenchmarkValid1M(b *testing.B) {
	b.SetBytes(int64(len(valid1M)))
	for i := 0; i < b.N; i++ {
		Valid(valid1M)
	}
}
```

The old Valid implementation runs at around 1450 MB/s.
The implementation above runs at around 1600 MB/s.
Better but not what I had hoped.
It compiles as follows:

```
% go test -c && go tool objdump -s 'utf8.Valid$' utf8.test
TEXT unicode/utf8.Valid(SB) /Users/rsc/go/src/unicode/utf8/utf8.go
  utf8.go:647		0x10789c0		4889442408		MOVQ AX, 0x8(SP)
  utf8.go:649		0x10789c5		31c9			XORL CX, CX
  utf8.go:649		0x10789c7		ba06000000		MOVL $0x6, DX
  utf8.go:649		0x10789cc		eb1f			JMP 0x10789ed
  utf8.go:649		0x10789ce		0fb63408		MOVZX 0(AX)(CX*1), SI
  utf8.go:649		0x10789d2		488d7901		LEAQ 0x1(CX), DI
  utf8.go:650		0x10789d6		4c8d0543ef1600		LEAQ unicode/utf8.dfa(SB), R8
  utf8.go:650		0x10789dd		498b34f0		MOVQ 0(R8)(SI*8), SI
  utf8.go:650		0x10789e1		4889d1			MOVQ DX, CX
  utf8.go:650		0x10789e4		48d3ee			SHRQ CL, SI
  utf8.go:649		0x10789e7		4889f9			MOVQ DI, CX
  utf8.go:652		0x10789ea		4889f2			MOVQ SI, DX
  utf8.go:649		0x10789ed		4839cb			CMPQ CX, BX
  utf8.go:649		0x10789f0		7fdc			JG 0x10789ce
  utf8.go:652		0x10789f2		4883e23f		ANDQ $0x3f, DX
  utf8.go:652		0x10789f6		4883fa06		CMPQ $0x6, DX
  utf8.go:652		0x10789fa		0f94c0			SETE AL
  utf8.go:652		0x10789fd		c3			RET
  :-1			0x10789fe		cc			INT $0x3
  :-1			0x10789ff		cc			INT $0x3
%
```

Translating this to proper non-regabi assembly I get:

```
#include "textflag.h"

TEXT ·Valid(SB),NOSPLIT,$0
	MOVQ	x_base+0(FP),AX // p = &x[0]
	MOVQ	x_len+8(FP),BX // n = len(x)
	XORL	CX, CX // i = 0
	MOVL	$6, DX
	JMP loop

body:
	MOVBLZX	(AX)(CX*1), SI // b = p[i]
	LEAQ	1(CX), DI // j = i+1
	LEAQ	·dfa(SB), R8
	MOVQ	(R8)(SI*8), SI // t = dfa[b]
	MOVQ	DX, CX // CX = state
	SHRQ	CX, SI // t >>= state&63
	MOVQ	DI, CX // i = j
	MOVQ	SI, DX // state = t

loop:
	CMPQ	CX, BX
	JLT	body

	ANDL	$0x3f, DX
	CMPL	DX, $6
	SETEQ	AL
	MOVB	AX, ret+24(FP)
	RET
```

This runs also at about 1600 MB/s.

First optimization: the `LEAQ ·dfa(SB), R8` should be hoisted out of the loop body.
(I tried to do this in the Go version with `dfa := &dfa` but it got constant propagated away!)

That change brings it up to 1750 MB/s.

Second optimization: use DI for `i` instead of CX, to avoid the pressure on CX.
This lets the `LEAQ 1(CX), DI` and the later `MOVQ DI, CX` collapse to just `LEAQ 1(DI), DI`.

That change brings it up to 1900 MB/s.

The body is now:

```
body:
	MOVBLZX	(AX)(DI*1), SI // b = p[i]
	LEAQ	1(DI), DI // i++
	MOVQ	(R8)(SI*8), SI // t = dfa[b]
	MOVQ	DX, CX // CX = state
	SHRQ	CX, SI // t >>= state&63
	MOVQ	SI, DX // state = t
```

Third optimization: since `DX` is moving into `CX`, do that one instruction earlier, allowing the use of `SI` to be optimized into `DX` to eliminate the final `MOVQ`:

```
body:
	MOVBLZX	(AX)(DI*1), SI // b = p[i]
	LEAQ	1(DI), DI // i++
	MOVQ	DX, CX // CX = state
	MOVQ	(R8)(SI*8), DX // state = dfa[b]
	SHRQ	CX, DX // state >>= CX&63
```

I think this ends up being just "compute the shift amount before the shifted value".
That change brings it up to 2150 MB/s.

This is still a direct translation of the Go code: there are no tricks the compiler couldn't do.
For this particular loop, the optimizations make the code run 35% faster.

Final assembly:

```
#include "textflag.h"

TEXT ·Valid(SB),NOSPLIT,$0
	MOVQ	x_base+0(FP),AX // p = &x[0]
	MOVQ	x_len+8(FP),BX // n = len(x)
	XORL	DI, DI // i = 0
	MOVL	$6, DX
	LEAQ	·dfa(SB), R8
	JMP loop

body:
	MOVBLZX	(AX)(DI*1), SI // b = p[i]
	LEAQ	1(DI), DI // i++
	MOVQ	DX, CX // CX = state
	MOVQ	(R8)(SI*8), DX // t = dfa[b]
	SHRQ	CX, DX // t >>= state&63

loop:
	CMPQ	DI, BX
	JLT	body

	ANDL	$0x3f, DX
	CMPL	DX, $6
	SETEQ	AL
	MOVB	AX, ret+24(FP)
	RET
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

cmd/compile: tight code optimization opportunities #47120

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

cmd/compile: tight code optimization opportunities #47120

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions