Skip to content

Commit 10f7574

Browse files
committed
cmd/compile/internal/ssa: generate bswap on AMD64
Generate bswap+load/store for reading/writing big endian data. Helps encoding/binary. name old time/op new time/op delta ReadSlice1000Int32s-8 5.06µs ± 8% 4.58µs ± 8% -9.50% (p=0.000 n=10+10) ReadStruct-8 1.07µs ± 0% 1.05µs ± 0% -1.51% (p=0.000 n=9+10) ReadInts-8 367ns ± 0% 363ns ± 0% -1.15% (p=0.000 n=8+9) WriteInts-8 475ns ± 1% 469ns ± 0% -1.45% (p=0.000 n=10+10) WriteSlice1000Int32s-8 5.03µs ± 3% 4.50µs ± 3% -10.45% (p=0.000 n=9+9) PutUvarint32-8 17.2ns ± 0% 17.2ns ± 0% ~ (all samples are equal) PutUvarint64-8 46.7ns ± 0% 46.7ns ± 0% ~ (p=0.509 n=10+10) name old speed new speed delta ReadSlice1000Int32s-8 791MB/s ± 8% 875MB/s ± 8% +10.53% (p=0.000 n=10+10) ReadStruct-8 70.0MB/s ± 0% 71.1MB/s ± 0% +1.54% (p=0.000 n=9+10) ReadInts-8 81.6MB/s ± 0% 82.6MB/s ± 0% +1.21% (p=0.000 n=9+9) WriteInts-8 63.0MB/s ± 1% 63.9MB/s ± 0% +1.45% (p=0.000 n=10+10) WriteSlice1000Int32s-8 796MB/s ± 4% 888MB/s ± 3% +11.65% (p=0.000 n=9+9) PutUvarint32-8 233MB/s ± 0% 233MB/s ± 0% ~ (p=0.089 n=10+10) PutUvarint64-8 171MB/s ± 0% 171MB/s ± 0% ~ (p=0.137 n=10+9) Change-Id: Ia2dbdef92198eaa7e2af5443a8ed586d4b401ffb Reviewed-on: https://go-review.googlesource.com/32222 Run-TryBot: Ilya Tocar <[email protected]> TryBot-Result: Gobot Gobot <[email protected]> Reviewed-by: Keith Randall <[email protected]>
1 parent ca5cea9 commit 10f7574

File tree

3 files changed

+1232
-62
lines changed

3 files changed

+1232
-62
lines changed

src/cmd/compile/internal/gc/asm_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,38 @@ func f(b []byte, i int) uint32 {
157157
`,
158158
[]string{"\tMOVL\t\\(.*\\)\\(.*\\*1\\),"},
159159
},
160+
{"amd64", "linux", `
161+
import "encoding/binary"
162+
func f(b []byte) uint64 {
163+
return binary.BigEndian.Uint64(b)
164+
}
165+
`,
166+
[]string{"\tBSWAPQ\t"},
167+
},
168+
{"amd64", "linux", `
169+
import "encoding/binary"
170+
func f(b []byte, i int) uint64 {
171+
return binary.BigEndian.Uint64(b[i:])
172+
}
173+
`,
174+
[]string{"\tBSWAPQ\t"},
175+
},
176+
{"amd64", "linux", `
177+
import "encoding/binary"
178+
func f(b []byte) uint32 {
179+
return binary.BigEndian.Uint32(b)
180+
}
181+
`,
182+
[]string{"\tBSWAPL\t"},
183+
},
184+
{"amd64", "linux", `
185+
import "encoding/binary"
186+
func f(b []byte, i int) uint32 {
187+
return binary.BigEndian.Uint32(b[i:])
188+
}
189+
`,
190+
[]string{"\tBSWAPL\t"},
191+
},
160192
{"386", "linux", `
161193
import "encoding/binary"
162194
func f(b []byte) uint32 {

src/cmd/compile/internal/ssa/gen/AMD64.rules

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1507,6 +1507,204 @@
15071507
&& clobber(o5)
15081508
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (MOVQloadidx1 <v.Type> [i] {s} p idx mem)
15091509

1510+
// Combine byte loads + shifts into larger (unaligned) loads + bswap
1511+
(ORL o1:(ORL o0:(ORL
1512+
x0:(MOVBload [i] {s} p mem)
1513+
s0:(SHLLconst [8] x1:(MOVBload [i-1] {s} p mem)))
1514+
s1:(SHLLconst [16] x2:(MOVBload [i-2] {s} p mem)))
1515+
s2:(SHLLconst [24] x3:(MOVBload [i-3] {s} p mem)))
1516+
&& x0.Uses == 1
1517+
&& x1.Uses == 1
1518+
&& x2.Uses == 1
1519+
&& x3.Uses == 1
1520+
&& s0.Uses == 1
1521+
&& s1.Uses == 1
1522+
&& s2.Uses == 1
1523+
&& o0.Uses == 1
1524+
&& o1.Uses == 1
1525+
&& mergePoint(b,x0,x1,x2,x3) != nil
1526+
&& clobber(x0)
1527+
&& clobber(x1)
1528+
&& clobber(x2)
1529+
&& clobber(x3)
1530+
&& clobber(s0)
1531+
&& clobber(s1)
1532+
&& clobber(s2)
1533+
&& clobber(o0)
1534+
&& clobber(o1)
1535+
-> @mergePoint(b,x0,x1,x2,x3) (BSWAPL <v.Type> (MOVLload [i-3] {s} p mem))
1536+
1537+
(ORL o1:(ORL o0:(ORL
1538+
x0:(MOVBloadidx1 [i] {s} p idx mem)
1539+
s0:(SHLLconst [8] x1:(MOVBloadidx1 [i-1] {s} p idx mem)))
1540+
s1:(SHLLconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem)))
1541+
s2:(SHLLconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem)))
1542+
&& x0.Uses == 1
1543+
&& x1.Uses == 1
1544+
&& x2.Uses == 1
1545+
&& x3.Uses == 1
1546+
&& s0.Uses == 1
1547+
&& s1.Uses == 1
1548+
&& s2.Uses == 1
1549+
&& o0.Uses == 1
1550+
&& o1.Uses == 1
1551+
&& mergePoint(b,x0,x1,x2,x3) != nil
1552+
&& clobber(x0)
1553+
&& clobber(x1)
1554+
&& clobber(x2)
1555+
&& clobber(x3)
1556+
&& clobber(s0)
1557+
&& clobber(s1)
1558+
&& clobber(s2)
1559+
&& clobber(o0)
1560+
&& clobber(o1)
1561+
-> @mergePoint(b,x0,x1,x2,x3) (BSWAPL <v.Type> (MOVLloadidx1 <v.Type> [i-3] {s} p idx mem))
1562+
1563+
(ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ
1564+
x0:(MOVBload [i] {s} p mem)
1565+
s0:(SHLQconst [8] x1:(MOVBload [i-1] {s} p mem)))
1566+
s1:(SHLQconst [16] x2:(MOVBload [i-2] {s} p mem)))
1567+
s2:(SHLQconst [24] x3:(MOVBload [i-3] {s} p mem)))
1568+
s3:(SHLQconst [32] x4:(MOVBload [i-4] {s} p mem)))
1569+
s4:(SHLQconst [40] x5:(MOVBload [i-5] {s} p mem)))
1570+
s5:(SHLQconst [48] x6:(MOVBload [i-6] {s} p mem)))
1571+
s6:(SHLQconst [56] x7:(MOVBload [i-7] {s} p mem)))
1572+
&& x0.Uses == 1
1573+
&& x1.Uses == 1
1574+
&& x2.Uses == 1
1575+
&& x3.Uses == 1
1576+
&& x4.Uses == 1
1577+
&& x5.Uses == 1
1578+
&& x6.Uses == 1
1579+
&& x7.Uses == 1
1580+
&& s0.Uses == 1
1581+
&& s1.Uses == 1
1582+
&& s2.Uses == 1
1583+
&& s3.Uses == 1
1584+
&& s4.Uses == 1
1585+
&& s5.Uses == 1
1586+
&& s6.Uses == 1
1587+
&& o0.Uses == 1
1588+
&& o1.Uses == 1
1589+
&& o2.Uses == 1
1590+
&& o3.Uses == 1
1591+
&& o4.Uses == 1
1592+
&& o5.Uses == 1
1593+
&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
1594+
&& clobber(x0)
1595+
&& clobber(x1)
1596+
&& clobber(x2)
1597+
&& clobber(x3)
1598+
&& clobber(x4)
1599+
&& clobber(x5)
1600+
&& clobber(x6)
1601+
&& clobber(x7)
1602+
&& clobber(s0)
1603+
&& clobber(s1)
1604+
&& clobber(s2)
1605+
&& clobber(s3)
1606+
&& clobber(s4)
1607+
&& clobber(s5)
1608+
&& clobber(s6)
1609+
&& clobber(o0)
1610+
&& clobber(o1)
1611+
&& clobber(o2)
1612+
&& clobber(o3)
1613+
&& clobber(o4)
1614+
&& clobber(o5)
1615+
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ <v.Type> (MOVQload [i-7] {s} p mem))
1616+
1617+
(ORQ o5:(ORQ o4:(ORQ o3:(ORQ o2:(ORQ o1:(ORQ o0:(ORQ
1618+
x0:(MOVBloadidx1 [i] {s} p idx mem)
1619+
s0:(SHLQconst [8] x1:(MOVBloadidx1 [i-1] {s} p idx mem)))
1620+
s1:(SHLQconst [16] x2:(MOVBloadidx1 [i-2] {s} p idx mem)))
1621+
s2:(SHLQconst [24] x3:(MOVBloadidx1 [i-3] {s} p idx mem)))
1622+
s3:(SHLQconst [32] x4:(MOVBloadidx1 [i-4] {s} p idx mem)))
1623+
s4:(SHLQconst [40] x5:(MOVBloadidx1 [i-5] {s} p idx mem)))
1624+
s5:(SHLQconst [48] x6:(MOVBloadidx1 [i-6] {s} p idx mem)))
1625+
s6:(SHLQconst [56] x7:(MOVBloadidx1 [i-7] {s} p idx mem)))
1626+
&& x0.Uses == 1
1627+
&& x1.Uses == 1
1628+
&& x2.Uses == 1
1629+
&& x3.Uses == 1
1630+
&& x4.Uses == 1
1631+
&& x5.Uses == 1
1632+
&& x6.Uses == 1
1633+
&& x7.Uses == 1
1634+
&& s0.Uses == 1
1635+
&& s1.Uses == 1
1636+
&& s2.Uses == 1
1637+
&& s3.Uses == 1
1638+
&& s4.Uses == 1
1639+
&& s5.Uses == 1
1640+
&& s6.Uses == 1
1641+
&& o0.Uses == 1
1642+
&& o1.Uses == 1
1643+
&& o2.Uses == 1
1644+
&& o3.Uses == 1
1645+
&& o4.Uses == 1
1646+
&& o5.Uses == 1
1647+
&& mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) != nil
1648+
&& clobber(x0)
1649+
&& clobber(x1)
1650+
&& clobber(x2)
1651+
&& clobber(x3)
1652+
&& clobber(x4)
1653+
&& clobber(x5)
1654+
&& clobber(x6)
1655+
&& clobber(x7)
1656+
&& clobber(s0)
1657+
&& clobber(s1)
1658+
&& clobber(s2)
1659+
&& clobber(s3)
1660+
&& clobber(s4)
1661+
&& clobber(s5)
1662+
&& clobber(s6)
1663+
&& clobber(o0)
1664+
&& clobber(o1)
1665+
&& clobber(o2)
1666+
&& clobber(o3)
1667+
&& clobber(o4)
1668+
&& clobber(o5)
1669+
-> @mergePoint(b,x0,x1,x2,x3,x4,x5,x6,x7) (BSWAPQ <v.Type> (MOVQloadidx1 <v.Type> [i-7] {s} p idx mem))
1670+
1671+
// Combine stores + shifts into bswap and larger (unaligned) stores
1672+
(MOVBstore [i] {s} p w
1673+
x2:(MOVBstore [i-1] {s} p (SHRLconst [8] w)
1674+
x1:(MOVBstore [i-2] {s} p (SHRLconst [16] w)
1675+
x0:(MOVBstore [i-3] {s} p (SHRLconst [24] w) mem))))
1676+
&& x0.Uses == 1
1677+
&& x1.Uses == 1
1678+
&& x2.Uses == 1
1679+
&& clobber(x0)
1680+
&& clobber(x1)
1681+
&& clobber(x2)
1682+
-> (MOVLstore [i-3] {s} p (BSWAPL <w.Type> w) mem)
1683+
1684+
(MOVBstore [i] {s} p w
1685+
x6:(MOVBstore [i-1] {s} p (SHRQconst [8] w)
1686+
x5:(MOVBstore [i-2] {s} p (SHRQconst [16] w)
1687+
x4:(MOVBstore [i-3] {s} p (SHRQconst [24] w)
1688+
x3:(MOVBstore [i-4] {s} p (SHRQconst [32] w)
1689+
x2:(MOVBstore [i-5] {s} p (SHRQconst [40] w)
1690+
x1:(MOVBstore [i-6] {s} p (SHRQconst [48] w)
1691+
x0:(MOVBstore [i-7] {s} p (SHRQconst [56] w) mem))))))))
1692+
&& x0.Uses == 1
1693+
&& x1.Uses == 1
1694+
&& x2.Uses == 1
1695+
&& x3.Uses == 1
1696+
&& x4.Uses == 1
1697+
&& x5.Uses == 1
1698+
&& x6.Uses == 1
1699+
&& clobber(x0)
1700+
&& clobber(x1)
1701+
&& clobber(x2)
1702+
&& clobber(x3)
1703+
&& clobber(x4)
1704+
&& clobber(x5)
1705+
&& clobber(x6)
1706+
-> (MOVQstore [i-7] {s} p (BSWAPQ <w.Type> w) mem)
1707+
15101708
// Combine constant stores into larger (unaligned) stores.
15111709
(MOVBstoreconst [c] {s} p x:(MOVBstoreconst [a] {s} p mem))
15121710
&& x.Uses == 1

0 commit comments

Comments
 (0)