Skip to content

Commit ff02c85

Browse files
committed
crypto/sha256: Use SHA extensions if available
This adds a new optimized version of SHA256 computation when the SHA Intel extensions are available. Based on the reference documentation at https://software.intel.com/content/www/us/en/develop/articles/intel-sha-extensions.html Benchmarks show a close to 4x performance improvement on an AMD Ryzen 5 3600, especially on larger inputs. Even on the smallest it's at least 2x faster. name old time/op new time/op delta Sha/SHA256____16_bytes-12 248ns ± 3% 117ns ± 3% -52.84% (p=0.000 n=20+19) Sha/SHA256____64_bytes-12 384ns ± 2% 153ns ± 3% -60.10% (p=0.000 n=20+17) Sha/SHA256___256_bytes-12 786ns ± 1% 249ns ± 3% -68.29% (p=0.000 n=19+19) Sha/SHA256____1k_bytes-12 2.36µs ± 1% 0.64µs ± 3% -72.93% (p=0.000 n=19+20) Sha/SHA256____8k_bytes-12 17.0µs ± 2% 4.2µs ± 1% -75.16% (p=0.000 n=20+20) Sha/SHA256__256k_bytes-12 537µs ± 1% 131µs ± 1% -75.60% (p=0.000 n=20+20) Sha/SHA256_1024k_bytes-12 2.15ms ± 1% 0.52ms ± 1% -75.60% (p=0.000 n=20+20)
1 parent da1069e commit ff02c85

File tree

2 files changed

+284
-2
lines changed

2 files changed

+284
-2
lines changed

src/crypto/sha256/sha256block_amd64.go

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,16 @@ func blockAVX2(dig *digest, p []byte)
1212
//go:noescape
1313
func blockAMD64(dig *digest, p []byte)
1414

15-
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI1 && cpu.X86.HasBMI2
15+
//go:noescape
16+
func blockSHA(dig *digest, p []byte)
17+
18+
var useAVX2 = cpu.X86.HasAVX2 && cpu.X86.HasBMI2
19+
var useSHA = cpu.X86.HasSHA
1620

1721
func block(dig *digest, p []byte) {
18-
if useAVX2 {
22+
if useSHA {
23+
blockSHA(dig, p)
24+
} else if useAVX2 {
1925
blockAVX2(dig, p)
2026
} else {
2127
blockAMD64(dig, p)

src/crypto/sha256/sha256block_amd64.s

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1026,3 +1026,279 @@ DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
10261026
DATA K256<>+0x1fc(SB)/4, $0xc67178f2
10271027

10281028
GLOBL K256<>(SB), (NOPTR + RODATA), $512
1029+
1030+
// SHA256 implementation using the SHA extension. Implemented using the
1031+
// reference implementation found at:
1032+
// https://software.intel.com/content/www/us/en/develop/articles/intel-sha-extensions.html
1033+
1034+
// Setup register aliases to easier follow the algorithm.
1035+
#define MSG X0
1036+
#define STATE0 X1
1037+
#define STATE1 X2
1038+
#define MSGTMP0 X3
1039+
#define MSGTMP1 X4
1040+
#define MSGTMP2 X5
1041+
#define MSGTMP3 X6
1042+
#define MSGTMP4 X7
1043+
#define SHUF_MASK X8
1044+
#define ABEF_SAVE X9
1045+
#define CDGH_SAVE X10
1046+
#define SHA256CONSTANTS BX
1047+
1048+
TEXT ·blockSHA(SB), NOSPLIT, $0-32
1049+
1050+
MOVQ dig+0(FP), CX
1051+
MOVQ p_base+8(FP), SI
1052+
MOVQ p_len+16(FP), DX
1053+
SHRQ $6, DX
1054+
SHLQ $6, DX
1055+
LEAQ (SI)(DX*1), DI
1056+
1057+
MOVOU (CX), STATE0
1058+
MOVOU 16(CX), STATE1
1059+
1060+
// Byte shuffle for correct 32 bit dword order.
1061+
// The algorithm assumes 8 32 bit working variables called
1062+
// A through H. CDGH need to be stored in one XMM register &
1063+
// ABEF in another one. So the desired end state here is ABEF
1064+
// (each a 32 bit dword in that order from high to low) in
1065+
// STATE0 & CDGH in STATE1.
1066+
//
1067+
// We start off with DCBA in STATE0 & HGFE in STATE1
1068+
//
1069+
// First, shuffle DCBA -> CDAB
1070+
PSHUFD $0xb1, STATE0, STATE0
1071+
// Shuffle HGFE -> EFGH
1072+
PSHUFD $0x1b, STATE1, STATE1
1073+
// Copy EFGH into temporary register since blending would
1074+
// otherwise lose data.
1075+
MOVO STATE1, MSGTMP0
1076+
// Blend CDAB into EFGH to result in CDGH
1077+
PBLENDW $0xf0, STATE0, STATE1
1078+
// Shift AB (8 bytes) from CDAB into EFGH to result in ABEF.
1079+
PALIGNR $8, MSGTMP0, STATE0
1080+
1081+
MOVO flip_mask<>(SB), SHUF_MASK
1082+
1083+
// Reuses the existing constant table, but it means that each
1084+
// time SHA256CONSTANTS is used the offset is doubled since
1085+
// K256 contains duplicate entries for the AVX2 path.
1086+
LEAQ K256<>(SB), SHA256CONSTANTS
1087+
1088+
// Skip if we accidentally have a zero sized block.
1089+
CMPQ SI, DI
1090+
JEQ end
1091+
1092+
loop:
1093+
1094+
// Save working variables.
1095+
MOVO STATE0, ABEF_SAVE
1096+
MOVO STATE1, CDGH_SAVE
1097+
1098+
// Rounds 0-3.
1099+
MOVOU (SI), MSG
1100+
PSHUFB SHUF_MASK, MSG
1101+
MOVO MSG, MSGTMP0
1102+
PADDD 0*16(SHA256CONSTANTS), MSG
1103+
SHA256RNDS2 MSG, STATE0, STATE1
1104+
PSHUFD $0x0e, MSG, MSG
1105+
SHA256RNDS2 MSG, STATE1, STATE0
1106+
1107+
// Rounds 4-7.
1108+
MOVOU 16(SI), MSG
1109+
PSHUFB SHUF_MASK, MSG
1110+
MOVO MSG, MSGTMP1
1111+
PADDD 2*16(SHA256CONSTANTS), MSG
1112+
SHA256RNDS2 MSG, STATE0, STATE1
1113+
PSHUFD $0x0e, MSG, MSG
1114+
SHA256RNDS2 MSG, STATE1, STATE0
1115+
SHA256MSG1 MSGTMP1, MSGTMP0
1116+
1117+
// Rounds 8-11.
1118+
MOVOU 32(SI), MSG
1119+
PSHUFB SHUF_MASK, MSG
1120+
MOVO MSG, MSGTMP2
1121+
PADDD 4*16(SHA256CONSTANTS), MSG
1122+
SHA256RNDS2 MSG, STATE0, STATE1
1123+
PSHUFD $0x0e, MSG, MSG
1124+
SHA256RNDS2 MSG, STATE1, STATE0
1125+
SHA256MSG1 MSGTMP2, MSGTMP1
1126+
1127+
// Rounds 12-15.
1128+
MOVOU 48(SI), MSG
1129+
PSHUFB SHUF_MASK, MSG
1130+
MOVO MSG, MSGTMP3
1131+
PADDD 6*16(SHA256CONSTANTS), MSG
1132+
SHA256RNDS2 MSG, STATE0, STATE1
1133+
MOVO MSGTMP3, MSGTMP4
1134+
PALIGNR $4, MSGTMP2, MSGTMP4
1135+
PADDD MSGTMP4, MSGTMP0
1136+
SHA256MSG2 MSGTMP3, MSGTMP0
1137+
PSHUFD $0x0e, MSG, MSG
1138+
SHA256RNDS2 MSG, STATE1, STATE0
1139+
SHA256MSG1 MSGTMP3, MSGTMP2
1140+
1141+
// Rounds 16-19.
1142+
MOVO MSGTMP0, MSG
1143+
PADDD 8*16(SHA256CONSTANTS), MSG
1144+
SHA256RNDS2 MSG, STATE0, STATE1
1145+
MOVO MSGTMP0, MSGTMP4
1146+
PALIGNR $4, MSGTMP3, MSGTMP4
1147+
PADDD MSGTMP4, MSGTMP1
1148+
SHA256MSG2 MSGTMP0, MSGTMP1
1149+
PSHUFD $0x0e, MSG, MSG
1150+
SHA256RNDS2 MSG, STATE1, STATE0
1151+
SHA256MSG1 MSGTMP0, MSGTMP3
1152+
1153+
// Rounds 20-23.
1154+
MOVO MSGTMP1, MSG
1155+
PADDD 10*16(SHA256CONSTANTS), MSG
1156+
SHA256RNDS2 MSG, STATE0, STATE1
1157+
MOVO MSGTMP1, MSGTMP4
1158+
PALIGNR $4, MSGTMP0, MSGTMP4
1159+
PADDD MSGTMP4, MSGTMP2
1160+
SHA256MSG2 MSGTMP1, MSGTMP2
1161+
PSHUFD $0x0e, MSG, MSG
1162+
SHA256RNDS2 MSG, STATE1, STATE0
1163+
SHA256MSG1 MSGTMP1, MSGTMP0
1164+
1165+
// Rounds 24-27.
1166+
MOVO MSGTMP2, MSG
1167+
PADDD 12*16(SHA256CONSTANTS), MSG
1168+
SHA256RNDS2 MSG, STATE0, STATE1
1169+
MOVO MSGTMP2, MSGTMP4
1170+
PALIGNR $4, MSGTMP1, MSGTMP4
1171+
PADDD MSGTMP4, MSGTMP3
1172+
SHA256MSG2 MSGTMP2, MSGTMP3
1173+
PSHUFD $0x0e, MSG, MSG
1174+
SHA256RNDS2 MSG, STATE1, STATE0
1175+
SHA256MSG1 MSGTMP2, MSGTMP1
1176+
1177+
// Rounds 28-31.
1178+
MOVO MSGTMP3, MSG
1179+
PADDD 14*16(SHA256CONSTANTS), MSG
1180+
SHA256RNDS2 MSG, STATE0, STATE1
1181+
MOVO MSGTMP3, MSGTMP4
1182+
PALIGNR $4, MSGTMP2, MSGTMP4
1183+
PADDD MSGTMP4, MSGTMP0
1184+
SHA256MSG2 MSGTMP3, MSGTMP0
1185+
PSHUFD $0x0e, MSG, MSG
1186+
SHA256RNDS2 MSG, STATE1, STATE0
1187+
SHA256MSG1 MSGTMP3, MSGTMP2
1188+
1189+
// Rounds 32-35.
1190+
MOVO MSGTMP0, MSG
1191+
PADDD 16*16(SHA256CONSTANTS), MSG
1192+
SHA256RNDS2 MSG, STATE0, STATE1
1193+
MOVO MSGTMP0, MSGTMP4
1194+
PALIGNR $4, MSGTMP3, MSGTMP4
1195+
PADDD MSGTMP4, MSGTMP1
1196+
SHA256MSG2 MSGTMP0, MSGTMP1
1197+
PSHUFD $0x0e, MSG, MSG
1198+
SHA256RNDS2 MSG, STATE1, STATE0
1199+
SHA256MSG1 MSGTMP0, MSGTMP3
1200+
1201+
// Rounds 36-39.
1202+
MOVO MSGTMP1, MSG
1203+
PADDD 18*16(SHA256CONSTANTS), MSG
1204+
SHA256RNDS2 MSG, STATE0, STATE1
1205+
MOVO MSGTMP1, MSGTMP4
1206+
PALIGNR $4, MSGTMP0, MSGTMP4
1207+
PADDD MSGTMP4, MSGTMP2
1208+
SHA256MSG2 MSGTMP1, MSGTMP2
1209+
PSHUFD $0x0e, MSG, MSG
1210+
SHA256RNDS2 MSG, STATE1, STATE0
1211+
SHA256MSG1 MSGTMP1, MSGTMP0
1212+
1213+
// Rounds 40-43.
1214+
MOVO MSGTMP2, MSG
1215+
PADDD 20*16(SHA256CONSTANTS), MSG
1216+
SHA256RNDS2 MSG, STATE0, STATE1
1217+
MOVO MSGTMP2, MSGTMP4
1218+
PALIGNR $4, MSGTMP1, MSGTMP4
1219+
PADDD MSGTMP4, MSGTMP3
1220+
SHA256MSG2 MSGTMP2, MSGTMP3
1221+
PSHUFD $0x0e, MSG, MSG
1222+
SHA256RNDS2 MSG, STATE1, STATE0
1223+
SHA256MSG1 MSGTMP2, MSGTMP1
1224+
1225+
// Rounds 44-47.
1226+
MOVO MSGTMP3, MSG
1227+
PADDD 22*16(SHA256CONSTANTS), MSG
1228+
SHA256RNDS2 MSG, STATE0, STATE1
1229+
MOVO MSGTMP3, MSGTMP4
1230+
PALIGNR $4, MSGTMP2, MSGTMP4
1231+
PADDD MSGTMP4, MSGTMP0
1232+
SHA256MSG2 MSGTMP3, MSGTMP0
1233+
PSHUFD $0x0e, MSG, MSG
1234+
SHA256RNDS2 MSG, STATE1, STATE0
1235+
SHA256MSG1 MSGTMP3, MSGTMP2
1236+
1237+
// Rounds 48-51.
1238+
MOVO MSGTMP0, MSG
1239+
PADDD 24*16(SHA256CONSTANTS), MSG
1240+
SHA256RNDS2 MSG, STATE0, STATE1
1241+
MOVO MSGTMP0, MSGTMP4
1242+
PALIGNR $4, MSGTMP3, MSGTMP4
1243+
PADDD MSGTMP4, MSGTMP1
1244+
SHA256MSG2 MSGTMP0, MSGTMP1
1245+
PSHUFD $0x0e, MSG, MSG
1246+
SHA256RNDS2 MSG, STATE1, STATE0
1247+
SHA256MSG1 MSGTMP0, MSGTMP3
1248+
1249+
// Rounds 52-55.
1250+
MOVO MSGTMP1, MSG
1251+
PADDD 26*16(SHA256CONSTANTS), MSG
1252+
SHA256RNDS2 MSG, STATE0, STATE1
1253+
MOVO MSGTMP1, MSGTMP4
1254+
PALIGNR $4, MSGTMP0, MSGTMP4
1255+
PADDD MSGTMP4, MSGTMP2
1256+
SHA256MSG2 MSGTMP1, MSGTMP2
1257+
PSHUFD $0x0e, MSG, MSG
1258+
SHA256RNDS2 MSG, STATE1, STATE0
1259+
1260+
// Rounds 56-59.
1261+
MOVO MSGTMP2, MSG
1262+
PADDD 28*16(SHA256CONSTANTS), MSG
1263+
SHA256RNDS2 MSG, STATE0, STATE1
1264+
MOVO MSGTMP2, MSGTMP4
1265+
PALIGNR $4, MSGTMP1, MSGTMP4
1266+
PADDD MSGTMP4, MSGTMP3
1267+
SHA256MSG2 MSGTMP2, MSGTMP3
1268+
PSHUFD $0x0e, MSG, MSG
1269+
SHA256RNDS2 MSG, STATE1, STATE0
1270+
1271+
// Rounds 60-63.
1272+
MOVO MSGTMP3, MSG
1273+
PADDD 30*16(SHA256CONSTANTS), MSG
1274+
SHA256RNDS2 MSG, STATE0, STATE1
1275+
PSHUFD $0x0e, MSG, MSG
1276+
SHA256RNDS2 MSG, STATE1, STATE0
1277+
1278+
// Mix in previously saved values.
1279+
PADDD ABEF_SAVE, STATE0
1280+
PADDD CDGH_SAVE, STATE1
1281+
1282+
// Check if we need to process another block.
1283+
ADDQ $64, SI
1284+
CMPQ SI, DI
1285+
JB loop
1286+
1287+
// Write hash values back in the correct order. This is the
1288+
// inverse of what was done in the setup.
1289+
// Shuffle ABEF -> FEBA
1290+
PSHUFD $0x1b, STATE0, STATE0
1291+
// Shuffle CDGH -> DCHG
1292+
PSHUFD $0xb1, STATE1, STATE1
1293+
MOVO STATE0, MSGTMP0
1294+
1295+
// Blend DCGH & FEBA to result in DCBA
1296+
PBLENDW $0xf0, STATE1, STATE0
1297+
// Shift FEBA into DCGH to result in GHFE
1298+
PALIGNR $8, MSGTMP0, STATE1
1299+
1300+
// Update digest.
1301+
MOVOU STATE0, (CX)
1302+
MOVOU STATE1, 16(CX)
1303+
end:
1304+
RET

0 commit comments

Comments
 (0)