@@ -1026,3 +1026,279 @@ DATA K256<>+0x1f8(SB)/4, $0xbef9a3f7
1026
1026
DATA K256<>+0x1fc (SB)/4 , $0xc67178f2
1027
1027
1028
1028
GLOBL K256<>(SB), (NOPTR + RODATA), $512
1029
+
1030
+ // SHA256 implementation using the SHA extension. Implemented using the
1031
+ // reference implementation found at:
1032
+ // https://software.intel.com/content/www/us/en/develop/articles/intel-sha-extensions.html
1033
+
1034
+ // Setup register aliases to easier follow the algorithm.
1035
+ #define MSG X0
1036
+ #define STATE0 X1
1037
+ #define STATE1 X2
1038
+ #define MSGTMP0 X3
1039
+ #define MSGTMP1 X4
1040
+ #define MSGTMP2 X5
1041
+ #define MSGTMP3 X6
1042
+ #define MSGTMP4 X7
1043
+ #define SHUF_MASK X8
1044
+ #define ABEF_SAVE X9
1045
+ #define CDGH_SAVE X10
1046
+ #define SHA256CONSTANTS BX
1047
+
1048
+ TEXT ·blockSHA(SB), NOSPLIT, $0 -32
1049
+
1050
+ MOVQ dig+0 (FP), CX
1051
+ MOVQ p_base+8 (FP), SI
1052
+ MOVQ p_len+16 (FP), DX
1053
+ SHRQ $6 , DX
1054
+ SHLQ $6 , DX
1055
+ LEAQ (SI)(DX*1 ), DI
1056
+
1057
+ MOVOU (CX), STATE0
1058
+ MOVOU 16 (CX), STATE1
1059
+
1060
+ // Byte shuffle for correct 32 bit dword order.
1061
+ // The algorithm assumes 8 32 bit working variables called
1062
+ // A through H. CDGH need to be stored in one XMM register &
1063
+ // ABEF in another one. So the desired end state here is ABEF
1064
+ // (each a 32 bit dword in that order from high to low) in
1065
+ // STATE0 & CDGH in STATE1.
1066
+ //
1067
+ // We start off with DCBA in STATE0 & HGFE in STATE1
1068
+ //
1069
+ // First, shuffle DCBA -> CDAB
1070
+ PSHUFD $0xb1 , STATE0, STATE0
1071
+ // Shuffle HGFE -> EFGH
1072
+ PSHUFD $0x1b , STATE1, STATE1
1073
+ // Copy EFGH into temporary register since blending would
1074
+ // otherwise lose data.
1075
+ MOVO STATE1, MSGTMP0
1076
+ // Blend CDAB into EFGH to result in CDGH
1077
+ PBLENDW $0xf0 , STATE0, STATE1
1078
+ // Shift AB (8 bytes) from CDAB into EFGH to result in ABEF.
1079
+ PALIGNR $8 , MSGTMP0, STATE0
1080
+
1081
+ MOVO flip_mask<>(SB), SHUF_MASK
1082
+
1083
+ // Reuses the existing constant table, but it means that each
1084
+ // time SHA256CONSTANTS is used the offset is doubled since
1085
+ // K256 contains duplicate entries for the AVX2 path.
1086
+ LEAQ K256<>(SB), SHA256CONSTANTS
1087
+
1088
+ // Skip if we accidentally have a zero sized block.
1089
+ CMPQ SI, DI
1090
+ JEQ end
1091
+
1092
+ loop:
1093
+
1094
+ // Save working variables.
1095
+ MOVO STATE0, ABEF_SAVE
1096
+ MOVO STATE1, CDGH_SAVE
1097
+
1098
+ // Rounds 0-3.
1099
+ MOVOU (SI), MSG
1100
+ PSHUFB SHUF_MASK, MSG
1101
+ MOVO MSG, MSGTMP0
1102
+ PADDD 0 *16 (SHA256CONSTANTS), MSG
1103
+ SHA256RNDS2 MSG, STATE0, STATE1
1104
+ PSHUFD $0x0e , MSG, MSG
1105
+ SHA256RNDS2 MSG, STATE1, STATE0
1106
+
1107
+ // Rounds 4-7.
1108
+ MOVOU 16 (SI), MSG
1109
+ PSHUFB SHUF_MASK, MSG
1110
+ MOVO MSG, MSGTMP1
1111
+ PADDD 2*16 (SHA256CONSTANTS), MSG
1112
+ SHA256RNDS2 MSG, STATE0, STATE1
1113
+ PSHUFD $0x0e , MSG, MSG
1114
+ SHA256RNDS2 MSG, STATE1, STATE0
1115
+ SHA256MSG1 MSGTMP1, MSGTMP0
1116
+
1117
+ // Rounds 8-11.
1118
+ MOVOU 32 (SI), MSG
1119
+ PSHUFB SHUF_MASK, MSG
1120
+ MOVO MSG, MSGTMP2
1121
+ PADDD 4*16 (SHA256CONSTANTS), MSG
1122
+ SHA256RNDS2 MSG, STATE0, STATE1
1123
+ PSHUFD $0x0e , MSG, MSG
1124
+ SHA256RNDS2 MSG, STATE1, STATE0
1125
+ SHA256MSG1 MSGTMP2, MSGTMP1
1126
+
1127
+ // Rounds 12-15.
1128
+ MOVOU 48 (SI), MSG
1129
+ PSHUFB SHUF_MASK, MSG
1130
+ MOVO MSG, MSGTMP3
1131
+ PADDD 6*16 (SHA256CONSTANTS), MSG
1132
+ SHA256RNDS2 MSG, STATE0, STATE1
1133
+ MOVO MSGTMP3, MSGTMP4
1134
+ PALIGNR $4 , MSGTMP2, MSGTMP4
1135
+ PADDD MSGTMP4, MSGTMP0
1136
+ SHA256MSG2 MSGTMP3, MSGTMP0
1137
+ PSHUFD $0x0e , MSG, MSG
1138
+ SHA256RNDS2 MSG, STATE1, STATE0
1139
+ SHA256MSG1 MSGTMP3, MSGTMP2
1140
+
1141
+ // Rounds 16-19.
1142
+ MOVO MSGTMP0, MSG
1143
+ PADDD 8*16 (SHA256CONSTANTS), MSG
1144
+ SHA256RNDS2 MSG, STATE0, STATE1
1145
+ MOVO MSGTMP0, MSGTMP4
1146
+ PALIGNR $4 , MSGTMP3, MSGTMP4
1147
+ PADDD MSGTMP4, MSGTMP1
1148
+ SHA256MSG2 MSGTMP0, MSGTMP1
1149
+ PSHUFD $0x0e , MSG, MSG
1150
+ SHA256RNDS2 MSG, STATE1, STATE0
1151
+ SHA256MSG1 MSGTMP0, MSGTMP3
1152
+
1153
+ // Rounds 20-23.
1154
+ MOVO MSGTMP1, MSG
1155
+ PADDD 10*16 (SHA256CONSTANTS), MSG
1156
+ SHA256RNDS2 MSG, STATE0, STATE1
1157
+ MOVO MSGTMP1, MSGTMP4
1158
+ PALIGNR $4 , MSGTMP0, MSGTMP4
1159
+ PADDD MSGTMP4, MSGTMP2
1160
+ SHA256MSG2 MSGTMP1, MSGTMP2
1161
+ PSHUFD $0x0e , MSG, MSG
1162
+ SHA256RNDS2 MSG, STATE1, STATE0
1163
+ SHA256MSG1 MSGTMP1, MSGTMP0
1164
+
1165
+ // Rounds 24-27.
1166
+ MOVO MSGTMP2, MSG
1167
+ PADDD 12*16 (SHA256CONSTANTS), MSG
1168
+ SHA256RNDS2 MSG, STATE0, STATE1
1169
+ MOVO MSGTMP2, MSGTMP4
1170
+ PALIGNR $4 , MSGTMP1, MSGTMP4
1171
+ PADDD MSGTMP4, MSGTMP3
1172
+ SHA256MSG2 MSGTMP2, MSGTMP3
1173
+ PSHUFD $0x0e , MSG, MSG
1174
+ SHA256RNDS2 MSG, STATE1, STATE0
1175
+ SHA256MSG1 MSGTMP2, MSGTMP1
1176
+
1177
+ // Rounds 28-31.
1178
+ MOVO MSGTMP3, MSG
1179
+ PADDD 14*16 (SHA256CONSTANTS), MSG
1180
+ SHA256RNDS2 MSG, STATE0, STATE1
1181
+ MOVO MSGTMP3, MSGTMP4
1182
+ PALIGNR $4 , MSGTMP2, MSGTMP4
1183
+ PADDD MSGTMP4, MSGTMP0
1184
+ SHA256MSG2 MSGTMP3, MSGTMP0
1185
+ PSHUFD $0x0e , MSG, MSG
1186
+ SHA256RNDS2 MSG, STATE1, STATE0
1187
+ SHA256MSG1 MSGTMP3, MSGTMP2
1188
+
1189
+ // Rounds 32-35.
1190
+ MOVO MSGTMP0, MSG
1191
+ PADDD 16*16 (SHA256CONSTANTS), MSG
1192
+ SHA256RNDS2 MSG, STATE0, STATE1
1193
+ MOVO MSGTMP0, MSGTMP4
1194
+ PALIGNR $4 , MSGTMP3, MSGTMP4
1195
+ PADDD MSGTMP4, MSGTMP1
1196
+ SHA256MSG2 MSGTMP0, MSGTMP1
1197
+ PSHUFD $0x0e , MSG, MSG
1198
+ SHA256RNDS2 MSG, STATE1, STATE0
1199
+ SHA256MSG1 MSGTMP0, MSGTMP3
1200
+
1201
+ // Rounds 36-39.
1202
+ MOVO MSGTMP1, MSG
1203
+ PADDD 18*16 (SHA256CONSTANTS), MSG
1204
+ SHA256RNDS2 MSG, STATE0, STATE1
1205
+ MOVO MSGTMP1, MSGTMP4
1206
+ PALIGNR $4 , MSGTMP0, MSGTMP4
1207
+ PADDD MSGTMP4, MSGTMP2
1208
+ SHA256MSG2 MSGTMP1, MSGTMP2
1209
+ PSHUFD $0x0e , MSG, MSG
1210
+ SHA256RNDS2 MSG, STATE1, STATE0
1211
+ SHA256MSG1 MSGTMP1, MSGTMP0
1212
+
1213
+ // Rounds 40-43.
1214
+ MOVO MSGTMP2, MSG
1215
+ PADDD 20*16 (SHA256CONSTANTS), MSG
1216
+ SHA256RNDS2 MSG, STATE0, STATE1
1217
+ MOVO MSGTMP2, MSGTMP4
1218
+ PALIGNR $4 , MSGTMP1, MSGTMP4
1219
+ PADDD MSGTMP4, MSGTMP3
1220
+ SHA256MSG2 MSGTMP2, MSGTMP3
1221
+ PSHUFD $0x0e , MSG, MSG
1222
+ SHA256RNDS2 MSG, STATE1, STATE0
1223
+ SHA256MSG1 MSGTMP2, MSGTMP1
1224
+
1225
+ // Rounds 44-47.
1226
+ MOVO MSGTMP3, MSG
1227
+ PADDD 22*16 (SHA256CONSTANTS), MSG
1228
+ SHA256RNDS2 MSG, STATE0, STATE1
1229
+ MOVO MSGTMP3, MSGTMP4
1230
+ PALIGNR $4 , MSGTMP2, MSGTMP4
1231
+ PADDD MSGTMP4, MSGTMP0
1232
+ SHA256MSG2 MSGTMP3, MSGTMP0
1233
+ PSHUFD $0x0e , MSG, MSG
1234
+ SHA256RNDS2 MSG, STATE1, STATE0
1235
+ SHA256MSG1 MSGTMP3, MSGTMP2
1236
+
1237
+ // Rounds 48-51.
1238
+ MOVO MSGTMP0, MSG
1239
+ PADDD 24*16 (SHA256CONSTANTS), MSG
1240
+ SHA256RNDS2 MSG, STATE0, STATE1
1241
+ MOVO MSGTMP0, MSGTMP4
1242
+ PALIGNR $4 , MSGTMP3, MSGTMP4
1243
+ PADDD MSGTMP4, MSGTMP1
1244
+ SHA256MSG2 MSGTMP0, MSGTMP1
1245
+ PSHUFD $0x0e , MSG, MSG
1246
+ SHA256RNDS2 MSG, STATE1, STATE0
1247
+ SHA256MSG1 MSGTMP0, MSGTMP3
1248
+
1249
+ // Rounds 52-55.
1250
+ MOVO MSGTMP1, MSG
1251
+ PADDD 26*16 (SHA256CONSTANTS), MSG
1252
+ SHA256RNDS2 MSG, STATE0, STATE1
1253
+ MOVO MSGTMP1, MSGTMP4
1254
+ PALIGNR $4 , MSGTMP0, MSGTMP4
1255
+ PADDD MSGTMP4, MSGTMP2
1256
+ SHA256MSG2 MSGTMP1, MSGTMP2
1257
+ PSHUFD $0x0e , MSG, MSG
1258
+ SHA256RNDS2 MSG, STATE1, STATE0
1259
+
1260
+ // Rounds 56-59.
1261
+ MOVO MSGTMP2, MSG
1262
+ PADDD 28*16 (SHA256CONSTANTS), MSG
1263
+ SHA256RNDS2 MSG, STATE0, STATE1
1264
+ MOVO MSGTMP2, MSGTMP4
1265
+ PALIGNR $4 , MSGTMP1, MSGTMP4
1266
+ PADDD MSGTMP4, MSGTMP3
1267
+ SHA256MSG2 MSGTMP2, MSGTMP3
1268
+ PSHUFD $0x0e , MSG, MSG
1269
+ SHA256RNDS2 MSG, STATE1, STATE0
1270
+
1271
+ // Rounds 60-63.
1272
+ MOVO MSGTMP3, MSG
1273
+ PADDD 30*16 (SHA256CONSTANTS), MSG
1274
+ SHA256RNDS2 MSG, STATE0, STATE1
1275
+ PSHUFD $0x0e , MSG, MSG
1276
+ SHA256RNDS2 MSG, STATE1, STATE0
1277
+
1278
+ // Mix in previously saved values.
1279
+ PADDD ABEF_SAVE, STATE0
1280
+ PADDD CDGH_SAVE, STATE1
1281
+
1282
+ // Check if we need to process another block.
1283
+ ADDQ $64 , SI
1284
+ CMPQ SI, DI
1285
+ JB loop
1286
+
1287
+ // Write hash values back in the correct order. This is the
1288
+ // inverse of what was done in the setup.
1289
+ // Shuffle ABEF -> FEBA
1290
+ PSHUFD $0x1b , STATE0, STATE0
1291
+ // Shuffle CDGH -> DCHG
1292
+ PSHUFD $0xb1 , STATE1, STATE1
1293
+ MOVO STATE0, MSGTMP0
1294
+
1295
+ // Blend DCGH & FEBA to result in DCBA
1296
+ PBLENDW $0xf0 , STATE1, STATE0
1297
+ // Shift FEBA into DCGH to result in GHFE
1298
+ PALIGNR $8 , MSGTMP0, STATE1
1299
+
1300
+ // Update digest.
1301
+ MOVOU STATE0, (CX)
1302
+ MOVOU STATE1, 16 (CX)
1303
+ end:
1304
+ RET
0 commit comments