Skip to content

Commit 8e3549f

Browse files
Xubo ZhanggbtuckerPengfei Li
authored and
Vladimir Kozlov
committed
8266332: Adler32 intrinsic for x86 64-bit platforms
Co-authored-by: Xubo Zhang <[email protected]> Co-authored-by: Greg B Tucker <[email protected]> Co-authored-by: Pengfei Li <[email protected]> Reviewed-by: sviswanathan, jbhateja, kvn, neliasso
1 parent b961f25 commit 8e3549f

13 files changed

+394
-6
lines changed

src/hotspot/cpu/x86/assembler_x86.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8030,6 +8030,18 @@ void Assembler::vbroadcastsd(XMMRegister dst, Address src, int vector_len) {
80308030
emit_operand(dst, src);
80318031
}
80328032

8033+
void Assembler::vbroadcastf128(XMMRegister dst, Address src, int vector_len) {
8034+
assert(VM_Version::supports_avx(), "");
8035+
assert(vector_len == AVX_256bit, "");
8036+
assert(dst != xnoreg, "sanity");
8037+
InstructionMark im(this);
8038+
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
8039+
attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
8040+
// swap src<->dst for encoding
8041+
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
8042+
emit_int8(0x1A);
8043+
emit_operand(dst, src);
8044+
}
80338045

80348046
// gpr source broadcast forms
80358047

src/hotspot/cpu/x86/assembler_x86.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2442,11 +2442,12 @@ class Assembler : public AbstractAssembler {
24422442
void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
24432443
void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);
24442444

2445-
// scalar single/double precision replicate
2445+
// scalar single/double/128bit precision replicate
24462446
void vbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
24472447
void vbroadcastss(XMMRegister dst, Address src, int vector_len);
24482448
void vbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
24492449
void vbroadcastsd(XMMRegister dst, Address src, int vector_len);
2450+
void vbroadcastf128(XMMRegister dst, Address src, int vector_len);
24502451

24512452
// gpr sourced byte/word/dword/qword replicate
24522453
void evpbroadcastb(XMMRegister dst, Register src, int vector_len);

src/hotspot/cpu/x86/macroAssembler_x86.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3231,6 +3231,16 @@ void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int
32313231
Assembler::vpmullw(dst, nds, src, vector_len);
32323232
}
32333233

3234+
void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
3235+
assert((UseAVX > 0), "AVX support is needed");
3236+
if (reachable(src)) {
3237+
Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
3238+
} else {
3239+
lea(scratch_reg, src);
3240+
Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
3241+
}
3242+
}
3243+
32343244
void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
32353245
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
32363246
Assembler::vpsubb(dst, nds, src, vector_len);

src/hotspot/cpu/x86/macroAssembler_x86.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1307,6 +1307,13 @@ class MacroAssembler: public Assembler {
13071307

13081308
void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
13091309
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
1310+
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
1311+
Assembler::vpmulld(dst, nds, src, vector_len);
1312+
};
1313+
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
1314+
Assembler::vpmulld(dst, nds, src, vector_len);
1315+
}
1316+
void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);
13101317

13111318
void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
13121319
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
@@ -1764,6 +1771,7 @@ class MacroAssembler: public Assembler {
17641771
void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
17651772
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
17661773
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
1774+
void updateBytesAdler32(Register adler32, Register buf, Register length, XMMRegister shuf0, XMMRegister shuf1, ExternalAddress scale);
17671775
#endif // _LP64
17681776

17691777
// CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
/*
2+
* Copyright (c) 2021, Intel Corporation.
3+
*
4+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
5+
*
6+
* This code is free software; you can redistribute it and/or modify it
7+
* under the terms of the GNU General Public License version 2 only, as
8+
* published by the Free Software Foundation.
9+
*
10+
* This code is distributed in the hope that it will be useful, but WITHOUT
11+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
13+
* version 2 for more details (a copy is included in the LICENSE file that
14+
* accompanied this code).
15+
*
16+
* You should have received a copy of the GNU General Public License version
17+
* 2 along with this work; if not, write to the Free Software Foundation,
18+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
19+
*
20+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
21+
* or visit www.oracle.com if you need additional information or have any
22+
* questions.
23+
*
24+
*/
25+
26+
#include "precompiled.hpp"
27+
#include "asm/assembler.hpp"
28+
#include "asm/assembler.inline.hpp"
29+
#include "runtime/stubRoutines.hpp"
30+
#include "macroAssembler_x86.hpp"
31+
32+
#ifdef _LP64
33+
void MacroAssembler::updateBytesAdler32(Register init_d, Register data, Register size, XMMRegister yshuf0, XMMRegister yshuf1, ExternalAddress ascaletab)
34+
{
35+
const int LIMIT = 5552;
36+
const int BASE = 65521;
37+
const int CHUNKSIZE = 16;
38+
const int CHUNKSIZE_M1 = CHUNKSIZE - 1;
39+
40+
const Register s = r11;
41+
const Register a_d = r12; //r12d
42+
const Register b_d = r8; //r8d
43+
const Register end = r13;
44+
45+
const XMMRegister ya = xmm0;
46+
const XMMRegister yb = xmm1;
47+
const XMMRegister ydata0 = xmm2;
48+
const XMMRegister ydata1 = xmm3;
49+
const XMMRegister ysa = xmm4;
50+
const XMMRegister ydata = ysa;
51+
const XMMRegister ytmp0 = ydata0;
52+
const XMMRegister ytmp1 = ydata1;
53+
const XMMRegister ytmp2 = xmm5;
54+
const XMMRegister xa = xmm0;
55+
const XMMRegister xb = xmm1;
56+
const XMMRegister xtmp0 = xmm2;
57+
const XMMRegister xtmp1 = xmm3;
58+
const XMMRegister xsa = xmm4;
59+
const XMMRegister xtmp2 = xmm5;
60+
assert_different_registers(init_d, data, size, s, a_d, b_d, end, rax);
61+
62+
Label SLOOP1, SLOOP1A, SKIP_LOOP_1A, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;
63+
64+
push(r12);
65+
push(r13);
66+
push(r14);
67+
movl(b_d, init_d); //adler
68+
shrl(b_d, 16);
69+
andl(init_d, 0xFFFF);
70+
cmpl(size, 32);
71+
jcc(Assembler::below, LT64);
72+
movdl(xa, init_d); //vmovd - 32bit
73+
vpxor(yb, yb, yb, Assembler::AVX_256bit);
74+
75+
bind(SLOOP1);
76+
movl(s, LIMIT);
77+
cmpl(s, size);
78+
cmovl(Assembler::above, s, size); // s = min(size, LIMIT)
79+
lea(end, Address(s, data, Address::times_1, -CHUNKSIZE_M1));
80+
cmpptr(data, end);
81+
jcc(Assembler::aboveEqual, SKIP_LOOP_1A);
82+
83+
align(32);
84+
bind(SLOOP1A);
85+
vbroadcastf128(ydata, Address(data, 0), Assembler::AVX_256bit);
86+
addptr(data, CHUNKSIZE);
87+
vpshufb(ydata0, ydata, yshuf0, Assembler::AVX_256bit);
88+
vpaddd(ya, ya, ydata0, Assembler::AVX_256bit);
89+
vpaddd(yb, yb, ya, Assembler::AVX_256bit);
90+
vpshufb(ydata1, ydata, yshuf1, Assembler::AVX_256bit);
91+
vpaddd(ya, ya, ydata1, Assembler::AVX_256bit);
92+
vpaddd(yb, yb, ya, Assembler::AVX_256bit);
93+
cmpptr(data, end);
94+
jcc(Assembler::below, SLOOP1A);
95+
96+
bind(SKIP_LOOP_1A);
97+
addptr(end, CHUNKSIZE_M1);
98+
testl(s, CHUNKSIZE_M1);
99+
jcc(Assembler::notEqual, DO_FINAL);
100+
101+
// either we're done, or we just did LIMIT
102+
subl(size, s);
103+
104+
// reduce
105+
vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
106+
vpmulld(ysa, ya, ascaletab, Assembler::AVX_256bit, r14);
107+
108+
// compute horizontal sums of ya, yb, ysa
109+
vextracti128(xtmp0, ya, 1);
110+
vextracti128(xtmp1, yb, 1);
111+
vextracti128(xtmp2, ysa, 1);
112+
vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
113+
vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
114+
vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
115+
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
116+
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
117+
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
118+
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
119+
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
120+
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
121+
122+
movdl(rax, xa);
123+
xorl(rdx, rdx);
124+
movl(rcx, BASE);
125+
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
126+
movl(a_d, rdx);
127+
128+
vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
129+
movdl(rax, xb);
130+
addl(rax, b_d);
131+
xorl(rdx, rdx);
132+
movl(rcx, BASE);
133+
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
134+
movl(b_d, rdx);
135+
136+
testl(size, size);
137+
jcc(Assembler::zero, FINISH);
138+
139+
// continue loop
140+
movdl(xa, a_d);
141+
vpxor(yb, yb, yb, Assembler::AVX_256bit);
142+
jmp(SLOOP1);
143+
144+
bind(FINISH);
145+
movl(rax, b_d);
146+
shll(rax, 16);
147+
orl(rax, a_d);
148+
jmp(END);
149+
150+
bind(LT64);
151+
movl(a_d, init_d);
152+
lea(end, Address(data, size, Address::times_1));
153+
testl(size, size);
154+
jcc(Assembler::notZero, FINAL_LOOP);
155+
jmp(ZERO_SIZE);
156+
157+
// handle remaining 1...15 bytes
158+
bind(DO_FINAL);
159+
// reduce
160+
vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
161+
vpmulld(ysa, ya, ascaletab, Assembler::AVX_256bit, r14); //scaled a
162+
163+
vextracti128(xtmp0, ya, 1);
164+
vextracti128(xtmp1, yb, 1);
165+
vextracti128(xtmp2, ysa, 1);
166+
vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
167+
vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
168+
vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
169+
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
170+
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
171+
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
172+
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
173+
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
174+
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
175+
vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
176+
177+
movdl(a_d, xa);
178+
movdl(rax, xb);
179+
addl(b_d, rax);
180+
181+
align(32);
182+
bind(FINAL_LOOP);
183+
movzbl(rax, Address(data, 0)); //movzx eax, byte[data]
184+
addl(a_d, rax);
185+
addptr(data, 1);
186+
addl(b_d, a_d);
187+
cmpptr(data, end);
188+
jcc(Assembler::below, FINAL_LOOP);
189+
190+
bind(ZERO_SIZE);
191+
192+
movl(rax, a_d);
193+
xorl(rdx, rdx);
194+
movl(rcx, BASE);
195+
divl(rcx); // div ecx -- divide edx:eax by ecx, quot->eax, rem->edx
196+
movl(a_d, rdx);
197+
198+
movl(rax, b_d);
199+
xorl(rdx, rdx);
200+
movl(rcx, BASE);
201+
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
202+
shll(rdx, 16);
203+
orl(rdx, a_d);
204+
movl(rax, rdx);
205+
206+
bind(END);
207+
pop(r14);
208+
pop(r13);
209+
pop(r12);
210+
}
211+
#endif

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5790,6 +5790,47 @@ address generate_avx_ghash_processBlocks() {
57905790
return start;
57915791
}
57925792

5793+
5794+
/***
5795+
* Arguments:
5796+
*
5797+
* Inputs:
5798+
* c_rarg0 - int adler
5799+
* c_rarg1 - byte* buff
5800+
* c_rarg2 - int len
5801+
*
5802+
* Output:
5803+
* rax - int adler result
5804+
*/
5805+
5806+
address generate_updateBytesAdler32() {
5807+
assert(UseAdler32Intrinsics, "need AVX2");
5808+
5809+
__ align(CodeEntryAlignment);
5810+
StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
5811+
5812+
address start = __ pc();
5813+
5814+
const Register data = r9;
5815+
const Register size = r10;
5816+
5817+
const XMMRegister yshuf0 = xmm6;
5818+
const XMMRegister yshuf1 = xmm7;
5819+
assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);
5820+
5821+
BLOCK_COMMENT("Entry:");
5822+
__ enter(); // required for proper stackwalking of RuntimeStub frame
5823+
5824+
__ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
5825+
__ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
5826+
__ movptr(data, c_rarg1); //data
5827+
__ movl(size, c_rarg2); //length
5828+
__ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
5829+
__ leave();
5830+
__ ret(0);
5831+
return start;
5832+
}
5833+
57935834
/**
57945835
* Arguments:
57955836
*
@@ -6754,6 +6795,11 @@ address generate_avx_ghash_processBlocks() {
67546795
StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
67556796
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
67566797
}
6798+
6799+
if (UseAdler32Intrinsics) {
6800+
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
6801+
}
6802+
67576803
if (UseLibmIntrinsic && InlineIntrinsics) {
67586804
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
67596805
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||

src/hotspot/cpu/x86/stubRoutines_x86.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,25 @@ juint StubRoutines::x86::_shuf_table_crc32_avx512[] =
224224
0x83828100UL, 0x87868584UL, 0x8b8a8988UL, 0x8f8e8d8cUL,
225225
0x03020100UL, 0x07060504UL, 0x0b0a0908UL, 0x000e0d0cUL
226226
};
227+
228+
juint StubRoutines::x86::_adler32_ascale_table[] =
229+
{
230+
0x00000000UL, 0x00000001UL, 0x00000002UL, 0x00000003UL,
231+
0x00000004UL, 0x00000005UL, 0x00000006UL, 0x00000007UL
232+
};
233+
234+
juint StubRoutines::x86::_adler32_shuf0_table[] =
235+
{
236+
0xFFFFFF00UL, 0xFFFFFF01UL, 0xFFFFFF02UL, 0xFFFFFF03UL,
237+
0xFFFFFF04UL, 0xFFFFFF05UL, 0xFFFFFF06UL, 0xFFFFFF07UL
238+
};
239+
240+
juint StubRoutines::x86::_adler32_shuf1_table[] =
241+
{
242+
0xFFFFFF08UL, 0xFFFFFF09, 0xFFFFFF0AUL, 0xFFFFFF0BUL,
243+
0xFFFFFF0CUL, 0xFFFFFF0D, 0xFFFFFF0EUL, 0xFFFFFF0FUL
244+
};
245+
227246
#endif // _LP64
228247

229248
#define D 32

src/hotspot/cpu/x86/stubRoutines_x86.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ class x86 {
119119
static juint _crc_by128_masks_avx512[];
120120
static juint _crc_table_avx512[];
121121
static juint _shuf_table_crc32_avx512[];
122+
static juint _adler32_shuf0_table[];
123+
static juint _adler32_shuf1_table[];
124+
static juint _adler32_ascale_table[];
122125
#endif // _LP64
123126
// table for CRC32C
124127
static juint* _crc32c_table;

0 commit comments

Comments
 (0)