Skip to content
This repository was archived by the owner on Mar 1, 2024. It is now read-only.

VPCLMULQDQ version for crc_fold_copy #28

Open
wants to merge 41 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
4603884
For x86, add CPUID check.
jtkukunas Sep 16, 2013
4fa77e8
zutil.h: add zlikely/zunlikely macros
jtkukunas Jun 21, 2018
63f46f5
zutil.h: add zalign macro
jtkukunas Jun 21, 2018
c4b9af6
enable 16-bit longest_match for x86
jtkukunas Nov 6, 2017
aa5fd5c
Add preprocessor define to tune Adler32 loop unrolling.
jtkukunas Jul 17, 2013
d4726de
Add preprocessor define to tune crc32 unrolling.
jtkukunas Jul 17, 2013
c0d4a12
Adds SSE2 optimized slide_hash
jtkukunas Jun 21, 2018
43a5e76
adds SSE4.2 optimized hash function
jtkukunas Jun 21, 2018
fd93c50
add PCLMULQDQ optimized CRC folding
jtkukunas Jul 11, 2013
8419145
crc_folding: Fix potential out-of-bounds access
NicolasT Mar 3, 2016
9a06dac
crc_folding: use temp buffer for partial stores
jtkukunas Dec 8, 2016
3b11552
deflate: add new deflate_quick strategy for level 1
jtkukunas Jun 21, 2018
2073486
deflate.c fix for window bits > 13
Apr 17, 2016
d6a0d5f
deflate: add new deflate_medium strategy
jtkukunas Dec 8, 2016
afa49d8
deflate: avoid use of uninitialized variable
nathankidd May 22, 2014
99a86da
check whether match or orig go too far back
jtkukunas May 27, 2014
3e77468
Include wmmintrin.h in configure test and crc_folding.c aid clang com…
mp15 Jun 10, 2014
62b7616
Reorganize inflate window layout
jtkukunas Dec 1, 2016
f9716e7
in the case of overlap copy mempcy dist bytes then byte by byte for a…
Nov 21, 2013
c960eeb
Add Intel's optimized RTE memcpy
Dec 8, 2016
58b51f6
integrate Intel's RTE memcpy with zmemcpy
Dec 8, 2016
641f59e
infcover: remove OoM test for SetDictionary since we don't lazy alloc
jtkukunas Dec 5, 2016
4fcda80
update gitignore
jtkukunas Oct 5, 2018
3c6b4f7
reorganize longest_match
jtkukunas Oct 5, 2018
8311c71
add zalways_inline for msvc and gcc
jtkukunas Oct 10, 2018
cb99979
force inline std2_longest_match
jtkukunas Oct 10, 2018
4b0ef4e
inflate: fix MSVC compiler warnings
vkvenkat Nov 9, 2018
3eecf51
inflate: handle windowBits == 16
jtkukunas Feb 1, 2019
3dd73ae
deflate_medium: add dist -1 to hash even for long matches
jtkukunas Apr 9, 2019
a43a247
deflate_medium: avoid emitting a suboptimal literal in the restart case
jtkukunas Apr 9, 2019
b593167
x86: add avx2 check
guowangy Aug 21, 2020
e9858b2
Add AVX2 optimized slide_hash
guowangy Aug 21, 2020
c27041b
test: add perf_test for slide_hash
guowangy Aug 27, 2020
efe1fa6
x86: check CPU VPCLMULQDQ feature
guowangy Sep 2, 2020
40d3dd9
crc_folding: add VPCLMULQDQ version of crc_fold_copy
guowangy Sep 17, 2020
3f546af
test: perf_test for crc_fold_copy
guowangy Sep 17, 2020
4714688
test: fix wrong size in perf_test
guowangy Sep 17, 2020
5a22f13
crc_folding: fold with 4 xmm_crc to get initial zmm_crc0
guowangy Sep 17, 2020
a1428e7
crc_folding: use uniform zmm_crc0 init and remove workaround
guowangy Sep 18, 2020
e2202e5
crc_folding: seperate avx512 from original fodling code to avoid comp…
guowangy Sep 21, 2020
ee007cd
crc_folding: name & indent fix
guowangy Sep 23, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,14 @@
/zlib.pc
/configure.log

*.obj
*.lib
*.pdb
*.exp
*.dll
*.exe
*.res

*.swp

.DS_Store
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ set(ZLIB_PRIVATE_HDRS
inflate.h
inftrees.h
trees.h
x86.h
zutil.h
)
set(ZLIB_SRCS
Expand All @@ -120,6 +121,7 @@ set(ZLIB_SRCS
inffast.c
trees.c
uncompr.c
x86.c
zutil.c
)

Expand Down
94 changes: 77 additions & 17 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@ SRCDIR=
ZINC=
ZINCOUT=-I.

OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
OBJZ = adler32.o crc32.o crc_folding.o crc_folding_vpclmulqdq.o deflate_quick.o deflate_medium.o deflate.o infback.o inffast.o inflate.o inftrees.o match.o slide_sse.o slide_avx2.o trees.o x86.o zutil.o
OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o
OBJC = $(OBJZ) $(OBJG)

PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
PIC_OBJZ = adler32.lo crc32.lo crc_folding.lo crc_folding_vpclmulqdq.lo deflate_quick.lo deflate_medium.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo match.lo slide_sse.lo slide_avx2.lo trees.lo x86.lo zutil.lo
PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)

Expand All @@ -73,11 +73,14 @@ OBJS = $(OBJC) $(OBJA)

PIC_OBJS = $(PIC_OBJC) $(PIC_OBJA)

# extra flags for crc-folding.c
VPCLMULQDQ_CRC_FLAGS = $(if $(findstring USE_VPCLMULQDQ_CRC,$(CFLAGS)), -mavx512f -mvpclmulqdq, )

all: static shared

static: example$(EXE) minigzip$(EXE)

shared: examplesh$(EXE) minigzipsh$(EXE)
shared: examplesh$(EXE) minigzipsh$(EXE) perf_testsh$(EXE)

all64: example64$(EXE) minigzip64$(EXE)

Expand All @@ -100,7 +103,7 @@ testshared: shared
DYLD_LIBRARY_PATH=`pwd`:$(DYLD_LIBRARY_PATH) ; export DYLD_LIBRARY_PATH; \
SHLIB_PATH=`pwd`:$(SHLIB_PATH) ; export SHLIB_PATH; \
TMPSH=tmpsh_$$; \
if echo hello world | ./minigzipsh | ./minigzipsh -d && ./examplesh $$TMPSH; then \
if echo hello world | ./minigzipsh | ./minigzipsh -d && ./examplesh $$TMPSH && ./perf_testsh; then \
echo ' *** zlib shared test OK ***'; \
else \
echo ' *** zlib shared test FAILED ***'; false; \
Expand Down Expand Up @@ -131,18 +134,6 @@ libz.a: $(OBJS)
$(AR) $(ARFLAGS) $@ $(OBJS)
-@ ($(RANLIB) $@ || true) >/dev/null 2>&1

match.o: match.S
$(CPP) match.S > _match.s
$(CC) -c _match.s
mv _match.o match.o
rm -f _match.s

match.lo: match.S
$(CPP) match.S > _match.s
$(CC) -c -fPIC _match.s
mv _match.o match.lo
rm -f _match.s

example.o: $(SRCDIR)test/example.c $(SRCDIR)zlib.h zconf.h
$(CC) $(CFLAGS) $(ZINCOUT) -c -o $@ $(SRCDIR)test/example.c

Expand All @@ -155,6 +146,8 @@ example64.o: $(SRCDIR)test/example.c $(SRCDIR)zlib.h zconf.h
minigzip64.o: $(SRCDIR)test/minigzip.c $(SRCDIR)zlib.h zconf.h
$(CC) $(CFLAGS) $(ZINCOUT) -D_FILE_OFFSET_BITS=64 -c -o $@ $(SRCDIR)test/minigzip.c

perf_test.o: $(SRCDIR)test/perf_test.c $(SRCDIR)zlib.h zconf.h deflate.h
$(CC) $(CFLAGS) $(ZINCOUT) -c -o $@ $(SRCDIR)test/perf_test.c

adler32.o: $(SRCDIR)adler32.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)adler32.c
Expand Down Expand Up @@ -201,6 +194,29 @@ gzread.o: $(SRCDIR)gzread.c
gzwrite.o: $(SRCDIR)gzwrite.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)gzwrite.c

x86.o: $(SRCDIR)x86.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)x86.c

slide_sse.o: $(SRCDIR)slide_sse.c
$(CC) $(CFLAGS) $(ZINC) -msse2 -c -o $@ $(SRCDIR)slide_sse.c

slide_avx2.o: $(SRCDIR)slide_avx2.c
$(CC) $(CFLAGS) $(ZINC) -mavx2 -c -o $@ $(SRCDIR)slide_avx2.c

crc_folding.o: $(SRCDIR)crc_folding.c
$(CC) $(CFLAGS) $(ZINC) -mpclmul -msse4 -c -o $@ $(SRCDIR)crc_folding.c

crc_folding_vpclmulqdq.o: $(SRCDIR)crc_folding_vpclmulqdq.c
$(CC) $(CFLAGS) $(ZINC) $(VPCLMULQDQ_CRC_FLAGS) -c -o $@ $(SRCDIR)crc_folding_vpclmulqdq.c

deflate_quick.o: $(SRCDIR)deflate_quick.c
$(CC) $(CFLAGS) $(ZINC) -msse4 -c -o $@ $(SRCDIR)deflate_quick.c

deflate_medium.o: $(SRCDIR)deflate_medium.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)deflate_medium.c

match.o: $(SRCDIR)match.c
$(CC) $(CFLAGS) $(ZINC) -c -o $@ $(SRCDIR)match.c

adler32.lo: $(SRCDIR)adler32.c
-@mkdir objs 2>/dev/null || test -d objs
Expand Down Expand Up @@ -277,6 +293,45 @@ gzwrite.lo: $(SRCDIR)gzwrite.c
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/gzwrite.o $(SRCDIR)gzwrite.c
-@mv objs/gzwrite.o $@

x86.lo: $(SRCDIR)x86.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/x86.o $(SRCDIR)x86.c
-@mv objs/x86.o $@

slide_sse.lo: $(SRCDIR)slide_sse.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -msse2 -DPIC -c -o objs/slide_sse.o $(SRCDIR)slide_sse.c
-@mv objs/slide_sse.o $@

slide_avx2.lo: $(SRCDIR)slide_avx2.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -mavx2 -DPIC -c -o objs/slide_avx2.o $(SRCDIR)slide_avx2.c
-@mv objs/slide_avx2.o $@

crc_folding.lo: $(SRCDIR)crc_folding.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -mpclmul -msse4 -DPIC -c -o objs/crc_folding.o $(SRCDIR)crc_folding.c
-@mv objs/crc_folding.o $@

crc_folding_vpclmulqdq.lo: $(SRCDIR)crc_folding_vpclmulqdq.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -DPIC $(VPCLMULQDQ_CRC_FLAGS) -c -o objs/crc_folding_vpclmulqdq.o $(SRCDIR)crc_folding_vpclmulqdq.c
-@mv objs/crc_folding_vpclmulqdq.o $@

deflate_quick.lo: $(SRCDIR)deflate_quick.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -msse4 -DPIC -c -o objs/deflate_quick.o $(SRCDIR)deflate_quick.c
-@mv objs/deflate_quick.o $@

deflate_medium.lo: $(SRCDIR)deflate_medium.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/deflate_medium.o $(SRCDIR)deflate_medium.c
-@mv objs/deflate_medium.o $@

match.lo: $(SRCDIR)match.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/match.o $(SRCDIR)match.c
-@mv objs/match.o $@

placebo $(SHAREDLIBV): $(PIC_OBJS) libz.a
$(LDSHARED) $(SFLAGS) -o $@ $(PIC_OBJS) $(LDSHAREDLIBC) $(LDFLAGS)
Expand All @@ -303,6 +358,9 @@ example64$(EXE): example64.o $(STATICLIB)
minigzip64$(EXE): minigzip64.o $(STATICLIB)
$(CC) $(CFLAGS) -o $@ minigzip64.o $(TEST_LDFLAGS)

perf_testsh$(EXE): perf_test.o $(SHAREDLIBV)
$(CC) $(CFLAGS) -o $@ perf_test.o crc_folding.o -L. $(SHAREDLIBV)

install-libs: $(LIBS)
-@if [ ! -d $(DESTDIR)$(exec_prefix) ]; then mkdir -p $(DESTDIR)$(exec_prefix); fi
-@if [ ! -d $(DESTDIR)$(libdir) ]; then mkdir -p $(DESTDIR)$(libdir); fi
Expand Down Expand Up @@ -367,7 +425,7 @@ mostlyclean: clean
clean:
rm -f *.o *.lo *~ \
example$(EXE) minigzip$(EXE) examplesh$(EXE) minigzipsh$(EXE) \
example64$(EXE) minigzip64$(EXE) \
example64$(EXE) minigzip64$(EXE) perf_testsh$(EXE)\
infcover \
libz.* foo.gz so_locations \
_match.s maketree contrib/infback9/*.o
Expand Down Expand Up @@ -398,6 +456,7 @@ infback.o inflate.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.
inffast.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h
inftrees.o: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h
trees.o: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h
x86.o: $(SRCDIR)x86.h

adler32.lo zutil.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: $(SRCDIR)zlib.h zconf.h $(SRCDIR)gzguts.h
Expand All @@ -408,3 +467,4 @@ infback.lo inflate.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftree
inffast.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h $(SRCDIR)inflate.h $(SRCDIR)inffast.h
inftrees.lo: $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)inftrees.h
trees.lo: $(SRCDIR)deflate.h $(SRCDIR)zutil.h $(SRCDIR)zlib.h zconf.h $(SRCDIR)trees.h
x86.lo: $(SRCDIR)x86.h
16 changes: 16 additions & 0 deletions adler32.c
Original file line number Diff line number Diff line change
Expand Up @@ -102,21 +102,37 @@ uLong ZEXPORT adler32_z(adler, buf, len)
/* do length NMAX blocks -- requires just one modulo operation */
while (len >= NMAX) {
len -= NMAX;
#ifndef ADLER32_UNROLL_LESS
n = NMAX / 16; /* NMAX is divisible by 16 */
#else
n = NMAX / 8; /* NMAX is divisible by 8 */
#endif
do {
#ifndef ADLER32_UNROLL_LESS
DO16(buf); /* 16 sums unrolled */
buf += 16;
#else
DO8(buf,0); /* 8 sums unrolled */
buf += 8;
#endif
} while (--n);
MOD(adler);
MOD(sum2);
}

/* do remaining bytes (less than NMAX, still just one modulo) */
if (len) { /* avoid modulos if none remaining */
#ifndef ADLER32_UNROLL_LESS
while (len >= 16) {
len -= 16;
DO16(buf);
buf += 16;
#else
while (len >= 8) {
len -= 8;
DO8(buf, 0);
buf += 8;
#endif
}
while (len--) {
adler += *buf++;
Expand Down
18 changes: 18 additions & 0 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,24 @@ EOF
else
echo "Checking for attribute(visibility) support... No." | tee -a configure.log
fi
# check if compiler suppoprt VPCLMULQDQ
echo >> configure.log
cat > $test.c <<EOF
#include <immintrin.h>
int main()
{
__m512i a, b;
_mm512_clmulepi64_epi128(a, b, 0x00);
return 0;
}
EOF
if tryboth $CC -c $CFLAGS -mavx512f -mvpclmulqdq -Wno-uninitialized $test.c; then
CFLAGS="$CFLAGS -DUSE_VPCLMULQDQ_CRC"
SFLAGS="$SFLAGS -DUSE_VPCLMULQDQ_CRC"
echo "Checking for VPCLMULDQD support... Yes." | tee -a configure.log
else
echo "Checking for VPCLMULDQD support... No." | tee -a configure.log
fi
fi

# show the results in the log
Expand Down
15 changes: 15 additions & 0 deletions crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ const z_crc_t FAR * ZEXPORT get_crc_table()
/* ========================================================================= */
#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
#define DO4 DO1; DO1; DO1; DO1

/* ========================================================================= */
unsigned long ZEXPORT crc32_z(crc, buf, len)
Expand All @@ -223,10 +224,19 @@ unsigned long ZEXPORT crc32_z(crc, buf, len)
}
#endif /* BYFOUR */
crc = crc ^ 0xffffffffUL;

#ifdef CRC32_UNROLL_LESS
while (len >= 4) {
DO4;
len -= 4;
}
#else
while (len >= 8) {
DO8;
len -= 8;
}
#endif

if (len) do {
DO1;
} while (--len);
Expand Down Expand Up @@ -279,10 +289,14 @@ local unsigned long crc32_little(crc, buf, len)
}

buf4 = (const z_crc_t FAR *)(const void FAR *)buf;

#ifndef CRC32_UNROLL_LESS
while (len >= 32) {
DOLIT32;
len -= 32;
}
#endif

while (len >= 4) {
DOLIT4;
len -= 4;
Expand Down Expand Up @@ -440,3 +454,4 @@ uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
{
return crc32_combine_(crc1, crc2, len2);
}

Loading