[RISCV] Add initial support of memcmp expansion #107548

wangpc-pp · 2024-09-06T09:20:55Z

There are two passes that have dependency on the implementation
of TargetTransformInfo::enableMemCmpExpansion : MergeICmps and
ExpandMemCmp.

This PR adds the initial implementation of enableMemCmpExpansion
so that we can have some basic benefits from these two passes.

We don't enable expansion when there is no unaligned access support
currently because there are some issues about unaligned loads and
stores in ExpandMemcmp pass. We should fix these issues and enable
the expansion later.

Vector case hasn't been tested as we don't generate inlined vector
instructions for memcmp currently.

Created using spr 1.3.6-beta.1

llvmbot · 2024-09-06T09:21:27Z

@llvm/pr-subscribers-backend-risc-v

Author: Pengcheng Wang (wangpc-pp)

Changes

There are two passes that have dependency on the implementation
of TargetTransformInfo::enableMemCmpExpansion : MergeICmps and
ExpandMemCmp.

This PR adds the initial implementation of enableMemCmpExpansion
so that we can have some basic benefits from these two passes.

Vector case hasn't been tested as we don't generate inlined vector
instructions for memcmp currently.

Patch is 42.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/107548.diff

3 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp (+15)
(modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h (+3)
(added) llvm/test/CodeGen/RISCV/memcmp.ll (+932)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index e809e15eacf696..ad532aadc83266 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -2113,3 +2113,18 @@ bool RISCVTTIImpl::shouldConsiderAddressTypePromotion(
   }
   return Considerable;
 }
+
+RISCVTTIImpl::TTI::MemCmpExpansionOptions
+RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
+  TTI::MemCmpExpansionOptions Options;
+  // FIXME: Vector haven't been tested.
+  Options.AllowOverlappingLoads =
+      (ST->enableUnalignedScalarMem() || ST->enableUnalignedScalarMem());
+  Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
+  Options.NumLoadsPerBlock = Options.MaxNumLoads;
+  if (ST->is64Bit())
+    Options.LoadSizes.push_back(8);
+  llvm::append_range(Options.LoadSizes, ArrayRef({4, 2, 1}));
+  Options.AllowedTailExpansions = {3, 5, 6};
+  return Options;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 763b89bfec0a66..ee9bed09df97f3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -404,6 +404,9 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader);
   std::optional<unsigned> getMinPageSize() const { return 4096; }
+
+  TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
+                                                    bool IsZeroCmp) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/RISCV/memcmp.ll b/llvm/test/CodeGen/RISCV/memcmp.ll
new file mode 100644
index 00000000000000..652cd02e2c750a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/memcmp.ll
@@ -0,0 +1,932 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -O2 | FileCheck %s --check-prefix=CHECK-ALIGNED-RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -O2 | FileCheck %s --check-prefix=CHECK-ALIGNED-RV64
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+unaligned-scalar-mem -O2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV32
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+unaligned-scalar-mem -O2 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-UNALIGNED-RV64
+
+declare i32 @bcmp(i8*, i8*, iXLen) nounwind readonly
+declare i32 @memcmp(i8*, i8*, iXLen) nounwind readonly
+
+define i1 @bcmp_size_15(i8* %s1, i8* %s2) {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_15:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 3(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a2, a2, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a4, a2
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 0(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 2(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a6, 3(a1)
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:    lbu a3, 5(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 4(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 6(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a6, 7(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV32-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 5(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 4(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a6, 6(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a7, 7(a1)
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV32-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV32-NEXT:    xor a3, a3, a4
+; CHECK-ALIGNED-RV32-NEXT:    lbu a4, 9(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 8(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a6, 10(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a7, 11(a0)
+; CHECK-ALIGNED-RV32-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV32-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV32-NEXT:    lbu a5, 9(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a6, 8(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a7, 10(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu t0, 11(a1)
+; CHECK-ALIGNED-RV32-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a5, a5, a6
+; CHECK-ALIGNED-RV32-NEXT:    slli a7, a7, 16
+; CHECK-ALIGNED-RV32-NEXT:    slli t0, t0, 24
+; CHECK-ALIGNED-RV32-NEXT:    or a6, t0, a7
+; CHECK-ALIGNED-RV32-NEXT:    lbu a7, 13(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu t0, 12(a0)
+; CHECK-ALIGNED-RV32-NEXT:    or a5, a6, a5
+; CHECK-ALIGNED-RV32-NEXT:    xor a4, a4, a5
+; CHECK-ALIGNED-RV32-NEXT:    slli a7, a7, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a5, a7, t0
+; CHECK-ALIGNED-RV32-NEXT:    lbu a6, 13(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a7, 12(a1)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a0, 14(a0)
+; CHECK-ALIGNED-RV32-NEXT:    lbu a1, 14(a1)
+; CHECK-ALIGNED-RV32-NEXT:    slli a6, a6, 8
+; CHECK-ALIGNED-RV32-NEXT:    or a6, a6, a7
+; CHECK-ALIGNED-RV32-NEXT:    xor a5, a5, a6
+; CHECK-ALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a3
+; CHECK-ALIGNED-RV32-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV32-NEXT:    or a2, a2, a4
+; CHECK-ALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_15:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a3
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a4, a2
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 5(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 4(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 6(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 7(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 0(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 5(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 4(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 6(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 7(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 9(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 8(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 10(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 11(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 9(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 8(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 10(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 11(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 13(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 12(a0)
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    xor a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a6, a7
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 13(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 12(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a0, 14(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 14(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a5, a6
+; CHECK-ALIGNED-RV64-NEXT:    xor a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a4, a0
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a3, a0
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-ALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV64-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV32-LABEL: bcmp_size_15:
+; CHECK-UNALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV32-NEXT:    lw a2, 0(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a3, 0(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a4, 4(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a5, 4(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a6, 8(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a7, 8(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 11(a0)
+; CHECK-UNALIGNED-RV32-NEXT:    lw a1, 11(a1)
+; CHECK-UNALIGNED-RV32-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV32-NEXT:    xor a4, a4, a5
+; CHECK-UNALIGNED-RV32-NEXT:    xor a3, a6, a7
+; CHECK-UNALIGNED-RV32-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV32-NEXT:    or a2, a2, a4
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a3, a0
+; CHECK-UNALIGNED-RV32-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV32-NEXT:    ret
+;
+; CHECK-UNALIGNED-RV64-LABEL: bcmp_size_15:
+; CHECK-UNALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-UNALIGNED-RV64-NEXT:    ld a2, 0(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a3, 0(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a0, 7(a0)
+; CHECK-UNALIGNED-RV64-NEXT:    ld a1, 7(a1)
+; CHECK-UNALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-UNALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-UNALIGNED-RV64-NEXT:    or a0, a2, a0
+; CHECK-UNALIGNED-RV64-NEXT:    seqz a0, a0
+; CHECK-UNALIGNED-RV64-NEXT:    ret
+entry:
+  %bcmp = call i32 @bcmp(i8* %s1, i8* %s2, iXLen 15)
+  %ret = icmp eq i32 %bcmp, 0
+  ret i1 %ret
+}
+
+define i1 @bcmp_size_31(i8* %s1, i8* %s2) {
+; CHECK-ALIGNED-RV32-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV32:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, -16
+; CHECK-ALIGNED-RV32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-ALIGNED-RV32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-ALIGNED-RV32-NEXT:    .cfi_offset ra, -4
+; CHECK-ALIGNED-RV32-NEXT:    li a2, 31
+; CHECK-ALIGNED-RV32-NEXT:    call bcmp
+; CHECK-ALIGNED-RV32-NEXT:    seqz a0, a0
+; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    ret
+;
+; CHECK-ALIGNED-RV64-LABEL: bcmp_size_31:
+; CHECK-ALIGNED-RV64:       # %bb.0: # %entry
+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 1(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 3(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a2, a2, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a3
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a4, a2
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 5(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 4(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 6(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 7(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a3, a2
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 1(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 0(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 2(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 3(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 5(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 4(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 6(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 7(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    xor a2, a2, a3
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 9(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 8(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 10(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 11(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a3, a3, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 13(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 12(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 14(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 15(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a3, a4, a3
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 9(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 8(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 10(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 11(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 13(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 12(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 14(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 15(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a5, a6
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli t0, t0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a6, t0, a7
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    xor a3, a3, a4
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 17(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 16(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 18(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 19(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a4, a4, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 21(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 20(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 22(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 23(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a5, a6
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli t0, t0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a6, t0, a7
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a5, a4
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 17(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 16(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 18(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 19(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a5, a6
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli t0, t0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a6, t0, a7
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 21(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 20(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 22(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t1, 23(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a6, a6, a7
+; CHECK-ALIGNED-RV64-NEXT:    slli t0, t0, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli t1, t1, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a7, t1, t0
+; CHECK-ALIGNED-RV64-NEXT:    or a6, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 32
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    xor a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    lbu a5, 25(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 24(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 26(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 27(a0)
+; CHECK-ALIGNED-RV64-NEXT:    slli a5, a5, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a5, a6
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli t0, t0, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a6, t0, a7
+; CHECK-ALIGNED-RV64-NEXT:    or a5, a6, a5
+; CHECK-ALIGNED-RV64-NEXT:    lbu a6, 25(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 24(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 26(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t1, 27(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a6, a6, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a6, a6, a7
+; CHECK-ALIGNED-RV64-NEXT:    slli t0, t0, 16
+; CHECK-ALIGNED-RV64-NEXT:    slli t1, t1, 24
+; CHECK-ALIGNED-RV64-NEXT:    or a7, t1, t0
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 29(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t1, 28(a0)
+; CHECK-ALIGNED-RV64-NEXT:    or a6, a7, a6
+; CHECK-ALIGNED-RV64-NEXT:    xor a5, a5, a6
+; CHECK-ALIGNED-RV64-NEXT:    slli t0, t0, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a6, t0, t1
+; CHECK-ALIGNED-RV64-NEXT:    lbu a7, 29(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu t0, 28(a1)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a0, 30(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a1, 30(a1)
+; CHECK-ALIGNED-RV64-NEXT:    slli a7, a7, 8
+; CHECK-ALIGNED-RV64-NEXT:    or a7, a7, t0
+; CHECK-ALIGNED-RV64-NEXT:    xor a6, a6, a7
+; CHECK-ALIGNED-RV64-NEXT:    xor a0, a0, a1
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a3
+; CHECK-ALIGNED-RV64-NEXT:    or a4, a4, a5
+; CHECK-ALIGNED-RV64-NEXT:    or a0, a6, a0
+; CHECK-ALIGNED-RV64-NEXT:    or a2, a2, a4
+; CHECK-ALIGNED-RV64-NEXT:    ...
[truncated]

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

Created using spr 1.3.6-beta.1

lukel97 · 2024-09-06T11:04:01Z

Looks like some SPEC benchmarks use memcmp, I'm benchmarking them with this patch now. I'll update you when it finishes :)

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

llvm/test/CodeGen/RISCV/memcmp.ll

preames · 2024-09-06T17:53:10Z

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

+  TTI::MemCmpExpansionOptions Options;
+  // FIXME: Vector haven't been tested.
+  Options.AllowOverlappingLoads =
+      (ST->enableUnalignedScalarMem() || ST->enableUnalignedVectorMem());


Unless I'm missing something, there is no check for sufficient alignment on any load generated by ExpandMemCmp, as a result, we can only legally provide a non-byte size if unaligned access is fully supported.

(This is really bug in ExpandMemCmp - it should be asking if the load is properly aligned.)

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

asb · 2024-09-10T09:33:48Z

This is perhaps more of a comment for #107824 than for this one, but I think we'd benefit from some level of test coverage for OptForSize to demonstrate that we stick with the libcall in cases where expanding increases code size.

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

wangpc-pp · 2024-09-10T14:15:15Z

This is perhaps more of a comment for #107824 than for this one, but I think we'd benefit from some level of test coverage for OptForSize to demonstrate that we stick with the libcall in cases where expanding increases code size.

Thanks! Done!

asb · 2024-09-10T14:28:04Z

This is perhaps more of a comment for #107824 than for this one, but I think we'd benefit from some level of test coverage for OptForSize to demonstrate that we stick with the libcall in cases where expanding increases code size.

Thanks! Done!

Cheers! It strikes me that we should probably be less aggressive about expanding memcmp for at least Os without unaligned access support. I'm not quite sure where the cutoff should be - Os of course has some leeway for increasing code size if we think it will benefit performance and I don't think we need to spend too much time trying to make a "perfect" decision when there's no single right answer. But e.g. bcmp_size_8 for Os without misaligned access support seems like more codesize blowup than I'd expect at that optimisation level. I'd welcome views on what a sensible threshold would be.

lukel97 · 2024-09-11T07:51:24Z

I collected the stats on the number of memcmps that were inlined, it looks like we're able to expand a good chunk of them:

Program                                       expand-memcmp.NumMemCmpCalls              expand-memcmp.NumMemCmpInlined             
                                              lhs                          rhs    diff  lhs                            rhs    diff 
FP2017rate/510.parest_r/510.parest_r          410.00                       468.00 14.1%                                104.00  inf%
INT2017speed/602.gcc_s/602.gcc_s               83.00                        92.00 10.8%                                 36.00  inf%
INT2017rate/502.gcc_r/502.gcc_r                83.00                        92.00 10.8%                                 36.00  inf%
INT2017spe...00.perlbench_s/600.perlbench_s   207.00                       220.00  6.3%                                120.00  inf%
INT2017rat...00.perlbench_r/500.perlbench_r   207.00                       220.00  6.3%                                120.00  inf%
INT2017spe...ed/620.omnetpp_s/620.omnetpp_s   304.00                       306.00  0.7%                                 13.00  inf%
INT2017rate/520.omnetpp_r/520.omnetpp_r       304.00                       306.00  0.7%                                 13.00  inf%
FP2017rate/508.namd_r/508.namd_r               13.00                        13.00  0.0%                                 13.00  inf%
INT2017rate/541.leela_r/541.leela_r            40.00                        40.00  0.0%                                  3.00  inf%
INT2017speed/641.leela_s/641.leela_s           40.00                        40.00  0.0%                                  3.00  inf%
INT2017speed/625.x264_s/625.x264_s              8.00                         8.00  0.0%                                  6.00  inf%
INT2017spe...23.xalancbmk_s/623.xalancbmk_s     8.00                         8.00  0.0%                                  6.00  inf%
INT2017rate/557.xz_r/557.xz_r                   6.00                         6.00  0.0%                                  4.00  inf%
INT2017rat...23.xalancbmk_r/523.xalancbmk_r     8.00                         8.00  0.0%                                  6.00  inf%
INT2017rate/525.x264_r/525.x264_r               8.00                         8.00  0.0%                                  6.00  inf%
FP2017speed/644.nab_s/644.nab_s                77.00                        77.00  0.0%                                 71.00  inf%
FP2017speed/638.imagick_s/638.imagick_s         3.00                         3.00  0.0%                                            
FP2017rate/544.nab_r/544.nab_r                 77.00                        77.00  0.0%                                 71.00  inf%
FP2017rate/538.imagick_r/538.imagick_r          3.00                         3.00  0.0%                                            
FP2017rate/526.blender_r/526.blender_r         41.00                        41.00  0.0%                                 27.00  inf%
FP2017rate/511.povray_r/511.povray_r            5.00                         5.00  0.0%                                  5.00  inf%
INT2017speed/657.xz_s/657.xz_s                  6.00                         6.00  0.0%                                  4.00  inf%

There's a small difference in the number of original memcmp calls, there's some merge commits in this branch which might have changed the codegen slightly in the meantime.

I'm working on getting some runtime numbers now, sorry for the delay

wangpc-pp · 2024-09-11T08:47:11Z

I'm working on getting some runtime numbers now, sorry for the delay.

No need to say sorry, I really appreciate your help! 😃

lukel97 · 2024-09-12T16:11:27Z

The run just finished, I'm seeing a 0.75% improvement on 500.perlbench_r on the BPI F3 (-O3 -mcpu=spacemit-x60), no regressions or improvements on the other benchmarks as far as I can see. Seems to check out with the number of memcmps inlined reported for perlbench!

wangpc-pp · 2024-09-13T03:01:50Z

The run just finished, I'm seeing a 0.75% improvement on 500.perlbench_r on the BPI F3 (-O3 -mcpu=spacemit-x60), no regressions or improvements on the other benchmarks as far as I can see. Seems to check out with the number of memcmps inlined reported for perlbench!

Thanks a lot! The result is within my expectation.
Is it OK to merge? The next step will be tuning for vectors.

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp

topperc · 2024-09-13T03:48:39Z

llvm/test/CodeGen/RISCV/memcmp-optsize.ll

-; CHECK-ALIGNED-RV32-NEXT:    call bcmp
-; CHECK-ALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-ALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-ALIGNED-RV32-NEXT:    lbu a2, 1(a0)


Would it be cheaper to Xor all the bytes individually and then Or the xor results together?

It costs the same number of instructions?

I was proposing

lbu a2, 1(a0) lbu a0, 0(a0) lbu a3, 1(a1) lbu a1, 0(a1) xor a2, a2, a3 xor a0, a0, a1 or a2, a2, a0 snez a2, a2

that looks like fewer instructions.

Great! This is an optimization we can do in ExpandMemcmp, I will take a look.

This seems to be more complicated than what I was thinking. For this case, we generate i16 loads/compares, and then we expand them since unaligned loads are illegal. We may expand memcmp to byte loop directly when there is no unaligned access.

topperc · 2024-09-13T03:52:16Z

llvm/test/CodeGen/RISCV/memcmp.ll

-; CHECK-UNALIGNED-RV32-NEXT:    call memcmp
-; CHECK-UNALIGNED-RV32-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
-; CHECK-UNALIGNED-RV32-NEXT:    addi sp, sp, 16
+; CHECK-UNALIGNED-RV32-NEXT:    lw a0, 0(a0)


What is this code doing? It seems way more complicated than a 4 byte memcmp should be. In fact it's longer than the ALIGNED version.

I guess this is an expanded byteswap?

Add a RUN line with Zbb/Zbkb?

Maybe we should restrict MemCmp expansion to only when Zbb/Zbkb are are enabled?

Done.
It seems we can benefit from not only rev8 instructions but also pack instructions (for forming large integers).
But I don't think we should disable the expansion when Zbb/Zbkb don't exist. If these extensions are not supported and we don't expand memcmp, we would still execute these instructions to do same operations in glibc, right?

The library call will do a byte loop without Zbb I think? I don't think it will do the byte swap with shift/and/or.

So the question is, is doing a wide load and an emulated byteswap cheaper than the byte loop?

Good question, I don't know the answer, but I think we can be aggressive here and enable it for all configurations. If we have some evidences that shows inefficiencies, we can go back to here and disable it. WDYT?

Can we test the non-Zbb config on qemu and get the instruction count?

Here is the code of memcmp copied from glibc: https://godbolt.org/z/4KxPTE6q1
There are many cases (which means branches) in this general implementation, at least we can benefit from unrolling and removal of branches.

topperc · 2024-09-13T04:04:32Z

The run just finished, I'm seeing a 0.75% improvement on 500.perlbench_r on the BPI F3 (-O3 -mcpu=spacemit-x60), no regressions or improvements on the other benchmarks as far as I can see. Seems to check out with the number of memcmps inlined reported for perlbench!

Does spacemit-x60 support unaligned scalar memory and was your test with or without that enabled?

wangpc-pp · 2024-09-13T04:21:56Z

The run just finished, I'm seeing a 0.75% improvement on 500.perlbench_r on the BPI F3 (-O3 -mcpu=spacemit-x60), no regressions or improvements on the other benchmarks as far as I can see. Seems to check out with the number of memcmps inlined reported for perlbench!

Does spacemit-x60 support unaligned scalar memory and was your test with or without that enabled?

It supports unaligned scalar but not unaligned vector. And it seems we don't add these features to -mcpu=spacemit-x60. So I think @lukel97 ran the SPEC without unaligned scalar.

topperc · 2024-10-09T18:04:07Z

Ping.

And some updates on vector support: currently, ExpandMemCmp will only generate integer types (i128, i256 and so on). So, if we simply add 128, 256, vlen to LoadSizes, the LLVM IR will use i128/i256/... and then they are expanded to legal scalar types as we don't mark i128/i256/... as legal when RVV exists.

There are two ways to fix this:

Make ExpandMemCmp generate vector types/operations.

Make i128/i256/... legal on RISC-V.

I think the first one is the right approach but I need some agreements on this.

I think X86 only supports IsZeroCmp for vector and has a pre-type legalization DAG combine to recognize the wide integer type. See X86ISelLowering combineVectorSizedSetCCEquality.

General memcmp is complicated with vector. You need to find the first element where the mismatch occurred and then compare only that element to find whether it is larger or smaller. I don't know if you can write that in target independent IR without it being really nasty and it wouldn't generate efficient code.

wangpc-pp · 2024-10-10T07:50:47Z

Ping.
And some updates on vector support: currently, ExpandMemCmp will only generate integer types (i128, i256 and so on). So, if we simply add 128, 256, vlen to LoadSizes, the LLVM IR will use i128/i256/... and then they are expanded to legal scalar types as we don't mark i128/i256/... as legal when RVV exists.
There are two ways to fix this:

Make ExpandMemCmp generate vector types/operations.

Make i128/i256/... legal on RISC-V.

I think the first one is the right approach but I need some agreements on this.

I think X86 only supports IsZeroCmp for vector and has a pre-type legalization DAG combine to recognize the wide integer type. See X86ISelLowering combineVectorSizedSetCCEquality.

Thanks! I didn't notice that, I will try to implement this similarly in RISC-V.

General memcmp is complicated with vector. You need to find the first element where the mismatch occurred and then compare only that element to find whether it is larger or smaller. I don't know if you can write that in target independent IR without it being really nasty and it wouldn't generate efficient code.

Yes, I have noticed it. I was thinking that if we generate vector code earlier, we may get more spaces for optimizations. I will still try this approach, but I won't hold out high hopes.

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

wangpc-pp · 2024-11-01T03:26:03Z

Ping, any comment for current scalar part? I'm working on vector expansion and will post it in a few days.

topperc · 2024-11-04T04:50:07Z

llvm/test/CodeGen/RISCV/memcmp-optsize.ll

+define i1 @bcmp_lt_zero(ptr %s1, ptr %s2) nounwind optsize {
+; CHECK-LABEL: bcmp_lt_zero:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    li a0, 0


What happened here?

It expands to:

entry: %0 = load i32, ptr %s1, align 1 %1 = load i32, ptr %s2, align 1 %2 = icmp ne i32 %0, %1 %3 = zext i1 %2 to i32 %ret = icmp slt i32 %3, 0 ret i1 %ret

And is optimized to :

entry: %0 = load i32, ptr %s1, align 1 %1 = load i32, ptr %s2, align 1 %2 = icmp ne i32 %0, %1 %3 = zext i1 %2 to i32 ret i1 false

This is how bcmp is expanded.

I guess llvm is assuming that the bcmp should only be checked for eq/ne 0 and not gt/lt 0?

I don't know if this is the assumption of bcmp or just the behavior of ExpandMemcmp. I will keep these tests.

topperc · 2024-11-04T04:55:35Z

llvm/test/CodeGen/RISCV/memcmp-optsize.ll

+; CHECK-ALIGNED-RV64-NEXT:    lbu a2, 0(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a3, 1(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lbu a4, 2(a0)
+; CHECK-ALIGNED-RV64-NEXT:    lb a0, 3(a0)


Why didn't this use a lbu to avoid the andi later?

It seems to be related to https://reviews.llvm.org/D130397. Maybe we should revert this change or add a pattern to catch this.

preames · 2024-11-04T20:46:22Z

At a macro level, it looks like ExpandMemCmp is making some problematic choices around unaligned loads and stores. As I commented before, ExpandMemCmp appears to be blindly emitting unaligned accesses (counted as one against budget) without accounting for the fact that such loads are going to be scalarized again (i.e. resulting in N x loads, where N is the type size). I think we need to fix this. In particular, the discussion around Zbb and Zbkb in this review seem to mostly come from cases where unaligned load.store are being expanded implicitly,

I don't believe this change should move forward until the underlying issue in ExpandMemCmp has been addressed.

topperc · 2024-11-04T22:07:58Z

At a macro level, it looks like ExpandMemCmp is making some problematic choices around unaligned loads and stores. As I commented before, ExpandMemCmp appears to be blindly emitting unaligned accesses (counted as one against budget) without accounting for the fact that such loads are going to be scalarized again (i.e. resulting in N x loads, where N is the type size). I think we need to fix this. In particular, the discussion around Zbb and Zbkb in this review seem to mostly come from cases where unaligned load.store are being expanded implicitly,

I don't believe this change should move forward until the underlying issue in ExpandMemCmp has been addressed.

Can we break the enabling down into more manageable pieces? I think enableUnalignedScalarMem() && (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || IsZeroCmp) might be a good starting point.

preames · 2024-11-04T22:58:35Z

Can we break the enabling down into more manageable pieces? I think enableUnalignedScalarMem() && (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || IsZeroCmp) might be a good starting point.

I'd be fine with this type of approach.

Created using spr 1.3.6-beta.1

wangpc-pp · 2024-11-05T09:26:57Z

Can we break the enabling down into more manageable pieces? I think enableUnalignedScalarMem() && (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || IsZeroCmp) might be a good starting point.

I'd be fine with this type of approach.

Thanks! I have applied this suggestion. For ExpandMemcmp issue, I will try to fix it later.

topperc · 2024-11-05T19:23:57Z

Can we break the enabling down into more manageable pieces? I think enableUnalignedScalarMem() && (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || IsZeroCmp) might be a good starting point.

I'd be fine with this type of approach.

Thanks! I have applied this suggestion. For ExpandMemcmp issue, I will try to fix it later.

I meant we should start with only doing inline memcmp expansion under those conditions. Not just the overlapping loads part. The miscounting of loads that get split needs to be fixed before we can enable expansion without enableUnalignedScalarMem().

Created using spr 1.3.6-beta.1

wangpc-pp · 2024-11-06T04:14:39Z

Can we break the enabling down into more manageable pieces? I think enableUnalignedScalarMem() && (Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || IsZeroCmp) might be a good starting point.

I'd be fine with this type of approach.

Thanks! I have applied this suggestion. For ExpandMemcmp issue, I will try to fix it later.

I meant we should start with only doing inline memcmp expansion under those conditions. Not just the overlapping loads part. The miscounting of loads that get split needs to be fixed before we can enable expansion without enableUnalignedScalarMem().

Oops, sorry I misunderstood it. Done.

topperc

LGTM, but wait for another review.

preames

LGTM

Created using spr 1.3.6-beta.1 [skip ci]

Created using spr 1.3.6-beta.1

I can't find any official documentation on this, but from other discussions[^1] and my own testing the spacemit-x60 seems to support unaligned scalar loads and stores. They seem to be performant, and just from a quick test we get a 2.45% speedup on 500.perlbench_r on the Banana Pi F3[^2]. This would allow it to take advantage of #107548. [^1]: #110454 (comment) [^2]: https://lnt.lukelau.me/db_default/v4/nts/32

[𝘀𝗽𝗿] initial version

f21cfcf

Created using spr 1.3.6-beta.1

llvmbot added the backend:RISC-V label Sep 6, 2024

wangpc-pp requested review from lukel97, preames, arcbbb, asb, dtcxzyw and topperc September 6, 2024 09:22

lukel97 reviewed Sep 6, 2024

View reviewed changes

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp Outdated Show resolved Hide resolved

Fix copt-paste mistake

2caea13

Created using spr 1.3.6-beta.1

preames reviewed Sep 6, 2024

View reviewed changes

wangpc-pp added 2 commits September 9, 2024 16:38

[𝘀𝗽𝗿] changes introduced through rebase

24dc612

Created using spr 1.3.6-beta.1 [skip ci]

Rebase on tests

02812ed

Created using spr 1.3.6-beta.1

wangpc-pp changed the base branch from main to users/wangpc-pp/spr/main.riscv-add-initial-support-of-memcmp-expansion September 9, 2024 12:11

wangpc-pp added 2 commits September 9, 2024 21:44

Don't add 5/6 to AllowedTailExpansions for RV32

a96e1aa

Created using spr 1.3.6-beta.1

Remove AllowedTailExpansions

86b823d

Created using spr 1.3.6-beta.1

wangpc-pp added 2 commits September 10, 2024 22:13

[𝘀𝗽𝗿] changes introduced through rebase

4749983

Created using spr 1.3.6-beta.1 [skip ci]

Rebase on new tests.

d1a516b

Created using spr 1.3.6-beta.1

topperc reviewed Sep 13, 2024

View reviewed changes

wangpc-pp added 8 commits October 10, 2024 16:59

[𝘀𝗽𝗿] changes introduced through rebase

6e34e30

Created using spr 1.3.6-beta.1 [skip ci]

Add RUNs with zbb/zbkb

8ab1147

Created using spr 1.3.6-beta.1

[𝘀𝗽𝗿] changes introduced through rebase

6e65dd0

Created using spr 1.3.6-beta.1 [skip ci]

Add Zbb tests

ed1a930

Created using spr 1.3.6-beta.1

[𝘀𝗽𝗿] changes introduced through rebase

a4119f8

Created using spr 1.3.6-beta.1 [skip ci]

Add signext

03946d2

Created using spr 1.3.6-beta.1

[𝘀𝗽𝗿] changes introduced through rebase

b9a0425

Created using spr 1.3.6-beta.1 [skip ci]

Reduce CHECKs

39816b2

Created using spr 1.3.6-beta.1

topperc reviewed Nov 4, 2024

View reviewed changes

Apply suggestion to AllowOverlappingLoads

2fe0d42

Created using spr 1.3.6-beta.1

unaligned only

d695c49

Created using spr 1.3.6-beta.1

topperc approved these changes Nov 6, 2024

View reviewed changes

preames approved these changes Nov 6, 2024

View reviewed changes

lukel97 mentioned this pull request Nov 6, 2024

[RISCV] Add +unaligned-scalar-mem to spacemit-x60 #115125

Merged

wangpc-pp added 2 commits November 6, 2024 15:41

[𝘀𝗽𝗿] changes introduced through rebase

2098099

Created using spr 1.3.6-beta.1 [skip ci]

Rebase

10d5216

Created using spr 1.3.6-beta.1

wangpc-pp changed the base branch from users/wangpc-pp/spr/main.riscv-add-initial-support-of-memcmp-expansion to main November 6, 2024 07:44

wangpc-pp merged commit 7a5b040 into main Nov 6, 2024
6 of 10 checks passed

wangpc-pp deleted the users/wangpc-pp/spr/riscv-add-initial-support-of-memcmp-expansion branch November 6, 2024 07:44

[RISCV] Add initial support of memcmp expansion #107548

[RISCV] Add initial support of memcmp expansion #107548

Uh oh!

Conversation

wangpc-pp commented Sep 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 6, 2024

Uh oh!

Uh oh!

lukel97 commented Sep 6, 2024

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

asb commented Sep 10, 2024

Uh oh!

wangpc-pp commented Sep 10, 2024

Uh oh!

asb commented Sep 10, 2024

Uh oh!

lukel97 commented Sep 11, 2024

Uh oh!

wangpc-pp commented Sep 11, 2024

Uh oh!

lukel97 commented Sep 12, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

wangpc-pp commented Sep 13, 2024

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

topperc Sep 13, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

wangpc-pp Oct 10, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

topperc commented Sep 13, 2024

Uh oh!

wangpc-pp commented Sep 13, 2024

Uh oh!

topperc commented Oct 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

wangpc-pp commented Oct 10, 2024

Uh oh!

wangpc-pp commented Nov 1, 2024

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

wangpc-pp commented Sep 6, 2024 •

edited

Loading

lukel97 commented Sep 12, 2024 •

edited

Loading

topperc Sep 13, 2024 •

edited

Loading

wangpc-pp Oct 10, 2024 •

edited

Loading

topperc commented Oct 9, 2024 •

edited

Loading

topperc commented Nov 4, 2024 •

edited

Loading

topperc commented Nov 5, 2024 •

edited

Loading