Skip to content

Commit c055e3e

Browse files
ardbiesheuvelherbertx
authored andcommitted
crypto: xor - use ktime for template benchmarking
Currently, we use the jiffies counter as a time source, by staring at it until a HZ period elapses, and then staring at it again and perform as many XOR operations as we can at the same time until another HZ period elapses, so that we can calculate the throughput. This takes longer than necessary, and depends on HZ, which is undesirable, since HZ is system dependent. Let's use the ktime interface instead, and use it to time a fixed number of XOR operations, which can be done much faster, and makes the time spent depend on the performance level of the system itself, which is much more reasonable. To ensure that we have the resolution we need even on systems with 32 kHz time sources, while not spending too much time in the benchmark on a slow CPU, let's switch to 3 attempts of 800 repetitions each: that way, we will only misidentify algorithms that perform within 10% of each other as the fastest if they are faster than 10 GB/s to begin with, which is not expected to occur on systems with such coarse clocks. On ThunderX2, I get the following results: Before: [72625.956765] xor: measuring software checksum speed [72625.993104] 8regs : 10169.000 MB/sec [72626.033099] 32regs : 12050.000 MB/sec [72626.073095] arm64_neon: 11100.000 MB/sec [72626.073097] xor: using function: 32regs (12050.000 MB/sec) After: [72599.650216] xor: measuring software checksum speed [72599.651188] 8regs : 10491 MB/sec [72599.652006] 32regs : 12345 MB/sec [72599.652871] arm64_neon : 11402 MB/sec [72599.652873] xor: using function: 32regs (12345 MB/sec) Link: https://lore.kernel.org/linux-crypto/[email protected]/ Signed-off-by: Ard Biesheuvel <[email protected]> Reviewed-by: Douglas Anderson <[email protected]> Signed-off-by: Herbert Xu <[email protected]>
1 parent 524ccdb commit c055e3e

File tree

1 file changed

+16
-22
lines changed

1 file changed

+16
-22
lines changed

crypto/xor.c

Lines changed: 16 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -76,49 +76,43 @@ static int __init register_xor_blocks(void)
7676
}
7777
#endif
7878

79-
#define BENCH_SIZE (PAGE_SIZE)
79+
#define BENCH_SIZE 4096
80+
#define REPS 800U
8081

8182
static void __init
8283
do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2)
8384
{
8485
int speed;
85-
unsigned long now, j;
86-
int i, count, max;
86+
int i, j, count;
87+
ktime_t min, start, diff;
8788

8889
tmpl->next = template_list;
8990
template_list = tmpl;
9091

9192
preempt_disable();
9293

93-
/*
94-
* Count the number of XORs done during a whole jiffy, and use
95-
* this to calculate the speed of checksumming. We use a 2-page
96-
* allocation to have guaranteed color L1-cache layout.
97-
*/
98-
max = 0;
99-
for (i = 0; i < 5; i++) {
100-
j = jiffies;
101-
count = 0;
102-
while ((now = jiffies) == j)
103-
cpu_relax();
104-
while (time_before(jiffies, now + 1)) {
94+
min = (ktime_t)S64_MAX;
95+
for (i = 0; i < 3; i++) {
96+
start = ktime_get();
97+
for (j = 0; j < REPS; j++) {
10598
mb(); /* prevent loop optimzation */
10699
tmpl->do_2(BENCH_SIZE, b1, b2);
107100
mb();
108101
count++;
109102
mb();
110103
}
111-
if (count > max)
112-
max = count;
104+
diff = ktime_sub(ktime_get(), start);
105+
if (diff < min)
106+
min = diff;
113107
}
114108

115109
preempt_enable();
116110

117-
speed = max * (HZ * BENCH_SIZE / 1024);
111+
// bytes/ns == GB/s, multiply by 1000 to get MB/s [not MiB/s]
112+
speed = (1000 * REPS * BENCH_SIZE) / (unsigned int)ktime_to_ns(min);
118113
tmpl->speed = speed;
119114

120-
printk(KERN_INFO " %-10s: %5d.%03d MB/sec\n", tmpl->name,
121-
speed / 1000, speed % 1000);
115+
pr_info(" %-16s: %5d MB/sec\n", tmpl->name, speed);
122116
}
123117

124118
static int __init
@@ -158,8 +152,8 @@ calibrate_xor_blocks(void)
158152
if (f->speed > fastest->speed)
159153
fastest = f;
160154

161-
printk(KERN_INFO "xor: using function: %s (%d.%03d MB/sec)\n",
162-
fastest->name, fastest->speed / 1000, fastest->speed % 1000);
155+
pr_info("xor: using function: %s (%d MB/sec)\n",
156+
fastest->name, fastest->speed);
163157

164158
#undef xor_speed
165159

0 commit comments

Comments
 (0)