Closed
Description
Consider the following benchmark:
#![feature(asm)]
extern crate test;
use test::{Bencher};
#[inline(never)]
fn gen() -> Vec<u8> {
Vec::from_elem(1024 * 65, 0)
}
#[bench]
fn position(b: &mut Bencher) {
let v = gen();
b.iter(|| {
test::black_box(v.as_slice().iter().position(|&c| c == 1));
});
}
#[bench]
fn iter(b: &mut Bencher) {
let v = gen();
b.iter(|| {
let mut res = None;
let mut i = 0u;
for &b in v.as_slice().iter() {
if b == 1 {
res = Some(i);
break;
}
i += 1;
}
test::black_box(res);
});
}
#[bench]
fn enumerate(b: &mut Bencher) {
let v = gen();
b.iter(|| {
let mut res = None;
for (i, &b) in v.as_slice().iter().enumerate() {
if b == 1 {
res = Some(i);
break;
}
}
test::black_box(res);
});
}
#[bench]
fn _range(b: &mut Bencher) {
let v = gen();
b.iter(|| {
let mut res = None;
for i in range(0, v.len()) {
if v[i] == 1 {
res = Some(i);
break;
}
}
test::black_box(res);
});
}
#[bench]
fn assembly(b: &mut Bencher) {
let v = gen();
b.iter(|| {
unsafe {
let mut start = v.as_ptr();
let end = start.offset(v.len() as int);
asm!("
dec $0
.align 16, 0x90
AGAIN:
inc $0
cmp $0, $1
je EXIT
cmpb $$1, ($0)
jne AGAIN
EXIT:
" : "+r"(start) : "r"(end));
if start < end {
test::black_box(Some(start as uint - v.as_ptr() as uint));
} else {
test::black_box(None::<u8>);
}
}
});
}
Which produces the following output:
test _range ... bench: 65200 ns/iter (+/- 1033)
test assembly ... bench: 60802 ns/iter (+/- 248)
test enumerate ... bench: 64441 ns/iter (+/- 566)
test iter ... bench: 91170 ns/iter (+/- 465)
test position ... bench: 91112 ns/iter (+/- 384)
position
is the correct abstraction for this but its code is 50% slower than the naive assembly implementation and 40% slower than enumerate
.
Metadata
Metadata
Assignees
Labels
No labels
Activity
jfager commentedon Oct 20, 2014
I'm getting different results:
$ rustc --opt-level=3 --test slow_position.rs
$ ./slow_position --bench
running 4 tests
test _range ... bench: 35569 ns/iter (+/- 8183)
test enumerate ... bench: 35699 ns/iter (+/- 1615)
test iter ... bench: 35929 ns/iter (+/- 4164)
test position ... bench: 31404 ns/iter (+/- 10487)
test result: ok. 0 passed; 0 failed; 0 ignored; 4 measured
(the assembly version segfaulted on me).
mahkoh commentedon Oct 20, 2014
wow. What platform?
jfager commentedon Oct 20, 2014
On a macbook. Here's another run without a bunch of stuff in the background screwing up the variance:
$ rustc --opt-level=3 --test slow_position.rs
$ ./slow_position --bench
running 4 tests
test _range ... bench: 33728 ns/iter (+/- 2331)
test enumerate ... bench: 33677 ns/iter (+/- 2851)
test iter ... bench: 33699 ns/iter (+/- 3995)
test position ... bench: 28884 ns/iter (+/- 978)
arielb1 commentedon Oct 20, 2014
The IL for
iter
-s inner loop is (de-gensymmed)Which looks sane, except for the spurious null check (which remains in the assembly).
Benchmarks on my machine:
arielb1 commentedon Oct 20, 2014
The spurious null check is the issue – writing a
nop
over it in the object code gives 51365±385 ns/iter on my laptop.mahkoh commentedon Jan 12, 2015
New code:
New results:
Unacceptable performance.
Eliminate excessive null-checks from slice iterators
rollup merge of rust-lang#21886: dotdash/fast_slice_iter
Auto merge of rust-lang#18193 - Wilfred:startup_error, r=lnicola