Skip to content

Iterator methods produce slow code #18193

Closed
@mahkoh

Description

@mahkoh
Contributor

Consider the following benchmark:

#![feature(asm)]

extern crate test;

use test::{Bencher};

#[inline(never)]
fn gen() -> Vec<u8> {
    Vec::from_elem(1024 * 65, 0)
}

#[bench]
fn position(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        test::black_box(v.as_slice().iter().position(|&c| c == 1));
    });
}

#[bench]
fn iter(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        let mut res = None;
        let mut i = 0u;
        for &b in v.as_slice().iter() {
            if b == 1 {
                res = Some(i);
                break;
            }
            i += 1;
        }
        test::black_box(res);
    });
}

#[bench]
fn enumerate(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        let mut res = None;
        for (i, &b) in v.as_slice().iter().enumerate() {
            if b == 1 {
                res = Some(i);
                break;
            }
        }
        test::black_box(res);
    });
}

#[bench]
fn _range(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        let mut res = None;
        for i in range(0, v.len()) {
            if v[i] == 1 {
                res = Some(i);
                break;
            }
        }
        test::black_box(res);
    });
}

#[bench]
fn assembly(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        unsafe {
            let mut start = v.as_ptr();
            let end = start.offset(v.len() as int);
            asm!("
                dec $0
                .align 16, 0x90
            AGAIN:
                inc $0
                cmp $0, $1
                je EXIT
                cmpb $$1, ($0)
                jne AGAIN
            EXIT:
                " : "+r"(start) : "r"(end));
            if start < end {
                test::black_box(Some(start as uint - v.as_ptr() as uint));
            } else {
                test::black_box(None::<u8>);
            }
        }
    });
}

Which produces the following output:

test _range ... bench: 65200 ns/iter (+/- 1033)
test assembly ... bench: 60802 ns/iter (+/- 248)
test enumerate ... bench: 64441 ns/iter (+/- 566)
test iter ... bench: 91170 ns/iter (+/- 465)
test position ... bench: 91112 ns/iter (+/- 384)

position is the correct abstraction for this but its code is 50% slower than the naive assembly implementation and 40% slower than enumerate.

Activity

jfager

jfager commented on Oct 20, 2014

@jfager
Contributor

I'm getting different results:

$ rustc --opt-level=3 --test slow_position.rs
$ ./slow_position --bench

running 4 tests
test _range ... bench: 35569 ns/iter (+/- 8183)
test enumerate ... bench: 35699 ns/iter (+/- 1615)
test iter ... bench: 35929 ns/iter (+/- 4164)
test position ... bench: 31404 ns/iter (+/- 10487)

test result: ok. 0 passed; 0 failed; 0 ignored; 4 measured

(the assembly version segfaulted on me).

mahkoh

mahkoh commented on Oct 20, 2014

@mahkoh
ContributorAuthor

test position ... bench: 31404 ns/iter (+/- 10487)

wow. What platform?

jfager

jfager commented on Oct 20, 2014

@jfager
Contributor

On a macbook. Here's another run without a bunch of stuff in the background screwing up the variance:

$ rustc --opt-level=3 --test slow_position.rs
$ ./slow_position --bench

running 4 tests
test _range ... bench: 33728 ns/iter (+/- 2331)
test enumerate ... bench: 33677 ns/iter (+/- 2851)
test iter ... bench: 33699 ns/iter (+/- 3995)
test position ... bench: 28884 ns/iter (+/- 978)

arielb1

arielb1 commented on Oct 20, 2014

@arielb1
Contributor

The IL for iter-s inner loop is (de-gensymmed)

.loop.preheader: ; preds = %.noexc15
  br label %.loop

for_loopback.i: ; preds = %for_body.i14
  %16 = icmp eq i8* %18, %14
  br i1 %16, label %.noexc6.loopexit, label %.exit

.loop: ; preds = %.loop.preheader, %for_loopback.i
  %i = phi i64 [ %21, %for_loopback.i ], [ 0, %.loop.preheader ]
  %ix = phi i8* [ %18, %for_loopback.i ], [ %.pre, %.loop.preheader ]
  %17 = icmp eq i8* %ix, null ; WTF? spurious null check
  br i1 %17, label %.noexc6.loopexit, label %for_body.i14

for_body.i14: ; preds = .loop
  %18 = getelementptr inbounds i8* %ix, i64 1
  %19 = load i8* %ix, align 1
  %20 = icmp eq i8 %19, 1
  %21 = add i64 %i, 1
  br i1 %20, label %.noexc6.loopexit, label %for_loopback.i

Which looks sane, except for the spurious null check (which remains in the assembly).

Benchmarks on my machine:

$ grep model\ name /proc/cpuinfo | uniq
model name  : Intel(R) Core(TM) i5-3317U CPU @ 1.70GHz
$ rustc -O --test test.rs
$ ./test --bench

running 5 tests
test _range    ... bench:     51362 ns/iter (+/- 4319)
test assembly  ... bench:     51578 ns/iter (+/- 2938)
test enumerate ... bench:     51465 ns/iter (+/- 850)
test iter      ... bench:     77122 ns/iter (+/- 886)
test position  ... bench:     77147 ns/iter (+/- 1315)

test result: ok. 0 passed; 0 failed; 0 ignored; 5 measured
arielb1

arielb1 commented on Oct 20, 2014

@arielb1
Contributor

The spurious null check is the issue – writing a nop over it in the object code gives 51365±385 ns/iter on my laptop.

mahkoh

mahkoh commented on Jan 12, 2015

@mahkoh
ContributorAuthor

New code:

#![feature(asm)]
#![allow(unstable)]

extern crate test;

use test::{Bencher};

#[inline(never)]
fn gen() -> Vec<u8> {
    (0..1024*65).map(|_| 0).collect()
}

#[bench]
fn position(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        test::black_box(v.as_slice().iter().position(|&c| c == 1));
    });
}

#[bench]
fn iter(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        let mut res = None;
        let mut i = 0us;
        for &b in v.as_slice().iter() {
            if b == 1 {
                res = Some(i);
                break;
            }
            i += 1;
        }
        test::black_box(res);
    });
}

#[bench]
fn enumerate(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        let mut res = None;
        for (i, &b) in v.as_slice().iter().enumerate() {
            if b == 1 {
                res = Some(i);
                break;
            }
        }
        test::black_box(res);
    });
}

#[bench]
fn _range(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        let mut res = None;
        for i in (0..v.len()) {
            if v[i] == 1 {
                res = Some(i);
                break;
            }
        }
        test::black_box(res);
    });
}

#[bench]
fn assembly(b: &mut Bencher) {
    let v = gen();
    b.iter(|| {
        unsafe {
            let mut start = v.as_ptr();
            let end = start.offset(v.len() as isize);
            asm!("
                dec $0
                .align 16, 0x90
            AGAIN:
                inc $0
                cmp $0, $1
                je EXIT
                cmpb $$1, ($0)
                jne AGAIN
            EXIT:
                " : "+r"(start) : "r"(end));
            if start < end {
                test::black_box(Some(start as usize - v.as_ptr() as usize));
            } else {
                test::black_box(None::<u8>);
            }
        }
    });
}

New results:

test _range    ... bench:     95373 ns/iter (+/- 734)
test assembly  ... bench:     60773 ns/iter (+/- 396)
test enumerate ... bench:     95355 ns/iter (+/- 361)
test iter      ... bench:     91127 ns/iter (+/- 455)
test position  ... bench:     91081 ns/iter (+/- 543)

Unacceptable performance.

added a commit that references this issue on Feb 19, 2015

rollup merge of rust-lang#21886: dotdash/fast_slice_iter

added a commit that references this issue on Oct 8, 2024

Auto merge of rust-lang#18193 - Wilfred:startup_error, r=lnicola

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

      Development

      Participants

      @jfager@steveklabnik@arielb1@mahkoh

      Issue actions

        Iterator methods produce slow code · Issue #18193 · rust-lang/rust