Literal vec notation is less efficient than vec::from_elem #4403

bstrie · 2013-01-09T18:43:04Z

I'm updating a benchmark and noting performance differences. Here's one version (perlin-mut.rs):

struct Vec2 {
    x: f32,
    y: f32,
}

fn lerp(a: f32, b: f32, v: f32) -> f32 {
    a * (1f32 - v) + b * v
}

fn smooth(v: f32) -> f32 {
    v * v * (3f32 - 2f32 * v)
}

fn random_gradient(r: rand::Rng) -> Vec2 {
    let v = r.gen_float() * float::consts::pi * 2.0;
    Vec2{
        x: float::cos(v) as f32,
        y: float::sin(v) as f32,
    }
}

fn gradient(orig: Vec2, grad: Vec2, p: Vec2) -> f32 {
    let sp = Vec2{x: p.x - orig.x, y: p.y - orig.y};
    grad.x * sp.x + grad.y + sp.y
}

struct Noise2DContext {
    rgradients: ~[Vec2],
    permutations: ~[int],
}

fn Noise2DContext() -> ~Noise2DContext {
    let r = rand::Rng();
    let rgradients = do vec::from_fn(256) |_i| { random_gradient(r) };
    let mut permutations = do vec::from_fn(256) |i| { i as int };
    r.shuffle_mut(permutations);

    ~Noise2DContext{
        rgradients: move rgradients,
        permutations: move permutations,
    }
}

impl Noise2DContext {
    fn get_gradient(x: int, y: int) -> Vec2 {
        let idx = self.permutations[x & 255] + self.permutations[y & 255];
        self.rgradients[idx & 255]
    }

    fn get_gradients(gradients: &[mut Vec2 * 4], origins: &[mut Vec2 * 4], x: f32, y: f32) {
        let x0f = float::floor(x as libc::c_double) as f32;
        let y0f = float::floor(y as libc::c_double) as f32;
        let x0 = x0f as int;
        let y0 = y0f as int;
        let x1 = x0 + 1;
        let y1 = y0 + 1;

        gradients[0] = self.get_gradient(x0, y0);
        gradients[1] = self.get_gradient(x1, y0);
        gradients[2] = self.get_gradient(x0, y1);
        gradients[3] = self.get_gradient(x1, y1);

        origins[0] = Vec2{x: x0f + 0f32, y: y0f + 0f32};
        origins[1] = Vec2{x: x0f + 1f32, y: y0f + 0f32};
        origins[2] = Vec2{x: x0f + 0f32, y: y0f + 1f32};
        origins[3] = Vec2{x: x0f + 1f32, y: y0f + 1f32};
    }

    fn get(x: f32, y: f32) -> f32 {
        let p = Vec2{x: x, y: y};
        let gradients: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        let origins: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        self.get_gradients(&gradients, &origins, x, y);
        let v0 = gradient(origins[0], gradients[0], p);
        let v1 = gradient(origins[1], gradients[1], p);
        let v2 = gradient(origins[2], gradients[2], p);
        let v3 = gradient(origins[3], gradients[3], p);
        let fx = smooth(x - origins[0].x);
        let vx0 = lerp(v0, v1, fx);
        let vx1 = lerp(v2, v3, fx);
        let fy = smooth(y - origins[0].y);
        lerp(vx0, vx1, fy)
    }
}

fn main() {
    let symbols = [" ", "░", "▒", "▓", "█", "█"];
    let mut pixels = vec::from_elem(256*256, 0f32);
    let n2d = Noise2DContext();
    for int::range(0, 100) |_i| {
        for int::range(0, 256) |y| {
            for int::range(0, 256) |x| {
                let v = n2d.get(
                    x as f32 * 0.1f32,
                    y as f32 * 0.1f32
                ) * 0.5f32 + 0.5f32;
                pixels[y*256+x] = v;
            };
        };
    };

    for int::range(0, 256) |y| {
        for int::range(0, 256) |x| {
            io::print(symbols[pixels[y*256+x] / 0.2f32 as int]);
        }
        io::println("");
    }
}

Here's a version that's had the following line altered: let mut pixels = vec::from_elem(256*256, 0f32); becomes let mut pixels = ~[0f32, ..65536];. For reproducibility, here's the whole file (perlin-mut-literal-tilde.rs):

struct Vec2 {
    x: f32,
    y: f32,
}

fn lerp(a: f32, b: f32, v: f32) -> f32 {
    a * (1f32 - v) + b * v
}

fn smooth(v: f32) -> f32 {
    v * v * (3f32 - 2f32 * v)
}

fn random_gradient(r: rand::Rng) -> Vec2 {
    let v = r.gen_float() * float::consts::pi * 2.0;
    Vec2{
        x: float::cos(v) as f32,
        y: float::sin(v) as f32,
    }
}

fn gradient(orig: Vec2, grad: Vec2, p: Vec2) -> f32 {
    let sp = Vec2{x: p.x - orig.x, y: p.y - orig.y};
    grad.x * sp.x + grad.y + sp.y
}

struct Noise2DContext {
    rgradients: ~[Vec2],
    permutations: ~[int],
}

fn Noise2DContext() -> ~Noise2DContext {
    let r = rand::Rng();
    let rgradients = do vec::from_fn(256) |_i| { random_gradient(r) };
    let mut permutations = do vec::from_fn(256) |i| { i as int };
    r.shuffle_mut(permutations);

    ~Noise2DContext{
        rgradients: move rgradients,
        permutations: move permutations,
    }
}

impl Noise2DContext {
    fn get_gradient(x: int, y: int) -> Vec2 {
        let idx = self.permutations[x & 255] + self.permutations[y & 255];
        self.rgradients[idx & 255]
    }

    fn get_gradients(gradients: &[mut Vec2 * 4], origins: &[mut Vec2 * 4], x: f32, y: f32) {
        let x0f = float::floor(x as libc::c_double) as f32;
        let y0f = float::floor(y as libc::c_double) as f32;
        let x0 = x0f as int;
        let y0 = y0f as int;
        let x1 = x0 + 1;
        let y1 = y0 + 1;

        gradients[0] = self.get_gradient(x0, y0);
        gradients[1] = self.get_gradient(x1, y0);
        gradients[2] = self.get_gradient(x0, y1);
        gradients[3] = self.get_gradient(x1, y1);

        origins[0] = Vec2{x: x0f + 0f32, y: y0f + 0f32};
        origins[1] = Vec2{x: x0f + 1f32, y: y0f + 0f32};
        origins[2] = Vec2{x: x0f + 0f32, y: y0f + 1f32};
        origins[3] = Vec2{x: x0f + 1f32, y: y0f + 1f32};
    }

    fn get(x: f32, y: f32) -> f32 {
        let p = Vec2{x: x, y: y};
        let gradients: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        let origins: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        self.get_gradients(&gradients, &origins, x, y);
        let v0 = gradient(origins[0], gradients[0], p);
        let v1 = gradient(origins[1], gradients[1], p);
        let v2 = gradient(origins[2], gradients[2], p);
        let v3 = gradient(origins[3], gradients[3], p);
        let fx = smooth(x - origins[0].x);
        let vx0 = lerp(v0, v1, fx);
        let vx1 = lerp(v2, v3, fx);
        let fy = smooth(y - origins[0].y);
        lerp(vx0, vx1, fy)
    }
}

fn main() {
    let symbols = [" ", "░", "▒", "▓", "█", "█"];
    //let mut pixels = vec::from_elem(256*256, 0f32);
    let mut pixels = ~[0f32, ..65536];
    let n2d = Noise2DContext();
    for int::range(0, 100) |_i| {
        for int::range(0, 256) |y| {
            for int::range(0, 256) |x| {
                let v = n2d.get(
                    x as f32 * 0.1f32,
                    y as f32 * 0.1f32
                ) * 0.5f32 + 0.5f32;
                pixels[y*256+x] = v;
            };
        };
    };

    for int::range(0, 256) |y| {
        for int::range(0, 256) |x| {
            io::print(symbols[pixels[y*256+x] / 0.2f32 as int]);
        }
        io::println("");
    }
}

Both programs were compiled with rustc --opt-level=3 and profiled as follows:

$ (perf stat -r 10 perlin-mut) 2> mut3.txt
$ (perf stat -r 10 perlin-mut-literal-tilde) 2> mut-literal-tilde2.txt

mut3.txt:

 Performance counter stats for 'perlin-mut' (10 runs):

    1346.376055  task-clock-msecs         #      0.982 CPUs    ( +-   0.136% )
             74  context-switches         #      0.000 M/sec   ( +-   4.013% )
              1  CPU-migrations           #      0.000 M/sec   ( +-   9.091% )
            738  page-faults              #      0.001 M/sec   ( +-   0.047% )
   166000485913  cycles                   # 123294.295 M/sec   ( +-   3.857% )
   166000485913  instructions             #      1.000 IPC     ( +-   3.857% )
   166000485913  branches                 # 123294.295 M/sec   ( +-   3.857% )
   166000485913  branch-misses            #    100.000 %       ( +-   3.857% )
   166000485913  cache-references         # 123294.295 M/sec   ( +-   3.857% )
   166000485913  cache-misses             # 123294.295 M/sec   ( +-   3.857% )

    1.371346350  seconds time elapsed   ( +-   0.181% )

mut-literal-tilde2.txt:

 Performance counter stats for 'perlin-mut-literal-tilde' (10 runs):

    1376.394325  task-clock-msecs         #      0.986 CPUs    ( +-   0.145% )
            111  context-switches         #      0.000 M/sec   ( +-   8.048% )
              1  CPU-migrations           #      0.000 M/sec   ( +-   9.091% )
            738  page-faults              #      0.001 M/sec   ( +-   0.036% )
   238800181546  cycles                   # 173496.924 M/sec   ( +-   7.465% )
   238800181546  instructions             #      1.000 IPC     ( +-   7.465% )
   238800181546  branches                 # 173496.924 M/sec   ( +-   7.465% )
   238800181546  branch-misses            #    100.000 %       ( +-   7.465% )
   238800181546  cache-references         # 173496.924 M/sec   ( +-   7.465% )
   238800181546  cache-misses             # 173496.924 M/sec   ( +-   7.465% )

    1.395384346  seconds time elapsed   ( +-   0.311% )

It's not a large difference, but intuitively I feel that the literal version ought to actually be much faster than the builder function; @graydon feels the same:

 <@graydon> the literal should be much faster. if we're doing our work properly
            it should be malloc + memcpy from const. which should then hit the
            SSE/AVX paths and go very fast indeed.

I'm also worried that any slight disadvantage to using literals will result in horrifying best-practices such as "use vec::from_elem instead of the literal vector syntax".

The text was updated successfully, but these errors were encountered:

dotdash · 2013-05-19T19:52:40Z

Seems like this was fixed.

 Performance counter stats for './perlin-mut' (10 runs):

        307,636406 task-clock                #    0,695 CPUs utilized            ( +-  1,69% )
            45.213 context-switches          #    0,147 M/sec                    ( +-  8,48% )
                12 cpu-migrations            #    0,039 K/sec                    ( +- 14,63% )
               611 page-faults               #    0,002 M/sec                    ( +-  0,07% )
     1.112.870.695 cycles                    #    3,617 GHz                      ( +-  1,46% )
       409.082.095 stalled-cycles-frontend   #   36,76% frontend cycles idle     ( +-  2,41% )
   <not supported> stalled-cycles-backend  
     2.303.660.516 instructions              #    2,07  insns per cycle        
                                             #    0,18  stalled cycles per insn  ( +-  0,52% )
       327.773.749 branches                  # 1065,458 M/sec                    ( +-  0,79% )
           851.556 branch-misses             #    0,26% of all branches          ( +-  6,84% )

       0,442829468 seconds time elapsed                                          ( +-  3,41% )

 Performance counter stats for './perlin-mut-vec' (10 runs):

        306,350806 task-clock                #    0,691 CPUs utilized            ( +-  1,64% )
            44.305 context-switches          #    0,145 M/sec                    ( +-  7,05% )
                21 cpu-migrations            #    0,070 K/sec                    ( +-  5,51% )
               605 page-faults               #    0,002 M/sec                    ( +-  1,04% )
     1.112.483.266 cycles                    #    3,631 GHz                      ( +-  1,39% )
       416.287.586 stalled-cycles-frontend   #   37,42% frontend cycles idle     ( +-  2,46% )
   <not supported> stalled-cycles-backend  
     2.287.418.002 instructions              #    2,06  insns per cycle        
                                             #    0,18  stalled cycles per insn  ( +-  0,42% )
       327.114.446 branches                  # 1067,777 M/sec                    ( +-  0,64% )
           845.542 branch-misses             #    0,26% of all branches          ( +-  6,47% )

       0,443025636 seconds time elapsed                                          ( +-  3,16% )

The generated assembly just calls memset.

luqmana · 2013-05-19T20:15:26Z

Oops, seemed to have forgotten to close this.
Fixed by #5524.

luqmana closed this as completed May 19, 2013

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Literal vec notation is less efficient than vec::from_elem #4403

Literal vec notation is less efficient than vec::from_elem #4403

bstrie commented Jan 9, 2013

dotdash commented May 19, 2013

luqmana commented May 19, 2013

Literal vec notation is less efficient than vec::from_elem #4403

Literal vec notation is less efficient than vec::from_elem #4403

Comments

bstrie commented Jan 9, 2013

dotdash commented May 19, 2013

luqmana commented May 19, 2013