Skip to content

Literal vec notation is less efficient than vec::from_elem #4403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
bstrie opened this issue Jan 9, 2013 · 2 comments
Closed

Literal vec notation is less efficient than vec::from_elem #4403

bstrie opened this issue Jan 9, 2013 · 2 comments
Labels
A-codegen Area: Code generation I-slow Issue: Problems and improvements with respect to performance of generated code.

Comments

@bstrie
Copy link
Contributor

bstrie commented Jan 9, 2013

I'm updating a benchmark and noting performance differences. Here's one version (perlin-mut.rs):

struct Vec2 {
    x: f32,
    y: f32,
}

fn lerp(a: f32, b: f32, v: f32) -> f32 {
    a * (1f32 - v) + b * v
}

fn smooth(v: f32) -> f32 {
    v * v * (3f32 - 2f32 * v)
}

fn random_gradient(r: rand::Rng) -> Vec2 {
    let v = r.gen_float() * float::consts::pi * 2.0;
    Vec2{
        x: float::cos(v) as f32,
        y: float::sin(v) as f32,
    }
}

fn gradient(orig: Vec2, grad: Vec2, p: Vec2) -> f32 {
    let sp = Vec2{x: p.x - orig.x, y: p.y - orig.y};
    grad.x * sp.x + grad.y + sp.y
}

struct Noise2DContext {
    rgradients: ~[Vec2],
    permutations: ~[int],
}

fn Noise2DContext() -> ~Noise2DContext {
    let r = rand::Rng();
    let rgradients = do vec::from_fn(256) |_i| { random_gradient(r) };
    let mut permutations = do vec::from_fn(256) |i| { i as int };
    r.shuffle_mut(permutations);

    ~Noise2DContext{
        rgradients: move rgradients,
        permutations: move permutations,
    }
}

impl Noise2DContext {
    fn get_gradient(x: int, y: int) -> Vec2 {
        let idx = self.permutations[x & 255] + self.permutations[y & 255];
        self.rgradients[idx & 255]
    }

    fn get_gradients(gradients: &[mut Vec2 * 4], origins: &[mut Vec2 * 4], x: f32, y: f32) {
        let x0f = float::floor(x as libc::c_double) as f32;
        let y0f = float::floor(y as libc::c_double) as f32;
        let x0 = x0f as int;
        let y0 = y0f as int;
        let x1 = x0 + 1;
        let y1 = y0 + 1;

        gradients[0] = self.get_gradient(x0, y0);
        gradients[1] = self.get_gradient(x1, y0);
        gradients[2] = self.get_gradient(x0, y1);
        gradients[3] = self.get_gradient(x1, y1);

        origins[0] = Vec2{x: x0f + 0f32, y: y0f + 0f32};
        origins[1] = Vec2{x: x0f + 1f32, y: y0f + 0f32};
        origins[2] = Vec2{x: x0f + 0f32, y: y0f + 1f32};
        origins[3] = Vec2{x: x0f + 1f32, y: y0f + 1f32};
    }

    fn get(x: f32, y: f32) -> f32 {
        let p = Vec2{x: x, y: y};
        let gradients: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        let origins: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        self.get_gradients(&gradients, &origins, x, y);
        let v0 = gradient(origins[0], gradients[0], p);
        let v1 = gradient(origins[1], gradients[1], p);
        let v2 = gradient(origins[2], gradients[2], p);
        let v3 = gradient(origins[3], gradients[3], p);
        let fx = smooth(x - origins[0].x);
        let vx0 = lerp(v0, v1, fx);
        let vx1 = lerp(v2, v3, fx);
        let fy = smooth(y - origins[0].y);
        lerp(vx0, vx1, fy)
    }
}

fn main() {
    let symbols = [" ", "░", "▒", "▓", "█", "█"];
    let mut pixels = vec::from_elem(256*256, 0f32);
    let n2d = Noise2DContext();
    for int::range(0, 100) |_i| {
        for int::range(0, 256) |y| {
            for int::range(0, 256) |x| {
                let v = n2d.get(
                    x as f32 * 0.1f32,
                    y as f32 * 0.1f32
                ) * 0.5f32 + 0.5f32;
                pixels[y*256+x] = v;
            };
        };
    };

    for int::range(0, 256) |y| {
        for int::range(0, 256) |x| {
            io::print(symbols[pixels[y*256+x] / 0.2f32 as int]);
        }
        io::println("");
    }
}

Here's a version that's had the following line altered: let mut pixels = vec::from_elem(256*256, 0f32); becomes let mut pixels = ~[0f32, ..65536];. For reproducibility, here's the whole file (perlin-mut-literal-tilde.rs):

struct Vec2 {
    x: f32,
    y: f32,
}

fn lerp(a: f32, b: f32, v: f32) -> f32 {
    a * (1f32 - v) + b * v
}

fn smooth(v: f32) -> f32 {
    v * v * (3f32 - 2f32 * v)
}

fn random_gradient(r: rand::Rng) -> Vec2 {
    let v = r.gen_float() * float::consts::pi * 2.0;
    Vec2{
        x: float::cos(v) as f32,
        y: float::sin(v) as f32,
    }
}

fn gradient(orig: Vec2, grad: Vec2, p: Vec2) -> f32 {
    let sp = Vec2{x: p.x - orig.x, y: p.y - orig.y};
    grad.x * sp.x + grad.y + sp.y
}

struct Noise2DContext {
    rgradients: ~[Vec2],
    permutations: ~[int],
}

fn Noise2DContext() -> ~Noise2DContext {
    let r = rand::Rng();
    let rgradients = do vec::from_fn(256) |_i| { random_gradient(r) };
    let mut permutations = do vec::from_fn(256) |i| { i as int };
    r.shuffle_mut(permutations);

    ~Noise2DContext{
        rgradients: move rgradients,
        permutations: move permutations,
    }
}

impl Noise2DContext {
    fn get_gradient(x: int, y: int) -> Vec2 {
        let idx = self.permutations[x & 255] + self.permutations[y & 255];
        self.rgradients[idx & 255]
    }

    fn get_gradients(gradients: &[mut Vec2 * 4], origins: &[mut Vec2 * 4], x: f32, y: f32) {
        let x0f = float::floor(x as libc::c_double) as f32;
        let y0f = float::floor(y as libc::c_double) as f32;
        let x0 = x0f as int;
        let y0 = y0f as int;
        let x1 = x0 + 1;
        let y1 = y0 + 1;

        gradients[0] = self.get_gradient(x0, y0);
        gradients[1] = self.get_gradient(x1, y0);
        gradients[2] = self.get_gradient(x0, y1);
        gradients[3] = self.get_gradient(x1, y1);

        origins[0] = Vec2{x: x0f + 0f32, y: y0f + 0f32};
        origins[1] = Vec2{x: x0f + 1f32, y: y0f + 0f32};
        origins[2] = Vec2{x: x0f + 0f32, y: y0f + 1f32};
        origins[3] = Vec2{x: x0f + 1f32, y: y0f + 1f32};
    }

    fn get(x: f32, y: f32) -> f32 {
        let p = Vec2{x: x, y: y};
        let gradients: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        let origins: [mut Vec2 * 4] = [mut
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
            Vec2{x:0f32, y:0f32},
        ];
        self.get_gradients(&gradients, &origins, x, y);
        let v0 = gradient(origins[0], gradients[0], p);
        let v1 = gradient(origins[1], gradients[1], p);
        let v2 = gradient(origins[2], gradients[2], p);
        let v3 = gradient(origins[3], gradients[3], p);
        let fx = smooth(x - origins[0].x);
        let vx0 = lerp(v0, v1, fx);
        let vx1 = lerp(v2, v3, fx);
        let fy = smooth(y - origins[0].y);
        lerp(vx0, vx1, fy)
    }
}

fn main() {
    let symbols = [" ", "░", "▒", "▓", "█", "█"];
    //let mut pixels = vec::from_elem(256*256, 0f32);
    let mut pixels = ~[0f32, ..65536];
    let n2d = Noise2DContext();
    for int::range(0, 100) |_i| {
        for int::range(0, 256) |y| {
            for int::range(0, 256) |x| {
                let v = n2d.get(
                    x as f32 * 0.1f32,
                    y as f32 * 0.1f32
                ) * 0.5f32 + 0.5f32;
                pixels[y*256+x] = v;
            };
        };
    };

    for int::range(0, 256) |y| {
        for int::range(0, 256) |x| {
            io::print(symbols[pixels[y*256+x] / 0.2f32 as int]);
        }
        io::println("");
    }
}

Both programs were compiled with rustc --opt-level=3 and profiled as follows:

$ (perf stat -r 10 perlin-mut) 2> mut3.txt
$ (perf stat -r 10 perlin-mut-literal-tilde) 2> mut-literal-tilde2.txt

mut3.txt:

 Performance counter stats for 'perlin-mut' (10 runs):

    1346.376055  task-clock-msecs         #      0.982 CPUs    ( +-   0.136% )
             74  context-switches         #      0.000 M/sec   ( +-   4.013% )
              1  CPU-migrations           #      0.000 M/sec   ( +-   9.091% )
            738  page-faults              #      0.001 M/sec   ( +-   0.047% )
   166000485913  cycles                   # 123294.295 M/sec   ( +-   3.857% )
   166000485913  instructions             #      1.000 IPC     ( +-   3.857% )
   166000485913  branches                 # 123294.295 M/sec   ( +-   3.857% )
   166000485913  branch-misses            #    100.000 %       ( +-   3.857% )
   166000485913  cache-references         # 123294.295 M/sec   ( +-   3.857% )
   166000485913  cache-misses             # 123294.295 M/sec   ( +-   3.857% )

    1.371346350  seconds time elapsed   ( +-   0.181% )

mut-literal-tilde2.txt:

 Performance counter stats for 'perlin-mut-literal-tilde' (10 runs):

    1376.394325  task-clock-msecs         #      0.986 CPUs    ( +-   0.145% )
            111  context-switches         #      0.000 M/sec   ( +-   8.048% )
              1  CPU-migrations           #      0.000 M/sec   ( +-   9.091% )
            738  page-faults              #      0.001 M/sec   ( +-   0.036% )
   238800181546  cycles                   # 173496.924 M/sec   ( +-   7.465% )
   238800181546  instructions             #      1.000 IPC     ( +-   7.465% )
   238800181546  branches                 # 173496.924 M/sec   ( +-   7.465% )
   238800181546  branch-misses            #    100.000 %       ( +-   7.465% )
   238800181546  cache-references         # 173496.924 M/sec   ( +-   7.465% )
   238800181546  cache-misses             # 173496.924 M/sec   ( +-   7.465% )

    1.395384346  seconds time elapsed   ( +-   0.311% )

It's not a large difference, but intuitively I feel that the literal version ought to actually be much faster than the builder function; @graydon feels the same:

 <@graydon> the literal should be much faster. if we're doing our work properly
            it should be malloc + memcpy from const. which should then hit the
            SSE/AVX paths and go very fast indeed.

I'm also worried that any slight disadvantage to using literals will result in horrifying best-practices such as "use vec::from_elem instead of the literal vector syntax".

@dotdash
Copy link
Contributor

dotdash commented May 19, 2013

Seems like this was fixed.

 Performance counter stats for './perlin-mut' (10 runs):

        307,636406 task-clock                #    0,695 CPUs utilized            ( +-  1,69% )
            45.213 context-switches          #    0,147 M/sec                    ( +-  8,48% )
                12 cpu-migrations            #    0,039 K/sec                    ( +- 14,63% )
               611 page-faults               #    0,002 M/sec                    ( +-  0,07% )
     1.112.870.695 cycles                    #    3,617 GHz                      ( +-  1,46% )
       409.082.095 stalled-cycles-frontend   #   36,76% frontend cycles idle     ( +-  2,41% )
   <not supported> stalled-cycles-backend  
     2.303.660.516 instructions              #    2,07  insns per cycle        
                                             #    0,18  stalled cycles per insn  ( +-  0,52% )
       327.773.749 branches                  # 1065,458 M/sec                    ( +-  0,79% )
           851.556 branch-misses             #    0,26% of all branches          ( +-  6,84% )

       0,442829468 seconds time elapsed                                          ( +-  3,41% )
 Performance counter stats for './perlin-mut-vec' (10 runs):

        306,350806 task-clock                #    0,691 CPUs utilized            ( +-  1,64% )
            44.305 context-switches          #    0,145 M/sec                    ( +-  7,05% )
                21 cpu-migrations            #    0,070 K/sec                    ( +-  5,51% )
               605 page-faults               #    0,002 M/sec                    ( +-  1,04% )
     1.112.483.266 cycles                    #    3,631 GHz                      ( +-  1,39% )
       416.287.586 stalled-cycles-frontend   #   37,42% frontend cycles idle     ( +-  2,46% )
   <not supported> stalled-cycles-backend  
     2.287.418.002 instructions              #    2,06  insns per cycle        
                                             #    0,18  stalled cycles per insn  ( +-  0,42% )
       327.114.446 branches                  # 1067,777 M/sec                    ( +-  0,64% )
           845.542 branch-misses             #    0,26% of all branches          ( +-  6,47% )

       0,443025636 seconds time elapsed                                          ( +-  3,16% )

The generated assembly just calls memset.

@luqmana
Copy link
Member

luqmana commented May 19, 2013

Oops, seemed to have forgotten to close this.
Fixed by #5524.

@luqmana luqmana closed this as completed May 19, 2013
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
A-codegen Area: Code generation I-slow Issue: Problems and improvements with respect to performance of generated code.
Projects
None yet
Development

No branches or pull requests

3 participants