|
| 1 | +// So far in Ziglings, we've seen how for loops can be used to |
| 2 | +// repeat calculations across an array in several ways. |
| 3 | +// |
| 4 | +// For loops are generally great for this kind of task, but |
| 5 | +// sometimes they don't fully utilize the capabilities of the |
| 6 | +// CPU. |
| 7 | +// |
| 8 | +// Most modern CPUs can execute instructions in which SEVERAL |
| 9 | +// calculations are performed WITHIN registers at the SAME TIME. |
| 10 | +// These are known as "single instruction, multiple data" (SIMD) |
| 11 | +// instructions. SIMD instructions can make code significantly |
| 12 | +// more performant. |
| 13 | +// |
| 14 | +// To see why, imagine we have a program in which we take the |
| 15 | +// square root of four (changing) f32 floats. |
| 16 | +// |
| 17 | +// A simple compiler would take the program and produce machine code |
| 18 | +// which calculates each square root sequentially. Most registers on |
| 19 | +// modern CPUs have 64 bits, so we could imagine that each float moves |
| 20 | +// into a 64-bit register, and the following happens four times: |
| 21 | +// |
| 22 | +// 32 bits 32 bits |
| 23 | +// +-------------------+ |
| 24 | +// register | 0 | x | |
| 25 | +// +-------------------+ |
| 26 | +// |
| 27 | +// | |
| 28 | +// [SQRT instruction] |
| 29 | +// V |
| 30 | +// |
| 31 | +// +-------------------+ |
| 32 | +// | 0 | sqrt(x) | |
| 33 | +// +-------------------+ |
| 34 | +// |
| 35 | +// Notice that half of the register contains blank data to which |
| 36 | +// nothing happened. What a waste! What if we were able to use |
| 37 | +// that space instead? This is the idea at the core of SIMD. |
| 38 | +// |
| 39 | +// Most modern CPUs contain specialized registers with at least 128 bits |
| 40 | +// for performing SIMD instructions. On a machine with 128-bit SIMD |
| 41 | +// registers, a smart compiler would probably NOT issue four sqrt |
| 42 | +// instructions as above, but instead pack the floats into a single |
| 43 | +// 128-bit register, then execute a single "packed" sqrt |
| 44 | +// instruction to do ALL the square root calculations at once. |
| 45 | +// |
| 46 | +// For example: |
| 47 | +// |
| 48 | +// |
| 49 | +// 32 bits 32 bits 32 bits 32 bits |
| 50 | +// +---------------------------------------+ |
| 51 | +// register | 4.0 | 9.0 | 25.0 | 49.0 | |
| 52 | +// +---------------------------------------+ |
| 53 | +// |
| 54 | +// | |
| 55 | +// [SIMD SQRT instruction] |
| 56 | +// V |
| 57 | +// |
| 58 | +// +---------------------------------------+ |
| 59 | +// register | 2.0 | 3.0 | 5.0 | 7.0 | |
| 60 | +// +---------------------------------------+ |
| 61 | +// |
| 62 | +// Pretty cool, right? |
| 63 | +// |
| 64 | +// Code with SIMD instructions is usually more performant than code |
| 65 | +// without SIMD instructions. Zig cares a lot about performance, |
| 66 | +// so it has built-in support for SIMD! It has a data structure that |
| 67 | +// directly supports SIMD instructions: |
| 68 | +// |
| 69 | +// +-----------+ |
| 70 | +// | Vectors | |
| 71 | +// +-----------+ |
| 72 | +// |
| 73 | +// Operations performed on vectors in Zig will be done in parallel using |
| 74 | +// SIMD instructions, whenever possible. |
| 75 | +// |
| 76 | +// Defining vectors in Zig is straightforwards. No library import is needed. |
| 77 | +const v1 = @Vector(3, i32){ 1, 10, 100 }; |
| 78 | +const v2 = @Vector(3, f32){ 2.0, 3.0, 5.0 }; |
| 79 | + |
| 80 | +// Vectors support the same builtin operators as their underlying base types. |
| 81 | +const v3 = v1 + v1; // { 2, 20, 200}; |
| 82 | +const v4 = v2 * v2; // { 4.0, 9.0, 25.0}; |
| 83 | + |
| 84 | +// Intrinsics that apply to base types usually extend to vectors. |
| 85 | +const v5: @Vector(3, f32) = @floatFromInt(v3); // { 2.0, 20.0, 200.0} |
| 86 | +const v6 = v4 - v5; // { 2.0, -11.0, -175.0} |
| 87 | +const v7 = @abs(v6); // { 2.0, 11.0, 175.0} |
| 88 | + |
| 89 | +// We can make constant vectors, and reduce vectors. |
| 90 | +const v8: @Vector(4, u8) = @splat(2); // { 2, 2, 2, 2} |
| 91 | +const v8_sum = @reduce(.Add, v8); // 8 |
| 92 | +const v8_min = @reduce(.Min, v8); // 2 |
| 93 | + |
| 94 | +// Fixed-length arrays can be automatically assigned to vectors (and vice-versa). |
| 95 | +const single_digit_primes = [4]i8{ 2, 3, 5, 7 }; |
| 96 | +const prime_vector: @Vector(4, i8) = single_digit_primes; |
| 97 | + |
| 98 | +// Now let's use vectors to simplify and optimize some code! |
| 99 | +// |
| 100 | +// Ewa is writing a program in which they frequently want to compare |
| 101 | +// two lists of four f32s. Ewa expects the lists to be similar, and |
| 102 | +// wants to determine the largest pairwise difference between the lists. |
| 103 | +// |
| 104 | +// Ewa wrote the following function to figure this out. |
| 105 | + |
| 106 | +fn calcMaxPairwiseDiffOld(list1: [4]f32, list2: [4]f32) f32 { |
| 107 | + var max_diff: f32 = 0; |
| 108 | + for (list1, list2) |n1, n2| { |
| 109 | + const abs_diff = @abs(n1 - n2); |
| 110 | + if (abs_diff > max_diff) { |
| 111 | + max_diff = abs_diff; |
| 112 | + } |
| 113 | + } |
| 114 | + return max_diff; |
| 115 | +} |
| 116 | + |
| 117 | +// Ewa heard about vectors in Zig, and started writing a new vector |
| 118 | +// version of the function, but has got stuck! |
| 119 | +// |
| 120 | +// Help Ewa finish the vector version! The examples above should help. |
| 121 | + |
| 122 | +const Vec4 = @Vector(4, f32); |
| 123 | +fn calcMaxPairwiseDiffNew(a: Vec4, b: Vec4) f32 { |
| 124 | + const abs_diff_vec = ???; |
| 125 | + const max_diff = @reduce(???, abs_diff_vec); |
| 126 | + return max_diff; |
| 127 | +} |
| 128 | + |
| 129 | +// Quite the simplification! We could even write the function in one line |
| 130 | +// and it would still be readable. |
| 131 | +// |
| 132 | +// Since the entire function is now expressed in terms of vector operations, |
| 133 | +// the Zig compiler will easily be able to compile it down to machine code |
| 134 | +// which utilizes the all-powerful SIMD instructions and does a lot of the |
| 135 | +// computation in parallel. |
| 136 | + |
| 137 | +const std = @import("std"); |
| 138 | +const print = std.debug.print; |
| 139 | + |
| 140 | +pub fn main() void { |
| 141 | + const l1 = [4]f32{ 3.141, 2.718, 0.577, 1.000 }; |
| 142 | + const l2 = [4]f32{ 3.154, 2.707, 0.591, 0.993 }; |
| 143 | + const mpd_old = calcMaxPairwiseDiffOld(l1, l2); |
| 144 | + const mpd_new = calcMaxPairwiseDiffNew(l1, l2); |
| 145 | + print("Max difference (old fn): {d: >5.3}\n", .{mpd_old}); |
| 146 | + print("Max difference (new fn): {d: >5.3}\n", .{mpd_new}); |
| 147 | +} |
0 commit comments