diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..08f8cf1 --- /dev/null +++ b/.clang-format @@ -0,0 +1,5 @@ +--- +Language: Cpp +BasedOnStyle: WebKit +... + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..467d949 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +**/accel +**/vitis_hls.log +zynq/build \ No newline at end of file diff --git a/README.md b/README.md index f2b0c93..bb36914 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ You should submit the following files: * Your optimized implementation source code, gzipped as `solution.tar.gz` which includes: * Training scripts/programs for reproducibility (if you re-trained or changed the classifier) * Your hardware design source (if you changed the hardware source, as a .cpp and .h file) - * The `classifier.bit` bitstream and the iPython notebook `classifier.ipynb` to run design on the PYNQ board. + * The `classifier.bit` bitstream and the jupyter notebook `classifier.ipynb` to run design on the PYNQ board. ### Story @@ -47,8 +47,8 @@ This high-level language is very similar to C/C++, but incorporates compiler pra In this lab, we’ve trained a naive [linear classifier](https://en.wikipedia.org/wiki/Linear_classifier) to perform hand digit recognition on scaled-down 16x16 images of hand-written digits. This equates to performing matrix multiplication over **I** and **W** where **I** is an (BxF) input matrix and **W** is an (FxC) weight matrix, and where B, F and C denote batch size, input features size and category count respectively. You can find the implementation of the linear classifier under `python/mnist.py`. To execute this Python program, you will need: -* Python 2.7 and pip -* The following packages: numpy, scipy, sklearn, Pillow +* Python 3.8 and pip +* The following packages: matplotlib, numpy, scikit-image, scikit-learn ### Hardware Kit Overview @@ -65,47 +65,49 @@ In addition, you will need an Ethernet port on your machine to communicate with If you don’t have a 64-bit Linux OS installed on your machine, we recommend [VirtualBox](https://www.virtualbox.org/wiki/VirtualBox) (free) or dual booting your machine. -Make sure to allocate at least 32GB (or 64GB preferably) of disk drive space for your VM’s main partition. In addition, compilation jobs can be resource-intensive, so allocating 4-8GB of DRAM for your VM would be wise. We’ve tested the tools under Ubuntu 16.04.2 but any of the following OSes or newer should work: -* Red Hat Enterprise Linux 6.6 64-bit -* CentOS Linux 6.7 -* SUSE Enterprise Linux 11.4 64-bit -* Ubuntu Linux 16.04.1 LTS 64-bit +Make sure to allocate at least 80GB (or 128GB preferably) of disk drive space for your VM’s main partition. In addition, compilation jobs can be resource-intensive, so allocating 4 CPU cores and 6-8 GB of DRAM for your VM would be wise. We’ve tested the tools under Ubuntu 20.04.4 LTS. -### Vivado HL WebPACK 2017.1 +### Vivado Design Suite 2020.2 -You’ll need to install Xilinx’ FPGA compilation toolchain, [Vivado HL WebPACK 2017.1](https://www.xilinx.com/products/design-tools/vivado.html), which a license-free version of the Vivado HLx toolchain. +You’ll need to install Xilinx’ FPGA compilation toolchain, Vivado Design Suite 2020.2. Xilinx provides a free edition and that's enough for this project. -1. Go to the [download webpage](https://www.xilinx.com/support/download.html), and download the Linux Self Extracting Web Installer for Vivado HL 2017.1 WebPACK and Editions. +1. Go to the [download webpage](https://www.xilinx.com/support/download/index.html/content/xilinx/en/downloadNav/vivado-design-tools/archive.html), select version 2020.2 and download the **Xilinx Unified Installer 2020.2: Linux Self Extracting Web Installer**. 2. You’ll have to sign in with a Xilinx account. This requires a Xilinx account creation that will take 2 minutes. -3. Pass the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Vivado_SDK_2017.1_0415_1_Lin64.bin`. -4. Now that the file is downloaded, go to your Downloads directory, and change the file permissions so it can be executed: `chmod u+x Xilinx_Vivado_SDK_2017.1_0415_1_Lin64.bin` -5. Now you can execute the binary: `./Xilinx_Vivado_SDK_2017.1_0415_1_Lin64.bin` -6. A Vivado 2017.1 Installer program GUI will launch. - * Click “Next” on the **Welcome** screen. - * Enter your Xilinx User Credentials under “User Authentication” and select the “Download and Install Now” before clicking “Next” on the **Select Install Type** screen. - * Accept all terms before clicking on “Next” on the **Accept License Agreements** screen. - * Select the “Vivado HL WebPACK” before clicking on “Next” on the **Select Edition to Install** screen. - * Under the **Vivado HL WebPACK** screen, before hitting “Next", check the following options (the rest should be unchecked): - * Design Tools -> Vivado Design Suite -> Vivado - * Design Tools -> Vivado Design Suite -> Vivado High Level Synthesis - * Devices -> Production Services -> SoCs -> Zynq-7000 Series - * Your total download size should be about 3GB and the amount of Disk Space Required 13GB. - * Set the installation directory before clicking “Next” on the **Select Destination Directory** screen. It might highlight some paths as red - that’s because the installer doesn’t have the permission to write to that directory. In that case select a path that doesn’t require special write permissions (e.g. in your home directory). - * Hit “Install” under the **Installation Summary** screen. - * An **Installation Progress Window** will pop-up to track progress of the download and the installation. +3. Pass the Name and Address Verification by clicking “Next”, and you will get the opportunity to download a binary file, called `Xilinx_Unified_2020.2_1118_1232_Lin64.bin`. +4. Now that the file is downloaded, go to your Downloads directory, and change the file permissions so it can be executed: `chmod u+x Xilinx_Unified_2020.2_1118_1232_Lin64.bin` +5. Now you can execute the binary: `./Xilinx_Unified_2020.2_1118_1232_Lin64.bin` +6. A Xilinx Unified 2020.2 Installer program GUI will launch. + * On the _A Newer Version Is Available_ prompt, click “Continue” to continue installing version 2020.2. + * Click “Next” on the _Welcome_ screen. + * Enter your Xilinx User Credentials under “User Authentication” and select the “Download and Install Now” before clicking “Next” on the _Select Install Type_ screen. + * Select “Vivado” and click “Next”. + ![](image/install-01-product.png) + * Select “Vivado HL WebPACK” and click “Next”. + ![](image/install-02-edition.png) + * Under the _Vivado Design Suite_ screen, before hitting “Next", check the following options (the rest can be unchecked): + * Design Tools -> Vivado Design Suite + * Devices -> Devices for Custom Platforms -> SoCs -> Zynq-7000 + ![](image/install-03-component.png) + * Your total download size should be about 13.86 GB and the amount of Disk Space Required 77.67 GB. + * Accept all terms before clicking on “Next” on the _Accept License Agreements_ screen. + * Set the installation directory before clicking “Next” on the _Select Destination Directory_ screen. It might highlight some paths as red - that’s because the installer doesn’t have the permission to write to that directory. In that case select a path that doesn’t require special write permissions (e.g. in your home directory). + * Hit “Install” under the _Installation Summary_ screen. + * An _Installation Progress Window_ will pop-up to track progress of the download and the installation. * This process will take about 20-30 minutes depending on your connection speed. - * A pop-up window will inform you that the installation completed successfully. Click "OK". - * Finally the **Vivado License Manager** will launch. Select "Get Free ISE WebPACK, ISE/Vivado IP or PetaLinux License" and click "Connect Now" to complete the license registration process. -7. The last step is to update your `~/.bashrc` with the following line: -```bash -# Xilinx Vivado 2017.1 environment -source /Vivado/2017.1/settings64.sh -``` + * A pop-up window will inform you that the installation completed successfully. Click “OK”. +7. The last step is to install other dependencies. + ```bash + sudo apt install git libtinfo5 build-essential gcc-multilib + ``` +8. You'll need to set environment variables to launch Vitis HLS and Vivado, by executing the command below. You may also append it to your `~/.bashrc` so that the variables can be loaded automatically on logging on. + ```bash + source /Xilinx/Vitis_HLS/2020.2/settings64.sh + ``` ### PYNQ board The PYNQ board website complete with documentation and forums is available [here](http://www.pynq.io/). -Follow the **Getting Started** tutorial to get your Pynq board set up (please read the notes below first). Note that if you have any issues with the board (booting problems, Python errors, etc.), consider flashing your SD card with [version 1.4 of the Pynq runtime](https://github.com/Xilinx/PYNQ/releases/tag/v1.4). +Follow the **Getting Started** tutorial to get your Pynq board set up (please read the notes below first). Note that if you have any issues with the board (booting problems, Python errors, etc.), consider flashing your SD card with [version 2.7.0 of the Pynq runtime](https://github.com/Xilinx/PYNQ/releases/tag/v2.7.0). **SD card flashing notes** * We recommend using [Etcher](https://etcher.io/) for one-step SD-card flashing. You can download the image for the SD card on the PYNQ board [website](http://www.pynq.io/). @@ -122,18 +124,20 @@ Follow the **Getting Started** tutorial to get your Pynq board set up (please re **Connecting to Jupyter notes:** * It seems like you won’t be able to connect to the board successfully using either Firefox or Safari. We recommend using [Chrome](https://www.google.com/chrome/) instead. +* If your board is configured correctly you will be presented with a login screen. The username is `xilinx` and the password is also `xilinx`. -Try one of the iPython notebook examples available out-of-the-box on your PYNQ board to make sure that it works as intended! +Try one of the jupyter notebook examples available out-of-the-box on your PYNQ board to make sure that it works as intended! # Part 1: Matrix Multiplication Pipeline Optimization in HLS (40 marks) -This first part will cover fundamentals of high level synthesis. +This first part will cover fundamentals of high level synthesis. ### Recommended Reading -The [Vivado HLS User Guide](https://www.xilinx.com/support/documentation/sw_manuals/xilinx2017_1/ug902-vivado-high-level-synthesis.pdf) provides plenty of valuable reference information. We *strongly* recommend reading the **Understanding High-Level Synthesis** pages 5-12 as an introduction. +The [Vitis High-Level Synthesis User Guide (UG1399) +](https://docs.xilinx.com/r/2020.2-English/ug1399-vitis-hls/Introduction-to-Vitis-HLS) provides plenty of valuable reference information. We *strongly* recommend reading the **Introduction to Vitis HLS** as an introduction. -As an optional exercise, we recommend going through the [Vivado HLS Tutorial](https://www.xilinx.com/support/documentation/sw_manuals/xilinx2017_1/ug871-vivado-high-level-synthesis-tutorial.pdf) Chapters 6 to familiarize yourself with design analysis and design optimization techniques (about 30mins). + ## A. Understanding the baseline matrix multiply (background) @@ -164,59 +168,66 @@ The `mmult_hw()` function contains certain high-level HLS-specific pragmas defin * *AXI Lite* is a simple protocol that provides point-to-point bi-directional communication between the ARM processor and the HLS accelerator to control the HLS accelerator tasks. * *AXI Stream* provides a unidirectional throughput-oriented communication channel between the ARM processor's memory system and the hardware accelerator and vice-versa. -In order to interface with the input and output AXI streams, we've provided two function helpers: `pop_stream()` and `push_stream()`. The code below shows how these two helper functions are used to stream data into and out of the user-instantiated hardware buffers. +In order to interface with the input and output AXI streams, we've provided two function helpers: `pop_stream()` and `push_stream()`. The code below shows how these two helper functions are used to stream data into and out of the user-instantiated hardware buffers. ```c++ // Input stream and output stream sizes -#define IS_SIZE CLASSES+CLASSES*FEAT+BATCHES*FEAT -#define OS_SIZE BATCHES*CLASSES +#define IS_SIZE CLASSES + CLASSES* FEAT + BATCHES* FEAT +#define OS_SIZE BATCHES* CLASSES // AXI Stream interface -void mmult_hw (AXI_VAL in_stream[IS_SIZE], AXI_VAL out_stream[OS_SIZE]) { - +void mmult_hw(hls::stream& in_stream, hls::stream& out_stream) +{ + // Hardware buffers float offset_buf[CLASSES]; float weight_buf[CLASSES][FEAT]; float in_buf[BATCHES][FEAT]; float out_buf[BATCHES][CLASSES]; - + // Input and output AXI stream indices - int is_idx = 0; int os_idx = 0; - // Stream data into offset_buf - LOAD_OFF: for (int i = 0; i < CLASSES; i++) { +// Stream data into offset_buf +LOAD_OFF: + for (int i = 0; i < CLASSES; i++) { // Pop data from stream - offset_buf[i] = pop_stream(in_stream[is_idx++]); + offset_buf[i] = pop_stream(in_stream); } - - // Stream data into weight_buf - LOAD_W_1: for (int i = 0; i < CLASSES; i++) { - LOAD_W_2: for (int j = 0; j < FEAT; j++) { + +// Stream data into weight_buf +LOAD_W_1: + for (int i = 0; i < CLASSES; i++) { + LOAD_W_2: + for (int j = 0; j < FEAT; j++) { // Pop data from stream - weight_buf[i][j] = pop_stream(in_stream[is_idx++]); + weight_buf[i][j] = pop_stream(in_stream); } } - - // Stream data into in_buf - LOAD_I_1: for (int i = 0; i < BATCH; i++) { - LOAD_I_2: for (int j = 0; j < FEAT; j++) { + +// Stream data into in_buf +LOAD_I_1: + for (int i = 0; i < BATCH; i++) { + LOAD_I_2: + for (int j = 0; j < FEAT; j++) { // Pop data from stream - in_buf[i][j] = pop_stream(in_stream[is_idx++]); - } + in_buf[i][j] = pop_stream(in_stream); + } } - - // Do Matrix Multiplication - ... - - - // Stream data out of out_buf - STORE_O_1: for (int i = 0; i < BATCH; i++) { - STORE_O_2: for (int j = 0; j < CLASSES; j++) { + +// Do Matrix Multiplication +... + +// Stream data out of out_buf +STORE_O_1: + for (int i = 0; i < BATCH; i++) { + STORE_O_2: + for (int j = 0; j < CLASSES; j++) { // Push output element into AXI stream // push_stream's second argument should be set to True when sending the last packet out - out_stream[os_idx] = push_stream(out_buf[i][j], os_idx++ == (BATCH*CLASSES-1)); - } + out_stream = push_stream(out_buf[i][j], os_idx == (BATCH * CLASSES - 1)); + os_idx++; + } } } ``` @@ -226,25 +237,28 @@ The code above assumes that the size of each AXI Stream data packet matches the In order to handle data type conversions you can use a `union`. The code below shows how to perform type conversion quickly. ```c++ -union -{ - axi_T packet; - struct {float in_0; float in_1;} val; -} converter; + union { + axi_T packet; + struct { + float in_0; + float in_1; + } val; + } converter; ... // Stream data into w_buf -LOAD_W_1: for (int i = 0; i < CLASSES; i++) { +LOAD_W_1: + for (int i = 0; i < CLASSES; i++) { // Increment by 2 (ratio between AXI bus width and float width) - LOAD_W_2: for (int j = 0; j < FEAT; j+=2) { - // Pop data from stream - int k = i*FEAT+j; - converter.packet = pop_stream(in_stream[k]); - w_buf[i][j+0] = converter.val.in_0; - w_buf[i][j+1] = converter.val.in_1; + LOAD_W_2: + for (int j = 0; j < FEAT; j += 2) { + // Pop data from stream + converter.packet = pop_stream(in_stream); + w_buf[i][j + 0] = converter.val.in_0; + w_buf[i][j + 1] = converter.val.in_1; + } } -} ``` ### HLS Compilation and Design Analysis @@ -254,7 +268,7 @@ Now the code in `mmult_float.cpp` should look a lot more familiar! Execute the HLS compilation (takes about 15-30s for the base design): ``` cd hls/mmult_float/ -vivado_hls -f hls.tcl +vitis_hls -f hls.tcl ``` The `hls.tcl` contains a sequence of commands to compile your design via HLS. You don't need to modify this file. It will run basic correctness checks in simulation, which are specified in the `mmult_test.cpp` test-bench. If the simulation fails, your hardware design won't be synthesized. @@ -265,62 +279,62 @@ The `BRAM_18K` are on-chip SRAM memories, the `DSP48E` are hard fused multiply a It is clear that the design has quite a small footprint: less than 5% utilization overall (bottlenecked by the `BRAM_18K`). ``` -+-----------------+---------+-------+--------+-------+ -| Name | BRAM_18K| DSP48E| FF | LUT | -+-----------------+---------+-------+--------+-------+ -|DSP | -| -| -| -| -|Expression | -| -| 0| 308| -|FIFO | -| -| -| -| -|Instance | 0| 5| 384| 751| -|Memory | 16| -| 0| 0| -|Multiplexer | -| -| -| 381| -|Register | -| -| 714| -| -+-----------------+---------+-------+--------+-------+ -|Total | 16| 5| 1098| 1440| -+-----------------+---------+-------+--------+-------+ -|Available | 280| 220| 106400| 53200| -+-----------------+---------+-------+--------+-------+ -|Utilization (%) | 5| 2| 1| 2| -+-----------------+---------+-------+--------+-------+ ++-----------------+---------+-----+--------+-------+-----+ +| Name | BRAM_18K| DSP | FF | LUT | URAM| ++-----------------+---------+-----+--------+-------+-----+ +|DSP | -| -| -| -| -| +|Expression | -| -| 0| 503| -| +|FIFO | -| -| -| -| -| +|Instance | 0| 5| 384| 751| -| +|Memory | 14| -| 64| 5| -| +|Multiplexer | -| -| -| 376| -| +|Register | -| -| 369| -| -| ++-----------------+---------+-----+--------+-------+-----+ +|Total | 14| 5| 817| 1635| 0| ++-----------------+---------+-----+--------+-------+-----+ +|Available | 280| 220| 106400| 53200| 0| ++-----------------+---------+-----+--------+-------+-----+ +|Utilization (%) | 5| 2| ~0| 3| 0| ++-----------------+---------+-----+--------+-------+-----+ ``` You will also find a performance estimate in the report: ``` -+--------+--------+--------+--------+---------+ -| Latency | Interval | Pipeline| -| min | max | min | max | Type | -+--------+--------+--------+--------+---------+ -| 209851| 209851| 209852| 209852| none | -+--------+--------+--------+--------+---------+ ++---------+---------+----------+----------+--------+--------+---------+ +| Latency (cycles) | Latency (absolute) | Interval | Pipeline| +| min | max | min | max | min | max | Type | ++---------+---------+----------+----------+--------+--------+---------+ +| 228022| 228022| 2.280 ms| 2.280 ms| 228023| 228023| none| ++---------+---------+----------+----------+--------+--------+---------+ ``` -The Latency indicates how many FPGA cycles it takes to perform one matrix multiplication on the FPGA (i.e. a batch of inference tasks). The Initiation Interval describes how many cycles you'd have to wait until you can process the next batch of input data. These two metrics are identical because the entire algorithm is too large to be pipelined. +The Latency indicates how many FPGA cycles it takes to perform one matrix multiplication on the FPGA (i.e. a batch of inference tasks). The Initiation Interval describes how many cycles you'd have to wait until you can process the next batch of input data. These two metrics are identical because the entire algorithm is too large to be pipelined. -Since the FPGA design is clocked at 100MHz, it takes 2.099ms to perform a single inference. This is very slow and the ARM CPU clocked at 667MHz with a dedicated FPU would have no problem beating this naive implementation. +Since the FPGA design is clocked at 100MHz, it takes 2.280ms to perform a single inference. This is very slow and the ARM CPU clocked at 667MHz with a dedicated FPU would have no problem beating this naive implementation. Let's take a deeper look at the loop analysis report (you now understand why we labeled the loops in the first place) to identify optimization opportunities. ``` -+--------------+--------+--------+----------+-----------+-----------+------+----------+ -| | Latency | Iteration| Initiation Interval | Trip | | -| Loop Name | min | max | Latency | achieved | target | Count| Pipelined| -+--------------+--------+--------+----------+-----------+-----------+------+----------+ -|- LOAD_OFF_1 | 10| 10| 2| -| -| 5| no | -|- LOAD_W_1 | 2580| 2580| 258| -| -| 10| no | -| + LOAD_W_2 | 256| 256| 2| -| -| 128| no | -|- LOAD_I_1 | 2064| 2064| 258| -| -| 8| no | -| + LOAD_I_2 | 256| 256| 2| -| -| 128| no | -|- L1 | 205056| 205056| 25632| -| -| 8| no | -| + L2 | 25630| 25630| 2563| -| -| 10| no | -| ++ L3 | 2560| 2560| 10| -| -| 256| no | -|- STORE_O_1 | 136| 136| 17| -| -| 8| no | -| + STORE_O_2 | 15| 15| 3| -| -| 5| no | -+--------------+--------+--------+----------+-----------+-----------+------+----------+ ++--------------+---------+---------+----------+-----------+-----------+------+----------+ +| | Latency (cycles) | Iteration| Initiation Interval | Trip | | +| Loop Name | min | max | Latency | achieved | target | Count| Pipelined| ++--------------+---------+---------+----------+-----------+-----------+------+----------+ +|- LOAD_OFF_1 | 5| 5| 1| -| -| 5| no| +|- LOAD_W_1 | 1300| 1300| 130| -| -| 10| no| +| + LOAD_W_2 | 128| 128| 1| -| -| 128| no| +|- LOAD_I_1 | 1040| 1040| 130| -| -| 8| no| +| + LOAD_I_2 | 128| 128| 1| -| -| 128| no| +|- L1 | 225536| 225536| 28192| -| -| 8| no| +| + L2 | 28190| 28190| 2819| -| -| 10| no| +| ++ L3 | 2816| 2816| 11| -| -| 256| no| +|- STORE_O_1 | 136| 136| 17| -| -| 8| no| +| + STORE_O_2 | 15| 15| 3| -| -| 5| no| ++--------------+---------+---------+----------+-----------+-----------+------+----------+ ``` -This report tells us that every step of our matrix multiply hardware is executing sequentially (as if we executed our entire matrix multiply on a single-cycle CPU). You'll observe that the L3 loop (inner dot product loop) takes 2560 cycles to perform a dot product of two vectors with 256 elements, meaning that it takes 10 cycles to multiply and add two elements! +This report tells us that every step of our matrix multiply hardware is executing sequentially (as if we executed our entire matrix multiply on a single-cycle CPU). You'll observe that the L3 loop (inner dot product loop) takes 2816 cycles to perform a dot product of two vectors with 256 elements, meaning that it takes more than 11 cycles to multiply and add two elements! This is because a floating point multiplication on the FPGA takes 4 cycles, and an addition 5 cycles. Let's look at how we can improve this design with pipelining and batching! @@ -331,7 +345,7 @@ The base design has pretty underwhelming performance. But from analyzing the des * We are clearly under-utilizing our resources, so we have room to utilize more memory and logic resources to improve overall throughput. * We are also not taking advantage of pipeline parallelism in any part of the loop, as indicated by the rather high inference latency of our matrix multiply. -We will exercise some common pipeline optimization techniques described in the **Optimizing For Throughput** chapter of the [Vivado HLS User Guide](https://www.xilinx.com/support/documentation/sw_manuals/xilinx2017_1/ug902-vivado-high-level-synthesis.pdf), pages 188-206. +We will exercise some common pipeline optimization techniques described in the **Optimizing For Throughput** chapter of the [Vitis HLS Methodology Guide](https://docs.xilinx.com/r/2020.2-English/ug1399-vitis-hls/Optimizing-for-Throughput). ### Problem Statement @@ -340,7 +354,7 @@ Carefully insert `#pragma HLS PIPELINE II=1` directives in your code to tell HLS Report (1) the design latency in cycles, (2) the overall device utilization (as Total per Resource), (3) the number of floating point adders and multipliers (you can find this information under the Instance section of the synthesis report) and (4) the Initiation Interval of the loops you pipelined. **Hints**: -* Pragmas should be inserted after the target loop header that you wish to unroll. You can always use the Vivado GUI after compilation with the following command: `vivado_hls -p accel/` to correctly insert pragmas. +* Pragmas should be inserted after the target loop header that you wish to unroll. You can always use the Vivado GUI after compilation with the following command: `vitis_hls -p accel/` to correctly insert pragmas. * Chapter 7 of the [Vivado HLS Tutorial](https://www.xilinx.com/support/documentation/sw_manuals/xilinx2017_1/ug871-vivado-high-level-synthesis-tutorial.pdf) should provide enough guidance on how to do this effectively. * Start from the inner-most loop(s) before moving to an outer-loop. Starting at an outer loop will flatten it entirely and your resource usage and compilation time will explode. @@ -348,13 +362,13 @@ Report (1) the design latency in cycles, (2) the overall device utilization (as ## C. Increasing Pipeline Parallelism by Repartitioning Memories (8 marks) -If you examine the log from HLS in `vivado_hls.log` you will find the following warning message: +If you examine the log from HLS in `vitis_hls.log` you will find the following warning message: ``` WARNING: [SCHED 204-69] Unable to schedule 'load' operation ('in_buf_load_2', ./mmult_accel.cpp:79) on array 'in_buf', ./mmult_accel.cpp:32 due to limited memory ports. ``` The pipelined design suffers from a resource contention problem. While HLS tries to allocate more adders and multipliers to expose more parallelism in the design, it can only leverage as much parallelism as the FPGA memories allow. -By default, FPGA-based SRAM memories are dual-ported. However you can tell the compiler to distribute your buffer across multiplier SRAM memories to offer more ports and therefore expose more parallelism. +By default, FPGA-based SRAM memories are dual-ported. However you can tell the compiler to distribute your buffer across multiplier SRAM memories to offer more ports and therefore expose more parallelism. ### Problem Statement @@ -436,17 +450,19 @@ make -j1 And voilà! This process will take a little while (30 mins) depending on utilization. Regarding utilization, to avoid *no-place* errors, we recommend compiling your design where no resource exceeds 60%. This will ensure that your place and route tools have breathing space. In addition, it will make compilation time go a little faster. **Consequently we recommend scaling the design down a little, by limiting the array partitioning factor to `4`, and keeping both tile size and batch size to `128` and `2048` respectively.** +**Note** If you failed to generate IP due to some _bad lexical cast_ or _invalid argument_ error, follow this [article](https://support.xilinx.com/s/article/76960?language=en_US) and apply the patch given. + When your design is done compiling, it is ready to be tested on the PYNQ! Let's go ahead and transfer (1) the hardware design overlay files, (2) the test program, (3) the trained linear model, and (4) the MNIST validation data and labels to the PYNQ board. ```bash -scp build/export/classifier.bit xilinx@192.168.2.99:/home/xilinx/pynq/bitstream/. -scp tcl/classifier.tcl xilinx@192.168.2.99:/home/xilinx/pynq/bitstream/. +scp `find -name "system_wrapper.bit"` xilinx@192.168.2.99:~/classifier.bit +scp `find -name "system.hwh"` xilinx@192.168.2.99:~/classifier.hwh scp jupyter/classifier_1.ipynb xilinx@192.168.2.99:/home/xilinx/jupyter_notebooks/. scp python/*.npy xilinx@192.168.2.99:/home/xilinx/jupyter_notebooks/. ``` Now log onto your PYNQ board on Chrome by entering the following address: http://192.168.2.99:9090/. Make sure you've properly powered on the board, and connected the board via Ethernet to your host machine. In addition, ensure that you've properly configured your machine's network settings as indicated in the PYNQ getting started guide. -Use the `xilinx` credentials (pwd: `xilinx`) to log into the iPython notebook server. If the file transfer completed with success, you should see your `classifier_1.ipynb` notebook in the jupyter directory tree. +Use the `xilinx` credentials (pwd: `xilinx`) to log into the jupyter notebook server. If the file transfer completed with success, you should see your `classifier_1.ipynb` notebook in the jupyter directory tree. Click on it to launch the program! You can simply execute the entire program by clicking on **Kernel -> Restart & Run All**. @@ -454,7 +470,7 @@ Click on it to launch the program! You can simply execute the entire program by Report (1) the measured speedup and (2) measured classification accuracy. -**What to expect**: You should measure a roughly 6.7x speedup over the numpy implementation. While this speedup is not mind-blowing, it is encouraging for the following reasons: +**What to expect**: You should measure a roughly 5x speedup over the numpy implementation. While this speedup is not mind-blowing, it is encouraging for the following reasons: * We are targeting floating point computation which low-power FPGAs are notoriously bad at. We have room perform more aggressive data type quantization techniques in order to push more throughput out of our FPGA (covered next). * We are compiling a low frequency (100MHz) which is 1/6.6th of what the CPU is running at. With more frequency optimizations, it's not impossible to clock the FPGA at 250MHz (however this won't be covered during this lab). * Lastly we are utilizing less than ~50% of the FPGA resources and only 1/4 of the available memory throughput on the FPGA, so we could potentially improve throughput significantly. @@ -477,23 +493,23 @@ set src_dir "../hls/mmult_fixed" To test your design on the board, you'll need to transfer your new fixed-point classifier overlay to the PYNQ file system with the following commands: ```bash -scp build/export/classifier.bit xilinx@192.168.2.99:/home/xilinx/pynq/bitstream/classifier_fixed.bit -scp tcl/classifier.tcl xilinx@192.168.2.99:/home/xilinx/pynq/bitstream/classifier_fixed.tcl +scp `find -name "*.bit"` xilinx@192.168.2.99:~/classifier.bit +scp `find -name "*.hwh"` xilinx@192.168.2.99:~/classifier.hwh scp jupyter/classifier_2.ipynb xilinx@192.168.2.99:/home/xilinx/jupyter_notebooks/. scp python/*.npy xilinx@192.168.2.99:/home/xilinx/jupyter_notebooks/. ``` -Finally, once you've logged onto the iPython notebook server on the PYNQ, open the `classifier_2.ipynb` notebook, and execute your test program by clicking on **Kernel -> Restart & Run All**. +Finally, once you've logged onto the jupyter notebook server on the PYNQ, open the `classifier_2.ipynb` notebook, and execute your test program by clicking on **Kernel -> Restart & Run All**. ### Problem Statement Implement a fixed-point linear classifier, by completing the `TODOs` in `hls/mmult_fixed/mmult_fixed.cpp` and `hls/mmult_fixed/mmult_fixed.h`. The batch size has been set to `8192` and should not be altered. Your design should pass the tests in `hls/mmult_fixed/mmult_test.cpp`. -You will also have to perform floating-point to fixed-point conversion of your learned weight and offset coefficients. In the training file under `python/mnist.py` modify the `SCALE` factor to achieve less than 20% validation error on fixed point inference. Executing `python mnist.py` from the `python/` directory will produce updated `.npy` files that will then be copied onto the Zynq when you re-run the `scp` commands. +You will also have to perform floating-point to fixed-point conversion of your learned weight and offset coefficients. In the training file under `python/mnist.py` modify the `SCALE` factor to achieve less than 20% validation error on fixed point inference. Executing `python3 mnist.py` from the `python/` directory will produce updated `.npy` files that will then be copied onto the Zynq when you re-run the `scp` commands. Report the following: * (1) the fixed-point validation accuracy reported by `mnist.py` after you've tweaked the `SCALE` factor. -* (2) the design latency in cycles +* (2) the design latency in cycles * (3) the overall device utilization (as Total per Resource). * (4) your measured system speedup over the fixed-point CPU implementation * (5) your measured classification accuracy on the 8k MNIST test sample @@ -507,7 +523,7 @@ Also report the following: * Again you will have to tweak your memory partitioning, tiling factor to optimize your kernel latency/throughput. * Make sure that you close your AXI stream properly when you push data to it (see `push_stream()` calls in `mmult_float.cpp`). The `bool last` argument should be set to `true` on the last stream packet or the DMA drivers will hang when you try to test your design. * You'll notice that in `mmult.h` we are using `ap_int` which are arbitrary precision integers, which is HLS' solution to providing integers of arbitrary width (so not just 1, 8, 16, 32, 64). Unfortunately the `union` type conversion trick does not work on `ap_ints`. Instead you'll need to do a bit of bit manipulation on the AXI raw values before converting to `ap_int`. The `mmult_test.cpp` file should provide a good reference on how data is packed and unpacked before being pushed or popped from AXI channels. -* By default HLS will implement 8-bit multipliers on hard `BRAM_18K` blocks. But the Zynq FPGA only contains 220 multipliers. If you want to allocate more multipliers, you can use the following directive which will tell HLS to synthesize multipliers using LUTs instead: `#pragma HLS RESOURCE variable=mult core=Mul_LUT`. + **What to expect**: In terms of latency as HLS reports, this design should achieve 400-500x improvement in batch-normalized latency over the first naive floating point design. On the board, you should see a roughly 10x improvement over Part 1's FPGA over CPU speedup. diff --git a/image/install-01-product.png b/image/install-01-product.png new file mode 100644 index 0000000..8480b6d Binary files /dev/null and b/image/install-01-product.png differ diff --git a/image/install-02-edition.png b/image/install-02-edition.png new file mode 100644 index 0000000..c852dd1 Binary files /dev/null and b/image/install-02-edition.png differ diff --git a/image/install-03-component.png b/image/install-03-component.png new file mode 100644 index 0000000..5caf65a Binary files /dev/null and b/image/install-03-component.png differ diff --git a/zynq/Makefile b/zynq/Makefile index 4181d7a..390be1d 100644 --- a/zynq/Makefile +++ b/zynq/Makefile @@ -3,7 +3,7 @@ BUILD_DIR = build SCRIPT_DIR = tcl # Executables -VIVADO_HLS = vivado_hls +VIVADO_HLS = vitis_hls VIVADO = vivado .PHONY: all setup ip bit clean diff --git a/zynq/hls/mmult_fixed/hls.tcl b/zynq/hls/mmult_fixed/hls.tcl index 5f278d5..be5b832 100755 --- a/zynq/hls/mmult_fixed/hls.tcl +++ b/zynq/hls/mmult_fixed/hls.tcl @@ -2,12 +2,16 @@ set src_dir "." open_project accel set_top mmult_hw -add_files $src_dir/mmult_fixed.cpp +add_files $src_dir/mmult_float.cpp add_files -tb $src_dir/mmult_test.cpp -open_solution "solution0" + +open_solution "solution0" -flow_target vivado set_part {xc7z020clg484-1} create_clock -period 10 -name default +set_clock_uncertainty 12.5% + +config_compile -pipeline_loops 0 csim_design -clean csynth_design close_project -exit \ No newline at end of file +exit diff --git a/zynq/hls/mmult_fixed/mmult.h b/zynq/hls/mmult_fixed/mmult.h index d0d2dd1..a07955d 100755 --- a/zynq/hls/mmult_fixed/mmult.h +++ b/zynq/hls/mmult_fixed/mmult.h @@ -1,6 +1,7 @@ -#include #include +#include +#include // Type definition of matrix elements typedef ap_int<8> w_T; @@ -15,14 +16,14 @@ typedef unsigned out_bit_T; typedef unsigned long long axi_T; // Datatype widths in bits -#define W_WIDTH (sizeof(w_T)*8) -#define IN_WIDTH (sizeof(in_T)*8) -#define OUT_WIDTH (sizeof(out_T)*8) +#define W_WIDTH (sizeof(w_T) * 8) +#define IN_WIDTH (sizeof(in_T) * 8) +#define OUT_WIDTH (sizeof(out_T) * 8) // Data type ratio between data type and axi width -#define W_WIDTH_RATIO (8*sizeof(axi_T)/W_WIDTH) -#define IN_WIDTH_RATIO (8*sizeof(axi_T)/IN_WIDTH) -#define OUT_WIDTH_RATIO (8*sizeof(axi_T)/OUT_WIDTH) +#define W_WIDTH_RATIO (8 * sizeof(axi_T) / W_WIDTH) +#define IN_WIDTH_RATIO (8 * sizeof(axi_T) / IN_WIDTH) +#define OUT_WIDTH_RATIO (8 * sizeof(axi_T) / OUT_WIDTH) // Matrix dimensions specifications #define BATCH 8192 @@ -34,21 +35,21 @@ typedef unsigned long long axi_T; // #define TILING // Input/Output Stream Size -#define IS_SIZE ((CLASSES+OUT_WIDTH_RATIO-1)/OUT_WIDTH_RATIO+CLASSES*FEAT/W_WIDTH_RATIO+BATCH*FEAT/IN_WIDTH_RATIO) -#define OS_SIZE (BATCH*((CLASSES+OUT_WIDTH_RATIO-1)/OUT_WIDTH_RATIO)) +#define IS_SIZE ((CLASSES + OUT_WIDTH_RATIO - 1) / OUT_WIDTH_RATIO + CLASSES * FEAT / W_WIDTH_RATIO + BATCH * FEAT / IN_WIDTH_RATIO) +#define OS_SIZE (BATCH * ((CLASSES + OUT_WIDTH_RATIO - 1) / OUT_WIDTH_RATIO)) // AXI settings (leave it fixed) -#define AXI_DATA (sizeof(axi_T)*8) +#define AXI_DATA (sizeof(axi_T) * 8) #define AXI_U 4 #define AXI_TI 5 #define AXI_TD 5 // AXI interface -typedef ap_axiu AXI_VAL; +typedef ap_axiu AXI_VAL; // Matrix Multiply prototype -void mmult_hw (AXI_VAL in_stream[IS_SIZE],AXI_VAL out_stream[OS_SIZE]); +void mmult_hw(hls::stream& in_stream, hls::stream& out_stream); // AXI stream push and pop -axi_T pop_stream(AXI_VAL const &e); -AXI_VAL push_stream(axi_T const &v, bool last); +axi_T pop_stream(hls::stream& in_stream); +AXI_VAL push_stream(axi_T const& v, bool last); diff --git a/zynq/hls/mmult_fixed/mmult_fixed.cpp b/zynq/hls/mmult_fixed/mmult_fixed.cpp index f92bbc5..a4ef606 100755 --- a/zynq/hls/mmult_fixed/mmult_fixed.cpp +++ b/zynq/hls/mmult_fixed/mmult_fixed.cpp @@ -5,92 +5,101 @@ // -------------------------------------------------------------------- // function to be accelerated in HW wrapped with AXI4-Stream interface -void mmult_hw (AXI_VAL in_stream[IS_SIZE], AXI_VAL out_stream[OS_SIZE]) +void mmult_hw(hls::stream& in_stream, hls::stream& out_stream) { #pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS #pragma HLS INTERFACE axis port=in_stream #pragma HLS INTERFACE axis port=out_stream - // Assertions (to avoid out of array bound writes) - assert(BATCH%TILING==0); - assert(FEAT%W_WIDTH_RATIO==0); - assert(FEAT%IN_WIDTH_RATIO==0); - assert((BATCH*CLASSES)%OUT_WIDTH_RATIO==0); - - // Hardware memory buffers - out_T offset_buf[CLASSES]; - w_T weight_buf[CLASSES][FEAT]; - in_T in_buf[TILING][FEAT]; - out_T out_buf[TILING][CLASSES]; - - // Input and output AXI stream indices - int is_idx = 0; - int os_idx = 0; - - // Stream in offset vector - // CSE548 TODO - - // Stream in weight matrix - // CSE548 TODO - - // Iterate over tiles - LT: for (int t = 0; t < BATCH; t+=TILING) { - - // Stream in input tile - // CSE548 TODO - - // Perform matrix multiplication - L1: for (int i = 0; i < TILING; i++) { - // Iterate over output classes - L2: for (int j = 0; j < CLASSES; j++) { - // Perform the dot product - out_T tmp = offset_buf[j]; - L3: for(int k = 0; k < FEAT; k++) { - out_T mult = in_buf[i][k] * weight_buf[j][k]; - tmp += mult; - } - out_buf[i][j] = tmp; - } - } - - // Stream out output matrix - // CSE548 TODO - } + // Assertions (to avoid out of array bound writes) + assert(BATCH % TILING == 0); + assert(FEAT % W_WIDTH_RATIO == 0); + assert(FEAT % IN_WIDTH_RATIO == 0); + assert((BATCH * CLASSES) % OUT_WIDTH_RATIO == 0); + + // Hardware memory buffers + out_T offset_buf[CLASSES]; + w_T weight_buf[CLASSES][FEAT]; + in_T in_buf[TILING][FEAT]; + out_T out_buf[TILING][CLASSES]; + +#pragma HLS BIND_STORAGE variable=offset_buf type=RAM_T2P +#pragma HLS BIND_STORAGE variable=weight_buf type=RAM_T2P +#pragma HLS BIND_STORAGE variable=in_buf type=RAM_T2P +#pragma HLS BIND_STORAGE variable=out_buf type=RAM_T2P + + // Input and output AXI stream indices + int is_idx = 0; + int os_idx = 0; + +// Stream in offset vector +// CSE548 TODO + +// Stream in weight matrix +// CSE548 TODO + +// Iterate over tiles +LT: + for (int t = 0; t < BATCH; t += TILING) { + + // Stream in input tile + // CSE548 TODO + + // Perform matrix multiplication + L1: + for (int i = 0; i < TILING; i++) { + // Iterate over output classes + L2: + for (int j = 0; j < CLASSES; j++) { + // Perform the dot product + out_T tmp = offset_buf[j]; + L3: + for (int k = 0; k < FEAT; k++) { + out_T mult = in_buf[i][k] * weight_buf[j][k]; + tmp += mult; + } + out_buf[i][j] = tmp; + } + } + + // Stream out output matrix + // CSE548 TODO + } } - // -------------------------------------------------------- // functions to insert and extract elements from an axi stream // includes conversion to correct data type -axi_T pop_stream(AXI_VAL const &e) +axi_T pop_stream(hls::stream& in_stream) { #pragma HLS INLINE + AXI_VAL e; + in_stream.read(e); - axi_T ret = e.data; + axi_T ret = e.data; - volatile ap_uint strb = e.strb; - volatile ap_uint keep = e.keep; - volatile ap_uint user = e.user; - volatile ap_uint<1> last = e.last; - volatile ap_uint id = e.id; - volatile ap_uint dest = e.dest; + volatile ap_uint strb = e.strb; + volatile ap_uint keep = e.keep; + volatile ap_uint user = e.user; + volatile ap_uint<1> last = e.last; + volatile ap_uint id = e.id; + volatile ap_uint dest = e.dest; - return ret; + return ret; } -AXI_VAL push_stream(axi_T const &v, bool last = false) +AXI_VAL push_stream(axi_T const& v, bool last = false) { #pragma HLS INLINE - AXI_VAL e; + AXI_VAL e; - e.data = v; - e.strb = (1< #include #include -#include #include "mmult.h" -void matrix_multiply_ref(out_T offsets[CLASSES], w_T weights[CLASSES][FEAT], in_T in[BATCH][FEAT], out_T out[BATCH][CLASSES]) +void matrix_multiply_ref(out_T offsets[CLASSES], w_T weights[CLASSES][FEAT], in_T in[BATCH][FEAT], out_T out[BATCH][CLASSES]) { - // matrix multiplication of a A*B matrix - for (int i = 0; i < BATCH; ++i) { - for (int j = 0; j < CLASSES; ++j) { - out_T sum = offsets[j]; - for (int k = 0; k < FEAT; ++k) { - sum += in[i][k] * weights[j][k]; - } - out[i][j] = sum; - } - } - return; + // matrix multiplication of a A*B matrix + for (int i = 0; i < BATCH; ++i) { + for (int j = 0; j < CLASSES; ++j) { + out_T sum = offsets[j]; + for (int k = 0; k < FEAT; ++k) { + sum += in[i][k] * weights[j][k]; + } + out[i][j] = sum; + } + } + return; } - int main(void) { - int i,j,err; - - out_T offsets[CLASSES]; - w_T weights[CLASSES][FEAT]; - in_T inputs[BATCH][FEAT]; - out_T output_sw[BATCH][CLASSES]; - out_T output_hw[BATCH][CLASSES]; - - /** Matrix Initiation */ - for(i = 0; i>(w*OUT_WIDTH)); - output_hw[i][j+w] = *((out_T*) &bits) & ((1ULL<>((j%OUT_WIDTH_RATIO)*OUT_WIDTH)); - output_hw[i][j] = *((out_T*) &bits) & ((1ULL< in_stream; + hls::stream out_stream; + + // Input and output stream indices + int is_idx = 0; + int os_idx = 0; + + // stream in the offset vector + for (int i = 0; i < CLASSES - OUT_WIDTH_RATIO; i += OUT_WIDTH_RATIO) { + axi_T packet = 0; + PACK_OFF: + for (int w = 0; w < OUT_WIDTH_RATIO; w++) { + out_bit_T bits = *((out_bit_T*)&offsets[i + w]); + packet |= (bits & ((1ULL << OUT_WIDTH) - 1)) << (w * OUT_WIDTH); + }; + is_idx++; + in_stream.write(push_stream(packet, 0)); + } + // pad the last packet in case things don't align + axi_T packet = 0; +FINISH_OFF: + for (int i = CLASSES - OUT_WIDTH_RATIO; i < CLASSES; i++) { + out_bit_T bits = *((out_bit_T*)&offsets[i]); + packet |= (bits & ((1ULL << OUT_WIDTH) - 1)) << ((i % OUT_WIDTH_RATIO) * OUT_WIDTH); + } + is_idx++; + in_stream.write(push_stream(packet, 0)); + + // stream in the weigth matrix + for (int i = 0; i < CLASSES; i++) { + for (int j = 0; j < FEAT; j += W_WIDTH_RATIO) { + axi_T packet = 0; + PACK_W: + for (int w = 0; w < W_WIDTH_RATIO; w++) { + w_bit_T bits = *((w_bit_T*)&weights[i][j + w]); + packet |= (bits & ((1ULL << W_WIDTH) - 1)) << (w * W_WIDTH); + }; + is_idx++; + in_stream.write(push_stream(packet, 0)); + } + } + + // stream in the input matrix + for (int i = 0; i < BATCH; i++) { + for (int j = 0; j < FEAT; j += IN_WIDTH_RATIO) { + axi_T packet = 0; + PACK_IN: + for (int w = 0; w < IN_WIDTH_RATIO; w++) { + in_bit_T bits = *((in_bit_T*)&inputs[i][j + w]); + packet |= (bits & ((1ULL << IN_WIDTH) - 1)) << (w * IN_WIDTH); + }; + is_idx++; + in_stream.write(push_stream(packet, is_idx == (IS_SIZE))); + } + } + + // call the DUT + mmult_hw(in_stream, out_stream); + + // extract the output matrix from the out stream + for (int i = 0; i < BATCH; i++) { + for (int j = 0; j < CLASSES - OUT_WIDTH_RATIO; j += OUT_WIDTH_RATIO) { + os_idx++; + axi_T packet = pop_stream(out_stream); + UNPACK_OUT: + for (int w = 0; w < OUT_WIDTH_RATIO; w++) { + out_bit_T bits = (packet >> (w * OUT_WIDTH)); + output_hw[i][j + w] = *((out_T*)&bits) & ((1ULL << OUT_WIDTH) - 1); + } + } + // Pop last AXI data packet + os_idx++; + axi_T packet = pop_stream(out_stream); + FINISH_OUT: + for (int j = CLASSES - OUT_WIDTH_RATIO; j < CLASSES; j++) { + out_bit_T bits = (packet >> ((j % OUT_WIDTH_RATIO) * OUT_WIDTH)); + output_hw[i][j] = *((out_T*)&bits) & ((1ULL << OUT_WIDTH) - 1); + } + } + + /* reference Matrix Multiplication */ + matrix_multiply_ref(offsets, weights, inputs, output_sw); + + /** Matrix comparison */ + err = 0; + for (i = 0; i < BATCH; i++) { + for (j = 0; j < CLASSES; j++) { + if (output_sw[i][j] != output_hw[i][j]) { + err++; + std::cout << i << "," << j << ": expected " << output_sw[i][j] << " but got " << output_hw[i][j] << std::endl; + } + } + } + + if (err == 0) + printf("Matrices identical ... Test successful!\r\n"); + else + printf("Test failed!\r\n"); + + return err; } diff --git a/zynq/hls/mmult_float/hls.tcl b/zynq/hls/mmult_float/hls.tcl index cd345e8..be5b832 100755 --- a/zynq/hls/mmult_float/hls.tcl +++ b/zynq/hls/mmult_float/hls.tcl @@ -4,10 +4,14 @@ open_project accel set_top mmult_hw add_files $src_dir/mmult_float.cpp add_files -tb $src_dir/mmult_test.cpp -open_solution "solution0" + +open_solution "solution0" -flow_target vivado set_part {xc7z020clg484-1} create_clock -period 10 -name default +set_clock_uncertainty 12.5% + +config_compile -pipeline_loops 0 csim_design -clean csynth_design close_project -exit \ No newline at end of file +exit diff --git a/zynq/hls/mmult_float/mmult.h b/zynq/hls/mmult_float/mmult.h index c3114b5..f3672a4 100755 --- a/zynq/hls/mmult_float/mmult.h +++ b/zynq/hls/mmult_float/mmult.h @@ -1,6 +1,7 @@ -#include #include +#include +#include typedef unsigned long long axi_T; typedef float T; @@ -11,23 +12,23 @@ typedef float T; #define CLASSES 10 // Input/Output Stream Size -#define IS_SIZE (BATCH*FEAT/WIDTH_RATIO+(FEAT+1)*CLASSES/WIDTH_RATIO) -#define OS_SIZE (BATCH*CLASSES/WIDTH_RATIO) +#define IS_SIZE (BATCH * FEAT / WIDTH_RATIO + (FEAT + 1) * CLASSES / WIDTH_RATIO) +#define OS_SIZE (BATCH * CLASSES / WIDTH_RATIO) // AXI settings -#define AXI_DATA (sizeof(axi_T)*8) +#define AXI_DATA (sizeof(axi_T) * 8) #define AXI_U 4 #define AXI_TI 5 #define AXI_TD 5 // Data type ratio between data type and axi width -#define WIDTH_RATIO (sizeof(axi_T)/sizeof(T)) +#define WIDTH_RATIO (sizeof(axi_T) / sizeof(T)) -typedef ap_axiu AXI_VAL; +typedef ap_axiu AXI_VAL; // Matrix Multiply prototype -void mmult_hw (AXI_VAL in_stream[IS_SIZE],AXI_VAL out_stream[OS_SIZE]); +void mmult_hw(hls::stream& in_stream, hls::stream& out_stream); // AXI stream push and pop -axi_T pop_stream(AXI_VAL const &e); -AXI_VAL push_stream(axi_T const &v, bool last); +axi_T pop_stream(hls::stream& in_stream); +AXI_VAL push_stream(axi_T const& v, bool last); diff --git a/zynq/hls/mmult_float/mmult_float.cpp b/zynq/hls/mmult_float/mmult_float.cpp index 1cf9e8f..10c0393 100755 --- a/zynq/hls/mmult_float/mmult_float.cpp +++ b/zynq/hls/mmult_float/mmult_float.cpp @@ -5,120 +5,140 @@ // -------------------------------------------------------------------- // function to be accelerated in HW wrapped with AXI4-Stream interface -void mmult_hw (AXI_VAL in_stream[IS_SIZE], AXI_VAL out_stream[OS_SIZE]) +void mmult_hw(hls::stream& in_stream, hls::stream& out_stream) { #pragma HLS INTERFACE s_axilite port=return bundle=CONTROL_BUS #pragma HLS INTERFACE axis port=in_stream #pragma HLS INTERFACE axis port=out_stream - // Assertions (to avoid out of array bound writes) - assert(CLASSES%WIDTH_RATIO==0); - assert(FEAT%WIDTH_RATIO==0); - assert(FEAT%WIDTH_RATIO==0); - assert((BATCH*CLASSES)%WIDTH_RATIO==0); - - // Union used for type conversion - union - { - axi_T packet; - struct {T f0; T f1;} val; - } converter; - - // Hardware buffers - T offset_buf[CLASSES]; - T weight_buf[CLASSES][FEAT]; - T in_buf[BATCH][FEAT]; - T out_buf[BATCH][CLASSES]; - - // Input and output AXI stream indices - int is_idx = 0; - int os_idx = 0; - - // Stream in offset vector - LOAD_OFF_1: for (int i = 0; i < CLASSES; i+=WIDTH_RATIO) { - converter.packet = pop_stream(in_stream[is_idx++]); - offset_buf[i+0] = converter.val.f0; - offset_buf[i+1] = converter.val.f1; - } - - // Stream in weight matrix - LOAD_W_1: for (int i = 0; i < CLASSES; i++) { - LOAD_W_2: for (int j = 0; j < FEAT; j+=WIDTH_RATIO) { - // Pop AXI data packet - converter.packet = pop_stream(in_stream[is_idx++]); - weight_buf[i][j+0] = converter.val.f0; - weight_buf[i][j+1] = converter.val.f1; - } - } - - - // Stream in input matrix - LOAD_I_1: for (int i = 0; i < BATCH; i++) { - LOAD_I_2: for (int j = 0; j < FEAT; j+=WIDTH_RATIO) { - // Pop AXI data packet - converter.packet = pop_stream(in_stream[is_idx++]); - in_buf[i][j+0] = converter.val.f0; - in_buf[i][j+1] = converter.val.f1; - } - } - - // Iterate over batch elements - L1: for (int i = 0; i < BATCH; i++) { - // Iterate over output classes - L2: for (int j = 0; j < CLASSES; j++) { - // Perform the dot product - T tmp = offset_buf[j]; - L3: for(int k = 0; k < FEAT; k++) { - tmp += in_buf[i][k] * weight_buf[j][k]; - } - out_buf[i][j] = tmp; - } - } - - // Stream out output matrix - STORE_O_1: for (int i = 0; i < BATCH; i++) { - STORE_O_2: for (int j = 0; j < CLASSES; j+=WIDTH_RATIO) { - // Push output element into AXI stream - converter.val.f0 = out_buf[i][j+0]; - converter.val.f1 = out_buf[i][j+1]; - out_stream[os_idx++] = push_stream(converter.packet, os_idx == (OS_SIZE)); - } - } + // Assertions (to avoid out of array bound writes) + assert(CLASSES % WIDTH_RATIO == 0); + assert(FEAT % WIDTH_RATIO == 0); + assert(FEAT % WIDTH_RATIO == 0); + assert((BATCH * CLASSES) % WIDTH_RATIO == 0); + + // Union used for type conversion + union { + axi_T packet; + struct { + T f0; + T f1; + } val; + } converter; + + // Hardware buffers + T offset_buf[CLASSES]; + T weight_buf[CLASSES][FEAT]; + T in_buf[BATCH][FEAT]; + T out_buf[BATCH][CLASSES]; + +#pragma HLS BIND_STORAGE variable=offset_buf type=RAM_T2P +#pragma HLS BIND_STORAGE variable=weight_buf type=RAM_T2P +#pragma HLS BIND_STORAGE variable=in_buf type=RAM_T2P +#pragma HLS BIND_STORAGE variable=out_buf type=RAM_T2P + + // Input and output AXI stream indices + int is_idx = 0; + int os_idx = 0; + +// Stream in offset vector +LOAD_OFF_1: + for (int i = 0; i < CLASSES; i += WIDTH_RATIO) { + is_idx++; + converter.packet = pop_stream(in_stream); + offset_buf[i + 0] = converter.val.f0; + offset_buf[i + 1] = converter.val.f1; + } + +// Stream in weight matrix +LOAD_W_1: + for (int i = 0; i < CLASSES; i++) { + LOAD_W_2: + for (int j = 0; j < FEAT; j += WIDTH_RATIO) { + // Pop AXI data packet + is_idx++; + converter.packet = pop_stream(in_stream); + weight_buf[i][j + 0] = converter.val.f0; + weight_buf[i][j + 1] = converter.val.f1; + } + } + +// Stream in input matrix +LOAD_I_1: + for (int i = 0; i < BATCH; i++) { + LOAD_I_2: + for (int j = 0; j < FEAT; j += WIDTH_RATIO) { + // Pop AXI data packet + is_idx++; + converter.packet = pop_stream(in_stream); + in_buf[i][j + 0] = converter.val.f0; + in_buf[i][j + 1] = converter.val.f1; + } + } + +// Iterate over batch elements +L1: + for (int i = 0; i < BATCH; i++) { + // Iterate over output classes + L2: + for (int j = 0; j < CLASSES; j++) { + // Perform the dot product + T tmp = offset_buf[j]; + L3: + for (int k = 0; k < FEAT; k++) { + tmp += in_buf[i][k] * weight_buf[j][k]; + } + out_buf[i][j] = tmp; + } + } + +// Stream out output matrix +STORE_O_1: + for (int i = 0; i < BATCH; i++) { + STORE_O_2: + for (int j = 0; j < CLASSES; j += WIDTH_RATIO) { + // Push output element into AXI stream + converter.val.f0 = out_buf[i][j + 0]; + converter.val.f1 = out_buf[i][j + 1]; + os_idx++; + out_stream.write(push_stream(converter.packet, os_idx == (OS_SIZE))); + } + } } - // -------------------------------------------------------- // functions to insert and extract elements from an axi stream // includes conversion to correct data type -axi_T pop_stream(AXI_VAL const &e) +axi_T pop_stream(hls::stream& in_stream) { #pragma HLS INLINE + AXI_VAL e; + in_stream.read(e); - axi_T ret = e.data; + axi_T ret = e.data; - volatile ap_uint strb = e.strb; - volatile ap_uint keep = e.keep; - volatile ap_uint user = e.user; - volatile ap_uint<1> last = e.last; - volatile ap_uint id = e.id; - volatile ap_uint dest = e.dest; + volatile ap_uint strb = e.strb; + volatile ap_uint keep = e.keep; + volatile ap_uint user = e.user; + volatile ap_uint<1> last = e.last; + volatile ap_uint id = e.id; + volatile ap_uint dest = e.dest; - return ret; + return ret; } -AXI_VAL push_stream(axi_T const &v, bool last = false) +AXI_VAL push_stream(axi_T const& v, bool last = false) { #pragma HLS INLINE - AXI_VAL e; + AXI_VAL e; - e.data = v; - e.strb = (1< #include #include -#include #include "mmult.h" -void matrix_multiply_ref(T offsets[CLASSES], T weights[CLASSES][FEAT], T in[BATCH][FEAT], T out[BATCH][CLASSES]) +void matrix_multiply_ref(T offsets[CLASSES], T weights[CLASSES][FEAT], T in[BATCH][FEAT], T out[BATCH][CLASSES]) { - // matrix multiplication of a A*B matrix - for (int i = 0; i < BATCH; ++i) { - for (int j = 0; j < CLASSES; ++j) { - T sum = offsets[j]; - for (int k = 0; k < FEAT; ++k) { - sum += in[i][k] * weights[j][k]; - } - out[i][j] = sum; - } - } - return; + // matrix multiplication of a A*B matrix + for (int i = 0; i < BATCH; ++i) { + for (int j = 0; j < CLASSES; ++j) { + T sum = offsets[j]; + for (int k = 0; k < FEAT; ++k) { + sum += in[i][k] * weights[j][k]; + } + out[i][j] = sum; + } + } + return; } - int main(void) { - int i,j,err; - - union - { - axi_T packet; - struct {T f0; T f1;} val; - } converter; - - T offsets[CLASSES]; - T weights[CLASSES][FEAT]; - T inputs[BATCH][FEAT]; - T matMult_sw[BATCH][CLASSES]; - T matMult_hw[BATCH][CLASSES]; - - /** Matrix Initiation */ - for(i = 0; i in_stream; + hls::stream out_stream; + + // input and output stream indices + int is_idx = 0; + int os_idx = 0; + + // stream in the offset vector + for (int i = 0; i < CLASSES; i += WIDTH_RATIO) { + converter.val.f0 = offsets[i + 0]; + converter.val.f1 = offsets[i + 1]; + in_stream.write(push_stream(converter.packet, 0)); + is_idx++; + } + + // stream in the weigth matrix + for (int i = 0; i < CLASSES; i++) { + for (int j = 0; j < FEAT; j += WIDTH_RATIO) { + converter.val.f0 = weights[i][j + 0]; + converter.val.f1 = weights[i][j + 1]; + in_stream.write(push_stream(converter.packet, 0)); + is_idx++; + } + } + + // stream in the input matrix + for (int i = 0; i < BATCH; i++) { + for (int j = 0; j < FEAT; j += WIDTH_RATIO) { + converter.val.f0 = inputs[i][j + 0]; + converter.val.f1 = inputs[i][j + 1]; + in_stream.write(push_stream(converter.packet, is_idx == (IS_SIZE))); + is_idx++; + } + } + + // call the DUT + mmult_hw(in_stream, out_stream); + + // extract the output matrix from the out stream + for (int i = 0; i < BATCH; i++) { + for (int j = 0; j < CLASSES; j += WIDTH_RATIO) { + os_idx++; + converter.packet = pop_stream(out_stream); + matMult_hw[i][j + 0] = converter.val.f0; + matMult_hw[i][j + 1] = converter.val.f1; + } + } + + /* reference Matrix Multiplication */ + matrix_multiply_ref(offsets, weights, inputs, matMult_sw); + + /** Matrix comparison */ + err = 0; + for (i = 0; i < BATCH; i++) { + for (j = 0; j < CLASSES; j++) { + if (matMult_sw[i][j] != matMult_hw[i][j]) { + err++; + std::cout << i << "," << j << ": expected " << matMult_sw[i][j] << " but got " << matMult_hw[i][j] << std::endl; + } + } + } + + if (err == 0) + printf("Matrices identical ... Test successful!\r\n"); + else + printf("Test failed!\r\n"); + + return err; } diff --git a/zynq/jupyter/classifier_1.ipynb b/zynq/jupyter/classifier_1.ipynb index ba75455..6d43658 100644 --- a/zynq/jupyter/classifier_1.ipynb +++ b/zynq/jupyter/classifier_1.ipynb @@ -3,94 +3,53 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ + "from time import time\n", + "\n", "import cffi\n", "import numpy as np\n", - "from pynq import MMIO\n", - "from pynq import Overlay\n", - "from pynq import PL\n", - "from pynq.drivers import DMA\n", - "from time import sleep, time\n", + "from pynq import MMIO, Overlay, allocate\n", + "\n", + "ffi = cffi.FFI()\n", "\n", "# Classifier Dimensions\n", "BATCH = 2048\n", "FEAT = 256\n", - "CLASSES = 10\n", - "\n", - "# Addresses\n", - "ACCEL_CTRL = 0x43C00000\n", - "AXI_DMA_0 = 0x40400000\n", - "AXI_DMA_1 = 0x40410000\n", - "AXI_TIMER = 0x42800000\n", + "CLASSES = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, + "outputs": [], + "source": [ + "# Prepare the custome overlay and DMA\n", + "ol = Overlay(\"/home/xilinx/classifier.bit\")\n", + "ol.download()\n", "\n", - "# C FFI\n", - "ffi = cffi.FFI()\n", + "dma_mm2s = ol.axi_dma_0\n", + "dma_s2mm = ol.axi_dma_1\n", + "mm2s_buffer = allocate(shape=(CLASSES + CLASSES * FEAT + BATCH * FEAT,), dtype=np.float32)\n", + "s2mm_buffer = allocate(shape=(BATCH * CLASSES,), dtype=np.float32)\n", "\n", - "# DMA Configs\n", - "DMAConfig1 = {\n", - " 'DeviceId' : 0,\n", - " 'BaseAddr' : ffi.cast(\"uint32_t *\",AXI_DMA_0),\n", - " 'HasStsCntrlStrm' : 0,\n", - " 'HasMm2S' : 1,\n", - " 'HasMm2SDRE' : 1,\n", - " 'Mm2SDataWidth' : 64,\n", - " 'HasS2Mm' : 0,\n", - " 'HasS2MmDRE' : 0,\n", - " 'S2MmDataWidth' : 32,\n", - " 'HasSg' : 0,\n", - " 'Mm2sNumChannels' : 1,\n", - " 'S2MmNumChannels' : 1,\n", - " 'Mm2SBurstSize' : 256,\n", - " 'S2MmBurstSize' : 16,\n", - " 'MicroDmaMode' : 0,\n", - " 'AddrWidth' : 32\n", - "}\n", - "DMAConfig2 = {\n", - " 'DeviceId' : 1,\n", - " 'BaseAddr' : ffi.cast(\"uint32_t *\",AXI_DMA_1),\n", - " 'HasStsCntrlStrm' : 0,\n", - " 'HasMm2S' : 0,\n", - " 'HasMm2SDRE' : 0,\n", - " 'Mm2SDataWidth' : 32,\n", - " 'HasS2Mm' : 1,\n", - " 'HasS2MmDRE' : 1,\n", - " 'S2MmDataWidth' : 64,\n", - " 'HasSg' : 0,\n", - " 'Mm2sNumChannels' : 1,\n", - " 'S2MmNumChannels' : 1,\n", - " 'Mm2SBurstSize' : 16,\n", - " 'S2MmBurstSize' : 256,\n", - " 'MicroDmaMode' : 0,\n", - " 'AddrWidth' : 32\n", - "}\n", - "\n", - "# Download the custom overlay\n", - "ol = Overlay(\"classifier.bit\")\n", - "ol.download()\n", + "# Accelerator Base Address\n", + "ACCEL_CTRL = 0x43C00000\n", "\n", "# Initialize HLS IP\n", - "mmult_ip = MMIO(ACCEL_CTRL,0x10000)\n", + "mmult_ip = MMIO(ACCEL_CTRL, 0x10000)\n", + "\n", "# Start the accelerator\n", - "ctrl=mmult_ip.read(0x00)&0x08\n", - "mmult_ip.write(0x00, (ctrl|0x81))\n", - "ctrl=mmult_ip.read(0x00)\n", - "hex(ctrl)\n", - "\n", - "# Initialize DMA1 (mem to FPGA)\n", - "dma1 = DMA(AXI_DMA_0, direction=0, attr_dict=DMAConfig1)\n", - "dma1.create_buf((CLASSES+CLASSES*FEAT+BATCH*FEAT)*4, cacheable=0)\n", - "\n", - "# Initialize DMA2 (FPGA to mem)\n", - "dma2 = DMA(AXI_DMA_1, direction=1, attr_dict=DMAConfig2)\n", - "dma2.create_buf(BATCH*CLASSES*4, cacheable=0)\n", - "# Start DMA transfer from FPGA to memory\n", - "dma2.transfer(BATCH*CLASSES*4, direction=1)" + "ctrl = mmult_ip.read(0x00) & 0x08\n", + "mmult_ip.write(0x00, (ctrl | 0x81))\n", + "ctrl = mmult_ip.read(0x00)\n", + "hex(ctrl)" ] }, { @@ -102,10 +61,10 @@ "outputs": [], "source": [ "# Initialize offsets, weights and inputs\n", - "o = np.load('model_offsets.npy').astype(np.float32)\n", - "w = np.load('model_weights.npy').astype(np.float32)\n", - "i = np.load('test_data.npy').astype(np.float32)\n", - "l = np.load('test_labels.npy').astype(np.float32)\n", + "o = np.load(\"model_offsets.npy\").astype(np.float32)\n", + "w = np.load(\"model_weights.npy\").astype(np.float32)\n", + "i = np.load(\"test_data.npy\").astype(np.float32)\n", + "l = np.load(\"test_labels.npy\").astype(np.float32)\n", "\n", "# Sample BATCHSIZE test samples from the MNIST test dataset\n", "np.random.seed(0xCAFEBEEF)\n", @@ -117,26 +76,25 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": true - }, + "metadata": {}, "outputs": [], "source": [ "# Move offset, weight and input data to DMA buffer\n", - "ffi.memmove(dma1.get_buf(), ffi.cast(\"uint32_t *\", o.ctypes.data), CLASSES*4)\n", - "ffi.memmove(dma1.get_buf()+CLASSES, ffi.cast(\"uint32_t *\", w.ctypes.data), CLASSES*FEAT*4)\n", - "ffi.memmove(dma1.get_buf()+CLASSES+CLASSES*FEAT, ffi.cast(\"uint32_t *\", i.ctypes.data), BATCH*FEAT*4)\n", + "ffi.memmove(mm2s_buffer[:], ffi.cast(\"uint32_t *\", o.ctypes.data), CLASSES * 4)\n", + "ffi.memmove(mm2s_buffer[CLASSES:], ffi.cast(\"uint32_t *\", w.ctypes.data), CLASSES * FEAT * 4)\n", + "ffi.memmove(mm2s_buffer[CLASSES + CLASSES * FEAT :], ffi.cast(\"uint32_t *\", i.ctypes.data), BATCH * FEAT * 4)\n", + "mm2s_buffer.flush()\n", "\n", "# Perform FPGA offloading\n", "start_t = time()\n", - "dma1.transfer((CLASSES+CLASSES*FEAT+BATCH*FEAT)*4, direction=0)\n", - "dma2.wait()\n", - "fpga_time = time()-start_t\n", + "dma_mm2s.sendchannel.transfer(mm2s_buffer)\n", + "dma_s2mm.recvchannel.transfer(s2mm_buffer)\n", + "dma_mm2s.sendchannel.wait()\n", + "dma_s2mm.recvchannel.wait()\n", + "fpga_time = time() - start_t\n", "\n", "# Dump FPGA result to a numpy array\n", - "c = np.frombuffer(ffi.buffer(\n", - " dma2.get_buf(),BATCH*CLASSES*4),\n", - " dtype=np.float32).reshape(BATCH,CLASSES)" + "c = np.array(s2mm_buffer).reshape(BATCH, CLASSES)\n" ] }, { @@ -148,14 +106,14 @@ "outputs": [], "source": [ "# Prepare input and weight matrices for matrix multiplication on CPU\n", - "ones = np.ones(BATCH).reshape((BATCH,1))\n", + "ones = np.ones(BATCH).reshape((BATCH, 1))\n", "i_p = np.append(ones, i, axis=1)\n", - "w_p = np.append(o.reshape(CLASSES,1), w, axis=1)\n", + "w_p = np.append(o.reshape(CLASSES, 1), w, axis=1)\n", "\n", "# Compute CPU result\n", "start_t = time()\n", - "c_ref = np.dot(i_p,w_p.T)\n", - "cpu_time = time()-start_t" + "c_ref = np.dot(i_p, w_p.T)\n", + "cpu_time = time() - start_t" ] }, { @@ -167,24 +125,20 @@ "outputs": [], "source": [ "# Evaluate validation accuracy\n", - "cpu_errors = 0\n", - "fpga_errors = 0\n", - "for idx in range(BATCH):\n", - " fpga_label = np.argmax(c[idx])\n", - " cpu_label = np.argmax(c_ref[idx])\n", - " actual_label = np.argmax(l[idx])\n", - " if (fpga_label!=actual_label):\n", - " fpga_errors += 1.\n", - " if (cpu_label!=actual_label):\n", - " cpu_errors += 1.\n", + "actual_label = l.argmax(axis=1)\n", + "fpga_label = c.argmax(axis=1)\n", + "cpu_label = c_ref.argmax(axis=1)\n", + "\n", + "fpga_errors = np.sum(fpga_label != actual_label)\n", + "cpu_errors = np.sum(cpu_label != actual_label)\n", "\n", "# Report results\n", - "print(\"FPGA accuracy: {0:.2f}% validation error\".format(fpga_errors/BATCH*100))\n", - "print(\"CPU accuracy: {0:.2f}% validation error\".format(cpu_errors/BATCH*100))\n", - "if (cpu_time < fpga_time):\n", - " print(\"FPGA has a {0:.2f}x slowdown\".format(fpga_time/cpu_time))\n", + "print(\"FPGA accuracy: {0:.2f}% validation error\".format(fpga_errors / BATCH * 100))\n", + "print(\"CPU accuracy: {0:.2f}% validation error\".format(cpu_errors / BATCH * 100))\n", + "if cpu_time < fpga_time:\n", + " print(\"FPGA has a {0:.2f}x slowdown\".format(fpga_time / cpu_time))\n", "else:\n", - " print(\"FPGA has a {0:.2f}x speedup\".format(cpu_time/fpga_time))" + " print(\"FPGA has a {0:.2f}x speedup\".format(cpu_time / fpga_time))\n" ] }, { @@ -199,28 +153,21 @@ "def show(image):\n", " from matplotlib import pyplot\n", " import matplotlib as mpl\n", + "\n", " fig = pyplot.figure()\n", - " ax = fig.add_subplot(1,1,1)\n", + " ax = fig.add_subplot(1, 1, 1)\n", " imgplot = ax.imshow(image, cmap=mpl.cm.Greys)\n", - " imgplot.set_interpolation('nearest')\n", - " ax.xaxis.set_ticks_position('top')\n", - " ax.yaxis.set_ticks_position('left')\n", + " imgplot.set_interpolation(\"nearest\")\n", + " ax.xaxis.set_ticks_position(\"top\")\n", + " ax.yaxis.set_ticks_position(\"left\")\n", " pyplot.show()\n", "\n", + "\n", "# Inspect one of the hand digits classified by the FPGA\n", "idx = 1\n", - "show(i[idx].reshape(16,16))\n", + "show(i[idx].reshape(16, 16))\n", "print(\"Classified as {} by the FPGA\".format(np.argmax(c[idx])))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/zynq/jupyter/classifier_2.ipynb b/zynq/jupyter/classifier_2.ipynb index 05389b5..f8e2f87 100644 --- a/zynq/jupyter/classifier_2.ipynb +++ b/zynq/jupyter/classifier_2.ipynb @@ -3,94 +3,49 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "collapsed": false, - "deletable": true, - "editable": true - }, + "metadata": {}, "outputs": [], "source": [ + "from time import time\n", + "\n", "import cffi\n", "import numpy as np\n", - "from pynq import MMIO\n", - "from pynq import Overlay\n", - "from pynq import PL\n", - "from pynq.drivers import DMA\n", - "from time import sleep, time\n", + "from pynq import MMIO, Overlay, allocate\n", + "\n", + "ffi = cffi.FFI()\n", "\n", "# Classifier Dimensions\n", "BATCH = 8192\n", "FEAT = 256\n", - "CLASSES = 10\n", - "\n", - "# Addresses\n", - "ACCEL_CTRL = 0x43C00000\n", - "AXI_DMA_0 = 0x40400000\n", - "AXI_DMA_1 = 0x40410000\n", - "AXI_TIMER = 0x42800000\n", - "\n", - "# C FFI\n", - "ffi = cffi.FFI()\n", + "CLASSES = 10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the custome overlay and DMA\n", + "ol = Overlay(\"/home/xilinx/classifier.bit\")\n", + "ol.download()\n", "\n", - "# DMA Configs\n", - "DMAConfig1 = {\n", - " 'DeviceId' : 0,\n", - " 'BaseAddr' : ffi.cast(\"uint32_t *\",AXI_DMA_0),\n", - " 'HasStsCntrlStrm' : 0,\n", - " 'HasMm2S' : 1,\n", - " 'HasMm2SDRE' : 1,\n", - " 'Mm2SDataWidth' : 64,\n", - " 'HasS2Mm' : 0,\n", - " 'HasS2MmDRE' : 0,\n", - " 'S2MmDataWidth' : 32,\n", - " 'HasSg' : 0,\n", - " 'Mm2sNumChannels' : 1,\n", - " 'S2MmNumChannels' : 1,\n", - " 'Mm2SBurstSize' : 256,\n", - " 'S2MmBurstSize' : 16,\n", - " 'MicroDmaMode' : 0,\n", - " 'AddrWidth' : 32\n", - "}\n", - "DMAConfig2 = {\n", - " 'DeviceId' : 1,\n", - " 'BaseAddr' : ffi.cast(\"uint32_t *\",AXI_DMA_1),\n", - " 'HasStsCntrlStrm' : 0,\n", - " 'HasMm2S' : 0,\n", - " 'HasMm2SDRE' : 0,\n", - " 'Mm2SDataWidth' : 32,\n", - " 'HasS2Mm' : 1,\n", - " 'HasS2MmDRE' : 1,\n", - " 'S2MmDataWidth' : 64,\n", - " 'HasSg' : 0,\n", - " 'Mm2sNumChannels' : 1,\n", - " 'S2MmNumChannels' : 1,\n", - " 'Mm2SBurstSize' : 16,\n", - " 'S2MmBurstSize' : 256,\n", - " 'MicroDmaMode' : 0,\n", - " 'AddrWidth' : 32\n", - "}\n", + "dma_mm2s = ol.axi_dma_0\n", + "dma_s2mm = ol.axi_dma_1\n", + "mm2s_buffer = allocate(shape=(CLASSES * 4 + CLASSES * FEAT + BATCH * FEAT,), dtype=np.uint8)\n", + "s2mm_buffer = allocate(shape=(BATCH * CLASSES,), dtype=np.int32)\n", "\n", - "# Download the custom overlay\n", - "ol = Overlay(\"classifier_fixed.bit\")\n", - "ol.download()\n", + "# Accelerator Base Address\n", + "ACCEL_CTRL = 0x43C00000\n", "\n", "# Initialize HLS IP\n", - "mmult_ip = MMIO(ACCEL_CTRL,0x10000)\n", - "# Start the accelerator\n", - "ctrl=mmult_ip.read(0x00)&0x08\n", - "mmult_ip.write(0x00, (ctrl|0x81))\n", - "ctrl=mmult_ip.read(0x00)\n", - "hex(ctrl)\n", + "mmult_ip = MMIO(ACCEL_CTRL, 0x10000)\n", "\n", - "# Initialize DMA1 (mem to FPGA)\n", - "dma1 = DMA(AXI_DMA_0, direction=0, attr_dict=DMAConfig1)\n", - "dma1.create_buf((CLASSES*4+CLASSES*FEAT+BATCH*FEAT), cacheable=0)\n", - "\n", - "# Initialize DMA2 (FPGA to mem)\n", - "dma2 = DMA(AXI_DMA_1, direction=1, attr_dict=DMAConfig2)\n", - "dma2.create_buf(BATCH*CLASSES*4, cacheable=0)\n", - "# Start DMA transfer from FPGA to memory\n", - "dma2.transfer(BATCH*CLASSES*4, direction=1)" + "# Start the accelerator\n", + "ctrl = mmult_ip.read(0x00) & 0x08\n", + "mmult_ip.write(0x00, (ctrl | 0x81))\n", + "ctrl = mmult_ip.read(0x00)\n", + "hex(ctrl)\n" ] }, { @@ -104,10 +59,10 @@ "outputs": [], "source": [ "# Initialize offsets, weights and inputs\n", - "o = np.load('model_offsets_fixed.npy').astype(np.int32)\n", - "w = np.load('model_weights_fixed.npy').astype(np.int8)\n", - "i = np.load('test_data.npy').astype(np.uint8)[0:BATCH]\n", - "l = np.load('test_labels.npy').astype(np.int32)[0:BATCH]" + "o = np.load(\"model_offsets_fixed.npy\").astype(np.int32)\n", + "w = np.load(\"model_weights_fixed.npy\").astype(np.int8)\n", + "i = np.load(\"test_data.npy\").astype(np.uint8)[0:BATCH]\n", + "l = np.load(\"test_labels.npy\").astype(np.int32)[0:BATCH]" ] }, { @@ -121,20 +76,21 @@ "outputs": [], "source": [ "# Move offset, weight and input data to DMA buffer\n", - "ffi.memmove(dma1.get_buf(), ffi.cast(\"uint32_t *\", o.ctypes.data), CLASSES*4)\n", - "ffi.memmove(dma1.get_buf()+CLASSES, ffi.cast(\"uint32_t *\", w.ctypes.data), CLASSES*FEAT)\n", - "ffi.memmove(dma1.get_buf()+CLASSES+(CLASSES*FEAT)//4, ffi.cast(\"uint32_t *\", i.ctypes.data), BATCH*FEAT)\n", + "ffi.memmove(mm2s_buffer[:], ffi.cast(\"uint8_t *\", o.ctypes.data), CLASSES * 4)\n", + "ffi.memmove(mm2s_buffer[CLASSES * 4 :], ffi.cast(\"uint8_t *\", w.ctypes.data), CLASSES * FEAT)\n", + "ffi.memmove(mm2s_buffer[CLASSES * 4 + CLASSES * FEAT :], ffi.cast(\"uint8_t *\", i.ctypes.data), BATCH * FEAT)\n", + "mm2s_buffer.flush()\n", "\n", "# Perform FPGA offloading\n", "start_t = time()\n", - "dma1.transfer(CLASSES*4+CLASSES*FEAT+BATCH*FEAT, direction=0)\n", - "dma2.wait()\n", - "fpga_time = time()-start_t\n", + "dma_mm2s.sendchannel.transfer(mm2s_buffer)\n", + "dma_s2mm.recvchannel.transfer(s2mm_buffer)\n", + "dma_mm2s.sendchannel.wait()\n", + "dma_s2mm.recvchannel.wait()\n", + "fpga_time = time() - start_t\n", "\n", "# Dump FPGA result to a numpy array\n", - "c = np.frombuffer(ffi.buffer(\n", - " dma2.get_buf(),BATCH*CLASSES*4),\n", - " dtype=np.int32).reshape(BATCH,CLASSES)" + "c = np.array(s2mm_buffer).reshape(BATCH, CLASSES)\n" ] }, { @@ -148,14 +104,14 @@ "outputs": [], "source": [ "# Prepare input and weight matrices for matrix multiplication on CPU\n", - "ones = np.ones(BATCH).reshape((BATCH,1))\n", + "ones = np.ones(BATCH).reshape((BATCH, 1))\n", "i_p = np.append(ones, i, axis=1)\n", - "w_p = np.append(o.reshape(CLASSES,1), w, axis=1)\n", + "w_p = np.append(o.reshape(CLASSES, 1), w, axis=1)\n", "\n", "# Compute CPU result\n", "start_t = time()\n", - "c_ref = np.dot(i_p,w_p.T)\n", - "cpu_time = time()-start_t" + "c_ref = np.dot(i_p, w_p.T)\n", + "cpu_time = time() - start_t" ] }, { @@ -169,24 +125,20 @@ "outputs": [], "source": [ "# Evaluate validation accuracy\n", - "cpu_errors = 0\n", - "fpga_errors = 0\n", - "for idx in range(BATCH):\n", - " fpga_label = np.argmax(c[idx])\n", - " cpu_label = np.argmax(c_ref[idx])\n", - " actual_label = np.argmax(l[idx])\n", - " if (fpga_label!=actual_label):\n", - " fpga_errors += 1.\n", - " if (cpu_label!=actual_label):\n", - " cpu_errors += 1.\n", + "actual_label = l.argmax(axis=1)\n", + "fpga_label = c.argmax(axis=1)\n", + "cpu_label = c_ref.argmax(axis=1)\n", + "\n", + "fpga_errors = np.sum(fpga_label != actual_label)\n", + "cpu_errors = np.sum(cpu_label != actual_label)\n", "\n", "# Report results\n", - "print(\"FPGA accuracy: {0:.2f}% validation error\".format(fpga_errors/BATCH*100))\n", - "print(\"CPU accuracy: {0:.2f}% validation error\".format(cpu_errors/BATCH*100))\n", - "if (cpu_time < fpga_time):\n", - " print(\"FPGA has a {0:.2f}x slowdown\".format(fpga_time/cpu_time))\n", + "print(\"FPGA accuracy: {0:.2f}% validation error\".format(fpga_errors / BATCH * 100))\n", + "print(\"CPU accuracy: {0:.2f}% validation error\".format(cpu_errors / BATCH * 100))\n", + "if cpu_time < fpga_time:\n", + " print(\"FPGA has a {0:.2f}x slowdown\".format(fpga_time / cpu_time))\n", "else:\n", - " print(\"FPGA has a {0:.2f}x speedup\".format(cpu_time/fpga_time))" + " print(\"FPGA has a {0:.2f}x speedup\".format(cpu_time / fpga_time))\n" ] }, { @@ -203,17 +155,19 @@ "def show(image):\n", " from matplotlib import pyplot\n", " import matplotlib as mpl\n", + "\n", " fig = pyplot.figure()\n", - " ax = fig.add_subplot(1,1,1)\n", + " ax = fig.add_subplot(1, 1, 1)\n", " imgplot = ax.imshow(image, cmap=mpl.cm.Greys)\n", - " imgplot.set_interpolation('nearest')\n", - " ax.xaxis.set_ticks_position('top')\n", - " ax.yaxis.set_ticks_position('left')\n", + " imgplot.set_interpolation(\"nearest\")\n", + " ax.xaxis.set_ticks_position(\"top\")\n", + " ax.yaxis.set_ticks_position(\"left\")\n", " pyplot.show()\n", "\n", + "\n", "# Inspect one of the hand digits classified by the FPGA\n", "idx = 1\n", - "show(i[idx].reshape(16,16))\n", + "show(i[idx].reshape(16, 16))\n", "print(\"Classified as {} by the FPGA\".format(np.argmax(c[idx])))" ] } diff --git a/zynq/python/.gitignore b/zynq/python/.gitignore new file mode 100644 index 0000000..6bd7070 --- /dev/null +++ b/zynq/python/.gitignore @@ -0,0 +1,2 @@ +.env +mnist \ No newline at end of file diff --git a/zynq/python/mnist.py b/zynq/python/mnist.py index c01f7fa..bba3592 100644 --- a/zynq/python/mnist.py +++ b/zynq/python/mnist.py @@ -1,51 +1,54 @@ -import os import argparse import struct -import random +from pathlib import Path + import numpy as np +from skimage.transform import resize from sklearn import linear_model -from scipy.misc import imresize # File names -TRAIN_DAT = 'train-images-idx3-ubyte' -TRAIN_LAB = 'train-labels-idx1-ubyte' -TEST_DAT = 't10k-images-idx3-ubyte' -TEST_LAB = 't10k-labels-idx1-ubyte' +TRAIN_DAT = "train-images-idx3-ubyte" +TRAIN_LAB = "train-labels-idx1-ubyte" +TEST_DAT = "t10k-images-idx3-ubyte" +TEST_LAB = "t10k-labels-idx1-ubyte" + def show(image): """ Render a given numpy.uint8 2D array of pixel data. """ - from matplotlib import pyplot import matplotlib as mpl + from matplotlib import pyplot + fig = pyplot.figure() - ax = fig.add_subplot(1,1,1) + ax = fig.add_subplot(1, 1, 1) imgplot = ax.imshow(image, cmap=mpl.cm.Greys) - imgplot.set_interpolation('nearest') - ax.xaxis.set_ticks_position('top') - ax.yaxis.set_ticks_position('left') + imgplot.set_interpolation("nearest") + ax.xaxis.set_ticks_position("top") + ax.yaxis.set_ticks_position("left") pyplot.show() + def download(args): """ Downloads the MNIST dataset into the specified dir. - source: mxnet """ - if not os.path.isdir(args.data_dir): - os.system("mkdir " + args.data_dir) - os.chdir(args.data_dir) - if (not os.path.exists(TRAIN_DAT)) or \ - (not os.path.exists(TRAIN_LAB)) or \ - (not os.path.exists(TEST_DAT)) or \ - (not os.path.exists(TEST_LAB)): - import urllib, zipfile - zippath = os.path.join(os.getcwd(), "mnist.zip") - urllib.urlretrieve("http://data.mxnet.io/mxnet/data/mnist.zip", zippath) - zf = zipfile.ZipFile(zippath, "r") - zf.extractall() - zf.close() - os.remove(zippath) - os.chdir("..") + + data_dir = Path(args.data_dir) + data_dir.mkdir(exist_ok=True) + + zippath = data_dir / "mnist.zip" + if not zippath.exists(): + from urllib import request + + request.urlretrieve("http://data.mxnet.io/mxnet/data/mnist.zip", zippath) + + if not all(map(lambda p: (data_dir / p).exists(), [TRAIN_DAT, TRAIN_LAB, TEST_DAT, TEST_LAB])): + import zipfile + + with zipfile.ZipFile(zippath, "r") as zf: + zf.extractall(data_dir) + def getIterator(args, mode): """ @@ -53,23 +56,23 @@ def getIterator(args, mode): source: https://gist.github.com/akesling/5358964 """ - fname_img = os.path.join(args.data_dir, TEST_DAT if mode=='test' else TRAIN_DAT) - fname_lbl = os.path.join(args.data_dir, TEST_LAB if mode=='test' else TRAIN_LAB) + fname_img = Path(args.data_dir) / (TEST_DAT if mode == "test" else TRAIN_DAT) + fname_lbl = Path(args.data_dir) / (TEST_LAB if mode == "test" else TRAIN_LAB) # Access label and data from bit files - with open(fname_lbl, 'rb') as flbl: - magic, num = struct.unpack(">II", flbl.read(8)) + with open(fname_lbl, "rb") as flbl: + _, _ = struct.unpack(">II", flbl.read(8)) lbl = np.fromfile(flbl, dtype=np.int8) - with open(fname_img, 'rb') as fimg: - magic, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) - img = np.fromfile(fimg, dtype=np.uint8).reshape(len(lbl), rows, cols) - - # Format tuple: (label, data) - get_img = lambda idx: (lbl[idx], img[idx]) - # Create an iterator which returns each image in turn - for i in xrange(len(lbl)): - yield get_img(i) + with open(fname_img, "rb") as fimg: + _, num, rows, cols = struct.unpack(">IIII", fimg.read(16)) + img = np.fromfile(fimg, dtype=np.uint8).reshape(num, rows, cols) + + assert num == len(lbl) + + for pair in zip(lbl, img): + yield pair + def getDataSet(args, mode): """ @@ -78,23 +81,17 @@ def getDataSet(args, mode): # Download MNIST dataset if it hasn't been already downloaded download(args) - # Process the raw data - mnistData = getIterator(args, mode) # Data and labels data = [] labels = [] - # Tracks positive and negative samples - (pos, neg) = (0, 0) # Iterate until we have enough samples - for t in mnistData: - lab = t[0] - img = t[1] + for lab, img in getIterator(args, mode): # Resize the image - img = imresize(img, (args.dim, args.dim), interp='bilinear') + img = resize(img, (args.dim, args.dim), preserve_range=True) # Reshape - datum = np.divide(img.reshape((args.dim*args.dim,)), 1) + datum = np.divide(img.reshape((args.dim * args.dim,)), 1) # Prepare the labels (one-hot encoded) label = np.zeros(10) label[lab] = 1.0 @@ -103,7 +100,7 @@ def getDataSet(args, mode): # Display the image show(img) # Print label - print 'Label: {}'.format(lab) + print("Label: {}".format(lab)) data.append(datum) labels.append(label) @@ -112,24 +109,21 @@ def getDataSet(args, mode): def parse_args(): - parser = argparse.ArgumentParser(description='produce synthesis constraints from mnist training data') - parser.add_argument('--data-dir', type=str, default='mnist/', - help='the input data directory') - parser.add_argument('--num-examples', type=int, default=8, - help='the number of training examples') - parser.add_argument('--dim', type=int, default=16, - help='height and width of mnist dataset to resize to') - parser.add_argument('--debug', action='store_true', - help='debug mode') + parser = argparse.ArgumentParser(description="produce synthesis constraints from mnist training data") + parser.add_argument("--data-dir", type=str, default="mnist/", help="the input data directory") + parser.add_argument("--num-examples", type=int, default=8, help="the number of training examples") + parser.add_argument("--dim", type=int, default=16, help="height and width of mnist dataset to resize to") + parser.add_argument("--debug", action="store_true", help="debug mode") return parser.parse_args() -if __name__ == '__main__': + +if __name__ == "__main__": args = parse_args() # Extract the training dataset - train_data, train_labels = getDataSet(args, 'train') + train_data, train_labels = getDataSet(args, "train") # Extract the training dataset - test_data, test_labels = getDataSet(args, 'test') + test_data, test_labels = getDataSet(args, "test") # Linear regression reg = linear_model.Ridge() @@ -139,45 +133,36 @@ def parse_args(): float_labels = reg.predict(test_data) # Fixed point computation - # CSE 548: Todo: tweak the SCALE to get less than 20% classification error + # CSE 548: TODO: tweak the SCALE to get less than 20% classification error SCALE = 0 # CSE 548 - Change me offset = reg.intercept_ weight = reg.coef_ - offset = np.clip(offset*SCALE, -128, 127) + offset = np.clip(offset * SCALE, -128, 127) offset = offset.astype(np.int32) - weight = np.clip(weight*SCALE, -128, 127) + weight = np.clip(weight * SCALE, -128, 127) weight = weight.astype(np.int8) # Perform fixed-point classification - ones = np.ones(len(test_data)).reshape((len(test_data),1)) + ones = np.ones(len(test_data)).reshape((len(test_data), 1)) i_p = np.append(ones, test_data, axis=1) - w_p = np.append(offset.reshape(10,1), weight, axis=1) + w_p = np.append(offset.reshape(10, 1), weight, axis=1) fixed_labels = np.dot(i_p, w_p.T) # Measure Validation Errors - float_errors = 0 - for idx, label in enumerate(test_labels): - guess_label = np.argmax(float_labels[idx]) - actual_label = np.argmax(label) - if (guess_label!=actual_label): - float_errors += 1. - fixed_errors = 0 - for idx, label in enumerate(test_labels): - guess_label = np.argmax(fixed_labels[idx]) - actual_label = np.argmax(label) - if (guess_label!=actual_label): - fixed_errors += 1. + actual_label = test_labels.argmax(axis=1) + float_errors = (actual_label != float_labels.argmax(axis=1)).sum() + fixed_errors = (actual_label != fixed_labels.argmax(axis=1)).sum() # Produce stats - print 'Min/Max of coefficient values [{}, {}]'.format(reg.coef_.min(), reg.coef_.max()) - print 'Min/Max of intersect values [{}, {}]'.format(reg.intercept_.min(),reg.intercept_.max()) - print 'Misclassifications (float) = {0:.2f}%'.format(float_errors/len(test_labels)*100) - print 'Misclassifications (fixed) = {0:.2f}%'.format(fixed_errors/len(test_labels)*100) + print("Min/Max of coefficient values [{}, {}]".format(reg.coef_.min(), reg.coef_.max())) + print("Min/Max of intersect values [{}, {}]".format(reg.intercept_.min(), reg.intercept_.max())) + print("Misclassifications (float) = {0:.2f}%".format(float_errors / len(test_labels) * 100)) + print("Misclassifications (fixed) = {0:.2f}%".format(fixed_errors / len(test_labels) * 100)) # Dump the model and test data - np.save('test_data', test_data) - np.save('test_labels', test_labels) - np.save('model_weights', reg.coef_) - np.save('model_offsets', reg.intercept_) - np.save('model_weights_fixed', weight) - np.save('model_offsets_fixed', offset) + np.save("test_data", test_data) + np.save("test_labels", test_labels) + np.save("model_weights", reg.coef_) + np.save("model_offsets", reg.intercept_) + np.save("model_weights_fixed", weight) + np.save("model_offsets_fixed", offset) diff --git a/zynq/python/requirements.txt b/zynq/python/requirements.txt new file mode 100644 index 0000000..7a3ad5a --- /dev/null +++ b/zynq/python/requirements.txt @@ -0,0 +1,4 @@ +matplotlib==3.5.2 +numpy==1.22.3 +scikit-image==0.19.2 +scikit-learn==1.1.0 diff --git a/zynq/tcl/classifier.tcl b/zynq/tcl/classifier.tcl index 38fc119..59a9ffc 100644 --- a/zynq/tcl/classifier.tcl +++ b/zynq/tcl/classifier.tcl @@ -9,7 +9,7 @@ ################################################################ # Check if script is running in correct Vivado version. -set scripts_vivado_version 2017.1 +set scripts_vivado_version 2020.2 set current_vivado_version [version -short] if { [string first $scripts_vivado_version $current_vivado_version] == -1 } { @@ -89,7 +89,7 @@ if { ${design_name} eq "" } { set errMsg "Design <$design_name> already exists in your project, please set the variable to another value." set nRet 1 } elseif { [get_files -quiet ${design_name}.bd] ne "" } { - # USE CASES: + # USE CASES: # 6) Current opened design, has components, but diff names, design_name exists in project. # 7) No opened design, design_name exists in project. @@ -1588,22 +1588,17 @@ create_root_design "" # tmoreau89 BEGIN ################################################################ +validate_bd_design + # Create top-level wrapper file make_wrapper -files [get_files $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/$proj_name.bd] -top -add_files -norecurse $proj_path/$proj_name.srcs/sources_1/bd/$proj_name/hdl/${proj_name}_wrapper.v -update_compile_order -fileset sources_1 -update_compile_order -fileset sim_1 +add_files -norecurse $proj_path/$proj_name.gen/sources_1/bd/$proj_name/hdl/${proj_name}_wrapper.v # Run bistream generation on 4 threads launch_runs impl_1 -to_step write_bitstream -jobs 4 -wait_on_run impl_1 +wait_on_run impl_1 puts "Implementation done!" -# Export hardware description file and bitstream files to export/ dir -file mkdir $proj_path/export -file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.sysdef $proj_path/export/classifier.hdf -file copy -force $proj_path/$proj_name.runs/impl_1/${proj_name}_wrapper.bit $proj_path/export/classifier.bit - exit ################################################################ diff --git a/zynq/tcl/hls.tcl b/zynq/tcl/hls.tcl index aa9a5f0..075e6fa 100755 --- a/zynq/tcl/hls.tcl +++ b/zynq/tcl/hls.tcl @@ -8,6 +8,11 @@ add_files -tb $src_dir/mmult_test.cpp open_solution "solution0" set_part {xc7z020clg484-1} create_clock -period 10 -name default +set_clock_uncertainty 12.5% + +config_compile -pipeline_loops 0 +config_export -vivado_phys_opt place +config_export -vivado_optimization_level 2 csim_design -clean csynth_design export_design -evaluate verilog -format ip_catalog