diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index 57e3d54952..91433f3093 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -4,5 +4,5 @@ contact_links:
url: https://finn.readthedocs.io/en/latest/getting_started.html
about: Documentation about how to get up and running with FINN.
- name: Ask for help and get in touch with the community
- url: https://gitter.im/xilinx-finn/community
- about: Check out our gitter channel, if you have a question about FINN or a general problem that is likely not a bug.
+ url: https://github.com/Xilinx/finn/discussions
+ about: Check out our GitHub Discussions, if you have a question about FINN or a general problem that is likely not a bug.
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
index 00c25a4a31..f9a251a8c7 100644
--- a/.github/workflows/docker-image.yml
+++ b/.github/workflows/docker-image.yml
@@ -1,8 +1,6 @@
name: DockerImage
on:
- pull_request:
- branches: [ dev ]
push:
branches: [ dev ]
diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml
index 5f03379bbc..011ccebadc 100644
--- a/.github/workflows/pre-commit.yml
+++ b/.github/workflows/pre-commit.yml
@@ -18,7 +18,7 @@ jobs:
- name: Setup Python
uses: actions/setup-python@v4
with:
- python-version: '3.8'
+ python-version: '3.10'
- name: Run Lint
uses: pre-commit/action@v3.0.0
diff --git a/.isort.cfg b/.isort.cfg
index 6cfe1c8919..5378b88fad 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -2,7 +2,7 @@
line_length=88
indent=' '
skip=.tox,.venv,build,dist
-known_standard_library=setuptools,pkg_resources
+known_standard_library=setuptools
known_test=pytest
known_first_party=finn
sections=FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 126a4ac4b2..72a9688505 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,11 +29,11 @@
exclude: '^docs/conf.py'
default_language_version:
- python: python3.8
+ python: python3.10
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.2.0
+ rev: v4.4.0
hooks:
- id: trailing-whitespace
exclude: '\.dat$'
@@ -56,15 +56,16 @@ repos:
- id: isort
- repo: https://github.com/psf/black
- rev: 22.3.0
+ rev: 23.3.0
hooks:
- id: black
language_version: python3
+ args: [--line-length=100]
- repo: https://github.com/PyCQA/flake8
- rev: 3.9.2
+ rev: 6.0.0
hooks:
- id: flake8
# black-compatible flake-8 config
- args: ['--max-line-length=88', # black default
+ args: ['--max-line-length=100', # black default
'--extend-ignore=E203'] # E203 is not PEP8 compliant
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 478957be11..575a60c69d 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -1,4 +1,5 @@
-# Copyright (c) 2021, Xilinx
+# Copyright (c) 2021-2022, Xilinx
+# Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -31,13 +32,15 @@
version: 2
+# Set the version of Python and other tools you might need
+build:
+ os: ubuntu-22.04
+ tools:
+ python: "3.8"
+
sphinx:
configuration: docs/finn/conf.py
python:
- version: 3.8
install:
- - method: pip
- path: .
- extra_requirements:
- - docs
+ - requirements: docs/requirements.txt
diff --git a/AUTHORS.rst b/AUTHORS.rst
index 861b81924b..5a11497fc8 100644
--- a/AUTHORS.rst
+++ b/AUTHORS.rst
@@ -28,3 +28,9 @@ Contributors
* Matthias Gehre (@mgehre-amd)
* Hugo Le Blevec (@hleblevec)
* Patrick Geel (@patrickgeel)
+* John Monks (@jmonks-amd)
+* Tim Paine (@timkpaine)
+* Linus Jungemann (@LinusJungemann)
+* Shashwat Khandelwal (@shashwat1198)
+* Ian Colbert (@i-colbert)
+* Rachit Garg (@rstar900)
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
deleted file mode 100644
index 226e6f5931..0000000000
--- a/CHANGELOG.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-=========
-Changelog
-=========
-
-Version 0.1
-===========
-
-- Feature A added
-- FIX: nasty bug #1729 fixed
-- add your changes here!
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d376a1b42b..5e34624790 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -29,6 +29,60 @@ Please follow the steps below and be sure that your contribution complies with o
1. The main branch should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break.
2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the development branch.
-3. We will review your contribution and, if any additional fixes or modifications are
+3. Sign Your Work
+
+Please use the *Signed-off-by* line at the end of your patch which indicates that you accept the Developer Certificate of Origin (DCO) defined by https://developercertificate.org/ reproduced below::
+
+```
+ Developer Certificate of Origin
+ Version 1.1
+
+ Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+ 1 Letterman Drive
+ Suite D4700
+ San Francisco, CA, 94129
+
+ Everyone is permitted to copy and distribute verbatim copies of this
+ license document, but changing it is not allowed.
+
+
+ Developer's Certificate of Origin 1.1
+
+ By making a contribution to this project, I certify that:
+
+ (a) The contribution was created in whole or in part by me and I
+ have the right to submit it under the open source license
+ indicated in the file; or
+
+ (b) The contribution is based upon previous work that, to the best
+ of my knowledge, is covered under an appropriate open source
+ license and I have the right under that license to submit that
+ work with modifications, whether created in whole or in part
+ by me, under the same open source license (unless I am
+ permitted to submit under a different license), as indicated
+ in the file; or
+
+ (c) The contribution was provided directly to me by some other
+ person who certified (a), (b) or (c) and I have not modified
+ it.
+
+ (d) I understand and agree that this project and the contribution
+ are public and that a record of the contribution (including all
+ personal information I submit with it, including my sign-off) is
+ maintained indefinitely and may be redistributed consistent with
+ this project or the open source license(s) involved.
+```
+
+You can enable Signed-off-by automatically by adding the `-s` flag to the `git commit` command.
+
+Here is an example Signed-off-by line which indicates that the contributor accepts DCO:
+
+```
+ This is my commit message
+
+ Signed-off-by: Jane Doe
+```
+
+4. We will review your contribution and, if any additional fixes or modifications are
necessary, may provide feedback to guide you. When accepted, your pull request will
be merged to the repository. If you have more questions please contact us.
diff --git a/LICENSE.txt b/LICENSE.txt
index 278564a5a4..cec78d6043 100644
--- a/LICENSE.txt
+++ b/LICENSE.txt
@@ -1,4 +1,5 @@
-Copyright (c) 2020, Xilinx
+Copyright (C) 2020-2022, Xilinx, Inc.
+Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
All rights reserved.
Redistribution and use in source and binary forms, with or without
diff --git a/README.md b/README.md
index 2e1faf8f0c..0856701908 100644
--- a/README.md
+++ b/README.md
@@ -2,13 +2,12 @@
-
+
[](https://github.com/Xilinx/finn/discussions)
[](http://finn.readthedocs.io/)
-FINN is an experimental framework from Xilinx Research Labs to explore deep neural network
-inference on FPGAs.
+FINN is an experimental framework from Integrated Communications and AI Lab of AMD Research & Advanced Development to explore deep neural network inference on FPGAs.
It specifically targets quantized neural
networks, with emphasis on
generating dataflow-style architectures customized for each network.
@@ -28,7 +27,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s
## Documentation
-You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience.
+You can view the documentation on [readthedocs](https://finn.readthedocs.io). Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience.
## Community
diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn
index dbafba2476..2ceb1f4195 100644
--- a/docker/Dockerfile.finn
+++ b/docker/Dockerfile.finn
@@ -1,4 +1,5 @@
-# Copyright (c) 2021, Xilinx
+# Copyright (C) 2021-2022, Xilinx, Inc.
+# Copyright (C) 2022-2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -26,10 +27,10 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime
-LABEL maintainer="Yaman Umuroglu "
+FROM ubuntu:jammy-20230126
+LABEL maintainer="Jakoba Petri-Koenig , Yaman Umuroglu "
-ARG XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"
+ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"
WORKDIR /workspace
@@ -57,12 +58,17 @@ RUN apt-get update && \
unzip \
zip \
locales \
- lsb-core
+ lsb-core \
+ python3 \
+ python-is-python3 \
+ python3-pip \
+ python3-setuptools-scm \
+ python3-venv
RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config
RUN locale-gen "en_US.UTF-8"
# install Verilator from source to get the right version
-RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev
+RUN apt-get install -y git perl make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlib1g zlib1g-dev
RUN git clone https://github.com/verilator/verilator
RUN cd verilator && \
git checkout v4.224 && \
@@ -81,22 +87,31 @@ RUN rm /tmp/$XRT_DEB_VERSION.deb
COPY requirements.txt .
RUN pip install -r requirements.txt
RUN rm requirements.txt
+
+# install PyTorch
+RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116
+
# extra Python package dependencies (for testing and interaction)
-RUN pip install pygments==2.4.1
-RUN pip install ipykernel==5.5.5
+RUN pip install pygments==2.14.0
+RUN pip install ipykernel==6.21.2
RUN pip install jupyter==1.0.0 --ignore-installed
RUN pip install markupsafe==2.0.1
-RUN pip install matplotlib==3.3.1 --ignore-installed
+RUN pip install matplotlib==3.7.0 --ignore-installed
RUN pip install pytest-dependency==0.5.1
-RUN pip install sphinx==5.0.2
-RUN pip install sphinx_rtd_theme==0.5.0
-RUN pip install pytest-xdist[setproctitle]==2.4.0
-RUN pip install pytest-parallel==0.1.0
+RUN pip install pytest-xdist[setproctitle]==3.2.0
+RUN pip install pytest-parallel==0.1.1
RUN pip install "netron>=5.0.0"
-RUN pip install pandas==1.1.5
-RUN pip install scikit-learn==0.24.1
-RUN pip install tqdm==4.31.1
+RUN pip install pandas==1.5.3
+RUN pip install scikit-learn==1.2.1
+RUN pip install tqdm==4.64.1
RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading
+# these versions of pytest and associated plugins allow for stable collection of
+# test reports and code coverage reports in HTML
+RUN pip install pytest==6.2.5
+RUN pip install pytest-metadata==1.7.0
+RUN pip install pytest-html==3.0.0
+RUN pip install pytest-html-merger==0.0.8
+RUN pip install pytest-cov==4.1.0
# extra dependencies from other FINN deps
# installed in Docker image to make entrypoint script go faster
diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh
index b5c702111a..61c8f78665 100644
--- a/docker/finn_entrypoint.sh
+++ b/docker/finn_entrypoint.sh
@@ -54,8 +54,11 @@ recho () {
echo -e "${RED}ERROR: $1${NC}"
}
-# qonnx
+# qonnx (using workaround for https://github.com/pypa/pip/issues/7953)
+# to be fixed in future Ubuntu versions (https://bugs.launchpad.net/ubuntu/+source/setuptools/+bug/1994016)
+mv ${FINN_ROOT}/deps/qonnx/pyproject.toml ${FINN_ROOT}/deps/qonnx/pyproject.tmp
pip install --user -e ${FINN_ROOT}/deps/qonnx
+mv ${FINN_ROOT}/deps/qonnx/pyproject.tmp ${FINN_ROOT}/deps/qonnx/pyproject.toml
# finn-experimental
pip install --user -e ${FINN_ROOT}/deps/finn-experimental
# brevitas
@@ -109,10 +112,31 @@ if [ -f "$HLS_PATH/settings64.sh" ];then
else
yecho "Unable to find $HLS_PATH/settings64.sh"
yecho "Functionality dependent on Vitis HLS will not be available."
- yecho "Please note that FINN needs at least version 2020.2 for Vitis HLS support."
+ yecho "Please note that FINN needs at least version 2020.2 for Vitis HLS support. Our recommendation is to use version 2022.2"
yecho "If you need Vitis HLS, ensure HLS_PATH is set correctly and mounted into the Docker container."
fi
+if [ -d "$FINN_ROOT/.Xilinx" ]; then
+ mkdir "$HOME/.Xilinx"
+ if [ -f "$FINN_ROOT/.Xilinx/HLS_init.tcl" ]; then
+ cp "$FINN_ROOT/.Xilinx/HLS_init.tcl" "$HOME/.Xilinx/"
+ gecho "Found HLS_init.tcl and copied to $HOME/.Xilinx/HLS_init.tcl"
+ else
+ yecho "Unable to find $FINN_ROOT/.Xilinx/HLS_init.tcl"
+ fi
+
+ if [ -f "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" ]; then
+ mkdir "$HOME/.Xilinx/Vivado/"
+ cp "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" "$HOME/.Xilinx/Vivado/"
+ gecho "Found Vivado_init.tcl and copied to $HOME/.Xilinx/Vivado/Vivado_init.tcl"
+ else
+ yecho "Unable to find $FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl"
+ fi
+else
+ echo "If you need to enable a beta device, ensure .Xilinx/HLS_init.tcl and/or .Xilinx/Vivado/Vivado_init.tcl are set correctly and mounted"
+ echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts"
+fi
+
export PATH=$PATH:$HOME/.local/bin
# execute the provided command(s) as root
exec "$@"
diff --git a/docker/jenkins/Jenkinsfile b/docker/jenkins/Jenkinsfile
index e3e5b5f7f9..6d51fffd64 100644
--- a/docker/jenkins/Jenkinsfile
+++ b/docker/jenkins/Jenkinsfile
@@ -1,46 +1,355 @@
-node {
- def app
- stage('Clone repository') {
- /* Let's make sure we have the repository cloned to our workspace */
- checkout scm
- }
- withEnv([
- "FINN_XILINX_PATH=/proj/xbuilds/SWIP/2022.1_0420_0327/installs/lin64",
- "FINN_XILINX_VERSION=2022.1",
- "FINN_DOCKER_TAG=xilinx/finn:jenkins",
- "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci",
- "PLATFORM_REPO_PATHS=/opt/xilinx/platforms"
- ]){
- parallel firstBranch: {
- stage('Brevitas export') {
- dir("${env.WORKSPACE}") {
- sh("bash run-docker.sh python setup.py test --addopts -mbrevitas_export")
- }
- }
- }, secondBranch: {
- stage('Streamlining transformations') {
- dir("${env.WORKSPACE}") {
- sh("bash run-docker.sh python setup.py test --addopts -mstreamline")
- }
- }
- }, thirdBranch: {
- stage('Util functions') {
- dir("${env.WORKSPACE}") {
- sh("bash run-docker.sh python setup.py test --addopts -mutil")
- }
- }
- }, fourthBranch: {
- stage('General transformations') {
- dir("${env.WORKSPACE}") {
- sh("bash run-docker.sh python setup.py test --addopts -mtransform")
- }
- }
- }, fifthBranch: {
- stage('Fpgadataflow transformations and simulations') {
- dir("${env.WORKSPACE}") {
- sh("bash run-docker.sh python setup.py test --addopts -mfpgadataflow")
- }
+pipeline {
+ agent none
+ parameters {
+ booleanParam(name: 'fpgadataflow', defaultValue: false, description: 'Run fpgadataflow tests')
+ booleanParam(name: 'sanity', defaultValue: true, description: 'Run sanity hardware and unit tests')
+ booleanParam(name: 'end2end', defaultValue: false, description: 'Run end2end tests')
+ }
+ stages {
+ stage('Run Tests') {
+ parallel {
+ stage('Sanity - Build Hardware') {
+ when {
+ expression { return params['sanity'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ TEST_NAME = "bnn_build_sanity"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ // Creates dir in finn clone to store build files for stashing
+ sh "mkdir -p ${env.TEST_NAME}"
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Pass in the marker to run with pytest and the XML test results filename
+ runDockerPytestWithMarker("sanity_bnn", "${env.TEST_NAME}", '')
+
+ // Find the board's build files (bitstreams/xclbins) and zip for use on the boards themselves
+ findCopyZip("Pynq-Z1", env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+ findCopyZip("ZCU104", env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+ findCopyZip("KV260_SOM", env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+ findCopyZip("U250", env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+
+ // Stash the test results file(s)
+ stash name: "${env.TEST_NAME}", includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.BNN_BUILD_SANITY = "SUCCESS"
+ }
+ }
+ }
+ }
+ stage('Sanity - Unit Tests') {
+ when {
+ expression { params['sanity'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ TEST_NAME = "sanity_ut"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Multiple markers with pytest needs its own script
+ createMultiMarkerScript("util or brevitas_export or streamline or transform or notebooks", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_sanity_ut")
+ sh './run-docker.sh ./run-tests.sh'
+
+ // Stash the test results file(s)
+ stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.SANITY_UT = "SUCCESS"
+
+ // Archive coverage report if successful
+ archiveSuccessfulStage(env.SANITY_UT, "coverage_sanity_ut")
+ }
+ }
+ }
+ }
+ stage('fpgadataflow Tests') {
+ when {
+ expression { params['fpgadataflow'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ TEST_NAME = "fpgadataflow"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Pass in the marker to run with pytest and the XML test results filename
+ runDockerPytestWithMarker("fpgadataflow", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_fpgadataflow")
+
+ // Stash the test results file(s)
+ stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.FPGADATAFLOW_RESULT = "SUCCESS"
+
+ // Archive coverage report if successful
+ archiveSuccessfulStage(env.FPGADATAFLOW_RESULT, "coverage_fpgadataflow")
+ }
+ }
+ }
+ }
+ stage('End2end') {
+ when {
+ expression { params['end2end'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ TEST_NAME = "end2end"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ // Delete any build files from a previous build
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Pass in the marker to run with pytest and the XML test results filename
+ runDockerPytestWithMarker(env.TEST_NAME, "${env.TEST_NAME}", '')
+
+ // Stash the test results file(s)
+ stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.END2END_RESULT = "SUCCESS"
+ }
+ }
+ }
+ }
+ stage('BNN end2end - U250') {
+ when {
+ expression { return params['end2end'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ BOARD = "U250"
+ TEST_NAME = "bnn_build_full"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ // Creates dir in finn clone to store build files for stashing
+ sh "mkdir -p ${env.TEST_NAME}"
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Pass in the marker to run with pytest and the XML test results filename
+ runDockerPytestWithMarker("bnn_u250", "${env.TEST_NAME}_${env.BOARD}", '')
+ findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+
+ // Stash the test results file(s)
+ stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.BNN_BUILD_U250 = "SUCCESS"
+ }
+ }
+ }
+ }
+ stage('BNN end2end - Pynq-Z1') {
+ when {
+ expression { return params['end2end'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ BOARD = "Pynq-Z1"
+ TEST_NAME = "bnn_build_full"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ // Creates dir in finn clone to store build files for stashing
+ sh "mkdir -p ${env.TEST_NAME}"
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Pass in the marker to run with pytest and the XML test results filename
+ runDockerPytestWithMarker("bnn_pynq", "${env.TEST_NAME}_${env.BOARD}", '')
+ findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+
+ // Stash the test results file(s)
+ stash name: "${env.TEST_NAME}_PynqZ1", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.BNN_BUILD_PYNQZ1 = "SUCCESS"
+ }
+ }
+ }
+ }
+ stage('BNN end2end - ZCU104') {
+ when {
+ expression { return params['end2end'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ BOARD = "ZCU104"
+ TEST_NAME = "bnn_build_full"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ // Creates dir in finn clone to store build files for stashing
+ sh "mkdir -p ${env.TEST_NAME}"
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Pass in the marker to run with pytest and the XML test results filename
+ runDockerPytestWithMarker("bnn_zcu104", "${env.TEST_NAME}_${env.BOARD}", '')
+ findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+
+ // Stash the test results file(s)
+ stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.BNN_BUILD_ZCU104 = "SUCCESS"
+ }
+ }
+ }
+ }
+ stage('BNN end2end - KV260_SOM') {
+ when {
+ expression { return params['end2end'] }
+ }
+ agent {
+ label 'finn-build'
+ }
+ environment {
+ BOARD = "KV260_SOM"
+ TEST_NAME = "bnn_build_full"
+ FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}"
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ script {
+ // Creates dir in finn clone to store build files for stashing
+ sh "mkdir -p ${env.TEST_NAME}"
+ cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR)
+
+ // Pass in the marker to run with pytest and the XML test results filename
+ runDockerPytestWithMarker("bnn_kv260", "${env.TEST_NAME}_${env.BOARD}", '')
+ findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME)
+
+ // Stash the test results file(s)
+ stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html"
+
+ // Use an env variable to help collect test results later in pipeline
+ env.BNN_BUILD_KV260_SOM = "SUCCESS"
+ }
}
+ }
}
+ }
}
+ stage('Check Stage Results') {
+ agent {
+ label 'finn-build'
+ }
+ steps {
+ script {
+ sh 'mkdir -p reports'
+ cleanPreviousBuildFiles('reports')
+ dir('reports') {
+ // Only unstash for stages that ran
+ unstashSuccessfulStage(env.SANITY_UT, "sanity_ut")
+ unstashSuccessfulStage(env.FPGADATAFLOW_RESULT, "fpgadataflow")
+ unstashSuccessfulStage(env.BNN_BUILD_SANITY, "bnn_build_sanity")
+ unstashSuccessfulStage(env.END2END_RESULT, "end2end")
+ unstashSuccessfulStage(env.BNN_BUILD_U250, "bnn_build_full_U250")
+ unstashSuccessfulStage(env.BNN_BUILD_PYNQZ1, "bnn_build_full_PynqZ1")
+ unstashSuccessfulStage(env.BNN_BUILD_ZCU104, "bnn_build_full_ZCU104")
+ unstashSuccessfulStage(env.BNN_BUILD_KV260_SOM, "bnn_build_full_KV260_SOM")
+ }
+
+ // Combine individual HTML files to one single report
+ sh './run-docker.sh pytest_html_merger -i reports/ -o reports/test_report_final.html'
+
+ // Archive the XML & HTML test results
+ archiveArtifacts artifacts: "reports/*.xml"
+ archiveArtifacts artifacts: "reports/*.html"
+
+ // Plot what XML files were created during the test run
+ junit 'reports/*.xml'
+ }
+ }
+ }
+ }
+}
+
+void cleanPreviousBuildFiles(String buildDir) {
+ // Delete any build files from a previous build
+ // Previous build folders affect findCopyZip() and can cause the stage to fail
+ if (!buildDir.empty) {
+ sh "rm -rf ${buildDir}"
+ }
+}
+
+void createMultiMarkerScript(String markers, String testResultsFilename, String additionalOptions) {
+ // Passing multiple markers when running ./run-docker.sh does not work with bash.
+ // Therefore, create a script to maintain the single quotes that surround the markers
+ sh """echo "#!/bin/bash
+python -m pytest -m \'${markers}\' --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}" >> run-tests.sh
+ """
+
+ // Give permissions to script
+ sh 'chmod 777 run-tests.sh'
+}
+
+void runDockerPytestWithMarker(String marker, String testResultsFilename, String additionalOptions) {
+ sh """./run-docker.sh python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}"""
+}
+
+def findBoardBuildFiles(String searchDir, String dirToFind) {
+ def result = sh(script: "find $searchDir -type d -name \"$dirToFind*\"", returnStdout: true).trim()
+ if (result.empty) {
+ error "Directory containing '$dirToFind' not found."
+ }
+ return result
+}
+
+void findCopyZip(String board, String findDir, String copyDir) {
+ def buildDir = findBoardBuildFiles(findDir, "hw_deployment_${board}")
+ sh "cp -r ${buildDir}/${board} ${copyDir}/"
+ dir(copyDir) {
+ sh "zip -r ${board}.zip ${board}/"
+ sh "mkdir -p ${env.ARTIFACT_DIR}/${copyDir}/"
+ sh "cp ${board}.zip ${env.ARTIFACT_DIR}/${copyDir}/"
+ }
+}
+
+void unstashSuccessfulStage(String stageEnvVariableSet, String stashName) {
+ if (stageEnvVariableSet) {
+ unstash stashName
+ }
+}
+
+void archiveSuccessfulStage(String stageEnvVariableSet, String folder) {
+ if (stageEnvVariableSet) {
+ archiveArtifacts artifacts: "${folder}/**/*"
+ }
}
diff --git a/docker/jenkins/Jenkinsfile_CI b/docker/jenkins/Jenkinsfile_CI
new file mode 100644
index 0000000000..5e7d5f1475
--- /dev/null
+++ b/docker/jenkins/Jenkinsfile_CI
@@ -0,0 +1,46 @@
+node('finn-build || built-in') {
+ def app
+ stage('Clone repository') {
+ /* Let's make sure we have the repository cloned to our workspace */
+ checkout scm
+ }
+ withEnv([
+ "FINN_XILINX_PATH=/proj/xbuilds/SWIP/2022.2_1014_8888/installs/lin64",
+ "FINN_XILINX_VERSION=2022.2",
+ "FINN_DOCKER_TAG=xilinx/finn:jenkins",
+ "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci",
+ "PLATFORM_REPO_PATHS=/opt/xilinx/platforms"
+ ]){
+ parallel firstBranch: {
+ stage('Brevitas export') {
+ dir("${env.WORKSPACE}") {
+ sh("bash run-docker.sh pytest -mbrevitas_export")
+ }
+ }
+ }, secondBranch: {
+ stage('Streamlining transformations') {
+ dir("${env.WORKSPACE}") {
+ sh("bash run-docker.sh pytest -mstreamline")
+ }
+ }
+ }, thirdBranch: {
+ stage('Util functions') {
+ dir("${env.WORKSPACE}") {
+ sh("bash run-docker.sh pytest -mutil")
+ }
+ }
+ }, fourthBranch: {
+ stage('General transformations') {
+ dir("${env.WORKSPACE}") {
+ sh("bash run-docker.sh pytest -mtransform")
+ }
+ }
+ }, fifthBranch: {
+ stage('Fpgadataflow transformations and simulations') {
+ dir("${env.WORKSPACE}") {
+ sh("bash run-docker.sh pytest -mfpgadataflow")
+ }
+ }
+ }
+ }
+}
diff --git a/docker/jenkins/Jenkinsfile_HW b/docker/jenkins/Jenkinsfile_HW
new file mode 100644
index 0000000000..bd438d888e
--- /dev/null
+++ b/docker/jenkins/Jenkinsfile_HW
@@ -0,0 +1,481 @@
+pipeline {
+ agent none
+ stages {
+ stage('Get node status') {
+ options { skipDefaultCheckout() }
+ agent {
+ label 'finn-build'
+ }
+ steps {
+ script {
+ // Check which boards are online before running HW tests
+ env.ALVEO_HOST_ONLINE = isNodeOnline('finn-u250')
+ env.PYNQ_ONLINE = isNodeOnline('finn-pynq')
+ env.ZCU104_ONLINE = isNodeOnline('finn-zcu104')
+ env.KV260_ONLINE = isNodeOnline('finn-kv260')
+ }
+ }
+ }
+ stage('Reboot Zynq platforms') {
+ parallel {
+ stage('Pynq-Z1') {
+ options { skipDefaultCheckout() }
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.PYNQ_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-pynq'
+ }
+ environment {
+ BOARD = 'Pynq-Z1'
+ USER_CREDENTIALS = credentials('pynq-z1-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ restartZynqPlatform()
+ }
+ }
+ }
+ stage('ZCU104') {
+ options { skipDefaultCheckout() }
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.ZCU104_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-zcu104'
+ }
+ environment {
+ BOARD = 'ZCU104'
+ USER_CREDENTIALS = credentials('pynq-z1-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ restartZynqPlatform()
+ }
+ }
+ }
+ stage('Kria KV260_SOM') {
+ options { skipDefaultCheckout() }
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.KV260_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-kv260'
+ }
+ environment {
+ BOARD = 'KV260_SOM'
+ USER_CREDENTIALS = credentials('user-ubuntu-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ restartZynqPlatform()
+ }
+ }
+ }
+ }
+ }
+ stage('Wait for Nodes to reboot') {
+ options { skipDefaultCheckout() }
+ agent {
+ label 'finn-build'
+ }
+ steps {
+ sleep(time: "${env.REBOOT_SLEEP}", unit: 'MINUTES')
+ }
+ }
+ stage('Collect build information for HW testing') {
+ options { skipDefaultCheckout() }
+ agent {
+ label 'finn-build'
+ }
+ steps {
+ script {
+ // Check which boards are online before running HW tests
+ env.ALVEO_HOST_ONLINE = isNodeOnline('finn-u250')
+ env.PYNQ_ONLINE = isNodeOnline('finn-pynq')
+ env.ZCU104_ONLINE = isNodeOnline('finn-zcu104')
+ env.KV260_ONLINE = isNodeOnline('finn-kv260')
+
+ // Stash the HW test scripts to be used on worker nodes
+ dir('docker/jenkins') {
+ stash name: 'bnn_test_files', includes: 'test_bnn_hw_pytest.py'
+ }
+
+ // Collect build artifacts from network and stash for use on worker nodes
+ dir("${env.ARTIFACT_DIR}"){
+ stashBuildArtifacts('bnn_build_sanity')
+ stashBuildArtifacts('bnn_build_full')
+ }
+ }
+ }
+ }
+ stage('Sanity - Run Hardware Tests') {
+ parallel {
+ stage('BNN Sanity - U250') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.ALVEO_HOST_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-u250'
+ }
+ environment {
+ BOARD = 'U250'
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_sanity", "${env.BOARD}", "${env.BOARD}")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_sanity", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ stage('BNN Sanity - Pynq-Z1') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.PYNQ_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-pynq'
+ }
+ environment {
+ BOARD = 'Pynq-Z1'
+ USER_CREDENTIALS = credentials('pynq-z1-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_sanity", "${env.BOARD}", "Pynq")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_sanity", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ stage('BNN Sanity - ZCU104') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.ZCU104_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-zcu104'
+ }
+ environment {
+ BOARD = 'ZCU104'
+ USER_CREDENTIALS = credentials('pynq-z1-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_sanity", "${env.BOARD}", "${env.BOARD}")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_sanity", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ stage('BNN Sanity - KV260_SOM') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.KV260_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-kv260'
+ }
+ environment {
+ BOARD = 'KV260_SOM'
+ USER_CREDENTIALS = credentials('user-ubuntu-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_sanity", "${env.BOARD}", "${env.BOARD}")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_sanity", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ }
+ }
+ stage('End2end - Run Hardware Tests') {
+ parallel {
+ stage('BNN end2end - U250') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.ALVEO_HOST_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-u250'
+ }
+ environment {
+ BOARD = 'U250'
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_full", "${env.BOARD}", "${env.BOARD}")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_full", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ stage('BNN end2end - Pynq-Z1') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.PYNQ_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-pynq'
+ }
+ environment {
+ BOARD = 'Pynq-Z1'
+ USER_CREDENTIALS = credentials('pynq-z1-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_full", "${env.BOARD}", "Pynq")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_full", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ stage('BNN end2end - ZCU104') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.ZCU104_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-zcu104'
+ }
+ environment {
+ BOARD = 'ZCU104'
+ USER_CREDENTIALS = credentials('pynq-z1-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_full", "${env.BOARD}", "${env.BOARD}")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_full", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ stage('BNN end2end - KV260_SOM') {
+ when {
+ // beforeAgent set to 'true' to prevent an offline agent hanging the stage
+ beforeAgent true
+ expression { return (env.KV260_ONLINE == 'true') }
+ }
+ agent {
+ label 'finn-kv260'
+ }
+ environment {
+ BOARD = 'KV260_SOM'
+ USER_CREDENTIALS = credentials('user-ubuntu-credentials')
+ }
+ steps {
+ catchError(stageResult: 'FAILURE') {
+ runTest("bnn_build_full", "${env.BOARD}", "${env.BOARD}")
+ }
+ }
+ post {
+ always {
+ stashResults("bnn_build_full", "${env.BOARD}")
+ cleanUpWorkspaceOwnership()
+ }
+ }
+ }
+ }
+ }
+ stage('Check Stage Results') {
+ agent {
+ label 'finn-build'
+ }
+ steps {
+ script {
+ sh 'mkdir -p reports'
+ cleanPreviousBuildFiles('reports')
+ dir('reports') {
+ // Only unstash for stages that ran
+ unstashSuccessfulStage(env.ALVEO_HOST_ONLINE, "xml_bnn_build_sanity_U250")
+ unstashSuccessfulStage(env.PYNQ_ONLINE, "xml_bnn_build_sanity_Pynq-Z1")
+ unstashSuccessfulStage(env.ZCU104_ONLINE, "xml_bnn_build_sanity_ZCU104")
+ unstashSuccessfulStage(env.KV260_ONLINE, "xml_bnn_build_sanity_KV260_SOM")
+ unstashSuccessfulStage(env.ALVEO_HOST_ONLINE, "xml_bnn_build_full_U250")
+ unstashSuccessfulStage(env.PYNQ_ONLINE, "xml_bnn_build_full_Pynq-Z1")
+ unstashSuccessfulStage(env.ZCU104_ONLINE, "xml_bnn_build_full_ZCU104")
+ unstashSuccessfulStage(env.KV260_ONLINE, "xml_bnn_build_full_KV260_SOM")
+ }
+
+ // Combine individual HTML files to one single report
+ sh './run-docker.sh pytest_html_merger -i reports/ -o reports/test_report_hw_final.html'
+
+ // Archive the XML & HTML test results
+ archiveArtifacts artifacts: "reports/*.xml"
+ archiveArtifacts artifacts: "reports/*.html"
+
+ // Plot what XML files were created during the test run
+ junit 'reports/*.xml'
+ }
+ }
+ }
+ }
+}
+
+void cleanPreviousBuildFiles(String buildDir) {
+ // Delete any build files from a previous build
+ // Previous build folders affect findCopyZip() and can cause the stage to fail
+ if (!buildDir.empty) {
+ if (env.USER_CREDENTIALS) {
+ sh "echo $USER_CREDENTIALS_PSW | sudo -S rm -rf ${buildDir}*"
+ } else {
+ sh "rm -rf ${buildDir}"
+ }
+ }
+}
+
+void createTestScript(String board, String marker, String testResultsFilename) {
+ if(board == "U250")
+ sh """echo "#!/bin/bash
+. /opt/xilinx/xrt/setup.sh
+. ${VENV_ACTIVATE}
+python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html" >> run-tests.sh
+ """
+ else
+ sh """echo "#!/bin/bash
+. /etc/profile.d/pynq_venv.sh
+. /etc/profile.d/xrt_setup.sh
+python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html" >> run-tests.sh
+ """
+
+ // Give permissions to script
+ sh 'chmod 777 run-tests.sh'
+}
+
+def isNodeOnline(String labelName) {
+ Label label = Jenkins.instance.getLabel(labelName)
+ def agentOnline = false
+
+ if (label) {
+ List nodes = Jenkins.instance.getNodes()
+
+ nodes.each { node ->
+ if (node.getAssignedLabels().contains(label)) {
+ def computer = node.toComputer()
+ if (computer && computer.isOnline()) {
+ agentOnline = true
+ } else {
+ echo """Agent ${node.displayName} is offline"""
+ }
+ }
+ }
+ } else {
+ echo """Node with label ${labelName} not found"""
+ }
+
+ return agentOnline
+}
+
+void unstashSuccessfulStage(String stageEnvVariableSet, String stashName) {
+ if (stageEnvVariableSet) {
+ unstash stashName
+ }
+}
+
+void stashBuildArtifacts(String testDir) {
+ dir("$testDir") {
+ def files = findFiles()
+ files.each { f ->
+ def file = f.toString()
+ def extIndex = file.lastIndexOf(".")
+ def boardName = file.substring(0, extIndex)
+ stash name: "${testDir}_${boardName}_zip", includes: "${f}"
+ }
+ }
+}
+
+void runTest(String testType, String board, String marker) {
+ sh "mkdir -p ${testType}"
+ dir("$testType") {
+ // Clean any files from a previous run
+ cleanPreviousBuildFiles("${board}*")
+
+ // Get the test files
+ unstash name: "${testType}_${board}_zip"
+ sh "unzip -o ${board}.zip"
+
+ dir("$board") {
+ // Get the scripts necessary for running hw tests
+ unstash name: 'bnn_test_files'
+
+ // Create test script
+ createTestScript(board, marker, "${testType}_hw_${board}")
+
+ if (env.USER_CREDENTIALS) {
+ // Execute the script as the root user - needed for zynq platforms
+ sh 'echo ${USER_CREDENTIALS_PSW} | sudo -S ./run-tests.sh'
+ } else {
+ // Execute the script
+ sh './run-tests.sh'
+ }
+ }
+ }
+}
+
+void stashResults (String testType, String board) {
+ // Get test result file and delete test files on the board
+ dir("${testType}/${board}") {
+ // Collect the results file on the worker node by stashing
+ try {
+ stash name: "xml_${testType}_${board}", includes: "${testType}_hw_${board}.xml,${testType}_hw_${board}.html"
+ } catch (err) {
+ echo "No results to stash"
+ }
+ }
+}
+
+void cleanUpWorkspaceOwnership () {
+ if (env.USER_CREDENTIALS) {
+ sh 'echo ${USER_CREDENTIALS_PSW} | sudo -S chown -R $(id -u):$(id -g) ${WORKSPACE}'
+ }
+}
+
+void restartZynqPlatform () {
+ if (env.USER_CREDENTIALS) {
+ sh 'echo ${USER_CREDENTIALS_PSW} | sudo -S shutdown -r +1'
+ }
+}
diff --git a/docker/jenkins/test_bnn_hw_pytest.py b/docker/jenkins/test_bnn_hw_pytest.py
new file mode 100755
index 0000000000..dc350d8504
--- /dev/null
+++ b/docker/jenkins/test_bnn_hw_pytest.py
@@ -0,0 +1,213 @@
+import pytest
+
+import itertools
+import logging
+import numpy as np
+import os
+import subprocess
+from scipy.stats import linregress
+
+# no __init__ constructors allowed in Pytest - so use global variables instead
+base_dir_global = os.getcwd()
+default_test_run_timeout = 30 # seconds
+output_execute_results_file = "output.npy"
+execute_results_reference_file = "output_reference.npy"
+output_throughput_results_file = "nw_metrics.txt"
+throughput_results_formatted_file = "throughput_metrics_formatted.txt"
+logger = logging.getLogger(__name__)
+
+
+def remove_cache_dirs(dir_list):
+ tmp_list = list(dir_list)
+ for i in range(len(tmp_list) - 1, -1, -1):
+ if ".pytest_cache" in tmp_list[i]:
+ del tmp_list[i]
+ elif "__pycache__" in tmp_list[i]:
+ del tmp_list[i]
+ return tmp_list
+
+
+def delete_file(file_path):
+ # Check if the file exists before deleting it
+ if os.path.exists(file_path):
+ try:
+ os.remove(file_path)
+ logger.info(f"File '{file_path}' deleted successfully.")
+ except Exception as e:
+ logger.error(f"An error occurred while deleting the file: {e}")
+ else:
+ logger.info(f"File '{file_path}' does not exist. Continuing with the script.")
+
+
+def get_platform(board_str):
+ return "alveo" if "U250" in board_str else "zynq-iodma"
+
+
+def get_full_parameterized_test_list(marker, test_dir_list, batch_size_list, platform_list):
+ test_cases = [
+ (
+ f"{marker}_{param1}_batchSize-{param2}_platform-{param3}",
+ {
+ "test_dir": param1,
+ "batch_size": param2,
+ "platform": param3,
+ },
+ )
+ for param1, param2, param3 in itertools.product(
+ test_dir_list,
+ batch_size_list,
+ platform_list,
+ )
+ ]
+ return test_cases
+
+
+def pytest_generate_tests(metafunc):
+ idlist = []
+ argvalues = []
+ scenarios = []
+
+ # Separate the full list of markers used on command line.
+ # This allows a user to select multiple markers
+ all_markers_used = metafunc.config.getoption("-m").split(" ")
+ current_dir = os.getcwd()
+ test_dirs = [
+ name for name in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, name))
+ ]
+ test_dirs = remove_cache_dirs(test_dirs)
+
+ for marker in all_markers_used:
+ if "Pynq" in marker or "U250" in marker or "ZCU104" in marker or "KV260_SOM" in marker:
+ platform = get_platform(marker)
+ scenarios.extend(
+ get_full_parameterized_test_list(
+ marker, test_dir_list=test_dirs, batch_size_list=[1], platform_list=[platform]
+ )
+ )
+
+ if len(scenarios) > 0:
+ for scenario in scenarios:
+ # There is a known Pynq/XRT issue with larger sets of weights on Alveo.
+ # Accesses to address spaces over 16KB do NOT work as intended.
+ # Disabling Alveo lfc HW test until resolved.
+ if scenario[0] == "U250_bnn_w1_a1_lfc_batchSize-1_platform-alveo":
+ continue
+ idlist.append(scenario[0])
+ items = scenario[1].items()
+ argnames = [x[0] for x in items]
+ argvalues.append([x[1] for x in items])
+ metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class")
+
+
+@pytest.mark.Pynq
+@pytest.mark.U250
+@pytest.mark.ZCU104
+@pytest.mark.KV260_SOM
+class TestBnn:
+ def test_type_execute(self, test_dir, batch_size, platform):
+ # Enter into test directory and clean any files from a potential previous run
+ os.chdir(os.path.join(base_dir_global, test_dir))
+ delete_file(output_execute_results_file)
+
+ # Run test option: execute
+ bitfile = "a.xclbin" if platform == "alveo" else "resizer.bit"
+ result = subprocess.run(
+ [
+ "python",
+ "driver.py",
+ "--exec_mode=execute",
+ f"--batchsize={batch_size}",
+ f"--bitfile={bitfile}",
+ "--inputfile=input.npy",
+ "--outputfile=output.npy",
+ f"--platform={platform}",
+ ],
+ capture_output=True,
+ text=True,
+ timeout=default_test_run_timeout,
+ )
+ assert result.returncode == 0
+
+ # Load the output and reference arrays
+ output_array = np.load(output_execute_results_file)
+ reference_array = np.load(execute_results_reference_file)
+
+ # Compare the arrays
+ try:
+ assert np.isclose(output_array, reference_array).all()
+ except AssertionError as e:
+ logger.error("AssertionError occurred: %s", e, exc_info=True)
+ raise
+
+ def test_type_throughput(self, test_dir, batch_size, platform):
+ os.chdir(os.path.join(base_dir_global, test_dir))
+ delete_file(output_throughput_results_file)
+
+ # Run test option: throughput
+ bitfile = "a.xclbin" if platform == "alveo" else "resizer.bit"
+ result = subprocess.run(
+ [
+ "python",
+ "driver.py",
+ "--exec_mode=throughput_test",
+ f"--batchsize={batch_size}",
+ f"--bitfile={bitfile}",
+ "--inputfile=input.npy",
+ "--outputfile=output.npy",
+ f"--platform={platform}",
+ ],
+ capture_output=True,
+ text=True,
+ timeout=default_test_run_timeout,
+ )
+ assert result.returncode == 0
+
+ # Check if nw_metrics.txt now exists after test run
+ assert os.path.exists(output_throughput_results_file)
+
+ with open(output_throughput_results_file, "r") as file:
+ res = eval(file.read())
+
+ # try a range of batch sizes, some may fail due to insufficient DMA
+ # buffers
+ bsize_range_in = [8**i for i in range(5)]
+ bsize_range = []
+ ret = dict()
+ for bsize in bsize_range_in:
+ if res is not None:
+ ret[bsize] = res
+ bsize_range.append(bsize)
+ else:
+ # assume we reached largest possible N
+ break
+
+ y = [ret[key]["runtime[ms]"] for key in bsize_range]
+ lrret = linregress(bsize_range, y)
+ ret_str = ""
+ ret_str += "\n" + "%s Throughput Test Results" % test_dir
+ ret_str += "\n" + "-----------------------------"
+ ret_str += "\n" + "From linear regression:"
+ ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept
+ ret_str += "\n" + "Time per sample: %f ms" % lrret.slope
+ ret_str += "\n" + "Raw data:"
+
+ ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+ "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]"
+ )
+ for k in bsize_range:
+ v = ret[k]
+ ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format(
+ k,
+ np.round(v["runtime[ms]"], 4),
+ v["fclk[mhz]"],
+ np.round(v["throughput[images/s]"], 2),
+ np.round(v["DRAM_in_bandwidth[MB/s]"], 2),
+ np.round(v["DRAM_out_bandwidth[MB/s]"], 2),
+ )
+ ret_str += "\n" + "-----------------------------"
+ # largest_bsize = bsize_range[-1]
+
+ # Dump the metrics to a text file
+ with open(throughput_results_formatted_file, "w") as f:
+ f.write(ret_str)
+ assert os.path.exists(throughput_results_formatted_file)
diff --git a/docker/quicktest.sh b/docker/quicktest.sh
index b4ad37232f..3684e3a0d4 100755
--- a/docker/quicktest.sh
+++ b/docker/quicktest.sh
@@ -6,16 +6,16 @@ cd $FINN_ROOT
# check if command line argument is empty or not present
if [ -z $1 ]; then
echo "Running quicktest: not (vivado or slow or board) with pytest-xdist"
- python setup.py test --addopts "-m 'not (vivado or slow or vitis or board)' --dist=loadfile -n $PYTEST_PARALLEL"
+ pytest -m 'not (vivado or slow or vitis or board or notebooks or bnn_pynq)' --dist=loadfile -n $PYTEST_PARALLEL
elif [ $1 = "main" ]; then
echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist"
- python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL"
+ pytest -k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL
elif [ $1 = "rtlsim" ]; then
echo "Running rtlsim test suite with pytest-parallel"
- python setup.py test --addopts "-k rtlsim --workers $PYTEST_PARALLEL"
+ pytest -k rtlsim --workers $PYTEST_PARALLEL
elif [ $1 = "end2end" ]; then
echo "Running end2end test suite with no parallelism"
- python setup.py test --addopts "-k end2end"
+ pytest -k end2end
elif [ $1 = "full" ]; then
echo "Running full test suite, each step with appropriate parallelism"
$0 main;
diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst
index 950b601f98..0a1c788324 100644
--- a/docs/finn/brevitas_export.rst
+++ b/docs/finn/brevitas_export.rst
@@ -8,11 +8,11 @@ Brevitas Export
:scale: 70%
:align: center
-FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_. Brevitas provides an export of a quantized network in ONNX representation in several flavors.
-Two of the Brevitas-exported ONNX variants can be ingested by FINN:
-
- * FINN-ONNX: Quantized weights exported as tensors with additional attributes to mark low-precision datatypes. Quantized activations exported as MultiThreshold nodes.
- * QONNX: All quantization is represented using Quant, BinaryQuant or Trunc nodes. QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn`
+FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_.
+Brevitas provides an export of a quantized network in QONNX representation, which is the format that can be ingested by FINN.
+In a QONNX graph, all quantization is represented using Quant, BinaryQuant or Trunc nodes.
+QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn`. FINN-ONNX is the intermediate representation (IR) FINN uses internally.
+In this IR, quantized weights are indicated through tensors with additional attributes to mark low-precision datatypes and quantized activations are expressed as MultiThreshold nodes.
To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN.
diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst
index 8c37479a28..110a522847 100644
--- a/docs/finn/command_line.rst
+++ b/docs/finn/command_line.rst
@@ -20,7 +20,7 @@ two command line entry points for productivity and ease-of-use:
Jupyter notebook as a starting point, visualizing the model at intermediate
steps and adding calls to new transformations as needed.
Once you have a working flow, you can implement a command line entry for this
- by using the "advanced mode" described here.
+ by using the "advanced mode".
Simple dataflow build mode
@@ -28,7 +28,7 @@ Simple dataflow build mode
This mode is intended for simpler networks whose topologies resemble the
FINN end-to-end examples.
-It runs a fixed build flow spanning tidy-up, streamlining, HLS conversion
+It runs a fixed build flow spanning tidy-up, streamlining, HW conversion
and hardware synthesis.
It can be configured to produce different outputs, including stitched IP for
integration in Vivado IPI as well as bitfiles.
@@ -43,7 +43,9 @@ To use it, first create a folder with the necessary configuration and model file
3. Create a JSON file with the build configuration. It must be named ``dataflow_build_dir/dataflow_build_config.json``.
Read more about the build configuration options on :py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig`.
You can find an example .json file under ``src/finn/qnn-data/build_dataflow/dataflow_build_config.json``
-4. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``.
+4. (Optional) create a JSON file with the specialize layers configuration. It must be named ``dataflow_build_dir/specialize_layers_config.json``
+ You can find an example .json file under ``src/finn/qnn-data/build_dataflow/specialize_layers_config.json``.
+5. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``.
You can find an example .json file under ``src/finn/qnn-data/build_dataflow/folding_config.json``.
Instead of specifying the folding configuration, you can use the `target_fps` option in the build configuration
to control the degree of parallelization for your network.
@@ -59,25 +61,28 @@ as it goes through numerous steps:
.. code-block:: none
- Building dataflow accelerator from /home/maltanar/sandbox/build_dataflow/model.onnx
+ Building dataflow accelerator from build_dataflow/model.onnx
Outputs will be generated at output_tfc_w1a1_Pynq-Z1
Build log is at output_tfc_w1a1_Pynq-Z1/build_dataflow.log
- Running step: step_tidy_up [1/16]
- Running step: step_streamline [2/16]
- Running step: step_convert_to_hls [3/16]
- Running step: step_create_dataflow_partition [4/16]
- Running step: step_target_fps_parallelization [5/16]
- Running step: step_apply_folding_config [6/16]
- Running step: step_generate_estimate_reports [7/16]
- Running step: step_hls_codegen [8/16]
- Running step: step_hls_ipgen [9/16]
- Running step: step_set_fifo_depths [10/16]
- Running step: step_create_stitched_ip [11/16]
- Running step: step_measure_rtlsim_performance [12/16]
- Running step: step_make_pynq_driver [13/16]
- Running step: step_out_of_context_synthesis [14/16]
- Running step: step_synthesize_bitfile [15/16]
- Running step: step_deployment_package [16/16]
+ Running step: step_qonnx_to_finn [1/19]
+ Running step: step_tidy_up [2/19]
+ Running step: step_streamline [3/19]
+ Running step: step_convert_to_hw [4/19]
+ Running step: step_create_dataflow_partition [5/19]
+ Running step: step_specialize_layers [6/19]
+ Running step: step_target_fps_parallelization [7/19]
+ Running step: step_apply_folding_config [8/19]
+ Running step: step_minimize_bit_width [9/19]
+ Running step: step_generate_estimate_reports [10/19]
+ Running step: step_hw_codegen [11/19]
+ Running step: step_hw_ipgen [12/19]
+ Running step: step_set_fifo_depths [13/19]
+ Running step: step_create_stitched_ip [14/19]
+ Running step: step_measure_rtlsim_performance [15/19]
+ Running step: step_out_of_context_synthesis [16/19]
+ Running step: step_synthesize_bitfile [17/19]
+ Running step: step_make_pynq_driver [18/19]
+ Running step: step_deployment_package [19/19]
You can read a brief description of what each step does on
@@ -99,6 +104,7 @@ The following outputs will be generated regardless of which particular outputs a
* ``build_dataflow.log`` is the build logfile that will contain any warnings/errors
* ``time_per_step.json`` will report the time (in seconds) each build step took
* ``final_hw_config.json`` will contain the final (after parallelization, FIFO sizing etc) hardware configuration for the build
+* ``template_specialize_layers_config.json`` is an example json file that can be used to set the specialize layers config
* ``intermediate_models/`` will contain the ONNX file(s) produced after each build step
@@ -206,3 +212,5 @@ You can launch the desired custom build flow using:
This will mount the specified folder into the FINN Docker container and launch
the build flow. If ```` is not specified it will default to ``build``
and thus execute ``build.py``. If it is specified, it will be ``.py``.
+
+If you would like to learn more about advance builder settings, please have a look at `our tutorial about this topic `_.
diff --git a/docs/finn/conf.py b/docs/finn/conf.py
index 47ba99fb5f..a4416706c2 100644
--- a/docs/finn/conf.py
+++ b/docs/finn/conf.py
@@ -19,7 +19,7 @@
# -- Project information -----------------------------------------------------
project = "FINN"
-copyright = "2020, Xilinx"
+copyright = "2020-2022, Xilinx, 2022-2024, AMD"
author = "Y. Umuroglu and J. Petri-Koenig"
diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst
index f9252f764c..2a5e26959b 100644
--- a/docs/finn/developers.rst
+++ b/docs/finn/developers.rst
@@ -2,15 +2,13 @@
Developer documentation
***********************
-.. note:: **This page is under construction.**
-
This page is intended to serve as a starting point for new FINN developers.
Power users may also find this information useful.
Prerequisites
================
-Before starting to do development on FINN it's a good idea to start
+Before starting to do development on FINN it is a good idea to start
with understanding the basics as a user. Going through all of the
:ref:`tutorials` is strongly recommended if you haven't already done so.
Additionally, please review the documentation available on :ref:`internals`.
@@ -61,7 +59,7 @@ further detailed below:
Docker images
===============
-If you want to add new dependencies (packages, repos) to FINN it's
+If you want to add new dependencies (packages, repos) to FINN it is
important to understand how we handle this in Docker.
The finn.dev image is built and launched as follows:
@@ -70,7 +68,7 @@ The finn.dev image is built and launched as follows:
2. run-docker.sh launches the build of the Docker image with `docker build` (unless ``FINN_DOCKER_PREBUILT=1``). Docker image is built from docker/Dockerfile.finn using the following steps:
- * Base: PyTorch dev image
+ * Base: Ubuntu 22.04 LTS image
* Set up apt dependencies: apt-get install a few packages for verilator and
* Set up pip dependencies: Python packages FINN depends on are listed in requirements.txt, which is copied into the container and pip-installed. Some additional packages (such as Jupyter and Netron) are also installed.
* Install XRT deps, if needed: For Vitis builds we need to install the extra dependencies for XRT. This is only triggered if the image is built with the INSTALL_XRT_DEPS=1 argument.
@@ -84,9 +82,9 @@ The finn.dev image is built and launched as follows:
4. Entrypoint script (docker/finn_entrypoint.sh) upon launching container performs the following:
- * Source Vivado settings64.sh from specified path to make vivado and vivado_hls available.
- * Download PYNQ board files into the finn root directory, unless they already exist.
- * Source Vitits settings64.sh if Vitis is mounted.
+ * Source Vivado settings64.sh from specified path to make vivado and vitis_hls available.
+ * Download board files into the finn root directory, unless they already exist or ``FINN_SKIP_BOARD_FILES=1``.
+ * Source Vitis settings64.sh if Vitis is mounted.
5. Depending on the arguments to run-docker.sh a different application is launched. run-docker.sh notebook launches a Jupyter server for the tutorials, whereas run-docker.sh build_custom and run-docker.sh build_dataflow trigger a dataflow build (see documentation). Running without arguments yields an interactive shell. See run-docker.sh for other options.
@@ -106,7 +104,7 @@ Linting
We use a pre-commit hook to auto-format Python code and check for issues.
See https://pre-commit.com/ for installation. Once you have pre-commit, you can install
the hooks into your local clone of the FINN repo.
-It's recommended to do this **on the host** and not inside the Docker container:
+It is recommended to do this **on the host** and not inside the Docker container:
::
@@ -119,7 +117,7 @@ you may have to fix it manually, then run `git commit` once again.
The checks are configured in .pre-commit-config.yaml under the repo root.
Testing
-=======
+========
Tests are vital to keep FINN running. All the FINN tests can be found at https://github.com/Xilinx/finn/tree/main/tests.
These tests can be roughly grouped into three categories:
@@ -132,7 +130,7 @@ These tests can be roughly grouped into three categories:
Additionally, qonnx, brevitas and finn-hlslib also include their own test suites.
The full FINN compiler test suite
-(which will take several hours to run and require a PYNQ board) can be executed
+(which will take several hours to run) can be executed
by:
::
@@ -146,7 +144,7 @@ requiring Vivado or as slow-running tests:
bash ./run-docker.sh quicktest
-When developing a new feature it's useful to be able to run just a single test,
+When developing a new feature it is useful to be able to run just a single test,
or a group of tests that e.g. share the same prefix.
You can do this inside the Docker container
from the FINN root directory as follows:
@@ -159,8 +157,8 @@ from the FINN root directory as follows:
If you want to run tests in parallel (e.g. to take advantage of a multi-core CPU)
you can use:
-* pytest-parallel for any rtlsim tests, e.g. `python setup.py test --addopts "-k rtlsim --workers auto"`
-* pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `python setup.py test --addopts "-k mytest -n auto --dist=loadfile"`
+* pytest-parallel for any rtlsim tests, e.g. `pytest -k rtlsim --workers auto`
+* pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `pytest -k mytest -n auto --dist=loadfile`
Finally, the full test suite with appropriate parallelization can be run inside the container by:
@@ -178,16 +176,9 @@ FINN provides two types of documentation:
* manually written documentation, like this page
* autogenerated API docs from Sphinx
-Everything is built using Sphinx, which is installed into the finn.dev
-Docker image. You can build the documentation locally by running the following
-inside the container:
-
-::
-
- python setup.py docs
+Everything is built using Sphinx.
-You can view the generated documentation on build/html/index.html.
-The documentation is also built online by readthedocs:
+The documentation is built online by readthedocs:
* finn.readthedocs.io contains the docs from the master branch
* finn-dev.readthedocs.io contains the docs from the dev branch
diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst
index 0a022067c3..8fafde5a5e 100644
--- a/docs/finn/end_to_end_flow.rst
+++ b/docs/finn/end_to_end_flow.rst
@@ -2,7 +2,11 @@
End-to-End Flow
***************
-The following image shows an example end-to-end flow in FINN, starting from a trained PyTorch/Brevitas network and going all the way to a running FPGA accelerator.
+The following image shows an example end-to-end flow in FINN for a PYNQ board.
+Please note that you can build an IP block for your neural network **for every Xilinx-AMD FPGA**, but we only provide automatic system integration for a limited number of boards.
+However, you can use Vivado to integrate an IP block generated by FINN into your own design.
+
+The example flow in this image starts from a trained PyTorch/Brevitas network and goes all the way to a running FPGA accelerator.
As you can see in the picture, FINN has a high modularity and has the property that the flow can be stopped at any point and the intermediate result can be used for further processing or other purposes. This enables a wide range of users to benefit from FINN, even if they do not use the whole flow.
.. image:: ../../notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg
diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst
index ef4457f53a..70c2f24ed2 100644
--- a/docs/finn/faq.rst
+++ b/docs/finn/faq.rst
@@ -7,16 +7,6 @@ Frequently Asked Questions
Can't find the answer to your question here? Check `FINN GitHub Discussions `_.
-Can I install FINN out of the Docker container?
- We do not support out of the Docker implementations at the moment. This is due
- to the high complexity of the FINN project dependencies.
-
-Since FINN uses ONNX, can I compile any model from the ONNX Model Zoo to an FPGA accelerator?
- The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer
- types and quantization annotations. Networks must be first quantized using Brevitas and exported
- to FINN-ONNX to be converted to FPGA accelerators.
-
-
Can I install FINN out of the Docker container?
We do not support out of the Docker implementations at the moment. This is due
to the high complexity of the FINN project dependencies.
@@ -52,7 +42,6 @@ What operating systems are supported by FINN?
FINN should work fine under any Linux-based OS capable of running Vivado/Vitis, as long
as you install Docker (``docker-ce``) on your machine.
-
I am getting DocNav and Model_Composer errors when launching the Docker image.
We do not mount those particular directories into the Docker container because they are not
used. The errors are Vivado related but you can safely ignore them.
@@ -74,16 +63,8 @@ How can I target an arbitrary Xilinx FPGA without PYNQ support?
Why does FINN-generated architectures need FIFOs between layers?
See https://github.com/Xilinx/finn/discussions/383
-How do I tell FINN to utilize DSPs instead of LUTs for MAC operations in particular layers?
- This is done with the ``resType="dsp"`` attribute on ``MatrixVectorActivation`` and ``Vector_Vector_Activate`` instances.
- When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’
- folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`).
- This is a good idea for layers with more weight/input act bits and high PE*SIMD.
- See the `MobileNet-v1 build config for ZCU104 in finn-examples `_ for reference.
-
-
How do I tell FINN to utilize a particular type of memory resource in particular layers?
- This is done with the ``ram_style`` attribute. Check the particular ``HLSCustomOp`` attribute definition to see
+ This is done with the ``ram_style`` attribute. Check the particular ``HWCustomOp`` attribute definition to see
which modes are supported (`example for MatrixVectorActivation `_).
When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’
folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`).
diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst
index 9b3111b70e..217f982702 100644
--- a/docs/finn/getting_started.rst
+++ b/docs/finn/getting_started.rst
@@ -8,7 +8,7 @@ Quickstart
==========
1. Install Docker to run `without root `_
-2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.1``)
+2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.2``)
3. Clone the FINN compiler from the repo: ``git clone https://github.com/Xilinx/finn/`` and go into the directory where it is cloned
4. Execute ``./run-docker.sh quicktest`` to verify your installation.
5. Optionally, follow the instructions on :ref:`PYNQ board first-time setup` or :ref:`Alveo first-time setup` for board setup.
@@ -28,8 +28,8 @@ to train *customized* networks and create highly-efficient FPGA implementations
In general, the approach for using the FINN framework is as follows:
1. Train your own quantized neural network (QNN) in `Brevitas `_. We have some `guidelines `_ on quantization-aware training (QAT).
-2. Export to FINN-ONNX by following `this tutorial `_ .
-3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_
+2. Export to QONNX and convert to FINN-ONNX by following `this tutorial `_ .
+3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_ or for advanced settings have a look at this `tutorial `_ .
4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results.
Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide.
@@ -49,17 +49,16 @@ Running FINN in Docker
======================
FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources `_ to get started.
You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well.
-If you want to use prebuilt images, read :ref:`Using a prebuilt image`.
The above mentioned script to build and launch the FINN docker container is called `run-docker.sh `_ . It can be launched in the following modes:
Launch interactive shell
************************
-Simply running sh run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation:
+Simply running bash run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation:
::
- bash ./run_docker.sh
+ bash ./run-docker.sh
Launch a Build with ``build_dataflow``
@@ -70,8 +69,8 @@ or a user-defined flow from the command line as follows:
::
- bash ./run_docker.sh build_dataflow
- bash ./run_docker.sh build_custom
+ bash ./run-docker.sh build_dataflow
+ bash ./run-docker.sh build_custom
Launch Jupyter notebooks
@@ -93,11 +92,12 @@ This will launch the `Jupyter notebook `_ server inside a
Environment variables
**********************
-Prior to running the `run-docker.sh` script, there are several environment variables you can set to configure certain aspects of FINN.
-These are summarized below:
+Prior to running the ``run-docker.sh`` script, there are several environment variables you can set to configure certain aspects of FINN.
+For a complete list, please have a look in the `run-docker.sh `_ file.
+The most relevant are summarized below:
* (required) ``FINN_XILINX_PATH`` points to your Xilinx tools installation on the host (e.g. ``/opt/Xilinx``)
-* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.1``)
+* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.2``)
* (required for Alveo) ``PLATFORM_REPO_PATHS`` points to the Vitis platform files (DSA).
* (required for Alveo) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``).
* (optional) ``NUM_DEFAULT_WORKERS`` (default 4) specifies the degree of parallelization for the transformations that can be run in parallel, potentially reducing build time
@@ -107,18 +107,14 @@ These are summarized below:
* (optional) ``LOCALHOST_URL`` (default localhost) sets the base URL for accessing e.g. Netron from inside the container. Useful when running FINN remotely.
* (optional) ``NETRON_PORT`` (default 8081) changes the port for Netron inside Docker
* (optional) ``PYNQ_BOARD`` or ``ALVEO_BOARD`` specifies the type of PYNQ/Alveo board used (see "supported hardware" below) for the test suite
-* (optional) ``PYNQ_IP`` and ``PYNQ_PORT`` (or ``ALVEO_IP`` and ``ALVEO_PORT``) specify ip address and port number to access the PYNQ board / Alveo target
-* (optional) ``PYNQ_USERNAME`` and ``PYNQ_PASSWORD`` (or ``ALVEO_USERNAME`` and ``ALVEO_PASSWORD``) specify the PYNQ board / Alveo host access credentials for the test suite. For PYNQ, password is always needed to run as sudo. For Alveo, you can leave the password empty and place your ssh private key in the ``finn/ssh_keys`` folder to use keypair authentication.
-* (optional) ``PYNQ_TARGET_DIR`` (or ``ALVEO_TARGET_DIR``) specifies the target dir on the PYNQ board / Alveo host for the test suite
* (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests.
-* (optional) ``FINN_DOCKER_PREBUILT`` (default 0) if set to 1 then skip Docker image building and use the image tagged with ``FINN_DOCKER_TAG``.
* (optional) ``FINN_DOCKER_TAG`` (autogenerated) specifies the Docker image tag to use.
* (optional) ``FINN_DOCKER_RUN_AS_ROOT`` (default 0) if set to 1 then run Docker container as root, default is the current user.
-* (optional) ``FINN_DOCKER_GPU`` (autodetected) if not 0 then expose all Nvidia GPUs or those selected by ``NVIDIA_VISIBLE_DEVICES`` to Docker container for accelerated DNN training. Requires `Nvidia Container Toolkit `_
* (optional) ``FINN_DOCKER_EXTRA`` (default "") pass extra arguments to the ``docker run`` command when executing ``./run-docker.sh``
* (optional) ``FINN_SKIP_DEP_REPOS`` (default "0") skips the download of FINN dependency repos (uses the ones already downloaded under deps/.
* (optional) ``NVIDIA_VISIBLE_DEVICES`` (default "") specifies specific Nvidia GPUs to use in Docker container. Possible values are a comma-separated list of GPU UUID(s) or index(es) e.g. ``0,1,2``, ``all``, ``none``, or void/empty/unset.
* (optional) ``DOCKER_BUILDKIT`` (default "1") enables `Docker BuildKit `_ for faster Docker image rebuilding (recommended).
+* (optional) ``FINN_SINGULARITY`` (default "") points to a pre-built Singularity image to use instead of the Docker image. Singularity support is experimental and intended only for systems where Docker is unavailable. Does not support GPUs.
General FINN Docker tips
************************
@@ -127,23 +123,11 @@ General FINN Docker tips
* If you want a new terminal on an already-running container, you can do this with ``docker exec -it bash``.
* The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the finn compiler folder (which is mounted from the host computer) or otherwise backed up.
-Using a prebuilt image
-**********************
-
-By default the ``run-docker.sh`` script tries to re-build the Docker image with each run. After the first run this should go quite fast thanks to Docker caching.
-If you are having trouble building the Docker image or need offline access, you can use prebuilt images by following these steps:
-
-1. Pull a prebuilt Docker image with ``docker pull maltanar/finn:`` where ```` can be ``dev_latest`` or ``main_latest``
-2. Set the ``FINN_DOCKER_TAG`` to the name of the image you just pulled e.g. ``FINN_DOCKER_TAG=maltanar/finn:dev_latest``
-3. Set ``FINN_DOCKER_PREBUILT=1``
-4. You can now launch the Docker image in all modes without re-building or any internet access.
-
-
Supported FPGA Hardware
=======================
-**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards.
+**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx-AMD FPGA as part of a larger system. It’s up to you to take the FINN-generated accelerator (what we call “stitched IP” in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
-**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator.
+**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Kria SOM, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards.
PYNQ board first-time setup
****************************
@@ -179,7 +163,7 @@ On the target side:
On the host side:
-1. Install Vitis 2022.1 and set up the ``VITIS_PATH`` environment variable to point to your installation.
+1. Install Vitis 2022.2 and set up the ``VITIS_PATH`` environment variable to point to your installation.
2. Install Xilinx XRT. Ensure that the ``XRT_DEB_VERSION`` environment variable reflects which version of XRT you have installed.
3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)*
4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above.
@@ -203,7 +187,7 @@ System Requirements
* Ubuntu 18.04 with ``bash`` installed
* Docker `without root `_
-* A working Vitis/Vivado 2022.1 installation
+* A working Vitis/Vivado 2022.2 installation
* ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables correctly set, see `Quickstart`_
* *(optional)* `Vivado/Vitis license`_ if targeting non-WebPack FPGA parts.
* *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_
diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst
index a5c486935d..39c39eb7df 100644
--- a/docs/finn/hw_build.rst
+++ b/docs/finn/hw_build.rst
@@ -8,7 +8,7 @@ Hardware Build and Deployment
:scale: 70%
:align: center
-A model where all layers have been converted to HLS layers can be processed by
+A model where all layers have been converted to either HLS or RTL layers can be processed by
FINN to build a bitfile and driver targeting a Zynq or Alveo system or to generate a Vivado IP Integrator (IPI)
design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system.
@@ -69,9 +69,11 @@ FINN will descend into each partition and insert FIFO nodes between streaming no
where FIFO depths dictated by the node attributes, using the :py:mod:`finn.transformation.fpgadataflow.insert_fifo.InsertFIFO`
transformation.
Afterwards, IP blocks will be created for each partition, which in turn contain the
-IP blocks for each layer stitched together. The layer-level IP blocks
-are generated by Vivado HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP`
+IP blocks for HLS layers and RTL modules for RTL layers stitched together. The layer-level IP blocks for HLS layers
+are generated by Vitis HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP`
and :py:mod:`finn.transformation.fpgadataflow.hlssynth_ip.HLSSynthIP` transformations.
+For RTL layers calling :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` will fill out the RTL wrapper files and store all files belonging to the RTL module in a folder.
+
The top-level IP blocks are generated in Vivado IPI, using the :py:mod:`finn.transformation.fpgadataflow.create_stitched_ip.CreateStitchedIP` transformation.
Vivado/Vitis Project Generation and Synthesis
@@ -85,8 +87,4 @@ transformation for Zynq, and the `VitisLink` transformation for Alveo.
Deployment
==========
-
-Deployment and Remote Execution
--------------------------------
-
-The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks.
+The bitfile and the driver file(s) can be copied to the PYNQ board and be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks.
diff --git a/docs/finn/img/finn-hw-build.png b/docs/finn/img/finn-hw-build.png
index f3a591fa8f..412317b8d1 100644
Binary files a/docs/finn/img/finn-hw-build.png and b/docs/finn/img/finn-hw-build.png differ
diff --git a/docs/finn/img/finn-stack.png b/docs/finn/img/finn-stack.png
index e34b1ecb45..c2b49de57e 100644
Binary files a/docs/finn/img/finn-stack.png and b/docs/finn/img/finn-stack.png differ
diff --git a/docs/finn/img/mem_mode.png b/docs/finn/img/mem_mode.png
index 27783c5f32..451561c54b 100755
Binary files a/docs/finn/img/mem_mode.png and b/docs/finn/img/mem_mode.png differ
diff --git a/docs/finn/img/nw-prep.png b/docs/finn/img/nw-prep.png
index bed56ebc6d..28a7c9d3ff 100755
Binary files a/docs/finn/img/nw-prep.png and b/docs/finn/img/nw-prep.png differ
diff --git a/docs/finn/img/repo-structure.png b/docs/finn/img/repo-structure.png
index 704e5e5bda..05db9d201c 100644
Binary files a/docs/finn/img/repo-structure.png and b/docs/finn/img/repo-structure.png differ
diff --git a/docs/finn/img/rtl_swg_impl_styles.png b/docs/finn/img/rtl_swg_impl_styles.png
new file mode 100644
index 0000000000..265ff9b915
Binary files /dev/null and b/docs/finn/img/rtl_swg_impl_styles.png differ
diff --git a/docs/finn/index.rst b/docs/finn/index.rst
index c13bf81cec..ab9cc96fb1 100644
--- a/docs/finn/index.rst
+++ b/docs/finn/index.rst
@@ -5,21 +5,21 @@ FINN
Welcome to the FINN Read the Docs website!
What is FINN?
-=============
+==============
.. image:: img/finn-stack.png
- :scale: 40%
+ :scale: 15%
:align: center
'FINN' is colloquially used to refer to two separate but highly related things:
-* The FINN **project**, which is an experimental framework from Xilinx Research Labs
- to explore deep neural network inference on FPGAs. It specifically targets
- quantized neural networks (QNNs), with emphasis on generating dataflow-style
+* The FINN **project**, which is an experimental framework from AMD Research and
+ Advanced Development (RAD) to explore deep neural network inference on FPGAs.
+ It specifically targets quantized neural networks (QNNs), with emphasis on generating dataflow-style
architectures customized for each network.
The key components are illustrated in the figure above;
including tools for training
quantized neural networks (Brevitas), the FINN compiler, and the finn-hlslib
- Vivado HLS library of FPGA components for QNNs.
+ Vitis HLS library of FPGA components for QNNs.
Read more on the `FINN project homepage `_.
* The FINN **compiler**, which this Read the Docs website is the documentation for.
diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst
index add70d649c..0fd6c42350 100644
--- a/docs/finn/internals.rst
+++ b/docs/finn/internals.rst
@@ -27,8 +27,6 @@ Custom Operations/Nodes
FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" or domain="qonnx.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`.
-.. note:: See the description of `this PR `_ for more on how the operator wrapper library is organized.
-
Custom ONNX Execution Flow
==========================
@@ -137,7 +135,7 @@ ModelWrapper contains more useful functions, if you are interested please have a
Analysis Pass
=============
-An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis` .
+An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis`.
.. _transformation_pass:
@@ -148,26 +146,26 @@ A transformation passes changes (transforms) the given model, it gets the model
.. _mem_mode:
-MatrixVectorActivation *mem_mode*
-==================================
+HLS variant of MatrixVectorActivation: *mem_mode*
+=================================================
FINN supports three types of the so-called *mem_mode* attrıbute for the node MatrixVectorActivation. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently three settings for the *mem_mode* are supported in FINN:
-* "const"
+* "internal_embedded" (former "const" mode)
-* "decoupled"
+* "internal_decoupled" (former "decoupled" mode)
* "external"
-The following picture shows the idea behind the "const" and "decoupled" mode.
+The following picture shows the idea behind the "internal_embedded" and "internal_decoupled" mode.
.. image:: img/mem_mode.png
:scale: 55%
:align: center
-Const mode
-----------
-In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
+Internal_embedded mode
+------------------------
+In *internal_embedded* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *internal_embedded* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these.
Advantages:
@@ -175,17 +173,15 @@ Advantages:
* easier to debug layer in cppsim since no additional components
-* well-tested and mature components
-
Disadvantages:
* can lead to very long HLS synthesis times for certain weight array shapes
* less control over the weight memory FPGA primitives, Vivado HLS doesn't always make the best resource allocation decisions
-Decoupled mode
---------------
-In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode.
+Internal_decoupled mode
+------------------------
+In *internal_decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *internal_embedded* mode.
Advantages:
@@ -197,11 +193,149 @@ Advantages:
Disadvantages:
-* somewhat less well-tested compared to the const mode
-
-* higher resource footprint due to additional weight streamer and weight FIFO
+* slightly higher resource footprint due to additional weight streamer and weight FIFO
How to set *mem_mode*
---------------------
-When the nodes in the network are converted to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the conversion to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is passed as argument. Note that if no argument is passed, the default is *const*.
+When the nodes in the network are specialized to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the specialization to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is set in the node attributes of the nodes and can be passed as part of the folding configuration. The default is *internal_decoupled*.
+
+
+.. _folding_factors:
+
+Constraints to folding factors per layer
+=========================================
+
+.. list-table:: Folding factor constraints
+
+ * - **Layers**
+ - **Parameters**
+ - **Constraints**
+ * - Addstreams
+ - PE
+ - inp_channels % PE == 0
+ * - ChannelwiseOp
+ - PE
+ - channels % PE == 0
+ * - ConvolutionInputGenerator
+ - SIMD
+ - inp_channels % SIMD == 0
+ * - Downsampler
+ - SIMD
+ - inp_channels % SIMD == 0
+ * - DuplicateStreams
+ - PE
+ - channels % PE == 0
+ * - StreamingEltwise
+ - PE
+ - inp_channels % PE == 0
+ * - FMPadding
+ - SIMD
+ - inp_channels % SIMD == 0
+ * - FMPadding_Pixel
+ - SIMD
+ - inp_channels % SIMD == 0
+ * - Globalaccpool
+ - PE
+ - channels % PE == 0
+ * - Labelselect
+ - PE
+ - num_labels % PE == 0
+ * - MatrixVectorActivation
+ - PE & SIMD
+ - MH % PE == 0 & MW % SIMD == 0
+ * - Pool
+ - PE
+ - inp_channels % PE == 0
+ * - Thresholding
+ - PE
+ - MH % PE == 0
+ * - VectorVectorActivation
+ - PE & SIMD
+ - k_h * k_w % SIMD == 0 & channels % PE == 0
+
+
+RTL ConvolutionInputGenerator
+=============================
+
+FINN implements convolution operations by pairing a ConvolutionInputGenerator (or "sliding window generator (SWG)") with an MVAU or VVAU (for depthwise convolution).
+This RTL version is an alternative to the original `HLS implementation `_ and aims to improve on it in the following ways:
+
+* Support a wider range of hyperparameters without the fragmentation into 16+ separate HLS functions
+
+* Support additional degrees of parallelism (i.e., across the output window or multiple input samples) that are difficult to implement in HLS
+
+* Support additional features, such as dynamic feature map sizing
+
+* Improve resource efficiency
+
+
+The component is implemented by generating (System-)Verilog code for each individual instance, realized via the template + replacement dictionary mechanism found in other FINN components.
+
+Implementation styles
+---------------------
+Depending on the amount of parallelism requested, one of two implementation styles is selected. The following table defines folding parameters (marked in bold text) and supported configurations.
+
+.. list-table:: Parallelism configurations
+
+ * - **SIMD**
+ - **parallel_window**
+ - **M**
+ - MMV_in
+ - MMV_out
+ - Style
+ - Notes
+ * - < C
+ - 0
+ - 1
+ - 1
+ - 1
+ - default
+ - depthwise-aware
+ * - C
+ - 0
+ - 1
+ - 1
+ - 1
+ - default
+ - depthwise-agnostic
+ * - < C
+ - 1
+ - 1
+ - 1
+ - K
+ - parallel
+ - depthwise only
+ * - C
+ - 1
+ - 1
+ - 1
+ - K
+ - parallel
+ - depthwise-agnostic
+ * - C
+ - 1
+ - M
+ - M
+ - M*K
+ - parallel
+ - Currently unsupported
+
+(With C = #Channels, MMV_in = input samples (or "pixels") per cycle, MMV_out = output samples (or "pixels") per cycle, K = kernel_width * kernel_height.)
+
+The following diagram shows the operating principle of both styles, the "parallel" variant is pictured for a 2x2 kernel without dilation.
+
+.. image:: img/rtl_swg_impl_styles.png
+ :align: center
+
+The main difference lies in the buffer structure. If the output width is equal to the input width ("default mode"), an addressable circular buffer is used, which can be implemented either in LUTRAM, BRAM, or URAM resources. If parallel access to multiple window elements is required ("parallel mode"), the SWG generates a fixed structure of registers and line buffers to avoid memory port limitations and exploding multiplexing logic, while still featuring LUT-saving BRAM/URAM implementation for the line buffers.
+
+The "default" style also supports a dynamic mode, which provides an interface to change feature map dimensions, stride, or dilation at run-time. See `this pull request `_ description for more information.
+
+Folding
+-------
+The RTL SWG is supported by the basic automatic folding algorithm in FINN (:py:mod:`finn.transformation.fpgadataflow.set_folding.SetFolding`). Consider the following implications:
+
+**MVAU:** Although it is recommended to unfold SIMD first, SIMD and PE can be set independently. Full (and balanced) parallelism is achieved by using the SWG in parallel window mode and setting MVAU SIMD and PE to their maximum values (SIMD = MW = C_in * K, PE = MH = C_out).
+
+**VVAU:** The VVAU component supports SIMD unfolding (up to SIMD = K) independently from PE unfolding (up to PE = C), but can't accept a datawidth-converted input from a fully-parallel SWG in case PE is not fully unfolded due to the depthwise data layout. Therefore, it is required to set SIMD of the SWG = PE of the VVAU when window-parallelism is enabled. In this scenario, VVAU SIMD < K is supported via an automatically inserted DWC.
diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst
index 6fea992cf7..5b1d59b99d 100644
--- a/docs/finn/nw_prep.rst
+++ b/docs/finn/nw_prep.rst
@@ -32,19 +32,28 @@ The idea behind streamlining is to eliminate floating point operations in a mode
After this transformation the ONNX model is streamlined and contains now custom nodes in addition to the standard nodes. At this point we can use the :ref:`verification` to simulate the model using Python and in the next step some of the nodes can be converted into HLS layers that correspond to finn_hlslib functions.
-Convert to HLS Layers
+Convert to HW Layers
=====================
-In this step standard or custom layers are converted to HLS layers. HLS layers are layers that directly correspond to a finn-hlslib function call. For example pairs of binary XNORPopcountMatMul and MultiThreshold layers are converted to MatrixVectorActivation layers. The result is a model consisting of a mixture of HLS and non-HLS layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers`. The MatrixVectorActivation layer can be implemented in three different modes, *const*, *decoupled* (see chapter :ref:`mem_mode`) and *external*.
+In this step standard or custom layers are converted to HW layers. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. These layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow.
+
+The result is a model consisting of a mixture of HW and non-HW layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hw_layers`.
Dataflow Partitioning
=====================
-In the next step the graph is split and the part consisting of HLS layers is further processed in the FINN flow. The parent graph containing the non-HLS layers remains. The PE and SIMD are set to 1 by default, so the result is a network of only HLS layers with maximum folding. The model can be verified using the *cppsim* simulation. It is a simulation using C++ and is described in more detail in chapter :ref:`verification`.
+In the next step the graph is split and the part consisting of HW layers is further processed in the FINN flow. The parent graph containing the non-HW layers remains.
+
+Specialize Layers
+=====================
+
+The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the SpecializeLayers transformation. It is possible to let the FINN flow know a preference for the implementation style {"hls", "rtl"} and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default.
Folding
=========
+The PE and SIMD are set to 1 by default, so the result is a network of only HLS/RTL layers with maximum folding. The HLS layers of the model can be verified using the *cppsim* simulation. It is a simulation using C++ and is described in more detail in chapter :ref:`verification`.
+
To adjust the folding, the values for PE and SIMD can be increased to achieve also an increase in the performance. The result can be verified using the same simulation flow as for the network with maximum folding (*cppsim* using C++), for details please have a look at chapter :ref:`verification`.
-The result is a network of HLS layers with desired folding and it can be passed to :ref:`hw_build`.
+The result is a network of HLS/RTL layers with desired folding and it can be passed to :ref:`hw_build`.
diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst
index f2321dbee7..d97c04eb62 100644
--- a/docs/finn/source_code/finn.analysis.rst
+++ b/docs/finn/source_code/finn.analysis.rst
@@ -31,6 +31,14 @@ qonnx.analysis.inference\_cost
:undoc-members:
:show-inheritance:
+qonnx.analysis.tensor\_stats
+-----------------------------
+
+.. automodule:: qonnx.analysis.tensor_stats
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
qonnx.analysis.topology
-----------------------------
diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst
index afa1ecffa0..28cb47eaf7 100644
--- a/docs/finn/source_code/finn.core.rst
+++ b/docs/finn/source_code/finn.core.rst
@@ -54,14 +54,6 @@ finn.core.onnx\_exec
:undoc-members:
:show-inheritance:
-finn.core.remote\_exec
------------------------------
-
-.. automodule:: finn.core.remote_exec
- :members:
- :undoc-members:
- :show-inheritance:
-
finn.core.rtlsim\_exec
-----------------------------
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst
new file mode 100644
index 0000000000..5a4fff6052
--- /dev/null
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst
@@ -0,0 +1,184 @@
+*****************************
+Custom Op - fpgadataflow.hls
+*****************************
+
+HLS Custom Op Nodes
+===================
+
+finn.custom\_op.fpgadataflow.addstreams\_hls
+---------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.addstreams_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.channelwise\_op\_hls
+-----------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.channelwise_op_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.checksum_hls
+------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.checksum_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.concat_hls
+-----------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.concat_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+finn.custom\_op.fpgadataflow.convolutioninputgenerator_hls
+-----------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.downsampler_hls
+---------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.downsampler_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.duplicatestreams\_hls
+-------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.duplicatestreams_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.fmpadding\_hls
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.fmpadding_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.fmpadding\_pixel\_hls
+---------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.globalaccpool\_hls
+---------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.globalaccpool_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.iodma\_hls
+----------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.iodma_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.labelselect\_hls
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.labelselect_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.lookup\_hls
+------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.lookup_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.matrixvectoractivation_hls
+--------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
+finn.custom\_op.fpgadataflow.pool\_hls
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.pool_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_hls
+----------------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.streamingeltwise\_hls
+----------------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.streamingeltwise_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.streamingmaxpool\_hls
+-----------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.thresholding\_hls
+-------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.thresholding_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.tlastmarker\_hls
+-----------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.tlastmarker_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.upsampler\_hls
+---------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.upsampler_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.vectorvectoractivation\_hls
+---------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
index fdcf44c6d9..25aafc324e 100644
--- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst
@@ -2,71 +2,71 @@
Custom Op - fpgadataflow
************************
-HLS Custom Op Nodes
-===================
+Submodules
+==========
-Base Class
-----------
+.. toctree::
+ :maxdepth: 2
-.. automodule:: finn.custom_op.fpgadataflow.hlscustomop
- :members:
- :undoc-members:
- :show-inheritance:
+ finn.custom_op.fpgadataflow.hls
+ finn.custom_op.fpgadataflow.rtl
-finn.custom\_op.fpgadataflow.addstreams\_batch
------------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.addstreams_batch
+HW Custom Op Nodes
+===================
+
+Base Class - HWCustomOp
+------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.hwcustomop
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.channelwise\_op\_batch
------------------------------------------------------
+HLSBackend
+-----------
-.. automodule:: finn.custom_op.fpgadataflow.channelwise_op_batch
+.. automodule:: finn.custom_op.fpgadataflow.hlsbackend
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.checksum
---------------------------------------
+RTLBackend
+-----------
-.. automodule:: finn.custom_op.fpgadataflow.checksum
+.. automodule:: finn.custom_op.fpgadataflow.rtlbackend
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.concat
--------------------------------------
+finn.custom\_op.fpgadataflow.addstreams
+----------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.concat
+.. automodule:: finn.custom_op.fpgadataflow.addstreams
:members:
:undoc-members:
:show-inheritance:
+finn.custom\_op.fpgadataflow.channelwise\_op
+---------------------------------------------
-finn.custom\_op.fpgadataflow.convolutioninputgenerator
---------------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator
+.. automodule:: finn.custom_op.fpgadataflow.channelwise_op
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.convolutioninputgenerator1d
--------------------------------------------------------------
+finn.custom\_op.fpgadataflow.concat
+-------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator1d
+.. automodule:: finn.custom_op.fpgadataflow.concat
:members:
:undoc-members:
:show-inheritance:
+finn.custom\_op.fpgadataflow.convolutioninputgenerator
+--------------------------------------------------------
-finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl
-------------------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl
+.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator
:members:
:undoc-members:
:show-inheritance:
@@ -79,52 +79,42 @@ finn.custom\_op.fpgadataflow.downsampler
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.duplicatestreams\_batch
--------------------------------------------------------
+finn.custom\_op.fpgadataflow.duplicatestreams
+----------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.duplicatestreams_batch
+.. automodule:: finn.custom_op.fpgadataflow.duplicatestreams
:members:
:undoc-members:
:show-inheritance:
+finn.custom\_op.fpgadataflow.fmpadding
+---------------------------------------
-finn.custom\_op.fpgadataflow.eltwise
--------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.eltwise
+.. automodule:: finn.custom_op.fpgadataflow.fmpadding
:members:
:undoc-members:
:show-inheritance:
-
-finn.custom\_op.fpgadataflow.fmpadding\_batch
+finn.custom\_op.fpgadataflow.fmpadding\_pixel
-----------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.fmpadding_batch
- :members:
- :undoc-members:
- :show-inheritance:
-
-finn.custom\_op.fpgadataflow.globalaccpool\_batch
----------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.globalaccpool_batch
+.. automodule:: finn.custom_op.fpgadataflow.fmpadding_pixel
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.iodma
-------------------------------------
+finn.custom\_op.fpgadataflow.globalaccpool
+-------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.iodma
+.. automodule:: finn.custom_op.fpgadataflow.globalaccpool
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.labelselect\_batch
------------------------------------------------
+finn.custom\_op.fpgadataflow.labelselect
+-----------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.labelselect_batch
+.. automodule:: finn.custom_op.fpgadataflow.labelselect
:members:
:undoc-members:
:show-inheritance:
@@ -138,7 +128,7 @@ finn.custom\_op.fpgadataflow.lookup
:show-inheritance:
finn.custom\_op.fpgadataflow.matrixvectoractivation
------------------------------------------------------------
+-----------------------------------------------------
.. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation
:members:
@@ -146,10 +136,10 @@ finn.custom\_op.fpgadataflow.matrixvectoractivation
:show-inheritance:
-finn.custom\_op.fpgadataflow.pool\_batch
------------------------------------------------
+finn.custom\_op.fpgadataflow.pool
+----------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.pool_batch
+.. automodule:: finn.custom_op.fpgadataflow.pool
:members:
:undoc-members:
:show-inheritance:
@@ -163,51 +153,50 @@ finn.custom\_op.fpgadataflow.streamingdataflowpartition
:show-inheritance:
-finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch
-----------------------------------------------------------------------
+finn.custom\_op.fpgadataflow.streamingdatawidthconverter
+---------------------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch
+.. automodule:: finn.custom_op.fpgadataflow.streamingdatawidthconverter
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.streamingfifo
--------------------------------------------------
+finn.custom\_op.fpgadataflow.streamingeltwise
+----------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.streamingfifo
+.. automodule:: finn.custom_op.fpgadataflow.streamingeltwise
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.streamingmaxpool\_batch
------------------------------------------------------------
+finn.custom\_op.fpgadataflow.streamingfifo
+-------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.streamingmaxpool_batch
+.. automodule:: finn.custom_op.fpgadataflow.streamingfifo
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.templates
----------------------------------------------
+finn.custom\_op.fpgadataflow.streamingmaxpool
+----------------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.templates
+.. automodule:: finn.custom_op.fpgadataflow.streamingmaxpool
:members:
:undoc-members:
:show-inheritance:
-finn.custom\_op.fpgadataflow.thresholding\_batch
--------------------------------------------------------
+finn.custom\_op.fpgadataflow.templates
+----------------------------------------
-.. automodule:: finn.custom_op.fpgadataflow.thresholding_batch
+.. automodule:: finn.custom_op.fpgadataflow.templates
:members:
:undoc-members:
:show-inheritance:
+finn.custom\_op.fpgadataflow.thresholding
+------------------------------------------
-finn.custom\_op.fpgadataflow.tlastmarker
------------------------------------------------
-
-.. automodule:: finn.custom_op.fpgadataflow.tlastmarker
+.. automodule:: finn.custom_op.fpgadataflow.thresholding
:members:
:undoc-members:
:show-inheritance:
diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
new file mode 100644
index 0000000000..346eddb073
--- /dev/null
+++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst
@@ -0,0 +1,62 @@
+*****************************
+Custom Op - fpgadataflow.rtl
+*****************************
+
+RTL Custom Op Nodes
+===================
+
+finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl
+------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.fmpadding\_rtl
+---------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.fmpadding_rtl
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.matrixvectoractivation\_rtl
+---------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_rtl
+---------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.streamingfifo\_rtl
+-------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.thresholding\_rtl
+-------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.thresholding_rtl
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.custom\_op.fpgadataflow.vectorvectoractivation\_rtl
+---------------------------------------------------------------
+
+.. automodule:: finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl
+ :members:
+ :undoc-members:
+ :show-inheritance:
diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
index 9f8ec07930..f56b5fcf01 100644
--- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst
+++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst
@@ -38,10 +38,10 @@ finn.transformation.fpgadataflow.compile\_cppsim
:undoc-members:
:show-inheritance:
-finn.transformation.fpgadataflow.convert\_to\_hls\_layers
-----------------------------------------------------------------
+finn.transformation.fpgadataflow.convert\_to\_hw\_layers
+----------------------------------------------------------
-.. automodule:: finn.transformation.fpgadataflow.convert_to_hls_layers
+.. automodule:: finn.transformation.fpgadataflow.convert_to_hw_layers
:members:
:undoc-members:
:show-inheritance:
@@ -79,22 +79,29 @@ finn.transformation.fpgadataflow.externalize\_params
:show-inheritance:
finn.transformation.fpgadataflow.floorplan
-----------------------------------------------------
+-----------------------------------------------
.. automodule:: finn.transformation.fpgadataflow.floorplan
:members:
:undoc-members:
:show-inheritance:
-
finn.transformation.fpgadataflow.hlssynth\_ip
-----------------------------------------------------
+-----------------------------------------------
.. automodule:: finn.transformation.fpgadataflow.hlssynth_ip
:members:
:undoc-members:
:show-inheritance:
+finn.transformation.fpgadataflow.infer\_pixel\_padding\_deconv
+----------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.infer_pixel_padding_deconv
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
finn.transformation.fpgadataflow.insert\_dwc
---------------------------------------------------
@@ -139,14 +146,6 @@ finn.transformation.fpgadataflow.insert\_tlastmarker
:undoc-members:
:show-inheritance:
-finn.transformation.fpgadataflow.make\_deployment
---------------------------------------------------------
-
-.. automodule:: finn.transformation.fpgadataflow.make_deployment
- :members:
- :undoc-members:
- :show-inheritance:
-
finn.transformation.fpgadataflow.make\_pynq\_driver
----------------------------------------------------------
@@ -173,6 +172,15 @@ finn.transformation.fpgadataflow.minimize\_accumulator\_width
:show-inheritance:
+finn.transformation.fpgadataflow.minimize\_weight\_bit\_width
+--------------------------------------------------------------
+
+.. automodule:: finn.transformation.fpgadataflow.minimize_weight_bit_width
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+
finn.transformation.fpgadataflow.prepare\_cppsim
-------------------------------------------------------
@@ -229,16 +237,24 @@ finn.transformation.fpgadataflow.set\_folding
:undoc-members:
:show-inheritance:
-finn.transformation.fpgadataflow.synth\_ooc
+finn.transformation.fpgadataflow.specialize\_layers
-------------------------------------------------------
+.. automodule:: finn.transformation.fpgadataflow.specialize_layers
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+finn.transformation.fpgadataflow.synth\_ooc
+---------------------------------------------
+
.. automodule:: finn.transformation.fpgadataflow.synth_ooc
:members:
:undoc-members:
:show-inheritance:
finn.transformation.fpgadataflow.template\_driver
--------------------------------------------------
+---------------------------------------------------
.. automodule:: finn.transformation.fpgadataflow.template_driver
:members:
@@ -246,7 +262,7 @@ finn.transformation.fpgadataflow.template\_driver
:show-inheritance:
finn.transformation.fpgadataflow.templates
--------------------------------------------------
+-----------------------------------------------
.. automodule:: finn.transformation.fpgadataflow.templates
:members:
diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst
index f42b595a50..8dc7e1afc2 100644
--- a/docs/finn/source_code/finn.transformation.rst
+++ b/docs/finn/source_code/finn.transformation.rst
@@ -15,7 +15,7 @@ Submodules
finn.transformation.streamline
Transformation Passes
-=====================
+======================
Base Class
----------
@@ -49,6 +49,14 @@ qonnx.transformation.change\_3d\_tensors\_to\_4d
:undoc-members:
:show-inheritance:
+qonnx.transformation.change\_batchsize
+----------------------------------------
+
+.. automodule:: qonnx.transformation.change_batchsize
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
qonnx.transformation.change\_datalayout
--------------------------------------------
@@ -83,6 +91,14 @@ qonnx.transformation.double\_to\_single\_float
:undoc-members:
:show-inheritance:
+qonnx.transformation.expose\_intermediate
+------------------------------------------
+
+.. automodule:: qonnx.transformation.expose_intermediate
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
qonnx.transformation.extend\_partition
------------------------------------------
@@ -99,9 +115,16 @@ qonnx.transformation.extract\_conv\_bias
:undoc-members:
:show-inheritance:
+qonnx.transformation.extract\_quant\_scale\_zeropt
+----------------------------------------------------
+
+.. automodule:: qonnx.transformation.extract_quant_scale_zeropt
+ :members:
+ :undoc-members:
+ :show-inheritance:
qonnx.transformation.fold\_constants
-------------------------------------------
+--------------------------------------
.. automodule:: qonnx.transformation.fold_constants
:members:
@@ -117,7 +140,7 @@ qonnx.transformation.gemm\_to\_matmul
:show-inheritance:
qonnx.transformation.general
-----------------------------------
+------------------------------
.. automodule:: qonnx.transformation.general
:members:
@@ -165,7 +188,7 @@ qonnx.transformation.lower\_convs\_to\_matmul
:show-inheritance:
qonnx.transformation.make\_input\_chanlast
-------------------------------------------
+---------------------------------------------
.. automodule:: qonnx.transformation.make_input_chanlast
:members:
@@ -180,6 +203,29 @@ qonnx.transformation.merge\_onnx\_models
:undoc-members:
:show-inheritance:
+qonnx.transformation.pruning
+------------------------------
+
+.. automodule:: qonnx.transformation.pruning
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+qonnx.transformation.qcdq\_to\_qonnx
+----------------------------------------
+
+.. automodule:: qonnx.transformation.qcdq_to_qonnx
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+qonnx.transformation.qonnx\_to\_qcdq
+-------------------------------------
+
+.. automodule:: qonnx.transformation.qonnx_to_qcdq
+ :members:
+ :undoc-members:
+ :show-inheritance:
qonnx.transformation.quant\_constant\_folding
----------------------------------------------
@@ -189,6 +235,13 @@ qonnx.transformation.quant\_constant\_folding
:undoc-members:
:show-inheritance:
+qonnx.transformation.quantize\_graph
+-------------------------------------
+
+.. automodule:: qonnx.transformation.quantize_graph
+ :members:
+ :undoc-members:
+ :show-inheritance:
qonnx.transformation.rebalance\_conv
----------------------------------------
@@ -199,13 +252,28 @@ qonnx.transformation.rebalance\_conv
:show-inheritance:
qonnx.transformation.remove
--------------------------------------
+----------------------------
.. automodule:: qonnx.transformation.remove
:members:
:undoc-members:
:show-inheritance:
+qonnx.transformation.resize\_conv\_to\_deconv
+-----------------------------------------------
+
+.. automodule:: qonnx.transformation.resize_conv_to_deconv
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+qonnx.transformation.subpixel\_to\_deconv
+-----------------------------------------------
+
+.. automodule:: qonnx.transformation.subpixel_to_deconv
+ :members:
+ :undoc-members:
+ :show-inheritance:
finn.transformation.move\_reshape
----------------------------------------
diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst
index 7ba3b252ab..2ec1502441 100644
--- a/docs/finn/source_code/finn.util.rst
+++ b/docs/finn/source_code/finn.util.rst
@@ -31,8 +31,16 @@ qonnx.util.config
:undoc-members:
:show-inheritance:
+qonnx.util.convert
+--------------------
+
+.. automodule:: qonnx.util.convert
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
qonnx.util.exec\_qonnx
-----------------------
+------------------------
.. automodule:: qonnx.util.exec_qonnx
:members:
@@ -55,6 +63,37 @@ qonnx.util.onnx
:undoc-members:
:show-inheritance:
+qonnx.util.prune\_channels
+---------------------------
+
+.. automodule:: qonnx.util.prune_channels
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+qonnx.util.random\_reseed
+--------------------------
+
+.. automodule:: qonnx.util.random_reseed
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+qonnx.util.range\_analysis
+---------------------------
+
+.. automodule:: qonnx.util.range_analysis
+ :members:
+ :undoc-members:
+ :show-inheritance:
+
+qonnx.util.test
+--------------------
+
+.. automodule:: qonnx.util.test
+ :members:
+ :undoc-members:
+ :show-inheritance:
qonnx.util.to\_channels\_last
------------------------------
@@ -81,8 +120,6 @@ finn.util.create
:undoc-members:
:show-inheritance:
-
-
finn.util.data\_packing
------------------------------
@@ -99,14 +136,6 @@ finn.util.fpgadataflow
:undoc-members:
:show-inheritance:
-finn.util.gdrive
------------------------------
-
-.. automodule:: finn.util.gdrive
- :members:
- :undoc-members:
- :show-inheritance:
-
finn.util.hls
---------------
diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst
index 7ac54501cf..39d25c2634 100644
--- a/docs/finn/tutorials.rst
+++ b/docs/finn/tutorials.rst
@@ -16,7 +16,7 @@ The notebooks in this folder should give a basic insight into FINN, how to get s
* This notebook can help you to learn how to create and manipulate a simple ONNX model, also by using FINN
-* 1_brevitas_network_import
+* 1_brevitas_network_import_via_QONNX
* This notebook shows how to import a Brevitas network and prepare it for the FINN flow.
@@ -47,6 +47,15 @@ The notebooks in this folder are more developer oriented. They should help you t
* Explains the basics of FINN custom ops and how to define a new one.
+* 3_folding
+
+ * Describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them.
+
+* 4_advanced_builder_settings
+
+ * Provides a more detailed look into the FINN builder tool and explores different options to customize your FINN design.
+
+
FINN Example FPGA Flow Using MNIST Numerals
============================================
diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst
index e1a9ac4b31..578c941c36 100644
--- a/docs/finn/verification.rst
+++ b/docs/finn/verification.rst
@@ -4,18 +4,18 @@
Functional Verification
***********************
-.. image:: ../../notebooks/end2end_example/bnn-pynq/verification.png
- :scale: 70%
+.. image:: ../../notebooks/end2end_example/bnn-pynq/verification.svg
+ :scale: 40%
:align: center
This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder `_.
-When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS custom nodes. A single node can be executed using one or more of the following methods:
+When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS/RTL custom nodes. A single node can be executed using one or more of the following methods:
Simulation using Python
=======================
-This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS custom nodes, so right after the streamlining transformations and before the nodes are converted into HLS layers.
+This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS/RTL custom nodes yet, so right after the streamlining transformations and before the nodes are specialized into HLS/RTL layers.
Simulation using C++
====================
@@ -26,7 +26,7 @@ This simulation can be used for a model containing several HLS custom operations
Emulation using PyVerilator
===========================
-The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files.
+The emulation using PyVerilator can be used when IP blocks/RTL modules were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files.
For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this:
- for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename.
diff --git a/docs/img/finn-examples-header.png b/docs/img/finn-examples-header.png
deleted file mode 100644
index 50f8fa7761..0000000000
Binary files a/docs/img/finn-examples-header.png and /dev/null differ
diff --git a/docs/img/imagenet.jpg b/docs/img/imagenet.jpg
deleted file mode 100644
index 5cdd5aa303..0000000000
Binary files a/docs/img/imagenet.jpg and /dev/null differ
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100644
index 0000000000..3a3730d2b9
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1,16 @@
+brevitas@git+https://github.com/Xilinx/brevitas@master#egg=brevitas_examples
+dataclasses-json==0.5.7
+docutils==0.19
+gspread==3.6.0
+importlib_resources
+IPython
+matplotlib
+netron
+pytest
+pyverilator@git+https://github.com/maltanar/pyverilator@master#egg=pyverilator
+qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx
+sphinx_rtd_theme==2.0.0
+torch
+torchvision
+tqdm
+vcdvcd
diff --git a/fetch-repos.sh b/fetch-repos.sh
index 5b060f5bc8..073c052d67 100755
--- a/fetch-repos.sh
+++ b/fetch-repos.sh
@@ -27,17 +27,18 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-QONNX_COMMIT="dd35a8ff49d7225a07ffceeebe25a6361df48349"
-FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366"
-BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03"
-PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f"
+QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f"
+FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2"
+BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db"
+PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1"
CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4"
-HLSLIB_COMMIT="d27f6b6c5d8f1bb208db395659389603f63ad4be"
-OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc"
+HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3"
+OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a"
AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b"
XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e"
+RFSOC4x2_BDF_COMMIT="13fb6f6c02c7dfd7e4b336b18b959ad5115db696"
KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79"
-EXP_BOARD_FILES_MD5="30eecc497c31050bd46d10ea20eba232"
+EXP_BOARD_FILES_MD5="226ca927a16ea4ce579f1332675e9e9a"
QONNX_URL="https://github.com/fastmachinelearning/qonnx.git"
FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git"
@@ -48,6 +49,7 @@ HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git"
OMX_URL="https://github.com/maltanar/oh-my-xilinx.git"
AVNET_BDF_URL="https://github.com/Avnet/bdf.git"
XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
+RFSOC4x2_BDF_URL="https://github.com/RealDigitalOrg/RFSoC4x2-BSP.git"
KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git"
QONNX_DIR="qonnx"
@@ -59,6 +61,7 @@ HLSLIB_DIR="finn-hlslib"
OMX_DIR="oh-my-xilinx"
AVNET_BDF_DIR="avnet-bdf"
XIL_BDF_DIR="xil-bdf"
+RFSOC4x2_BDF_DIR="rfsoc4x2-bdf"
KV260_SOM_BDF_DIR="kv260-som-bdf"
# absolute path to this script, e.g. /home/user/bin/foo.sh
@@ -107,6 +110,7 @@ fetch_board_files() {
unzip -q pynq-z2.zip
cp -r $SCRIPTPATH/deps/$AVNET_BDF_DIR/* $SCRIPTPATH/deps/board_files/
cp -r $SCRIPTPATH/deps/$XIL_BDF_DIR/boards/Xilinx/rfsoc2x2 $SCRIPTPATH/deps/board_files/;
+ cp -r $SCRIPTPATH/deps/$RFSOC4x2_BDF_DIR/board_files/rfsoc4x2 $SCRIPTPATH/deps/board_files/;
cp -r $SCRIPTPATH/deps/$KV260_SOM_BDF_DIR/boards/Xilinx/kv260_som $SCRIPTPATH/deps/board_files/;
cd $OLD_PWD
}
@@ -120,19 +124,26 @@ fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR
fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR
fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR
fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR
+fetch_repo $RFSOC4x2_BDF_URL $RFSOC4x2_BDF_COMMIT $RFSOC4x2_BDF_DIR
fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR
-# download extra Pynq board files and extract if needed
-if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then
- fetch_board_files
+# Can skip downloading of board files entirely if desired
+if [ "$FINN_SKIP_BOARD_FILES" = "1" ]; then
+ echo "Skipping download and verification of board files"
else
- cd $SCRIPTPATH
- BOARD_FILES_MD5=$(find deps/board_files/ -type f -exec md5sum {} \; | sort -k 2 | md5sum | cut -d' ' -f 1)
- if [ "$BOARD_FILES_MD5" = "$EXP_BOARD_FILES_MD5" ]; then
- echo "Verified board files folder content md5: $BOARD_FILES_MD5"
- else
- echo "Board files folder content mismatch, removing and re-downloading"
- rm -rf deps/board_files/
+ # download extra board files and extract if needed
+ if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then
fetch_board_files
+ else
+ cd $SCRIPTPATH
+ BOARD_FILES_MD5=$(find deps/board_files/ -type f -exec md5sum {} \; | sort -k 2 | md5sum | cut -d' ' -f 1)
+ if [ "$BOARD_FILES_MD5" = "$EXP_BOARD_FILES_MD5" ]; then
+ echo "Verified board files folder content md5: $BOARD_FILES_MD5"
+ else
+ echo "Board files folder md5: expected $BOARD_FILES_MD5 found $EXP_BOARD_FILES_MD5"
+ echo "Board files folder content mismatch, removing and re-downloading"
+ rm -rf deps/board_files/
+ fetch_board_files
+ fi
fi
fi
diff --git a/finn-rtllib/axi_info/component.xml b/finn-rtllib/axi_info/component.xml
index d22637534f..c7632e2915 100644
--- a/finn-rtllib/axi_info/component.xml
+++ b/finn-rtllib/axi_info/component.xml
@@ -197,6 +197,10 @@
ASSOCIATED_BUSIFs_axi
+
+ FREQ_TOLERANCE_HZ
+ -1
+
@@ -228,7 +232,7 @@
viewChecksum
- 7d682dfc
+ c9da9874
@@ -244,7 +248,7 @@
viewChecksum
- 7d682dfc
+ c9da9874
@@ -258,7 +262,7 @@
viewChecksum
- e11f9727
+ 1e654f67
@@ -607,7 +611,7 @@
hdl/axi_info_top.svsystemVerilogSource
- CHECKSUM_ec9ff0da
+ CHECKSUM_db6ccc10
@@ -692,17 +696,22 @@
axi_info_top_v1_0package_project
- 5
- 2022-05-30T14:16:13Z
+ 6
+ 2023-05-24T06:36:33Z
- 2022.1
-
+ 2022.2
+
-
+
+
+
+
+
+
diff --git a/finn-rtllib/axi_info/hdl/axi_info_top.sv b/finn-rtllib/axi_info/hdl/axi_info_top.sv
index ab2cfc8bed..74aebe3ec7 100644
--- a/finn-rtllib/axi_info/hdl/axi_info_top.sv
+++ b/finn-rtllib/axi_info/hdl/axi_info_top.sv
@@ -38,7 +38,10 @@ module axi_info_top #(
bit [31:0] CHECKSUM_COUNT
)(
//- Global Control ------------------
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axi, ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
input logic ap_clk,
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
input logic ap_rst_n,
//- AXI Lite ------------------------
diff --git a/finn-rtllib/dwc/hdl/dwc.sv b/finn-rtllib/dwc/hdl/dwc.sv
new file mode 100644
index 0000000000..13b0cb34c4
--- /dev/null
+++ b/finn-rtllib/dwc/hdl/dwc.sv
@@ -0,0 +1,158 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Stream Data Width Converter.
+ * @author Thomas B. Preußer
+ *****************************************************************************/
+module dwc #(
+ int unsigned IBITS,
+ int unsigned OBITS
+)(
+ //- Global Control ------------------
+ input logic clk,
+ input logic rst,
+
+ //- AXI Stream - Input --------------
+ output logic irdy,
+ input logic ivld,
+ input logic [IBITS-1:0] idat,
+
+ //- AXI Stream - Output -------------
+ input logic ordy,
+ output logic ovld,
+ output logic [OBITS-1:0] odat
+);
+
+ if(IBITS == OBITS) begin : genNoop
+ assign irdy = ordy;
+ assign ovld = ivld;
+ assign odat = idat;
+ end : genNoop
+ else if(IBITS < OBITS) begin : genUp
+
+ // Sanity Checking: integer upscaling
+ initial begin
+ if(OBITS % IBITS) begin
+ $error("Output width %0d is not a multiple of input width %0d.", OBITS, IBITS);
+ $finish;
+ end
+ end
+
+ // Parallelizing Shift Register A and Sidestep Buffer B on Input Path
+ localparam int unsigned K = OBITS / IBITS;
+ typedef logic [IBITS-1:0] dat_t;
+ dat_t [K-1:0] ADat = 'x;
+ logic [$clog2(K):0] ACnt = K-1; // (empty) K-1, ..., 0, -1 (full/valid)
+ dat_t BDat = 'x;
+ logic BRdy = 1;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ ADat <= 'x;
+ ACnt <= K-1;
+ BDat <= 'x;
+ BRdy <= 1;
+ end
+ else begin
+ automatic type(ACnt) acnt = (ovld && ordy)? K-1 : ACnt;
+ automatic logic rdy = !ovld || ordy;
+ if((ivld || !BRdy) && rdy) begin
+ ADat <= { BRdy? idat : BDat, ADat[K-1:1] };
+ acnt--;
+ end
+ ACnt <= acnt;
+
+ if(BRdy) BDat <= idat;
+ BRdy <= rdy || (BRdy && !ivld);
+ end
+ end
+
+ // Output Assignments
+ assign irdy = BRdy;
+ assign ovld = ACnt[$left(ACnt)];
+ assign odat = ADat;
+
+ end : genUp
+ else begin : genDown
+
+ // Sanity Checking: integer downscaling
+ initial begin
+ if(IBITS % OBITS) begin
+ $error("Input width %0d is not a multiple of output width %0d.", IBITS, OBITS);
+ $finish;
+ end
+ end
+
+ // Serializing Shift Register A and Sidestep Buffer B on Output Path
+ localparam int unsigned K = IBITS / OBITS;
+ typedef logic [OBITS-1:0] dat_t;
+ dat_t [ K-1:0] ADat = 'x;
+ logic [$clog2(K):0] ACnt = 1; // (full) -K+1, ..., -1, 0, 1 (empty/not valid)
+ dat_t BDat = 'x;
+ logic BRdy = 1;
+ dat_t CDat = 'x;
+ logic CVld = 0;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ ADat <= 'x;
+ ACnt <= 1;
+ BDat <= 'x;
+ BRdy <= 1;
+ CDat <= 'x;
+ CVld <= 0;
+ end
+ else begin
+ automatic type(ACnt) acnt = ACnt;
+ automatic logic ainc = 0;
+ if(irdy) begin
+ ADat <= idat;
+ acnt = ivld? -K+1 : 1;
+ end
+ else if(BRdy) begin
+ ADat <= { {OBITS{1'bx}}, ADat[K-1:1] };
+ ainc = BRdy;
+ end;
+ ACnt <= acnt + ainc;
+
+ if(BRdy) BDat <= ADat[0];
+ BRdy <= !CVld || ordy || (BRdy && !ACnt[$left(ACnt)] && ACnt[0]);
+
+ if(!CVld || ordy) CDat <= BRdy? ADat[0] : BDat;
+ CVld <= (CVld && !ordy) || !BRdy || ACnt[$left(ACnt)] || !ACnt[0];
+ end
+ end
+
+ // Output Assignments
+ assign irdy = BRdy && !ACnt[$left(ACnt)];
+ assign ovld = CVld;
+ assign odat = CDat;
+
+ end : genDown
+
+endmodule : dwc
diff --git a/finn-rtllib/dwc/hdl/dwc_axi.sv b/finn-rtllib/dwc/hdl/dwc_axi.sv
new file mode 100644
index 0000000000..dfe02fcb48
--- /dev/null
+++ b/finn-rtllib/dwc/hdl/dwc_axi.sv
@@ -0,0 +1,65 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief AXI Stream Adapter for Data Width Converter.
+ * @author Thomas B. Preußer
+ *****************************************************************************/
+module dwc_axi #(
+ int unsigned IBITS,
+ int unsigned OBITS,
+
+ localparam int unsigned AXI_IBITS = (IBITS+7)/8 * 8,
+ localparam int unsigned AXI_OBITS = (OBITS+7)/8 * 8
+)(
+ //- Global Control ------------------
+ input logic ap_clk,
+ input logic ap_rst_n,
+
+ //- AXI Stream - Input --------------
+ output logic s_axis_tready,
+ input logic s_axis_tvalid,
+ input logic [AXI_IBITS-1:0] s_axis_tdata,
+
+ //- AXI Stream - Output -------------
+ input logic m_axis_tready,
+ output logic m_axis_tvalid,
+ output logic [AXI_OBITS-1:0] m_axis_tdata
+);
+
+ dwc #(.IBITS(IBITS), .OBITS(OBITS)) core (
+ .clk(ap_clk), .rst(!ap_rst_n),
+ .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata[IBITS-1:0]),
+ .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata[OBITS-1:0])
+ );
+ if(OBITS < AXI_OBITS) begin
+ assign m_axis_tdata[AXI_OBITS-1:OBITS] = '0;
+ end
+
+endmodule : dwc_axi
diff --git a/finn-rtllib/dwc/hdl/dwc_template.v b/finn-rtllib/dwc/hdl/dwc_template.v
new file mode 100644
index 0000000000..01a0254040
--- /dev/null
+++ b/finn-rtllib/dwc/hdl/dwc_template.v
@@ -0,0 +1,71 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$ #(
+ parameter IBITS = $IBITS$,
+ parameter OBITS = $OBITS$,
+
+ parameter AXI_IBITS = (IBITS+7)/8 * 8,
+ parameter AXI_OBITS = (OBITS+7)/8 * 8
+)(
+ //- Global Control ------------------
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+ input ap_clk,
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+ input ap_rst_n,
+
+ //- AXI Stream - Input --------------
+ output in0_V_TREADY,
+ input in0_V_TVALID,
+ input [AXI_IBITS-1:0] in0_V_TDATA,
+
+ //- AXI Stream - Output -------------
+ input out_V_TREADY,
+ output out_V_TVALID,
+ output [AXI_OBITS-1:0] out_V_TDATA
+);
+
+ dwc_axi #(
+ .IBITS(IBITS),
+ .OBITS(OBITS)
+ ) impl (
+ .ap_clk(ap_clk),
+ .ap_rst_n(ap_rst_n),
+ .s_axis_tready(in0_V_TREADY),
+ .s_axis_tvalid(in0_V_TVALID),
+ .s_axis_tdata(in0_V_TDATA),
+ .m_axis_tready(out_V_TREADY),
+ .m_axis_tvalid(out_V_TVALID),
+ .m_axis_tdata(out_V_TDATA)
+ );
+
+endmodule
diff --git a/finn-rtllib/dwc/sim/dwc_axi_tb.sv b/finn-rtllib/dwc/sim/dwc_axi_tb.sv
new file mode 100644
index 0000000000..64435c1900
--- /dev/null
+++ b/finn-rtllib/dwc/sim/dwc_axi_tb.sv
@@ -0,0 +1,195 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Testbench for AXI Stream Data Width Converter.
+ * @author Thomas B. Preußer
+ *****************************************************************************/
+module dwc_axi_tb;
+
+ localparam int unsigned DBITS = 8;
+ localparam int unsigned K = 3;
+ typedef logic [DBITS-1:0] dat_t;
+
+ // Global Control
+ logic clk = 0;
+ always #5ns clk = !clk;
+ logic rst = 1;
+ initial begin
+ repeat(8) @(posedge clk);
+ rst <= 0;
+ end
+
+ if(1) begin : blkUp
+ localparam int unsigned IBITS = DBITS;
+ localparam int unsigned OBITS = K * DBITS;
+
+ //- AXI Stream - Input --------------
+ uwire s_axis_tready;
+ logic s_axis_tvalid;
+ dat_t s_axis_tdata;
+
+ //- AXI Stream - Output -------------
+ logic m_axis_tready;
+ uwire m_axis_tvalid;
+ dat_t [K-1:0] m_axis_tdata;
+
+ dwc_axi #(.IBITS(IBITS), .OBITS(OBITS)) dut (
+ .ap_clk(clk), .ap_rst_n(!rst),
+ .s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+ .m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+ );
+
+ // Stimulus: Feed
+ dat_t Q[$];
+ initial begin
+ s_axis_tvalid = 0;
+ s_axis_tdata = 'x;
+ @(posedge clk iff !rst);
+
+ repeat(57600) begin
+ automatic type(s_axis_tdata) dat;
+ std::randomize(dat);
+
+ while($urandom()%7 < 2) @(posedge clk);
+
+ s_axis_tvalid <= 1;
+ s_axis_tdata <= dat;
+ @(posedge clk iff s_axis_tready);
+ Q.push_back(dat);
+
+ s_axis_tvalid <= 0;
+ s_axis_tdata <= 'x;
+ end
+
+ repeat(16) @(posedge clk);
+ $finish;
+ end
+
+ // Output Sink
+ initial begin
+ m_axis_tready = 0;
+ @(posedge clk iff !rst);
+
+ forever begin
+ automatic dat_t [K-1:0] dat;
+
+ while($urandom()%9 < 1) @(posedge clk);
+
+ m_axis_tready <= 1;
+ @(posedge clk iff m_axis_tvalid);
+ assert(Q.size >= K) else begin
+ $error("Spurious output.");
+ $stop;
+ end
+ for(int unsigned i = 0; i < K; i++) dat[i] = Q.pop_front();
+ assert(m_axis_tdata == dat) else begin
+ $error("Output mismatch.");
+ $stop;
+ end
+
+ m_axis_tready <= 0;
+ end
+ end
+ end : blkUp
+
+ if(1) begin : blkDown
+ localparam int unsigned IBITS = K * DBITS;
+ localparam int unsigned OBITS = DBITS;
+
+ //- AXI Stream - Input --------------
+ uwire s_axis_tready;
+ logic s_axis_tvalid;
+ dat_t [K-1:0] s_axis_tdata;
+
+ //- AXI Stream - Output -------------
+ logic m_axis_tready;
+ uwire m_axis_tvalid;
+ dat_t m_axis_tdata;
+
+ dwc_axi #(.IBITS(IBITS), .OBITS(OBITS)) dut (
+ .ap_clk(clk), .ap_rst_n(!rst),
+ .s_axis_tready, .s_axis_tvalid, .s_axis_tdata,
+ .m_axis_tready, .m_axis_tvalid, .m_axis_tdata
+ );
+
+ // Stimulus: Feed
+ dat_t Q[$];
+ initial begin
+ s_axis_tvalid = 0;
+ s_axis_tdata = 'x;
+ @(posedge clk iff !rst);
+
+ repeat(57600) begin
+ automatic dat_t [K-1:0] dat;
+ std::randomize(dat);
+
+ while($urandom()%7 < 2) @(posedge clk);
+
+ s_axis_tvalid <= 1;
+ s_axis_tdata <= dat;
+ @(posedge clk iff s_axis_tready);
+ for(int unsigned i = 0; i < K; i++) Q.push_back(dat[i]);
+
+ s_axis_tvalid <= 0;
+ s_axis_tdata <= 'x;
+ end
+
+ repeat(16) @(posedge clk);
+ $finish;
+ end
+
+ // Output Sink
+ initial begin
+ m_axis_tready = 0;
+ @(posedge clk iff !rst);
+
+ forever begin
+ automatic dat_t dat;
+
+ while($urandom()%9 < 1) @(posedge clk);
+
+ m_axis_tready <= 1;
+ @(posedge clk iff m_axis_tvalid);
+ assert(Q.size) else begin
+ $error("Spurious output.");
+ $stop;
+ end
+ dat = Q.pop_front();
+ assert(m_axis_tdata == dat) else begin
+ $error("Output mismatch: 0x%0x instead of 0x%0x", m_axis_tdata, dat);
+ $stop;
+ end
+
+ m_axis_tready <= 0;
+ end
+ end
+ end : blkDown
+
+endmodule : dwc_axi_tb
diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v
new file mode 100644
index 0000000000..11cef604e0
--- /dev/null
+++ b/finn-rtllib/fifo/hdl/Q_srl.v
@@ -0,0 +1,308 @@
+// original source:
+// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v
+
+
+// Copyright (c) 1999 The Regents of the University of California
+// Copyright (c) 2010 The Regents of the University of Pennsylvania
+// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London
+// Copyright (c) 2020 Xilinx
+//
+// Permission to use, copy, modify, and distribute this software and
+// its documentation for any purpose, without fee, and without a
+// written agreement is hereby granted, provided that the above copyright
+// notice and this paragraph and the following two paragraphs appear in
+// all copies.
+//
+// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR
+// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING
+// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION,
+// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF
+// SUCH DAMAGE.
+//
+// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES,
+// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON
+// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO
+// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.
+//
+
+// Q_srl_oreg3_prefull_SIMPLE.v
+//
+// - In-page queue with parameterizable depth, bit width
+// - Stream I/O is triple (data, valid, back-pressure),
+// with EOS concatenated into the data
+// - Flow control for input & output is combinationally decoupled
+// - 2 <= depth <= 256
+// * (depth >= 2) is required to decouple I/O flow control,
+// where empty => no produce, full => no consume,
+// and depth 1 would ping-pong between the two at half rate
+// * (depth <= 256) can be modified
+// by changing ''synthesis loop_limit X'' below
+// and changing ''addrwidth'' or its log computation
+// - 1 <= width
+// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice,
+// plus output register (for fast output)
+// - Queue addressing is done by ''addr'' up-down counter
+// - Queue fullness is checked by comparator (addr==depth)
+// - Queue fullness is pre-computed for next cycle
+// - Queue input back-pressure is pre-computed for next cycle
+// - Queue output valid (state!=state__empty) is pre-computed for next cycle
+// (necessary since SRL data output reg requires non-boolean state)
+// - FSM has 3 states (empty, one, more)
+// - When empty, continue to emit most recently emitted value (for debugging)
+//
+// - Queue slots used = / (state==state_empty) ? 0
+// | (state==state_one) ? 1
+// \ (state==state_more) ? addr+2
+// - Queue slots used <= depth
+// - Queue slots remaining = depth - used
+// = / (state==state_empty) ? depth
+// | (state==state_one) ? depth-1
+// \ (state==state_more) ? depth-2-addr
+//
+// - Synplify 7.1 / 8.0
+// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05
+
+
+`ifdef Q_srl
+`else
+`define Q_srl
+
+
+module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount);
+
+ parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256)
+ parameter width = 16; // - width of data (i_d, o_d)
+
+ parameter addrwidth = $clog2(depth);
+
+ input clock;
+ input reset;
+
+ input [width-1:0] i_d; // - input stream data (concat data + eos)
+ input i_v; // - input stream valid
+ output i_r; // - input stream ready
+ wire i_b; // - input stream back-pressure
+
+ output [width-1:0] o_d; // - output stream data (concat data + eos)
+ output o_v; // - output stream valid
+ input o_r; // - output stream ready
+ wire o_b; // - output stream back-pressure
+
+ output [addrwidth:0] count; // - output number of elems in queue
+ output [addrwidth:0] maxcount; // - maximum observed count since reset
+
+ reg [addrwidth:0] maxcount_reg; // - maximum count seen until now
+ reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address
+ // for data output
+ reg shift_en_; // - SRL16 shift enable
+ reg [width-1:0] srl [depth-2:0]; // - SRL16 memory
+ reg shift_en_o_; // - SRLO shift enable
+ reg [width-1:0] srlo_, srlo // - SRLO output reg
+ /* synthesis syn_allow_retiming=0 */ ;
+
+ parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED
+ parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo
+ parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo
+ // #items in srl = addr+2
+
+ reg [1:0] state, state_; // - state register
+
+ wire addr_full_; // - true iff addr==depth-2 on NEXT cycle
+ reg addr_full; // - true iff addr==depth-2
+ wire addr_zero_; // - true iff addr==0
+ wire o_v_reg_; // - true iff state_empty on NEXT cycle
+ reg o_v_reg // - true iff state_empty
+ /* synthesis syn_allow_retiming=0 */ ;
+ wire i_b_reg_; // - true iff !full on NEXT cycle
+ reg i_b_reg // - true iff !full
+ /* synthesis syn_allow_retiming=0 */ ;
+
+ assign addr_full_ = (state_==state_more) && (addr_==depth-2);
+ // - queue full
+ assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0)
+ assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty
+ assign i_b_reg_ = addr_full_; // - input bp if full
+ assign o_d = srlo; // - output data from queue
+ assign o_v = o_v_reg; // - output valid if non-empty
+ assign i_b = i_b_reg; // - input bp if full
+ assign maxcount = maxcount_reg;
+
+ assign i_r = !i_b;
+ assign o_b = !o_r;
+
+ assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0));
+
+ // - ''always'' block with both FFs and SRL16 does not work,
+ // since FFs need reset but SRL16 does not
+
+ always @(posedge clock) begin // - seq always: FFs
+ if (reset) begin
+ state <= state_empty;
+ addr <= 0;
+ addr_full <= 0;
+ o_v_reg <= 0;
+
+ i_b_reg <= 0;
+ maxcount_reg <= 0;
+
+ end
+ else begin
+ state <= state_;
+ addr <= addr_;
+ addr_full <= addr_full_;
+ o_v_reg <= o_v_reg_;
+ i_b_reg <= i_b_reg_;
+ maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg);
+ end
+ end // always @ (posedge clock)
+
+ always @(posedge clock) begin // - seq always: srlo
+ // - infer enabled output reg at end of shift chain
+ // - input first element from i_d, all subsequent elements from SRL16
+ if (reset) begin
+ srlo <= 0;
+ end
+ else begin
+ if (shift_en_o_) begin
+ srlo <= srlo_;
+ end
+ end
+ end // always @ (posedge clock)
+
+ always @(posedge clock) begin // - seq always: srl
+ // - infer enabled SRL16E from shifting srl array
+ // - no reset capability; srl[] contents undefined on reset
+ if (shift_en_) begin
+ // synthesis loop_limit 256
+ for (a_=depth-2; a_>0; a_=a_-1) begin
+ srl[a_] = srl[a_-1];
+ end
+ srl[0] <= i_d;
+ end
+ end // always @ (posedge clock or negedge reset)
+
+ always @* begin // - combi always
+ srlo_ <= 'bx;
+ shift_en_o_ <= 1'bx;
+ shift_en_ <= 1'bx;
+ addr_ <= 'bx;
+ state_ <= 2'bx;
+ case (state)
+
+ state_empty: begin // - (empty, will not produce)
+ if (i_v) begin // - empty & i_v => consume
+ srlo_ <= i_d;
+ shift_en_o_ <= 1;
+ shift_en_ <= 1'bx;
+ addr_ <= 0;
+ state_ <= state_one;
+ end
+ else begin // - empty & !i_v => idle
+ srlo_ <= 'bx;
+ shift_en_o_ <= 0;
+ shift_en_ <= 1'bx;
+ addr_ <= 0;
+ state_ <= state_empty;
+ end
+ end
+
+ state_one: begin // - (contains one)
+ if (i_v && o_b) begin // - one & i_v & o_b => consume
+ srlo_ <= 'bx;
+ shift_en_o_ <= 0;
+ shift_en_ <= 1;
+ addr_ <= 0;
+ state_ <= state_more;
+ end
+ else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod
+ srlo_ <= i_d;
+ shift_en_o_ <= 1;
+ shift_en_ <= 1;
+ addr_ <= 0;
+ state_ <= state_one;
+ end
+ else if (!i_v && o_b) begin // - one & !i_v & o_b => idle
+ srlo_ <= 'bx;
+ shift_en_o_ <= 0;
+ shift_en_ <= 1'bx;
+ addr_ <= 0;
+ state_ <= state_one;
+ end
+ else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce
+ srlo_ <= 'bx;
+ shift_en_o_ <= 0;
+ shift_en_ <= 1'bx;
+ addr_ <= 0;
+ state_ <= state_empty;
+ end
+ end // case: state_one
+
+ state_more: begin // - (contains more than one)
+ if (addr_full || (depth==2)) begin
+ // - (full, will not consume)
+ // - (full here if depth==2)
+ if (o_b) begin // - full & o_b => idle
+ srlo_ <= 'bx;
+ shift_en_o_ <= 0;
+ shift_en_ <= 0;
+ addr_ <= addr;
+ state_ <= state_more;
+ end
+ else begin // - full & !o_b => produce
+ srlo_ <= srl[addr];
+ shift_en_o_ <= 1;
+ shift_en_ <= 0;
+// addr_ <= addr-1;
+// state_ <= state_more;
+ addr_ <= addr_zero_ ? 0 : addr-1;
+ state_ <= addr_zero_ ? state_one : state_more;
+ end
+ end
+ else begin // - (mid: neither empty nor full)
+ if (i_v && o_b) begin // - mid & i_v & o_b => consume
+ srlo_ <= 'bx;
+ shift_en_o_ <= 0;
+ shift_en_ <= 1;
+ addr_ <= addr+1;
+ state_ <= state_more;
+ end
+ else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod
+ srlo_ <= srl[addr];
+ shift_en_o_ <= 1;
+ shift_en_ <= 1;
+ addr_ <= addr;
+ state_ <= state_more;
+ end
+ else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle
+ srlo_ <= 'bx;
+ shift_en_o_ <= 0;
+ shift_en_ <= 0;
+ addr_ <= addr;
+ state_ <= state_more;
+ end
+ else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce
+ srlo_ <= srl[addr];
+ shift_en_o_ <= 1;
+ shift_en_ <= 0;
+ addr_ <= addr_zero_ ? 0 : addr-1;
+ state_ <= addr_zero_ ? state_one : state_more;
+ end
+ end // else: !if(addr_full)
+ end // case: state_more
+
+ default: begin
+ srlo_ <= 'bx;
+ shift_en_o_ <= 1'bx;
+ shift_en_ <= 1'bx;
+ addr_ <= 'bx;
+ state_ <= 2'bx;
+ end // case: default
+
+ endcase // case(state)
+ end // always @ *
+
+endmodule // Q_srl
+
+
+`endif // `ifdef Q_srl
diff --git a/finn-rtllib/fifo/hdl/fifo_template.v b/finn-rtllib/fifo/hdl/fifo_template.v
new file mode 100644
index 0000000000..3f14ae991f
--- /dev/null
+++ b/finn-rtllib/fifo/hdl/fifo_template.v
@@ -0,0 +1,72 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$(
+//- Global Control ------------------
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET = ap_rst_n" *)
+(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+input ap_clk,
+(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+input ap_rst_n,
+
+output $COUNT_RANGE$ count,
+output $COUNT_RANGE$ maxcount,
+
+//- AXI Stream - Input --------------
+output in0_V_TREADY,
+input in0_V_TVALID,
+input $IN_RANGE$ in0_V_TDATA,
+
+//- AXI Stream - Output --------------
+input out_V_TREADY,
+output out_V_TVALID,
+output $OUT_RANGE$ out_V_TDATA
+);
+
+Q_srl #(
+.depth($DEPTH$),
+.width($WIDTH$)
+)
+impl
+(
+ .clock(ap_clk),
+ .reset(!ap_rst_n),
+ .count(count),
+ .maxcount(maxcount),
+ .i_d(in0_V_TDATA),
+ .i_v(in0_V_TVALID),
+ .i_r(in0_V_TREADY),
+ .o_d(out_V_TDATA),
+ .o_v(out_V_TVALID),
+ .o_r(out_V_TREADY)
+);
+
+endmodule
diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_template.v b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
index 0b0f40f86a..2347d9b394 100644
--- a/finn-rtllib/fmpadding/hdl/fmpadding_template.v
+++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v
@@ -31,10 +31,11 @@
module $TOP_MODULE_NAME$(
//- Global Control ------------------
-(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
-input ap_clk,
-(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
-input ap_rst_n,
+(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite, ASSOCIATED_RESET = ap_rst_n" *)
+(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+input ap_clk,
+(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+input ap_rst_n,
//- AXI Lite ------------------------
// Writing
@@ -86,7 +87,7 @@ fmpadding_axi #(
.INIT_YOFF($INIT_YOFF$),
.INIT_YEND($INIT_YEND$)
)
-$TOP_MODULE_NAME$_impl
+impl
(
.ap_clk(ap_clk),
.ap_rst_n(ap_rst_n),
diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml
index 63a8540a76..722da1d803 100644
--- a/finn-rtllib/memstream/component.xml
+++ b/finn-rtllib/memstream/component.xml
@@ -1,7 +1,7 @@
- xilinx.com
- user
+ amd.com
+ finnmemstream1.0
@@ -37,201 +37,6 @@
-
- m_axis_1
-
-
-
-
-
-
- TDATA
-
-
- m_axis_1_tdata
-
-
-
-
- TVALID
-
-
- m_axis_1_tvalid
-
-
-
-
- TREADY
-
-
- m_axis_1_tready
-
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_2
-
-
-
-
-
-
- TDATA
-
-
- m_axis_2_tdata
-
-
-
-
- TVALID
-
-
- m_axis_2_tvalid
-
-
-
-
- TREADY
-
-
- m_axis_2_tready
-
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_3
-
-
-
-
-
-
- TDATA
-
-
- m_axis_3_tdata
-
-
-
-
- TVALID
-
-
- m_axis_3_tvalid
-
-
-
-
- TREADY
-
-
- m_axis_3_tready
-
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_4
-
-
-
-
-
-
- TDATA
-
-
- m_axis_4_tdata
-
-
-
-
- TVALID
-
-
- m_axis_4_tvalid
-
-
-
-
- TREADY
-
-
- m_axis_4_tready
-
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_5
-
-
-
-
-
-
- TDATA
-
-
- m_axis_5_tdata
-
-
-
-
- TVALID
-
-
- m_axis_5_tvalid
-
-
-
-
- TREADY
-
-
- m_axis_5_tready
-
-
-
-
-
-
- true
-
-
-
- s_axilite
@@ -393,16 +198,9 @@
-
-
-
- true
-
-
-
- aresetn
+ ap_rst_n
@@ -412,19 +210,19 @@
RST
- aresetn
+ ap_rst_nPOLARITY
- ACTIVE_LOW
+ ACTIVE_LOW
- aclk
+ ap_clk
@@ -434,18 +232,22 @@
CLK
- aclk
+ ap_clk
+
+ ASSOCIATED_RESET
+ ap_rst_n
+ ASSOCIATED_BUSIF
- m_axis_0:m_axis_1:m_axis_2:m_axis_3:m_axis_4:m_axis_5:s_axilite
+ m_axis_0:s_axilite
- ASSOCIATED_RESET
- aresetn
+ FREQ_TOLERANCE_HZ
+ -1
@@ -453,11 +255,13 @@
interface_aximm
+ interface_aximmreg0
- 0
- 65536
- 32
+ reg0
+ 0x0
+ 4096
+ 32register
@@ -468,15 +272,15 @@
xilinx_anylanguagesynthesisSynthesis:vivado.xilinx.com:synthesis
- Verilog
- memstream
+ SystemVerilog
+ memstream_axi_wrapperxilinx_anylanguagesynthesis_view_filesetviewChecksum
- 1fc5a310
+ 04464096
@@ -484,15 +288,27 @@
xilinx_anylanguagebehavioralsimulationSimulation:vivado.xilinx.com:simulation
- Verilog
- memstream
+ SystemVerilog
+ memstream_axi_wrapperxilinx_anylanguagebehavioralsimulation_view_filesetviewChecksum
- d02d9990
+ 9e058959
+
+
+
+
+ xilinx_implementation
+ Implementation
+ :vivado.xilinx.com:implementation
+ memstream_axi_wrapper
+
+
+ viewChecksum
+ cd434062
@@ -506,7 +322,7 @@
viewChecksum
- f960907f
+ 6c92393d
@@ -520,14 +336,14 @@
viewChecksum
- d2aad2c5
+ 923e7b90
- aclk
+ ap_clkin
@@ -540,7 +356,7 @@
- aresetn
+ ap_rst_nin
@@ -582,11 +398,11 @@
- awaddr
+ awprotin
- 15
+ 20
@@ -602,11 +418,11 @@
- awprot
+ awaddrin
- 2
+ 100
@@ -766,11 +582,11 @@
- araddr
+ arprotin
- 15
+ 20
@@ -786,11 +602,11 @@
- arprot
+ araddrin
- 2
+ 100
@@ -868,29 +684,6 @@
-
- m_axis_0_afull
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 0
-
-
-
-
-
- true
-
-
-
- m_axis_0_tready
@@ -925,352 +718,7 @@
out
- 31
- 0
-
-
-
- std_logic_vector
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_1_afull
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 0
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_1_tready
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 1
-
-
-
-
- m_axis_1_tvalid
-
- out
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_1_tdata
-
- out
-
- 31
- 0
-
-
-
- std_logic_vector
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_2_afull
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 0
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_2_tready
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 1
-
-
-
-
- m_axis_2_tvalid
-
- out
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_2_tdata
-
- out
-
- 31
- 0
-
-
-
- std_logic_vector
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_3_afull
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 0
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_3_tready
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 1
-
-
-
-
- m_axis_3_tvalid
-
- out
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_3_tdata
-
- out
-
- 31
- 0
-
-
-
- std_logic_vector
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_4_afull
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 0
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_4_tready
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 1
-
-
-
-
- m_axis_4_tvalid
-
- out
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_4_tdata
-
- out
-
- 31
- 0
-
-
-
- std_logic_vector
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_5_afull
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 0
-
-
-
-
-
- true
-
-
-
-
-
- m_axis_5_tready
-
- in
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
- 1
-
-
-
-
- m_axis_5_tvalid
-
- out
-
-
- std_logic
- xilinx_anylanguagesynthesis
- xilinx_anylanguagebehavioralsimulation
-
-
-
-
-
- m_axis_5_tdata
-
- out
-
- 31
+ 310
@@ -1285,129 +733,29 @@
- CONFIG_EN
- Config En
- true
+ DEPTH
+ Depth
+ 512
- NSTREAMS
- Nstreams
- 6
-
-
- MEM_DEPTH
- Mem Depth
- 13824
-
-
- MEM_WIDTH
- Mem Width
- 32
+ WIDTH
+ Width
+ 32
- MEM_INIT
- Mem Init
- ./
+ INIT_FILE
+ Init File
+ RAM_STYLERam Styleauto
-
- STRM0_WIDTH
- Strm0 Width
- 32
-
-
- STRM1_WIDTH
- Strm1 Width
- 32
-
-
- STRM2_WIDTH
- Strm2 Width
- 32
-
-
- STRM3_WIDTH
- Strm3 Width
- 32
-
-
- STRM4_WIDTH
- Strm4 Width
- 32
-
-
- STRM5_WIDTH
- Strm5 Width
- 32
-
-
- STRM0_DEPTH
- Strm0 Depth
- 2304
-
-
- STRM1_DEPTH
- Strm1 Depth
- 2304
-
-
- STRM2_DEPTH
- Strm2 Depth
- 2304
-
-
- STRM3_DEPTH
- Strm3 Depth
- 2304
-
-
- STRM4_DEPTH
- Strm4 Depth
- 2304
-
-
- STRM5_DEPTH
- Strm5 Depth
- 2304
-
-
- STRM0_OFFSET
- Strm0 Offset
- 0
-
-
- STRM1_OFFSET
- Strm1 Offset
- 2304
-
-
- STRM2_OFFSET
- Strm2 Offset
- 4608
-
-
- STRM3_OFFSET
- Strm3 Offset
- 6912
-
-
- STRM4_OFFSET
- Strm4 Offset
- 9216
-
-
- STRM5_OFFSET
- Strm5 Offset
- 11520
- AXILITE_ADDR_WIDTHAxilite Addr Width
- 16
+ 11
@@ -1417,13 +765,6 @@
ACTIVE_HIGHACTIVE_LOW
-
- choice_list_e2bd1cd0
- auto
- distributed
- block
- ultra
-
@@ -1433,71 +774,41 @@
verilogSource
- hdl/memstream.v
- verilogSource
+ hdl/memstream.sv
+ systemVerilogSource
- hdl/memstream_multiblock.v
- verilogSource
+ hdl/memstream_axi.sv
+ systemVerilogSource
- hdl/memstream_singleblock.v
+ hdl/memstream_axi_wrapper.vverilogSource
-
-
- hdl/mux.v
- verilogSource
-
-
- hdl/ramb18_sdp.v
- verilogSource
-
-
- hdl/ramb18_wf_dualport.v
- verilogSource
- CHECKSUM_9425c051
+ CHECKSUM_7caabca7xilinx_anylanguagebehavioralsimulation_view_fileset
- hdl/memstream.v
- verilogSource
+ hdl/memstream.sv
+ systemVerilogSourceUSED_IN_ipstaticxil_defaultlib
- hdl/axilite_if.v
- verilogSource
+ hdl/memstream_axi.sv
+ systemVerilogSourceUSED_IN_ipstaticxil_defaultlib
- hdl/memstream_singleblock.v
- verilogSource
- USED_IN_ipstatic
- xil_defaultlib
-
-
- hdl/mux.v
- verilogSource
- USED_IN_ipstatic
- xil_defaultlib
-
-
- hdl/ramb18_wf_dualport.v
- verilogSource
- USED_IN_ipstatic
- xil_defaultlib
-
-
- hdl/memstream_multiblock.v
+ hdl/axilite_if.vverilogSourceUSED_IN_ipstaticxil_defaultlib
- hdl/ramb18_sdp.v
+ hdl/memstream_axi_wrapper.vverilogSourceUSED_IN_ipstaticxil_defaultlib
@@ -1508,7 +819,7 @@
xgui/memstream_v1_0.tcltclSource
- CHECKSUM_f960907f
+ CHECKSUM_32cad48dXGUI_VERSION_2
@@ -1520,132 +831,32 @@
- memstream_v1_0
+ memstream
- CONFIG_EN
- Config En
- true
-
-
- NSTREAMS
- Nstreams
- 6
-
-
- MEM_DEPTH
- Mem Depth
- 13824
+ DEPTH
+ Depth
+ 512
- MEM_WIDTH
- Mem Width
- 32
+ WIDTH
+ Width
+ 32
- MEM_INIT
- Mem Init
- ./
+ INIT_FILE
+ Init File
+ RAM_STYLERam Style
- auto
-
-
- STRM0_WIDTH
- Strm0 Width
- 32
-
-
- STRM1_WIDTH
- Strm1 Width
- 32
-
-
- STRM2_WIDTH
- Strm2 Width
- 32
-
-
- STRM3_WIDTH
- Strm3 Width
- 32
-
-
- STRM4_WIDTH
- Strm4 Width
- 32
-
-
- STRM5_WIDTH
- Strm5 Width
- 32
-
-
- STRM0_DEPTH
- Strm0 Depth
- 2304
-
-
- STRM1_DEPTH
- Strm1 Depth
- 2304
-
-
- STRM2_DEPTH
- Strm2 Depth
- 2304
-
-
- STRM3_DEPTH
- Strm3 Depth
- 2304
-
-
- STRM4_DEPTH
- Strm4 Depth
- 2304
-
-
- STRM5_DEPTH
- Strm5 Depth
- 2304
-
-
- STRM0_OFFSET
- Strm0 Offset
- 0
-
-
- STRM1_OFFSET
- Strm1 Offset
- 2304
-
-
- STRM2_OFFSET
- Strm2 Offset
- 4608
-
-
- STRM3_OFFSET
- Strm3 Offset
- 6912
-
-
- STRM4_OFFSET
- Strm4 Offset
- 9216
-
-
- STRM5_OFFSET
- Strm5 Offset
- 11520
+ autoAXILITE_ADDR_WIDTHAxilite Addr Width
- 16
+ 11
@@ -1656,52 +867,40 @@
Component_Name
- memstream_v1_0
+ memstream_axi_wrapper_v1_0
-
- aartix7
- akintex7
- artix7
- artix7l
- azynq
- kintex7
- kintex7l
- kintexu
- kintexuplus
- qkintex7
- qkintex7l
- qvirtex7
- qzynq
- qzynqplus
- versal
- versalprime
- virtex7
- virtexu
- virtexuplus
- virtexuplusHBM
- virtexupluse58g
- zynq
- zynquplus
- /UserIP
- memstream_v1_0
+ memstream
+ level_1package_project
+ AMD5
- 2020-10-09T15:31:57Z
+
+ user.org:user:memstream_axi_wrapper:1.0
+
+ 2023-05-24T06:34:57Z
+
+
+
- 2020.1
-
-
-
-
-
-
+ 2022.2
+
+
+
+
+
+
+
+
+
+
+
diff --git a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
index a68b85e1f5..271f9df453 100644
--- a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
+++ b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl
@@ -1,2 +1,2 @@
# This file is automatically written. Do not modify.
-proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {MEM_DEPTH MEM_WIDTH } {expr 2+ceil(log($MEM_DEPTH*pow(2,ceil(log(($MEM_WIDTH+31)/32)/log(2))))/log(2))}
+proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {DEPTH WIDTH } {expr 2 + ceil(log($DEPTH*pow(2, ceil(log(($WIDTH+31)/32)/log(2))))/log(2))}
diff --git a/finn-rtllib/memstream/hdl/memstream.sv b/finn-rtllib/memstream/hdl/memstream.sv
new file mode 100644
index 0000000000..9cbef493a3
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream.sv
@@ -0,0 +1,176 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author Thomas B. Preußer
+ */
+
+module memstream #(
+ int unsigned DEPTH,
+ int unsigned WIDTH,
+
+ parameter INIT_FILE = "",
+ parameter RAM_STYLE = "auto"
+)(
+ input logic clk,
+ input logic rst,
+
+ // Configuration and readback interface - compatible with ap_memory
+ input logic config_ce,
+ input logic config_we,
+ input logic [31 :0] config_address,
+ input logic [WIDTH-1:0] config_d0,
+
+ output logic config_rack,
+ output logic [WIDTH-1:0] config_q0,
+
+ // Continuous output stream
+ input logic ordy,
+ output logic ovld,
+ output logic [WIDTH-1:0] odat
+);
+
+ typedef logic [$clog2(DEPTH)-1:0] addr_t;
+ typedef logic [WIDTH -1:0] data_t;
+
+ uwire en; // Pipeline enable
+ uwire rollback; // Rollback stream reads if backpressure would block read back
+
+ // Counter with pre-computed last indication for val == DEPTH-1
+ typedef struct {
+ addr_t val;
+ logic lst;
+ } ptr_t;
+
+ // Counter history to facilitate pipeline rollback
+ ptr_t Ptr[3] = '{
+ 0: '{ val: 0, lst: DEPTH<2 },
+ default: '{ default: 'x }
+ };
+
+ //-----------------------------------------------------------------------
+ // Stage #0: Address & Op
+ logic Wr1 = 0; // Write
+ logic Rb1 = 0; // Read back
+ logic Rs1 = 0; // Read stream
+ data_t Data1 = 'x;
+ if(1) begin : blkStage1
+ // Increment for wrapping DEPTH-1 back to zero
+ localparam int unsigned WRAP_INC = 2**$bits(addr_t) - DEPTH + 1;
+
+ uwire ptr_t ptr_eff = rollback? Ptr[2] : Ptr[0];
+ uwire ptr_t ptr_nxt;
+ assign ptr_nxt.val = ptr_eff.val + (config_ce? 0 : !ptr_eff.lst? 1 : WRAP_INC);
+ assign ptr_nxt.lst =
+ DEPTH < 2? 1 :
+ config_ce? ptr_eff.lst :
+ ptr_eff.lst? 0 :
+ /* else */ ptr_eff.val == DEPTH-2;
+
+ always_ff @(posedge clk) begin
+ if(rst) Ptr[0] <= '{ val: 0, lst: DEPTH<2 };
+ else if(en) Ptr[0] <= ptr_nxt;
+ end
+
+ // Issue next Memory Operation
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ Wr1 <= 0;
+ Rb1 <= 0;
+ Rs1 <= 0;
+ Ptr[1] <= '{ default : 'x };
+ Data1 <= 'x;
+ end
+ else if(en) begin
+ Wr1 <= 0;
+ Rb1 <= 0;
+ Rs1 <= 0;
+ if(config_ce) begin
+ if(config_we) Wr1 <= 1;
+ else Rb1 <= 1;
+ Ptr[1] <= '{ val: config_address, lst: 'x };
+ Data1 <= config_d0;
+ end
+ else begin
+ Rs1 <= 1;
+ Ptr[1] <= ptr_eff;
+ Data1 <= 'x;
+ end
+ end
+ end
+ end : blkStage1
+
+ //-----------------------------------------------------------------------
+ // Stage #2: Memory Access
+ logic Rb2 = 0;
+ logic Rs2 = 0;
+ data_t Data2 = 'x;
+ if(1) begin : blkStage2
+ (* RAM_STYLE = RAM_STYLE *)
+ data_t Mem[DEPTH];
+
+ // Optional Memory Initialization
+ if(INIT_FILE != "") initial $readmemh(INIT_FILE, Mem);
+
+ // Execute Memory Operation
+ uwire addr_t addr = Ptr[1].val;
+ always_ff @(posedge clk) begin
+ if(en) begin
+ if(Wr1) Mem[addr] <= Data1;
+ Data2 <= Mem[addr];
+ end
+ end
+
+ // Copy Output Designation
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ Rb2 <= 0;
+ Rs2 <= 0;
+ Ptr[2] <= '{ default: 'x };
+ end
+ else if(en) begin
+ Rb2 <= Rb1;
+ Rs2 <= Rs1 && !rollback;
+ Ptr[2] <= Ptr[1];
+ end
+ end
+ end : blkStage2
+
+ //-----------------------------------------------------------------------
+ // Output Interfaces
+ assign config_rack = Rb2;
+ assign config_q0 = Data2;
+
+ assign ovld = Rs2;
+ assign odat = Data2;
+
+ uwire backpressure = Rs2 && !ordy;
+ assign rollback = backpressure && (Rb1 || config_ce);
+ assign en = !backpressure || Rb1 || config_ce;
+
+endmodule : memstream
diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v
deleted file mode 100644
index 2cd955f8d1..0000000000
--- a/finn-rtllib/memstream/hdl/memstream.v
+++ /dev/null
@@ -1,327 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module memstream
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
- parameter CONFIG_EN = 1,
- parameter NSTREAMS = 6,//1 up to 6
-
- parameter MEM_DEPTH = 13824,
- parameter MEM_WIDTH = 32,
- parameter MEM_INIT = "./",
- parameter RAM_STYLE = "auto",
-
- //widths per stream
- parameter STRM0_WIDTH = 32,
- parameter STRM1_WIDTH = 32,
- parameter STRM2_WIDTH = 32,
- parameter STRM3_WIDTH = 32,
- parameter STRM4_WIDTH = 32,
- parameter STRM5_WIDTH = 32,
-
- //depths per stream
- parameter STRM0_DEPTH = 2304,
- parameter STRM1_DEPTH = 2304,
- parameter STRM2_DEPTH = 2304,
- parameter STRM3_DEPTH = 2304,
- parameter STRM4_DEPTH = 2304,
- parameter STRM5_DEPTH = 2304,
-
- //offsets for each stream
- parameter STRM0_OFFSET = 0,
- parameter STRM1_OFFSET = 2304,
- parameter STRM2_OFFSET = 4608,
- parameter STRM3_OFFSET = 6912,
- parameter STRM4_OFFSET = 9216,
- parameter STRM5_OFFSET = 11520,
-
- parameter AXILITE_ADDR_WIDTH = 2+$clog2(MEM_DEPTH*(1<<$clog2((MEM_WIDTH+31)/32)))
-)
-
-(
- input aclk,
- input aresetn,
-
- output awready,
- input awvalid,
- input [AXILITE_ADDR_WIDTH-1:0] awaddr,
- input [2:0] awprot,
- //write data
- output wready,
- input wvalid,
- input [31:0] wdata,
- input [3:0] wstrb,
- //burst response
- input bready,
- output bvalid,
- output [1:0] bresp,
-
- //Read channels
- //read address
- output arready,
- input arvalid,
- input [AXILITE_ADDR_WIDTH-1:0] araddr,
- input [2:0] arprot,
- //read data
- input rready,
- output rvalid,
- output [1:0] rresp,
- output [31:0] rdata,
-
- //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
- input m_axis_0_afull,
- input m_axis_0_tready,
- output m_axis_0_tvalid,
- output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-
- input m_axis_1_afull,
- input m_axis_1_tready,
- output m_axis_1_tvalid,
- output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
-
- input m_axis_2_afull,
- input m_axis_2_tready,
- output m_axis_2_tvalid,
- output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
-
- input m_axis_3_afull,
- input m_axis_3_tready,
- output m_axis_3_tvalid,
- output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
-
- input m_axis_4_afull,
- input m_axis_4_tready,
- output m_axis_4_tvalid,
- output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
-
- input m_axis_5_afull,
- input m_axis_5_tready,
- output m_axis_5_tvalid,
- output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
-
-
-);
-
-wire [31:0] config_address;
-wire config_ce;
-wire config_we;
-wire config_rack;
-wire [MEM_WIDTH-1:0] config_d0;
-wire [MEM_WIDTH-1:0] config_q0;
-
-generate
-if(NSTREAMS <= 2) begin: singleblock
-
-
-memstream_singleblock
-#(
- .CONFIG_EN(CONFIG_EN),
- .NSTREAMS(NSTREAMS),
- .MEM_DEPTH(MEM_DEPTH),
- .MEM_WIDTH(MEM_WIDTH),
- .MEM_INIT(MEM_INIT),
- .RAM_STYLE(RAM_STYLE),
-
- //widths per stream
- .STRM0_WIDTH(STRM0_WIDTH),
- .STRM1_WIDTH(STRM1_WIDTH),
-
- //depths per stream
- .STRM0_DEPTH(STRM0_DEPTH),
- .STRM1_DEPTH(STRM1_DEPTH),
-
- //offsets for each stream
- .STRM0_OFFSET(STRM0_OFFSET),
- .STRM1_OFFSET(STRM1_OFFSET)
-)
-mem
-(
- .aclk(aclk),
- .aresetn(aresetn),
-
- .config_address(config_address),
- .config_ce(config_ce),
- .config_we(config_we),
- .config_d0(config_d0),
- .config_q0(config_q0),
- .config_rack(config_rack),
-
- .m_axis_0_tready(m_axis_0_tready),
- .m_axis_0_tvalid(m_axis_0_tvalid),
- .m_axis_0_tdata(m_axis_0_tdata),
-
- .m_axis_1_tready(m_axis_1_tready),
- .m_axis_1_tvalid(m_axis_1_tvalid),
- .m_axis_1_tdata(m_axis_1_tdata)
-);
-
-assign m_axis_2_tvalid = 0;
-assign m_axis_2_tdata = 0;
-assign m_axis_3_tvalid = 0;
-assign m_axis_3_tdata = 0;
-assign m_axis_4_tvalid = 0;
-assign m_axis_4_tdata = 0;
-assign m_axis_5_tvalid = 0;
-assign m_axis_5_tdata = 0;
-
-end else begin: multiblock
-
-
-memstream_multiblock
-#(
- .CONFIG_EN(CONFIG_EN),
- .NSTREAMS(NSTREAMS),
- .MEM_DEPTH(MEM_DEPTH),
- .MEM_WIDTH(MEM_WIDTH),
- .MEM_INIT(MEM_INIT),
- .RAM_STYLE(RAM_STYLE),
-
- //widths per stream
- .STRM0_WIDTH(STRM0_WIDTH),
- .STRM1_WIDTH(STRM1_WIDTH),
- .STRM2_WIDTH(STRM2_WIDTH),
- .STRM3_WIDTH(STRM3_WIDTH),
- .STRM4_WIDTH(STRM4_WIDTH),
- .STRM5_WIDTH(STRM5_WIDTH),
-
- //depths per stream
- .STRM0_DEPTH(STRM0_DEPTH),
- .STRM1_DEPTH(STRM1_DEPTH),
- .STRM2_DEPTH(STRM2_DEPTH),
- .STRM3_DEPTH(STRM3_DEPTH),
- .STRM4_DEPTH(STRM4_DEPTH),
- .STRM5_DEPTH(STRM5_DEPTH),
-
- //offsets for each stream
- .STRM0_OFFSET(STRM0_OFFSET),
- .STRM1_OFFSET(STRM1_OFFSET),
- .STRM2_OFFSET(STRM2_OFFSET),
- .STRM3_OFFSET(STRM3_OFFSET),
- .STRM4_OFFSET(STRM4_OFFSET),
- .STRM5_OFFSET(STRM5_OFFSET)
-)
-mem
-(
- .aclk(aclk),
- .aresetn(aresetn),
-
- .config_address(config_address),
- .config_ce(config_ce),
- .config_we(config_we),
- .config_d0(config_d0),
- .config_q0(config_q0),
-
- .m_axis_0_afull(m_axis_0_afull),
- .m_axis_0_tready(m_axis_0_tready),
- .m_axis_0_tvalid(m_axis_0_tvalid),
- .m_axis_0_tdata(m_axis_0_tdata),
-
- .m_axis_1_afull(m_axis_1_afull),
- .m_axis_1_tready(m_axis_1_tready),
- .m_axis_1_tvalid(m_axis_1_tvalid),
- .m_axis_1_tdata(m_axis_1_tdata),
-
- .m_axis_2_afull(m_axis_2_afull),
- .m_axis_2_tready(m_axis_2_tready),
- .m_axis_2_tvalid(m_axis_2_tvalid),
- .m_axis_2_tdata(m_axis_2_tdata),
-
- .m_axis_3_afull(m_axis_3_afull),
- .m_axis_3_tready(m_axis_3_tready),
- .m_axis_3_tvalid(m_axis_3_tvalid),
- .m_axis_3_tdata(m_axis_3_tdata),
-
- .m_axis_4_afull(m_axis_4_afull),
- .m_axis_4_tready(m_axis_4_tready),
- .m_axis_4_tvalid(m_axis_4_tvalid),
- .m_axis_4_tdata(m_axis_4_tdata),
-
- .m_axis_5_afull(m_axis_5_afull),
- .m_axis_5_tready(m_axis_5_tready),
- .m_axis_5_tvalid(m_axis_5_tvalid),
- .m_axis_5_tdata(m_axis_5_tdata)
-
-);
-
-
-end
-endgenerate
-
-axi4lite_if
-#(
- .ADDR_WIDTH(AXILITE_ADDR_WIDTH),
- .DATA_WIDTH(32),
- .IP_DATA_WIDTH(MEM_WIDTH)
-)
-config_if
-(
- //system signals
- .aclk(aclk),
- .aresetn(aresetn),
-
- //Write channels
- //write address
- .awready(awready),
- .awvalid(awvalid),
- .awaddr(awaddr),
- .awprot(awprot),
- //write data
- .wready(wready),
- .wvalid(wvalid),
- .wdata(wdata),
- .wstrb(wstrb),
- //burst response
- .bready(bready),
- .bvalid(bvalid),
- .bresp(bresp),
-
- //Read channels
- //read address
- .arready(arready),
- .arvalid(arvalid),
- .araddr(araddr),
- .arprot(arprot),
- //read data
- .rready(rready),
- .rvalid(rvalid),
- .rresp(rresp),
- .rdata(rdata),
-
- //IP-side interface
- .ip_en(config_ce),
- .ip_wen(config_we),
- .ip_addr(config_address),
- .ip_wdata(config_d0),
- .ip_rack(config_rack),
- .ip_rdata(config_q0)
-);
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_axi.sv b/finn-rtllib/memstream/hdl/memstream_axi.sv
new file mode 100644
index 0000000000..136bcb1d7e
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream_axi.sv
@@ -0,0 +1,136 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author Thomas B. Preußer
+ */
+
+module memstream_axi #(
+ int unsigned DEPTH,
+ int unsigned WIDTH,
+
+ parameter INIT_FILE = "",
+ parameter RAM_STYLE = "auto",
+
+ localparam int unsigned AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2
+)(
+ // Global Control
+ input logic clk,
+ input logic rst,
+
+ // AXI-lite Write
+ output logic awready,
+ input logic awvalid,
+ input logic [2:0] awprot,
+ input logic [AXILITE_ADDR_WIDTH-1:0] awaddr,
+
+ output logic wready,
+ input logic wvalid,
+ input logic [31:0] wdata,
+ input logic [ 3:0] wstrb,
+
+ input logic bready,
+ output logic bvalid,
+ output logic [1:0] bresp,
+
+ // AXI-lite Read
+ output logic arready,
+ input logic arvalid,
+ input logic [2:0] arprot,
+ input logic [AXILITE_ADDR_WIDTH-1:0] araddr,
+
+ input logic rready,
+ output logic rvalid,
+ output logic [ 1:0] rresp,
+ output logic [31:0] rdata,
+
+ // Continuous output stream
+ input logic m_axis_0_tready,
+ output logic m_axis_0_tvalid,
+ output logic [((WIDTH+7)/8)*8-1:0] m_axis_0_tdata
+);
+
+ //-----------------------------------------------------------------------
+ // AXI-lite to ap_memory Adapter
+ uwire [31:0] config_address;
+ uwire config_ce;
+ uwire config_we;
+ uwire config_rack;
+ uwire [WIDTH-1:0] config_d0;
+ uwire [WIDTH-1:0] config_q0;
+ axi4lite_if #(
+ .ADDR_WIDTH(AXILITE_ADDR_WIDTH),
+ .DATA_WIDTH(32),
+ .IP_DATA_WIDTH(WIDTH)
+ ) config_if (
+ .aclk(clk), .aresetn(!rst),
+
+ // Write Channels
+ .awready, .awvalid, .awaddr, .awprot,
+ .wready, .wvalid, .wdata, .wstrb,
+ .bready, .bvalid, .bresp,
+
+ // Read Channels
+ .arready, .arvalid, .araddr, .arprot,
+ .rready, .rvalid, .rresp, .rdata,
+
+ // IP-side Interface
+ .ip_en(config_ce),
+ .ip_wen(config_we),
+ .ip_addr(config_address),
+ .ip_wdata(config_d0),
+ .ip_rack(config_rack),
+ .ip_rdata(config_q0)
+ );
+
+ //-----------------------------------------------------------------------
+ // Streaming Memory Backend
+ memstream #(
+ .DEPTH(DEPTH),
+ .WIDTH(WIDTH),
+ .INIT_FILE(INIT_FILE),
+ .RAM_STYLE(RAM_STYLE)
+ ) mem (
+ .clk, .rst,
+
+ .config_address,
+ .config_ce,
+ .config_we,
+ .config_d0,
+ .config_q0,
+ .config_rack,
+
+ .ordy(m_axis_0_tready),
+ .ovld(m_axis_0_tvalid),
+ .odat(m_axis_0_tdata[WIDTH-1:0])
+ );
+ if($bits(m_axis_0_tdata) > WIDTH) begin
+ assign m_axis_0_tdata[$left(m_axis_0_tdata):WIDTH] = '0;
+ end
+
+endmodule : memstream_axi
diff --git a/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v
new file mode 100644
index 0000000000..13f5c82d6e
--- /dev/null
+++ b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v
@@ -0,0 +1,123 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author Thomas B. Preußer
+ */
+
+module memstream_axi_wrapper #(
+ parameter DEPTH = 512,
+ parameter WIDTH = 32,
+
+ parameter INIT_FILE = "",
+ parameter RAM_STYLE = "auto",
+
+ parameter AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2
+)(
+ // Global Control
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF m_axis_0, ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+ input ap_clk,
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+ input ap_rst_n,
+
+ // AXI-lite Write
+ output awready,
+ input awvalid,
+ input [2:0] awprot,
+ input [AXILITE_ADDR_WIDTH-1:0] awaddr,
+
+ output wready,
+ input wvalid,
+ input [31:0] wdata,
+ input [ 3:0] wstrb,
+
+ input bready,
+ output bvalid,
+ output [1:0] bresp,
+
+ // AXI-lite Read
+ output arready,
+ input arvalid,
+ input [2:0] arprot,
+ input [AXILITE_ADDR_WIDTH-1:0] araddr,
+
+ input rready,
+ output rvalid,
+ output [ 1:0] rresp,
+ output [31:0] rdata,
+
+ // Continuous output stream
+ input m_axis_0_tready,
+ output m_axis_0_tvalid,
+ output [((WIDTH+7)/8)*8-1:0] m_axis_0_tdata
+);
+
+ localparam INIT_FILTERED =
+`ifdef SYNTHESIS
+ RAM_STYLE == "ultra"? "" :
+`endif
+ INIT_FILE;
+
+ memstream_axi #(
+ .DEPTH(DEPTH), .WIDTH(WIDTH),
+ .INIT_FILE(INIT_FILTERED),
+ .RAM_STYLE(RAM_STYLE)
+ ) core (
+ .clk(ap_clk), .rst(!ap_rst_n),
+
+ // AXI-lite Write
+ .awready(awready),
+ .awvalid(awvalid),
+ .awprot(awprot),
+ .awaddr(awaddr),
+ .wready(wready),
+ .wvalid(wvalid),
+ .wdata(wdata),
+ .wstrb(wstrb),
+ .bready(bready),
+ .bvalid(bvalid),
+ .bresp(bresp),
+
+ // AXI-lite Read
+ .arready(arready),
+ .arvalid(arvalid),
+ .arprot(arprot),
+ .araddr(araddr),
+ .rready(rready),
+ .rvalid(rvalid),
+ .rresp(rresp),
+ .rdata(rdata),
+
+ // Continuous output stream
+ .m_axis_0_tready(m_axis_0_tready),
+ .m_axis_0_tvalid(m_axis_0_tvalid),
+ .m_axis_0_tdata(m_axis_0_tdata)
+ );
+
+endmodule : memstream_axi_wrapper
diff --git a/finn-rtllib/memstream/hdl/memstream_multiblock.v b/finn-rtllib/memstream/hdl/memstream_multiblock.v
deleted file mode 100644
index 4e6167132d..0000000000
--- a/finn-rtllib/memstream/hdl/memstream_multiblock.v
+++ /dev/null
@@ -1,474 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module memstream_multiblock
-#(
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
- parameter CONFIG_EN = 1,
- parameter NSTREAMS = 6,//1 up to 6
-
- parameter MEM_DEPTH = 13824,
- parameter MEM_WIDTH = 32,
- parameter MEM_INIT = "./",
- parameter RAM_STYLE = "auto",
-
- //widths per stream
- parameter STRM0_WIDTH = 32,
- parameter STRM1_WIDTH = 32,
- parameter STRM2_WIDTH = 32,
- parameter STRM3_WIDTH = 32,
- parameter STRM4_WIDTH = 32,
- parameter STRM5_WIDTH = 32,
-
- //depths per stream
- parameter STRM0_DEPTH = 2304,
- parameter STRM1_DEPTH = 2304,
- parameter STRM2_DEPTH = 2304,
- parameter STRM3_DEPTH = 2304,
- parameter STRM4_DEPTH = 2304,
- parameter STRM5_DEPTH = 2304,
-
- //offsets for each stream
- parameter STRM0_OFFSET = 0,
- parameter STRM1_OFFSET = 2304,
- parameter STRM2_OFFSET = 4608,
- parameter STRM3_OFFSET = 6912,
- parameter STRM4_OFFSET = 9216,
- parameter STRM5_OFFSET = 11520
-)
-
-(
- input aclk,
- input aresetn,
-
- //optional configuration interface compatible with ap_memory
- input [31:0] config_address,
- input config_ce,
- input config_we,
- input [31:0] config_d0,
- output [31:0] config_q0,
- output config_rack,
-
- //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
- input m_axis_0_afull,
- input m_axis_0_tready,
- output m_axis_0_tvalid,
- output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-
- input m_axis_1_afull,
- input m_axis_1_tready,
- output m_axis_1_tvalid,
- output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata,
-
- input m_axis_2_afull,
- input m_axis_2_tready,
- output m_axis_2_tvalid,
- output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata,
-
- input m_axis_3_afull,
- input m_axis_3_tready,
- output m_axis_3_tvalid,
- output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata,
-
- input m_axis_4_afull,
- input m_axis_4_tready,
- output m_axis_4_tvalid,
- output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata,
-
- input m_axis_5_afull,
- input m_axis_5_tready,
- output m_axis_5_tvalid,
- output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata
-
-
-);
-
-//calculate number of RAMB18 blocks we need depth-wise
-localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024)
-
-//calculate width of address for each block
-localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH);
-
-//determine whether a stream needs to multiplex between memory blocks
-localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024));
-localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024));
-localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024));
-localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024));
-localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024));
-localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024));
-
-//determine what the base block of each stream is
-localparam STRM0_BLOCK = (STRM0_OFFSET/1024);
-localparam STRM1_BLOCK = (STRM1_OFFSET/1024);
-localparam STRM2_BLOCK = (STRM2_OFFSET/1024);
-localparam STRM3_BLOCK = (STRM3_OFFSET/1024);
-localparam STRM4_BLOCK = (STRM4_OFFSET/1024);
-localparam STRM5_BLOCK = (STRM5_OFFSET/1024);
-
-//determine what the end block of each stream is
-localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024);
-localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024);
-localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024);
-localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024);
-localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024);
-localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024);
-
-//determine the number of blocks spanned by each stream
-localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1;
-localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1;
-localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1;
-localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1;
-localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1;
-localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1;
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
-initial begin
- if((NSTREAMS < 1) | (NSTREAMS > 6)) begin
- $display("Invalid setting for NSTREAMS, please set in range [1,6]");
- $finish();
- end
-end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed
-//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth
-
-reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET;
-reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET;
-
-reg strm0_incr_en;
-reg strm1_incr_en;
-reg strm2_incr_en;
-reg strm3_incr_en;
-reg strm4_incr_en;
-reg strm5_incr_en;
-
-wire strm0_rst;
-wire strm1_rst;
-wire strm2_rst;
-wire strm3_rst;
-wire strm4_rst;
-wire strm5_rst;
-
-reg strm0_ready;
-reg strm1_ready;
-reg strm2_ready;
-reg strm3_ready;
-reg strm4_ready;
-reg strm5_ready;
-
-//arbiter: work on one stream at a time
-//multiplex each port between (up to) half of the streams
-reg [1:0] current_stream_porta = 0;
-reg [1:0] current_stream_portb = 0;
-
-always @(posedge aclk) begin
- if(rst)
- current_stream_porta <= 0;
- else case(current_stream_porta)
- 0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0;
- 1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1;
- 2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2;
- endcase
- if(rst)
- current_stream_portb <= 0;
- else case(current_stream_portb)
- 0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0;
- 1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1;
- 2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2;
- endcase
-end
-
-always @(posedge aclk) begin
- if(rst) begin
- strm0_incr_en <= 0;
- strm1_incr_en <= 0;
- strm2_incr_en <= 0;
- strm3_incr_en <= 0;
- strm4_incr_en <= 0;
- strm5_incr_en <= 0;
- end else begin
- strm0_incr_en <= (current_stream_porta == 0) & strm0_ready;
- strm1_incr_en <= (current_stream_portb == 0) & strm1_ready;
- strm2_incr_en <= (current_stream_porta == 1) & strm2_ready;
- strm3_incr_en <= (current_stream_portb == 1) & strm3_ready;
- strm4_incr_en <= (current_stream_porta == 2) & strm4_ready;
- strm5_incr_en <= (current_stream_portb == 2) & strm5_ready;
- end
-end
-
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1));
-assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1));
-assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1));
-assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1));
-
-always @(posedge aclk) begin
- strm0_ready <= ~m_axis_0_afull;
- strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2);
- strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3);
- strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4);
- strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5);
- strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6);
-end
-
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
- if(strm0_rst | rst)
- strm0_addr <= STRM0_OFFSET;
- else if(strm0_incr_en)
- strm0_addr <= strm0_addr + 1;
- if(strm1_rst | rst)
- strm1_addr <= STRM1_OFFSET;
- else if(strm1_incr_en)
- strm1_addr <= strm1_addr + 1;
- if(strm2_rst | rst)
- strm2_addr <= STRM2_OFFSET;
- else if(strm2_incr_en)
- strm2_addr <= strm2_addr + 1;
- if(strm3_rst | rst)
- strm3_addr <= STRM3_OFFSET;
- else if(strm3_incr_en)
- strm3_addr <= strm3_addr + 1;
- if(strm4_rst | rst)
- strm4_addr <= STRM4_OFFSET;
- else if(strm4_incr_en)
- strm4_addr <= strm4_addr + 1;
- if(strm5_rst | rst)
- strm5_addr <= STRM5_OFFSET;
- else if(strm5_incr_en)
- strm5_addr <= strm5_addr + 1;
-end
-
-reg [$clog2(MEM_DEPTH)-1:0] addra;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa;
-
-reg [$clog2(MEM_DEPTH)-1:0] addrb;
-wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb;
-
-wire [NMEMBLOCKS-1:0] we;
-
-reg [1:0] addr_select_porta;
-reg [1:0] addr_select_portb;
-
-//multiplex addresses of various streams into address ports of memory
-always @(posedge aclk) begin
- addr_select_porta <= current_stream_porta;
- case(addr_select_porta)
- 0: addra <= strm0_addr;
- 1: addra <= strm2_addr;
- 2: addra <= strm4_addr;
- endcase
- addr_select_portb <= current_stream_portb;
- case(addr_select_portb)
- 0: addrb <= strm1_addr;
- 1: addrb <= strm3_addr;
- 2: addrb <= strm5_addr;
- endcase
-end
-
-genvar g;
-generate for(g=0; g 1) begin: multiblock
-
-wire [MEM_WIDTH-1:0] rdqmux[5:0];
-
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0];
-reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0];
-
-always @(posedge aclk) begin
- rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
- rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH];
- for(i=0; i<2; i=i+1) begin
- rdblocka[i+1] <= rdblocka[i];
- rdblockb[i+1] <= rdblockb[i];
- end
-end
-
-if(NSTREAMS >= 1) begin: en_strm0
- if(STRM0_MUX == 1) begin: mux0
- mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK);
- end else begin: nomux0
- assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH];
- end
- assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 2) begin: en_strm1
- if(STRM1_MUX == 1) begin: mux1
- mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK);
- end else begin: nomux1
- assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH];
- end
- assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 3) begin: en_strm2
- if(STRM2_MUX == 1) begin: mux2
- mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK);
- end else begin: nomux2
- assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH];
- end
- assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 4) begin: en_strm3
- if(STRM3_MUX == 1) begin: mux3
- mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK);
- end else begin: nomux3
- assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH];
- end
- assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 5) begin: en_strm4
- if(STRM4_MUX == 1) begin: mux4
- mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK);
- end else begin: nomux4
- assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH];
- end
- assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0];
-end
-
-if(NSTREAMS >= 6) begin: en_strm5
- if(STRM5_MUX == 1) begin: mux5
- mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK);
- end else begin: nomux5
- assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH];
- end
- assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0];
-end
-
-end else begin: singleblock
-
-if(NSTREAMS >= 1) begin: en_strm0_direct
- assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0];
-end
-if(NSTREAMS >= 2) begin: en_strm1_direct
- assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0];
-end
-if(NSTREAMS >= 3) begin: en_strm2_direct
- assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0];
-end
-if(NSTREAMS >= 4) begin: en_strm3_direct
- assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0];
-end
-if(NSTREAMS >= 5) begin: en_strm4_direct
- assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0];
-end
-if(NSTREAMS >= 6) begin: en_strm5_direct
- assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0];
-end
-
-end
-endgenerate
-
-//output to AXI Streams
-reg tvalid_pipe0[2:0];
-reg tvalid_pipe1[2:0];
-reg tvalid_pipe2[2:0];
-reg tvalid_pipe3[2:0];
-reg tvalid_pipe4[2:0];
-reg tvalid_pipe5[2:0];
-
-assign m_axis_0_tvalid = tvalid_pipe0[2];
-assign m_axis_1_tvalid = tvalid_pipe1[2];
-assign m_axis_2_tvalid = tvalid_pipe2[2];
-assign m_axis_3_tvalid = tvalid_pipe3[2];
-assign m_axis_4_tvalid = tvalid_pipe4[2];
-assign m_axis_5_tvalid = tvalid_pipe5[2];
-
-
-always @(posedge aclk) begin
- tvalid_pipe0[0] <= strm0_incr_en;
- tvalid_pipe1[0] <= strm1_incr_en;
- tvalid_pipe2[0] <= strm2_incr_en;
- tvalid_pipe3[0] <= strm3_incr_en;
- tvalid_pipe4[0] <= strm4_incr_en;
- tvalid_pipe5[0] <= strm5_incr_en;
- for(i=0; i<2; i=i+1) begin: srl
- tvalid_pipe0[i+1] <= tvalid_pipe0[i];
- tvalid_pipe1[i+1] <= tvalid_pipe1[i];
- tvalid_pipe2[i+1] <= tvalid_pipe2[i];
- tvalid_pipe3[i+1] <= tvalid_pipe3[i];
- tvalid_pipe4[i+1] <= tvalid_pipe4[i];
- tvalid_pipe5[i+1] <= tvalid_pipe5[i];
- end
-end
-
-//dummy read, for now
-assign config_q0 = 0;
-assign config_rack = config_ce & ~config_we;
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/memstream_singleblock.v b/finn-rtllib/memstream/hdl/memstream_singleblock.v
deleted file mode 100644
index c9b8770aaa..0000000000
--- a/finn-rtllib/memstream/hdl/memstream_singleblock.v
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-/*
- Implements a lightweight streamer for up to 2 streams in a single block of memory
-*/
-
-module memstream_singleblock
-#(
- parameter CONFIG_EN = 1,
- parameter NSTREAMS = 2,//1 up to 2
-
- parameter MEM_DEPTH = 512,
- parameter MEM_WIDTH = 32,
- parameter MEM_INIT = "./",
- parameter RAM_STYLE = "auto",
-
- //widths per stream
- parameter STRM0_WIDTH = 32,
- parameter STRM1_WIDTH = 32,
-
- //depths per stream
- parameter STRM0_DEPTH = 256,
- parameter STRM1_DEPTH = 256,
-
- //offsets for each stream
- parameter STRM0_OFFSET = 0,
- parameter STRM1_OFFSET = 256
-)
-
-(
- input aclk,
- input aresetn,
-
- //optional configuration interface compatible with ap_memory
- input [31:0] config_address,
- input config_ce,
- input config_we,
- input [MEM_WIDTH-1:0] config_d0,
- output [MEM_WIDTH-1:0] config_q0,
- output config_rack,
-
- //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits
- input m_axis_0_tready,
- output m_axis_0_tvalid,
- output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata,
-
- input m_axis_1_tready,
- output m_axis_1_tvalid,
- output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata
-
-);
-
-
-//TODO: check that memory width is equal to the widest stream
-//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?)
-initial begin
- if((NSTREAMS < 1) | (NSTREAMS > 2)) begin
- $display("Invalid setting for NSTREAMS, please set in range [1,2]");
- $finish();
- end
-end
-
-//invert reset
-wire rst;
-assign rst = ~aresetn;
-
-wire strm0_incr_en;
-wire strm1_incr_en;
-
-assign strm0_incr_en = m_axis_0_tready | ~m_axis_0_tvalid;
-assign strm1_incr_en = m_axis_1_tready | ~m_axis_1_tvalid;
-
-reg rack_shift[1:0];
-
-generate
-if(MEM_DEPTH > 1) begin: use_ram
-
-//calculate width of memory address, with a minimum of 1 bit
-localparam BLOCKADRWIDTH = $clog2(MEM_DEPTH);
-
-reg [BLOCKADRWIDTH-1:0] strm0_addr = STRM0_OFFSET;
-wire strm0_rst;
-assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1));
-
-//one address counter per stream; more LUTs but keeps routing short and local
-always @(posedge aclk) begin
- if(strm0_rst | rst)
- strm0_addr <= STRM0_OFFSET;
- else if(strm0_incr_en)
- strm0_addr <= strm0_addr + 1;
-end
-
-if(NSTREAMS == 1) begin: sdp
-
-ramb18_sdp
-#(
- .ID(0),
- .DWIDTH(MEM_WIDTH),
- .AWIDTH(BLOCKADRWIDTH),
- .DEPTH(MEM_DEPTH),
- .MEM_INIT(MEM_INIT),
- .RAM_STYLE(RAM_STYLE)
-)
-ram
-(
- .clk(aclk),
-
- .ena(config_ce),
- .wea(config_we),
- .addra(config_address[BLOCKADRWIDTH-1:0]),
- .wdataa(config_d0),
-
- .enb(strm0_incr_en | config_ce),
- .enqb(strm0_incr_en | rack_shift[0]),
- .addrb(config_ce ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr),
- .rdqb(m_axis_0_tdata)
-);
-
-
-end else begin: tdp
-
-reg [BLOCKADRWIDTH-1:0] strm1_addr = STRM1_OFFSET;
-wire strm1_rst;
-assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1));
-
-always @(posedge aclk) begin
- if(strm1_rst | rst)
- strm1_addr <= STRM1_OFFSET;
- else if(strm1_incr_en)
- strm1_addr <= strm1_addr + 1;
-end
-
-ramb18_wf_dualport
-#(
- .ID(0),
- .DWIDTH(MEM_WIDTH),
- .AWIDTH(BLOCKADRWIDTH),
- .DEPTH(MEM_DEPTH),
- .MEM_INIT(MEM_INIT),
- .RAM_STYLE(RAM_STYLE)
-)
-ram
-(
- .clk(aclk),
-
- .wea(config_we),
- .ena(strm0_incr_en | config_ce),
- .enqa(strm0_incr_en | config_ce_r),
- .addra(config_we ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr),
- .wdataa(config_d0),
- .rdqa(m_axis_0_tdata),
-
- .web(1'b0),
- .enb(strm1_incr_en),
- .enqb(strm1_incr_en),
- .addrb(strm1_addr),
- .wdatab('d0),
- .rdqb(m_axis_1_tdata)
-);
-
-end
-
-end else begin: bypass
-
-reg [MEM_WIDTH-1:0] singleval[0:0];
-initial begin
- `ifdef SYNTHESIS
- $readmemh({MEM_INIT,"memblock_synth_0.dat"}, singleval, 0, 0);
- `else
- $readmemh({MEM_INIT,"memblock_sim_0.dat"}, singleval, 0, 0);
- `endif
-end
-
-always @(posedge aclk)
- if(config_ce & config_we)
- singleval[0] <= config_d0;
-
-assign m_axis_0_tdata = singleval[0];
-assign m_axis_1_tdata = singleval[0];
-
-end
-endgenerate
-
-//signal valid after 2 tready cycles after initialization
-//then stay valid
-reg [1:0] tvalid_pipe0 = 2'd0;
-reg [1:0] tvalid_pipe1 = 2'd0;
-
-assign m_axis_0_tvalid = tvalid_pipe0[1];
-assign m_axis_1_tvalid = tvalid_pipe1[1];
-
-always @(posedge aclk) begin
- if(rst) begin
- tvalid_pipe0 <= 0;
- end else if(strm0_incr_en) begin
- tvalid_pipe0[0] <= 1;
- tvalid_pipe0[1] <= tvalid_pipe0[0];
- end
-end
-
-always @(posedge aclk) begin
- if(rst) begin
- tvalid_pipe1 <= 0;
- end else if(strm1_incr_en) begin
- tvalid_pipe1[0] <= 1;
- tvalid_pipe1[1] <= tvalid_pipe1[0];
- end
-end
-
-always @(posedge aclk) begin
- rack_shift[0] <= config_ce & ~config_we;
- rack_shift[1] <= rack_shift[0];
-end
-
-assign config_rack = rack_shift[1];
-assign config_q0 = m_axis_0_tdata;
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/mux.v b/finn-rtllib/memstream/hdl/mux.v
deleted file mode 100644
index f7087f9735..0000000000
--- a/finn-rtllib/memstream/hdl/mux.v
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module mux
-#(
- parameter NINPUTS = 1,
- parameter WIDTH = 16
-)
-(
- input [NINPUTS*WIDTH-1:0] in,
- output [WIDTH-1:0] out,
- input [$clog2(NINPUTS)-1:0] sel
-);
-
-assign out = in >> (sel*WIDTH);
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_sdp.v b/finn-rtllib/memstream/hdl/ramb18_sdp.v
deleted file mode 100644
index 8d2fbf9a98..0000000000
--- a/finn-rtllib/memstream/hdl/ramb18_sdp.v
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module ramb18_sdp
-#(
- parameter ID = 0,
- parameter DWIDTH = 18,
- parameter AWIDTH = 10,
- parameter DEPTH = 2**AWIDTH,
- parameter MEM_INIT = "",
- parameter RAM_STYLE = "auto"
-)
-(
- input clk,
-
- input ena,
- input wea,
- input [AWIDTH-1:0] addra,
- input [DWIDTH-1:0] wdataa,
-
- input enb,
- input enqb,
- input [AWIDTH-1:0] addrb,
- output reg [DWIDTH-1:0] rdqb
-);
-
-(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1];
-reg [DWIDTH-1:0] rdatab;
-
-`ifdef SYNTHESIS
-reg [7:0] idx = ID;
-`else
-reg [15:0] idx;
-`endif
-
-//initialize memory
-initial begin
- //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
- //ID can go up to 99
- if (ID < 0 && ID > 99) begin
- $display("ID out of range [0-99]");
- $finish();
- end
- //MEM_INIT path must be terminated by /
- `ifdef SYNTHESIS
- if (ID < 10)
- $readmemh({MEM_INIT,"memblock_synth_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
- else
- $readmemh({MEM_INIT,"memblock_synth_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
- `else
- $sformat(idx,"%0d",ID);
- if (ID < 10)
- $readmemh({MEM_INIT,"memblock_sim_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
- else
- $readmemh({MEM_INIT,"memblock_sim_",idx,".dat"}, mem, 0, DEPTH-1);
- `endif
-end
-
-//memory ports, with output pipeline register
-always @(posedge clk) begin
- if(wea)
- mem[addra] <= wdataa;
- if(enb)
- rdatab <= mem[addrb];
- if(enqb)
- rdqb <= rdatab;
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
deleted file mode 100644
index c7850106ae..0000000000
--- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-module ramb18_wf_dualport
-#(
- parameter ID = 0,
- parameter DWIDTH = 18,
- parameter AWIDTH = 10,
- parameter DEPTH = 2**AWIDTH,
- parameter MEM_INIT = "",
- parameter RAM_STYLE = "auto"
-)
-(
- input clk,
-
- input wea,
- input ena,
- input enqa,
- input [AWIDTH-1:0] addra,
- input [DWIDTH-1:0] wdataa,
- output reg [DWIDTH-1:0] rdqa,
-
- input web,
- input enb,
- input enqb,
- input [AWIDTH-1:0] addrb,
- input [DWIDTH-1:0] wdatab,
- output reg [DWIDTH-1:0] rdqb
-);
-
-(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1];
-reg [DWIDTH-1:0] rdataa;
-reg [DWIDTH-1:0] rdatab;
-
-`ifdef SYNTHESIS
-reg [7:0] idx = ID;
-`else
-reg [15:0] idx;
-`endif
-
-//initialize memory
-initial begin
- //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT
- //ID can go up to 99
- if (ID < 0 && ID > 99) begin
- $display("ID out of range [0-99]");
- $finish();
- end
- //MEM_INIT path must be terminated by /
- `ifdef SYNTHESIS
- if (ID < 10)
- $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, DEPTH-1);
- else
- $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1);
- `else
- $sformat(idx,"%0d",ID);
- if (ID < 10)
- $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, DEPTH-1);
- else
- $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, DEPTH-1);
- `endif
-end
-
-//memory ports, with output pipeline register
-always @(posedge clk) begin
- if(ena) begin
- if(wea)
- mem[addra] <= wdataa;
- rdataa <= mem[addra];
- end
- if(enqa)
- rdqa <= rdataa;
-end
-always @(posedge clk) begin
- if(enb) begin
- if(web)
- mem[addrb] <= wdatab;
- rdatab <= mem[addrb];
- end
- if(enqb)
- rdqb <= rdatab;
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/sim/golden.dat b/finn-rtllib/memstream/sim/golden.dat
deleted file mode 100644
index 1466271bca..0000000000
--- a/finn-rtllib/memstream/sim/golden.dat
+++ /dev/null
@@ -1,9216 +0,0 @@
-AFB2B66A
-BB100CFF
-1ED93E9B
-1B8E800D
-DA9E0150
-38B1C916
-93BC4E64
-860F8373
-B31D708B
-C2934023
-739C9593
-4C898A3D
-CCC8F4C5
-8FA275E6
-47732CC7
-6857ABF0
-31671013
-6BC4AA43
-73D4F790
-2C6158B6
-FDC3B5D
-6DC755F2
-E0E7E8C9
-7862E17
-3D4FFE1E
-9AFFF447
-C862FD7D
-A4C4D89A
-D7D6EF51
-10E5A31D
-79DA9C63
-A83060A8
-EA988813
-6B411BCF
-85544B5A
-5AC91DE6
-586E6779
-8FE8161B
-4C57CC92
-74C918A6
-36B20D44
-5CB62FC0
-62FDB2E1
-4B1CB514
-526B7CEC
-B3FA61D0
-C95DDBE
-CC2BA600
-2466CD1D
-3354A056
-CCED3EAC
-6FFA09EE
-F9648FAF
-18CB5358
-EA506270
-66F385A6
-5B0246E5
-26218A76
-BC7CECFD
-5969F6FF
-3DAF5901
-C53D05BD
-1EDA2D76
-5C0C0010
-7A6C0C8C
-BF99E997
-C964C884
-4DE417F4
-8637312
-133B8C3A
-D637DB88
-297288F6
-CF1D00B3
-426BD0F3
-4D258120
-8F7EC898
-E15482D9
-DFDFC442
-16A5C4AE
-7A6A14DF
-5E9C2807
-31BD3EA2
-BD6DCDBC
-E47CD35E
-FA4FE42
-CCDE0036
-345EBCB7
-64686255
-AE1D77EB
-D2B42B84
-CD5E5824
-8DABAB1F
-4E07FFCA
-7F3B4C13
-1A62C962
-CE08835F
-E8E05318
-DC25C7BF
-132E4308
-5D0122D6
-B7451ACE
-829D2507
-19329C7F
-39FCA8F0
-DCD1A574
-17E2EEE
-B2B6583A
-2181E65
-7013A2A7
-46535CDE
-C85BF5D3
-2FD5EFC2
-E05C5D2E
-244F0F96
-F01D711F
-F1CBB67E
-6DAE6666
-84AD6F4A
-B95BC84E
-9DD54B95
-5A7CA1B
-7B1447F4
-44A8EDA7
-20929E9
-40E62E02
-3D03CC3E
-81EEF8C4
-1E686D13
-17C13B3D
-A14967BE
-D8693E0E
-15A7FDD1
-19F51C6D
-249D0C21
-51424939
-BA05F551
-C614827A
-32841A0D
-2F8B041
-11A2806
-DBF24199
-F246D9EB
-52FFB23D
-F3061A47
-B6D51EF3
-2DE434C3
-E1D3F874
-85270B0A
-CC405B14
-DD3E9F23
-A0352F98
-67EE5731
-96892C65
-6D67A443
-16354414
-17959F75
-A554F236
-C585076
-2B665011
-7D503509
-77A4530
-6A13C8DC
-31996F5
-916AD400
-E761D000
-D23CFD32
-CF3A5154
-C575A1CB
-B91ACDBF
-BEE7F338
-44C26212
-8124CD5B
-245F7451
-DD6D18BA
-6B838EC6
-5247AB98
-2F41FDAA
-A780BD3B
-1FD2F95
-6CDA39C
-C31FA5A0
-AB56A5E1
-87F50441
-47093971
-BEBD81EC
-2A7F6977
-8C83BD29
-FB067DAC
-5FEBDCDC
-8FB43F72
-EE45FC6D
-4088691C
-34F235D0
-43AB8E4D
-67FA8BB5
-FC2D2C02
-DA77044C
-22E6FC7
-6B6039A9
-BA6E3C45
-46DEC612
-8E7E0FF7
-438DE467
-F4525025
-7937973A
-9ABE4BEF
-8F8DF841
-F74C5087
-7EDE1CA4
-FF3C7F98
-A025FE0B
-59E5EDF6
-6DD27411
-65C080E6
-C86D872D
-628B6B26
-B9316D56
-E09EFA8B
-A8CD3F21
-C0CD8745
-F4D62BA7
-D4D7FB99
-E9174232
-7F068FC4
-767480FC
-275BBBF7
-3470FF88
-E632ACD1
-85677507
-AE0E2C69
-E2C74DA9
-C307B72B
-5FB5A769
-99C18162
-FAFB7660
-6E984733
-E17FD97B
-EC5E6CA7
-3D659815
-30826B60
-300BE8E8
-86D0B096
-856F2CB0
-2A61ADE4
-24EEB996
-2FCB729B
-8190CE0D
-E64F7E6A
-4D0D42F
-CE29765B
-C77DE893
-9264C299
-A200E419
-868B5EC6
-8452AC39
-59F7BDED
-422E75B2
-74E6329A
-38F053E8
-16F8BD5A
-363A2E43
-8018AB7B
-44AE4CF5
-C8F7B14B
-52658A45
-7B46C7D8
-CD319C38
-19AC8957
-5F42CFAA
-5DB4DBF7
-DF66DDBA
-4FBCB611
-266DFB86
-4F0EE64C
-1765E724
-E30C89CA
-4705FCE8
-BB7636B3
-789EFEFC
-AAC0F37F
-424B1661
-234F05AB
-1BC0ADF8
-7F9EC67E
-500448E5
-BF4D1C45
-C5B64E3B
-914F44FE
-EB17F041
-1752165C
-F5B72E31
-6D68C060
-4EF27C55
-8CEDFDC5
-E3996A56
-25C5C632
-430D930F
-EE04DE4D
-576E4921
-E13A2A6E
-CFE21675
-B1067912
-4C888068
-3C3A1A6D
-FCE12E0
-FAD6AD8B
-F7DE2E0F
-E8DC0DE7
-CC8721DF
-34411355
-2C664D07
-ED034324
-F57FDA56
-8C70BCDF
-3A6FF2C8
-C6440537
-8113D976
-A40176A1
-46D1D0D9
-877A407C
-3FBCD395
-3E74C1D8
-72E22A13
-BA46116D
-CFB14406
-21400896
-7AD34367
-2905F60C
-C1F9C16F
-2E0E5FCF
-2EEB00A0
-9C2D94A9
-8DE1CF01
-5912596C
-CF2CA22A
-774E7D4F
-805657AE
-1BA223EF
-236FD53F
-C1ABFD4A
-6B8DD778
-6A6E40D2
-70CF4F79
-950E8D35
-5E4F9545
-86AA4166
-28D056E9
-9C550D75
-CB435A3
-B875667E
-F54E6E97
-BB7ACD6B
-F11637E9
-C220E1FA
-C7CAD54B
-32853439
-65BA20C9
-1838F8C0
-C3CCE57D
-7D2B69F9
-137AD6E9
-6C041B9
-296497AA
-98C5E853
-D37AB835
-376764A9
-2F714011
-D24BE867
-B2BA4E
-9EA785F9
-726FCED6
-6B4C6950
-44C6D5C0
-85DEA727
-733F5A86
-41785CFF
-BB395E8A
-100F8117
-276A08D3
-9268A16E
-FBF63C19
-AA497F25
-E92E1DC3
-185B4692
-FE6377D6
-C50771B
-D98BCD04
-50FC7D74
-BE5BC294
-2C9C4482
-12FBF6CD
-D1E04AE4
-5C9679EE
-889D2695
-3699F061
-933D06A9
-930DC367
-496D7A37
-C4161D19
-3E08728B
-66388C70
-B2363734
-5D12926F
-39B4AEF8
-1948B925
-321E08BC
-27559FC2
-A543B709
-4D28BC0
-46C64305
-F7B7D459
-97C4966B
-A027A9C8
-43CABFA9
-F7C3643D
-1128AB2A
-AA4A1419
-AC6F2B46
-8F6FEFEF
-34284D4D
-D951EB81
-77AC6B7C
-70F6E0B2
-FD7BE3CE
-77BE497E
-4883FBD6
-FCAB08D4
-9BC032A4
-67DA8A5C
-82037EC1
-E3EC6CC9
-481B7623
-DA1F3873
-CE9E8787
-785CD654
-1661CF27
-42BD0C3C
-990F261A
-49F18930
-FA336094
-FFD6FC06
-B71077A6
-204B911E
-BA1586D6
-8A2F6DBC
-36B184AD
-76017CAB
-DA7E891E
-88A51A1A
-97AC49CB
-2482BE28
-CE6BD009
-C7776DE0
-4E960944
-64081AF2
-56512D55
-D6D1C640
-EE78145B
-54CC5EE0
-BE5D3E1F
-8FC8816C
-1D6AC407
-5D98F8F1
-18FECC5C
-F3DE9A29
-93A19068
-AB623B35
-43FF1A02
-AA26434C
-B071FDD5
-45AB6A2E
-C1275AA7
-EADA5CDA
-E427C95E
-AE6E5B77
-89F3CA30
-9648C00A
-330A03A7
-20DB35D6
-AA9946BF
-A0E3050E
-DEBB5819
-5047E2E
-9C8FBEB9
-6B70D173
-8A99428D
-230C88FE
-3B26DBD4
-8DBED704
-EFF1C946
-C2381970
-71087497
-2268599D
-FCE50AAE
-460A49E5
-EC65BC4C
-5A83C23C
-DD44120F
-D6E81BEB
-D10235B7
-9362A387
-B3C9220C
-46F21F0
-3D04FBC0
-63A2B38D
-8F7DEF26
-F326457D
-21933DC1
-775197FB
-8D6C7C5F
-B2D7D570
-147F9FF7
-78666356
-BAB7D249
-69B45EC6
-F56634ED
-34738794
-26DF0163
-188DA00
-D2035A36
-FFBB8062
-62852DCF
-55FC882A
-849388E6
-43BE6E2C
-D53EA2A2
-A228BC21
-9112A960
-5FCDE2F1
-79F42B27
-8AE37179
-1D722815
-5AE6DD26
-A8531C6F
-EF386673
-AC761B14
-23C6BC3A
-488D93B
-AE6B0D63
-A4F1CEAC
-43F80A43
-D9681EF6
-BA959674
-CCB852B8
-D9F4D79E
-6403622F
-75FAECC6
-7F43813F
-51FC7BE6
-896A3A28
-CAF31C60
-76000EE7
-C1135AAB
-6E83B2E6
-2AED1966
-C4F88A86
-21219EA
-8AF14AD6
-14014BA2
-BC0BE2D5
-78757CE8
-C09D83DC
-6B2021FE
-D5AD900
-3685A49F
-FD8B4BA0
-7B005539
-2F0C36EF
-B41DBA0D
-1DCF61B0
-CB3DA1A6
-24C0ADAA
-BED01B2B
-59C8C334
-11CCA76C
-6F962508
-ABE672A6
-3C281A24
-A6C3DC39
-A72517B1
-FBA81175
-9906CEE4
-E8177FE1
-338D0184
-CC6650DF
-840D8CA0
-4C55C42B
-6B40F9CC
-57B7E7B7
-B7C42442
-4500E9B
-8C788183
-9B8F5FCE
-49D0AEE1
-426B2271
-EC25BCE3
-7D63A976
-2EFFF592
-32A9E43C
-AF5AFA52
-3ABE1133
-35B75ED7
-8F4271A9
-725A6EF
-7ED7EB40
-37BD3B
-7A0A5AF2
-F6492D7D
-C2856688
-9595C241
-C07F646A
-7D394FDC
-7A991B05
-2CE3AF30
-9929E6E6
-4AE66BD4
-F0F3D1A3
-F76F72E9
-6C2051E2
-72431DE4
-B1796A93
-E04FD748
-D19522B1
-71396A78
-4202F058
-4F2CEB1E
-A186853F
-8B4474AA
-C679B644
-98E10D42
-E7CEB08C
-733CA225
-3478B95C
-A706A842
-9510B8EB
-F47E426E
-9A0A17EE
-2DA8832B
-E73536CC
-E6CA4B40
-11A2708F
-753AC1E1
-8C304DED
-5FC83F07
-4F9A04C9
-E0737708
-9091DFDD
-8E1B322
-2552D768
-7C894296
-EABDC081
-E3B2A37
-DEC7EC87
-37FFB6DC
-2B2A0CD6
-7E797B13
-64ABD0C5
-1FF12252
-F81AFB24
-C16F1ABC
-F0B5AAFC
-F80281BA
-E51C04D
-EEF8BD3E
-450A49DB
-AC985D7B
-CBD4D077
-CAA6370A
-FDA6530C
-20B71F06
-ED5A891E
-BA51A622
-E9F8E132
-63C23719
-2F59EE96
-14D77539
-1A98FC31
-12FCC937
-F39AD8FB
-3750DBA9
-564E45B
-F74C47FD
-1010AD3A
-8BE0AED3
-28B27F7B
-D5E8EEFA
-DC0EFEFB
-959F5394
-A10ECCB8
-5C366706
-3B82A5EE
-74E377DD
-9881CEF3
-D1A4BD88
-69106661
-B209B42
-B56EE86B
-63F37839
-C5AB7736
-4AD627C4
-8A4C7E1C
-F7CC6334
-3D6CAEC4
-A86A18D5
-8FD910B1
-972371C8
-A423E9B6
-CE8C76C7
-DF930841
-C9D4A7B0
-18521955
-F6F167FC
-889F1625
-432C606A
-CA5EB4D0
-AFE77C91
-EAF55F16
-6F9A9777
-33726C1D
-DC7B1D64
-8031DC00
-CF13144F
-84BF2AB
-45F5FD45
-6AF06D8C
-C50FBE6C
-11B8A4A2
-16B780E1
-98033979
-8EFAAEC0
-DD984A5A
-D6A80AFC
-15C793A3
-EF458063
-B784551F
-552CC380
-D1E05EBA
-4A795261
-F2B25418
-66066848
-D935B481
-136D2C8F
-7A25AEFB
-7000439A
-E147CC62
-68976C6E
-69447DAB
-C72506F3
-C6E3FE3B
-4FB0FD96
-DB465740
-A254195C
-B11EA223
-FC3C44B5
-A9A86F1C
-8EED03E3
-24CFF3A
-A1B488CE
-FD75D002
-9FEF0461
-75DC6637
-B3D38CD2
-57C8F65D
-C62026D0
-D6320A18
-5E961798
-80FE0097
-6DA57E68
-D1E8A3C7
-96D49CFC
-A8D2DFBC
-520D2C1
-151C3F1D
-8180DCC7
-4461E43E
-C895BF5C
-18EE374
-33EA06D4
-75B9D006
-23B934C1
-C2E89F39
-444BCB75
-78077AA5
-ECA64716
-3C1E3FFD
-F7DB9CEE
-6EC313DD
-9CABEC47
-675FA281
-16B8304D
-3E38FEC
-A9663BDE
-8EF647F2
-B646C61C
-2228E400
-2B411566
-7A72EB44
-88BD9AE9
-4EF4EBA3
-BCC822D9
-4668160D
-695667C1
-CE51A675
-40DE9687
-877561EF
-416F5AE6
-EF9304FE
-34C1C9D3
-5B63E1BB
-C50E9899
-1831810D
-25DE2CC1
-10539A77
-EE51D9B2
-462E5A70
-B0F8C3B7
-CA16E410
-1796F2E5
-573F6B28
-E157A965
-2640969A
-153B4909
-7FC1290F
-ABCAC2F
-2A42D17
-BFFA3865
-7B12D8B9
-9321F9EF
-E560B7A9
-36E18DD2
-57710FF9
-FAE1F933
-F717FEF8
-E86BAF7E
-D0CE3E89
-C8755650
-704BB6ED
-6309F650
-E21DDB4F
-7CBF531C
-7E0AFB8E
-D6A1128B
-60F16A1B
-534186AF
-72971F2E
-428A867C
-F571D32C
-CD522E7B
-13F6443
-38CDC9EC
-D01C51E6
-2E575D3F
-7E86B596
-C1460B28
-1403B019
-76D89A66
-4F2D9465
-9B87B1
-172A00A4
-4669559C
-105C8A19
-3CD2DD63
-EF054D76
-8B9AB48
-64136500
-71C56349
-B7AEEDF5
-4145D7AC
-D6A3E4C7
-2F9E0DF4
-31E418C8
-D2C839DE
-63E919D9
-2F4D0353
-8812C572
-B88E671F
-54D2BBE0
-E166998
-B7487741
-64312607
-5ADF6F3E
-31A86BF1
-D8A96C85
-22AA3021
-AD4719B5
-49EB0670
-93B76AAF
-B109648
-FBC7346C
-2530A7B5
-C8525175
-15EC0A76
-315FACCE
-D8C21A6F
-9EDEF96D
-6495575D
-722A0577
-51EDE2ED
-8109F168
-6CBA0929
-1ED88DCD
-D79A67E2
-CE62A29C
-6FE2A87F
-D1E6E3B9
-601988A0
-6A045849
-A7E30F35
-E0EE4424
-AA89C628
-33D7A7A3
-FCD27B7A
-80CAF9A4
-2E7F1302
-69F19C
-80DBDC64
-392FBDC
-E5981A33
-B4AF4210
-1DBFDB9F
-31E5DF02
-5C571556
-EE256151
-9F573818
-200D540B
-87743240
-1335188F
-5A1E9D1F
-FA267CB
-688D2302
-80D32C1
-195719E
-EF151174
-772EEC93
-DD2E2E4E
-D8EA362D
-3B24FC06
-FFFCF7FC
-C571F2F4
-A8DAC7D
-3BA7880C
-16FC184D
-7DBC453C
-8F355780
-65C7ED3D
-2202E50E
-9EC765A9
-9D8F8CDA
-CFA71D0B
-7A463A33
-AA94D750
-359750D8
-B9A4BEFD
-B153CD8C
-93AFB5F4
-2676E0A0
-78C0805
-347133
-3B229F4D
-4486A7BE
-F3A0FAF3
-D29E9349
-A62C0FB4
-574D3763
-BCDAEE6E
-BA27D40D
-896903EB
-8AE6171C
-A911D78E
-970FB490
-33B8A631
-893F7E3B
-700EDF9D
-EA7AC6E6
-6041F473
-FC6702EE
-F225A258
-96A21B4
-CCA94D4D
-FA6D00B7
-35580441
-F5E42BA
-EE9AB535
-50874EBA
-4454B2B
-30653468
-9ABFE240
-29A13784
-EBF5F88F
-B1769BB8
-EF22637D
-A2FEEE4E
-4B39E8F8
-38AD4316
-A3FCB454
-7D6F402
-18CEA9F0
-956B2CCE
-6559ADC4
-F00F696E
-C878E2A3
-3AB31BE4
-FF2E6E3A
-3767BE32
-37CFBCBC
-C307A74B
-ED6A132B
-8D5A1B70
-774C41D1
-A45F1CA9
-3FCF576A
-C1BBAB8C
-5B11B23A
-620B6C8E
-A6F5CB83
-450BFF8B
-FBB9620D
-BD936B56
-2FBF9A89
-2E000CD5
-E508C955
-2FB99422
-5043B664
-1C43CF3B
-2D7E713F
-FAD8A72B
-7CF2FA33
-8FDD90A6
-8B5CDCDE
-6CBF908F
-740425F6
-D142F4B9
-2B30DF9D
-3808D354
-508C4729
-E6FB0279
-FA0F9DF5
-2FFA33E1
-8A93B18
-FE7C0855
-E69193B1
-AA7E4DA
-DCDD121D
-4E7CD1
-14C03D9
-ACB60232
-818C10F0
-D8CAA46E
-2CBC53B4
-46F82991
-9B24E92B
-E1DBF265
-C6649C
-87D0CA2F
-C24A605
-AEB470E
-8DC36FE7
-2D6B856E
-9B459A3A
-5C204000
-C7CC0BA9
-E637D8C4
-1F8C7240
-41788DF4
-27B94DFA
-BBA5B2CD
-51E1AB57
-FB14B16B
-B6821713
-F955BAB9
-44FEBDEF
-A484D04E
-FCC08A15
-A117E11E
-CAE09305
-789A734A
-338EAB60
-183825B
-61931C6E
-ECBBBA86
-1AC53895
-BCEFB579
-CC68D938
-217A4ED1
-3CC6F2DE
-12E55EF5
-FAE1CE98
-CF89DDCE
-8FEFFF33
-8C27552E
-6D63AA8F
-B094E27C
-4E7632FE
-5D9DDBD8
-8E2766E6
-2EF9333E
-98B9A7D4
-20D98AB
-C12C8047
-5995F2BB
-BB30E14
-C769CC0E
-632D8C76
-B7FBE051
-3170D046
-D595ACCF
-190326FC
-D1D03166
-DA4420CD
-81FA57FA
-D8615FD4
-33AEF793
-E2B32AB3
-E2B2D613
-5A37DB74
-EBF473BC
-62C5F8CF
-624D5D2D
-9A9006D4
-8515BED2
-7DD650C8
-D0BABA59
-1E635B2C
-690CBFF7
-E4028EC4
-E4E5B3C2
-57607B0E
-D4087B2
-3C06022A
-813133A2
-B206699
-3827A132
-985BF479
-6C11EA62
-F58DA68F
-818CD2B6
-F204828B
-64A0D011
-A6F07C40
-6816D54D
-8B00F959
-3B6A1891
-EF20520A
-B5B90BD0
-D70B3B4
-7B165E3F
-FBE60B95
-50656296
-6250C189
-B50E29BC
-7BBB35AE
-124AD7B3
-BAD38F67
-A0CA136
-FB03F6CB
-B88FB36D
-9025524E
-4EB80454
-D07FEA2B
-D9385E1F
-B1EDF69A
-11D2AE5C
-9EEC00C3
-55916263
-AAD5CF88
-2740548B
-662FB2DE
-173DFA86
-8D734BE9
-D4A27E13
-E92A39A2
-A58A3F4A
-A71CE9AC
-B43ED5F
-1600E2AD
-265C4182
-4EA4F91
-1E3A0BD5
-62650FD0
-BC6E23A1
-3BF3E963
-5F6AFA4A
-6BA2B659
-5C00047A
-E8F81B0A
-C30BF4A0
-DFF059E0
-4E3F93FE
-D688F348
-3220541C
-F8A72F57
-6D78CAE6
-AF13AA11
-BDB3229D
-936DA76F
-749DB9C1
-EBF347A6
-BBFA776B
-6472B218
-6144ECA8
-E66CD255
-274BC846
-64C0C67A
-95748CF2
-25DE3E48
-29A685B3
-CC8C7B15
-F18FA7CF
-5F2D1C01
-6DFEC90F
-CF834DDD
-A72D9439
-BC6D83C3
-9F888C34
-385D225F
-168886B3
-98EF8EB2
-BD8ADDD1
-80DA0EE2
-F4196AC8
-6F020F21
-61136480
-4DA28475
-86A506E0
-1A75F4D7
-222C4645
-8C4486EE
-98560E3C
-944205C9
-D5E0BB3C
-C9667421
-2932030
-BFE65EB0
-FB463370
-9FE77763
-DE8ED32D
-FC9BDBEE
-FD77E3F
-288C605F
-7475F3D
-C3F75513
-C5AF2C40
-40FB62E2
-2C7C83E9
-A8A7E6CC
-512E4560
-950C9D
-EC507007
-65B7CEC6
-4A91094F
-3BDA586B
-7029FB6E
-739B556A
-678652AD
-7B940AD3
-4A8728BC
-76841FC0
-F53DEB4C
-1B13B0F8
-80A5CFA8
-69C8B602
-6F984889
-14A53B17
-409BF6B7
-46D597EE
-3502ED7D
-315B1DE7
-E785791
-21871730
-78BE7E05
-D1536BC0
-F9708FE6
-EE4E143D
-4E498B00
-A2113F88
-630DFE4E
-3FA3D4B
-F88D623D
-3ADB0736
-BF25AD18
-CB89D619
-1D41D458
-EEFA6367
-7671EBAB
-B98E8CFB
-238D9F19
-C5155B
-223C16B
-E484FED9
-DD6A6680
-5192089B
-CFF24757
-F2CD17B3
-CC3C7B1C
-581E6ED2
-C2D7E5D2
-E9789543
-424EF913
-E6B10C7F
-706C0B16
-6EC36BE6
-54C41CF4
-CD1EAD0D
-17460ECA
-452A78CC
-D680E5A2
-57AA8EB1
-252EB084
-9DBB8E55
-BF759D75
-6E5E9F27
-30EBEFCA
-C4514A4F
-FE76382B
-99A07A25
-F9017D0B
-452226BA
-3DD6111B
-967464D
-C0BAF41B
-C4D39425
-767A57E4
-7183FC19
-844A33A5
-54F13F7
-C5854DAD
-BE406FE9
-14340FCF
-F665DC28
-701D2EA1
-A7B6AC6C
-AC3167EF
-C3CE6810
-C6844D77
-64887D7E
-4EFF4E1C
-8508CD3
-45CD4361
-3FAB9023
-9121F935
-46C5C6BE
-272C83A9
-24762973
-EB858013
-FF2D23BA
-6F5C8026
-A045E967
-7B844395
-2611E8E4
-8AF4659
-89FB4D33
-D9F50DF4
-CA6BD0F6
-A47A1386
-F78D3515
-2E73ABAE
-36C0297B
-DCF0FD32
-3930C7E1
-246799B2
-BF8BEEAF
-7AD6D40C
-7BDCB9B9
-7829D32C
-EC826EC9
-ECE1D576
-4E3D613B
-DCB44DB2
-67EA1BF2
-D1DE75BF
-4609E175
-423132A3
-D33DD5F6
-D74829AF
-FE0FB1F4
-C32939D9
-4FB97597
-1441DE62
-649D26B5
-4835C073
-1F67EAE0
-E28AE826
-DB808A84
-58FD0074
-1424245
-6BD9E7E1
-26476595
-E8C08661
-F1F0D3D5
-577263A7
-CB86C426
-EA57839B
-C8B37BC9
-FBD2B525
-D033D0BC
-A3A0474F
-22EDE40F
-CCD58291
-CB64AA7D
-3176C162
-78DE2512
-ADD0A1B3
-EB41F141
-A7B5DAB1
-C68652ED
-1F8E90D
-31578AF4
-CFA12A8A
-E20A88F2
-74AA9676
-3B353B5E
-1956E731
-AA8B10C0
-63369269
-C833A9E5
-9425A8E4
-89DB1783
-1BE23F63
-D84221B9
-F8D9FE9B
-EA1FD309
-E16516F3
-8F0EA801
-F5256123
-F21B02D8
-F3335520
-F7729F5D
-B7F2AF17
-6B97F182
-806347D9
-962A011D
-A5427014
-B7358896
-E9D6A1C6
-2E3DBDE7
-94B06EA1
-4B3D9107
-26F1956B
-1726E033
-6660681C
-39E4E3D5
-E8CD4742
-78D71E0E
-15733521
-89D0606F
-D449755F
-A2753DF9
-AC7ED71
-7803B9A9
-87CCA2B4
-23003317
-2A91CE6
-C37B28F5
-CD9A436B
-893C12E2
-C1FB04FB
-3D8230BC
-737002C2
-15314ACB
-F4D74B95
-6C8BCBFC
-292459A8
-1692BDFF
-DC68FEB8
-48DEF854
-4BAE6B50
-8B850B23
-AEDD7125
-5B740DA0
-AA83A652
-474C59D4
-A4B2D4D3
-451C3B83
-D93BD101
-BF10B243
-8AB74771
-68C5891
-C8EE35CC
-D22DC638
-5C7FA2D3
-54A2001A
-747538DC
-AC75ECD3
-F1BBFFB4
-844C0E4B
-D7D25E9E
-460EC0ED
-688BA8D7
-CA6E35E7
-9396DBBA
-3E9C3E0C
-5D29B720
-3E5BB85D
-F1CFA9A
-8EF00E21
-28669B1B
-98BE145D
-2696E360
-F91E3763
-B0E3F6FE
-45699C1
-F5945549
-2CB64CA4
-F3508C44
-653BABD0
-773F51CB
-9D228D81
-E4FAB747
-1DC767E3
-89A77290
-8E2A722
-45D00328
-42E979FA
-C19D28EB
-C6645B54
-5AD41E9A
-93587C5A
-719944B2
-B10FF0A7
-A57FE070
-78C8DFAE
-138BFBAF
-1126A4D8
-C9DB256B
-EE01D5FF
-A8EB81AB
-80AB24B4
-95B129FD
-802078
-A6F71D37
-334BFF82
-32678187
-4AA896B0
-149226EB
-5B8C446
-D1799EBD
-74EA35A0
-FA9B52C8
-FAC6A436
-9E543685
-C1184EE
-2D8CF846
-C2AFF300
-18EED386
-80C04036
-77FA6FF7
-5D1512F0
-D2C0C9B7
-22DBA873
-62468BB9
-42C90933
-F7EA7A3C
-69449140
-7DD1B0F0
-52AAADFF
-2F8B7479
-70B719F9
-CD8E1081
-4B46932
-DB933B74
-1E7A04BF
-75DC735A
-C3925701
-7EC84718
-DFEE049D
-E8B3328A
-3A9936EE
-F2E22D2A
-1F2B5894
-DB44DCE5
-4F1DD5B4
-B66F3E9F
-943480BE
-ABA71BB2
-E4F15D5B
-4C9D7A9C
-B751518B
-24C9762E
-F9DA3386
-D13AB9B6
-5CFC891C
-CBEDF3E9
-395421ED
-5A3570B8
-1641D0A0
-AF9A9981
-A07CC659
-4BA92C0
-D94C7431
-AA749489
-372456FB
-690097AE
-B5EF28F3
-1F8F313B
-6C45ECE2
-24F4CAD9
-40C5200C
-920AFACD
-A2E0DD6A
-CEC81C6C
-DED2D22F
-4AEA1A34
-7504D5DA
-1F8E8F02
-72100835
-BB4AE282
-A0154848
-EF3ECE2D
-6DA87A1A
-46D17BF
-DAE80D31
-FA8CA757
-8F75F943
-AFFB5EDD
-F1A09255
-A80EDAB5
-5AC04A14
-B51A2E1E
-FD9C51F4
-F99A5A90
-3EA5F0D
-C4D40DFC
-C0280AF9
-CEC83127
-FA1A5F6B
-D603510E
-3663D878
-A79682FB
-B7313271
-7E37A2C7
-A1CB289D
-C51B6F15
-EC66F0DA
-80D5C268
-F3A52A28
-E056F895
-4A0A2418
-66E47974
-8E8CA911
-FD7E6D05
-70960317
-5D378166
-3A2D634
-CA6510C4
-93BBB6AB
-4FE2CF83
-2273B7D4
-E372BB74
-8AD6B40E
-496AA885
-11F4186
-8DEDF498
-5435E535
-5145EF8D
-44AB3DF
-7B449D2C
-3489063E
-F0A61E35
-A2F75775
-F691A0D2
-9CA997F2
-D64FFFB7
-DA79CC6A
-2DEA4171
-D2E4D598
-C641D01
-79699CD2
-49FF5A89
-C967A1C4
-F4C7FF25
-9CD04F9A
-374C3740
-7B6376BD
-ECC505A1
-E76F3618
-42C0B205
-B28C63BC
-2BA4280E
-7278103B
-83B861F6
-F862D563
-433B3F81
-358E4226
-2E9334B5
-2E9B7324
-23BF3CB0
-1E44A323
-BAA2480D
-3B8483BD
-419659C5
-91A9B2C2
-82574F8
-28A32CD0
-3534C89B
-759FD52E
-B260329C
-82112334
-2D5B7F7B
-816C0227
-ED5FAD1D
-7BDFA5AE
-B5C8006C
-BD9691EA
-36C28C33
-B8702558
-EB3E656A
-D752A865
-FA94FF5E
-AE5D43C3
-747587AD
-6E5E5C96
-39312BCE
-B13B468A
-81543486
-1B57D2B3
-4D3D70A7
-2D4ECFBA
-640E83F8
-4FD1588B
-4EA4599A
-E231E4F0
-A2D4437B
-47D88CE6
-D048C6D1
-4CA7F923
-E9E435A8
-E93D6805
-C032C4A6
-E15934E3
-CB728ED0
-E7D65CEA
-8E5D2F8B
-1676D174
-B42D23CC
-A1462E09
-CA718E2A
-F5BA8F57
-EFA467ED
-6DA31185
-895FB4A2
-649A7D89
-3B71CFA2
-C67F9D02
-DFBDDF09
-AAB8BDDB
-870C617A
-220F7717
-795DE75E
-5C787D87
-BB94CBBC
-99928778
-9D5C4DAB
-4EEC433E
-F4C08960
-F71FE87B
-BF78D7C6
-671FB341
-4EAD6A0E
-534B1D46
-1B4DE7CF
-A7B45E06
-97F43041
-4B77382C
-61EBC96C
-336A9206
-E2A6FD02
-72E6EE51
-26144F77
-DD22DF66
-CBAFB596
-B9CE864D
-CEBC372F
-907981E8
-A9FA3C97
-6B1704B8
-B1160637
-FE603AC4
-274C6ED5
-6C317434
-77A16703
-2489D28D
-2DBFB899
-4A3D882B
-E81AF570
-1B8F583E
-F1CFA601
-C7B776D2
-A26651A3
-303D5E43
-CD80678
-7E9DCEBA
-E0F128C5
-4B1807BB
-25B10534
-4117D98B
-95079C39
-58C7BCE2
-AE0AF4E3
-331A0152
-DB3D821C
-F4F11B78
-E2F55DDF
-15BF23DA
-15E7695F
-1F40D321
-128A49CA
-2D25CD8F
-AE762164
-7EC8AC49
-1D9A1899
-97B6BAF0
-D7E07736
-A2566738
-A903EE89
-67CD354E
-89C1C57A
-97B3EF5C
-240FC35D
-52CE3A2C
-15E8D7D2
-6A8A9E32
-4254550D
-A345B8F1
-464C5420
-FD2E1DB2
-C629DA54
-81D24EFE
-421E30F4
-E4008742
-62839D68
-AD78257A
-23DBB6EE
-49DAE0F2
-B1B07AAD
-EC7791BA
-3B4D3E2F
-C241836D
-C836E98A
-EE9D6DA5
-33B5A570
-81D50D38
-6EE68232
-76677B3C
-AF355302
-D2415D7
-1510CCAA
-A6627F82
-A5A96453
-CD0B833E
-5CF4C1E1
-C14866A
-AFB8FE0E
-B7D08BAC
-4CBFF97E
-F0191C3D
-4E2A3EC
-E76E048
-FF368683
-F4DF51
-8D0F29CD
-91E431F5
-B6808051
-927E3404
-6ADBDD1
-5852A1E9
-394DFE4
-8990BE64
-A69026EF
-3656791E
-63C5AC11
-B9E88670
-9326F9CC
-414EFA53
-B5028CB5
-22181175
-3B1A49C1
-22FEDBAC
-A39731D2
-9C7E2E87
-E931F133
-D9AFCE3F
-C2CC527A
-A85B19BB
-C66CB9EC
-93558B54
-F5197362
-7EA88969
-B380F206
-56AC8890
-56D0C8A6
-B39C42A6
-7B966768
-1B6E37E5
-43429273
-668BAF0B
-327CE28C
-CEA34DC6
-EA727DD9
-2C1AE3E4
-802A7A51
-A1934827
-1A18C4BF
-AEB9CA99
-D572EF76
-18DFC210
-11A4385C
-671ED0D6
-D1E5D02E
-9EE0AE12
-DF1EC812
-51BFF4B5
-CE089E79
-CE4BADF4
-75879327
-C98B6178
-D7B1E852
-95D6767
-1283D091
-20F90A2C
-9020BD75
-504D84DD
-D8982F3B
-E41E0CF4
-55F4FE2E
-2097DB6F
-4B8B7790
-F3A1E487
-F4C274C1
-3452A00A
-15587F21
-687D0671
-7EB3715
-945B9A90
-8C83F0D1
-8934F9BC
-38A50D8A
-7EF49EB5
-A45D34E3
-6C014201
-D4D19185
-821E216B
-569485E9
-6DCC7357
-7711858C
-852AA907
-591CCDF4
-775E7DDB
-9463CA74
-DFF1EFEC
-1F60E4B
-2628AEE4
-EC89EF52
-49D232FB
-E8BD7DD1
-EED418A8
-C35E3A33
-5C739CE7
-979E4B23
-B386E4FC
-62F98F10
-2FEF090
-599508E2
-F3F9F428
-17A18287
-639B700A
-AA9AA4A6
-B1AFC9E7
-FB6E8D34
-44F6A6D9
-EEFB7788
-9D616EA3
-78F3BDCF
-A5E71361
-1D25ED7E
-9059ACA7
-89118CEB
-BDE78C2E
-55B9E0E4
-FB6B9A
-2DBAC44
-85C0DEFA
-1E222914
-2413FBCA
-C8569486
-E757EC3C
-5ED9DB70
-3EA2086B
-F4A4057D
-E29E1B00
-C271490A
-525A60E4
-9A286CE0
-61A42BC0
-D3F6ABE4
-9F31FB75
-335ADC59
-9EA61808
-232ACBB1
-270C7B13
-6EA6535D
-F1D1B1A0
-AE9088BE
-D9E4FD87
-3C8C0972
-5EAA57A
-26997EF4
-3B02B885
-A4722715
-434BE51C
-495165DA
-BC9FC978
-18D8C1E
-328203FD
-12643D32
-65EFAAAF
-71297EEC
-EF8496AC
-E5B7BF16
-2B2C5A0A
-86B713DD
-101E03D1
-14F4FB7E
-34EBDF2E
-2A9F4CF5
-7143B386
-448716E5
-C61C8469
-5F9F797D
-6A89B910
-548E4139
-C48968FC
-11F52973
-E18DC2B5
-7EEDA069
-2EE38156
-B8F99E97
-E066E1BB
-ACC5C04E
-6E645848
-98CA4890
-78191984
-84EC83C1
-C58D9987
-3AA63D1C
-E17CA75A
-CF8B5E23
-155BC19C
-5809C3C5
-E2A7DAE3
-D55C1B6A
-585BF6D2
-5D192255
-310467FC
-ECA8FE97
-4ACDBA8C
-E6319F8B
-FD4F3E85
-47FF7B0
-B6FA3B69
-D75D49C2
-B831D3F4
-1D6282B8
-E335FE0A
-C955B98D
-87968F47
-B9600C1
-805AB6DD
-2677ED62
-86AA7680
-836DD1B4
-82C073FF
-F2664656
-DBE8C3BB
-E4DA24B2
-AE14BE60
-1CF178AA
-F2C661B
-9ED5C4B4
-3B67F448
-426F85E0
-40195BA0
-66BDEE57
-3A128638
-A48D546B
-7DC7834
-C7706566
-1E23F578
-CF55EC28
-F46031E2
-CFDD3546
-6CD58E9C
-C40E02C2
-19558D54
-46E056B2
-C1581093
-20C057BD
-34695F72
-1C4B7B13
-2FD3155E
-152F2F86
-189E2F15
-31991472
-1B85405D
-D1F72A1F
-8AA93824
-CE409894
-9F6D30AD
-E72C6DE5
-A31CC799
-694EB42E
-C2D96633
-7F4776D2
-509C0781
-6A84F278
-E11739F5
-CC5EFAC4
-DDD81D37
-6960145A
-E40C5DEC
-70C068DF
-1E6CC338
-592EDE93
-A19B8534
-DA27B1C9
-608D85FD
-63AAE798
-509A13B
-BAF29F05
-69342538
-5A2FD47D
-5FA22C82
-AC7E3397
-4E546537
-4611C427
-DA39FAAC
-445F1CE8
-5BC83B69
-64AB6C7D
-F2B4EFB5
-DC0016AF
-987EDDC1
-3354C952
-A5B9ECBD
-E5B77548
-997279F9
-7C460F6
-82A1099
-B7CF0472
-ABC3726D
-DD4155C0
-319B8C50
-CAE7E88C
-910F1C5E
-B1367D8E
-56B78305
-8F4CB7A1
-8765A3AA
-89624EB6
-22DE29BD
-A12D4C67
-6BC56ADC
-B587BB0F
-3806EC0
-3C269C48
-9EA289A3
-B5EB4FDF
-1ADB0729
-A991429C
-CE574FF8
-CF071DB5
-CE0D372F
-3D99AE5C
-D6D56E7C
-3A493434
-86AC7C63
-FAF8B585
-B9F1994
-89CB3A3D
-7C8974F7
-2169640E
-D74D62DA
-8F0D850D
-3B9D0225
-4E2CBB6A
-BCA7006
-9DCE6E7B
-3695D660
-EB344960
-F3D223F5
-6B8CA588
-45744961
-2F493968
-E9CBD376
-9B0FDE95
-F17603FE
-B0825FF2
-5B1CCD35
-6F98639D
-5CBBFA88
-890B3C42
-2DD4CA67
-DC9513B5
-A7B91C22
-83A897B6
-399ACDEC
-AD11B2EF
-11D76C5E
-E170FB03
-9326B999
-87845BB9
-CA14B73D
-943FE9FF
-341ADB81
-D800A2CD
-A7265DEE
-1E7F3F7D
-8AC49BD1
-CCE49B1F
-58764B66
-D57DF0D7
-229BE279
-42DB683C
-D8530314
-F1FE931
-DE1A4EEB
-DF35B43B
-3E90F80
-B3934E4A
-FD658EFA
-E6CF1CFA
-472B47E9
-20F155AD
-77571441
-9FE03233
-8BC0043E
-80E9B238
-D325F7D2
-F0333147
-FC86E62F
-A5451DCE
-D9374B52
-674D4083
-9952E9AC
-B529BFF5
-B7E072D6
-5BCD2886
-8381AC4
-5CD6C7FF
-F24E3549
-9EBB5EB9
-23F47A79
-49D578D0
-6CA5874A
-2F3C83E6
-D975C720
-FB484F11
-3BCFB5C0
-3A66DB47
-B3BB4F33
-D5136C2
-D4AB89C5
-8A782859
-C8FE9ADA
-B5D57BA5
-9C8D2781
-7D0919B5
-D362A6D6
-1006FFAA
-3BB31D71
-7709BEE4
-8A348C59
-44A704D7
-96F2AFF3
-592DF706
-F3247289
-3E9BC2A8
-570D8349
-2F615AFC
-B3802616
-B54191C6
-DD155718
-455945B6
-C74C7DF8
-232005C5
-6185D2D2
-8FACE1C
-73D27EB
-770D2680
-DB913D28
-90FC0FA5
-9DE358EA
-2BD3287A
-D5C8095A
-DE541F30
-D10F0F61
-4657627D
-739F2E93
-F9F7B479
-DFC6490
-3D554A13
-D3C6C2EE
-80145765
-D601408B
-52EFFD8
-A44B597A
-9E65E39
-2A5CB536
-A0420638
-EA752AFA
-A7DE4743
-18480882
-A559B83D
-2DC4B6C
-8F33055B
-7C4E3B8D
-52C7F9F7
-9FFA0A63
-A0413C90
-ECA35002
-AB4A7AD9
-A829613
-71904BCD
-9560A35E
-118EC2D1
-CA730775
-A631E447
-F526588
-C415CDC9
-DE509745
-C2C64E6B
-4A3350CF
-CB04DB23
-8D3BA4E2
-3FC18EC6
-C8CFB2C4
-C2B600BF
-FE36BBA5
-EB4B302E
-F2BD24D2
-A820E2B0
-DDE54189
-744E33AA
-9E63B141
-21C2E601
-2C12D5AF
-85AAD794
-EE1F97C2
-9096006
-14132FBE
-FDDA365D
-E3623A52
-9F52F94C
-18F84D8D
-F866F6EB
-9759E208
-38195047
-E31F1936
-9D7E9182
-CEC2787B
-975EB96B
-12F202B
-CA36D8E3
-A694168A
-F033E484
-DAEA79C6
-C465D02A
-154EBBA3
-FFE408B5
-977F7FD7
-59992C2
-72DAEF3B
-47AD9078
-11CEA76E
-3B88B352
-BA2FF2D9
-2A7F4E47
-DD6B398A
-164FCDDE
-CB7284FE
-9FCF9606
-34406791
-104CC89C
-A2F32BB7
-213E9CB0
-1E1E0B37
-7226FA86
-20502886
-4C1C9E90
-2D4D0ADC
-D843214D
-57730409
-614341B4
-ECF30446
-330F5216
-5FBA2C4F
-B4102EF6
-D6129240
-7D5DFBEA
-EB01FCDB
-7CA7342
-46DFED3F
-5BE1B2D8
-2F40EF9D
-59622E77
-A6AEA365
-78133A87
-7FEF9106
-3956BCC5
-8C6509F9
-79525FD
-D3A518F9
-A76193BA
-3F552EED
-F974C309
-12A5B04E
-A71DD6D4
-D9FE2B7D
-95F822BA
-EDBE32B0
-92BFA916
-79899BA5
-3FBDC933
-BC0E7C30
-6D7FEA47
-1F1954E
-4F2F17AC
-F6EA71E3
-B8E34FFE
-3BCD8BD6
-695B7934
-D4CE8358
-26B0699
-784EC0DD
-625BC98B
-8861D087
-44DF0DE
-35B7517A
-A8FA9A12
-244B927
-AF7A58C
-BE48CF00
-95C13C21
-9D8DBCFD
-AE8B4798
-ED04535D
-47A2219C
-C8B87734
-8355D2A5
-B4127CD6
-DDA3394A
-36846F2C
-F38282D0
-177D3FF5
-EE8924CA
-5E6CB3D2
-1F6C2C7F
-3EACD843
-51A77194
-51D89AA4
-DCC17C24
-DB5043E9
-25D52B74
-1C7176E2
-1F483DAF
-24B587EA
-6188E94F
-C886E2F7
-7B24254F
-A761DFA7
-357C70B5
-6BC46A7
-31B8CF7C
-BACB7205
-6C1B0387
-50685794
-7726ACF
-64C49E4D
-7AF06B7F
-D1F2AD02
-E4F5BB37
-2A8A4925
-4245E047
-B7CD8000
-6C72A8DD
-19590349
-7F7EDB49
-5DAF5458
-5EEBC5E9
-6E84757D
-AD3868FA
-F85A2B5D
-A8569A1
-88F1F6BE
-AF363178
-D9A61BFD
-A2959EC8
-C1343E46
-B34A697B
-22530AC3
-70213F56
-1DDEECA5
-4DF030F3
-78A4B8E6
-F93B20A6
-27AB7A7B
-F43A2969
-AEB9E421
-75A8F820
-52CD9316
-CA166F29
-C28D14E7
-51E4C76A
-50249FCB
-3EDA432D
-C6C3EEB3
-6CFF2A56
-5B50A9CE
-D2CEB19B
-2F16746B
-1C19CB24
-9CD2076
-3F804860
-FE59323F
-62F1F95
-2CF56FAE
-E1A3437E
-973F442F
-DB62AE6C
-C0AA4F87
-67224779
-A28378EA
-6C5BE4D5
-97F75FF8
-49922E2
-19ECBBCB
-C89000E7
-436496D2
-29C94230
-21A4D75
-3DF46E1A
-A6D150BF
-4EDE1CCF
-37A996E3
-B0F73D3C
-33E41F15
-14076103
-7BC6082F
-E98E377E
-1E787464
-16AB93F5
-B8E3ECD1
-4A944320
-41E77D61
-8B669E91
-20F1F65
-F4D26572
-81D9D4AD
-99843F88
-7066E60C
-4D6B9549
-C79BBF94
-F53252E4
-EDB94B9F
-EA504F01
-9BE5AD3C
-98F301D4
-C1C0ED35
-3F2734C7
-76351C26
-AEC02AAC
-B9D4A014
-A01F14A1
-2DD27A90
-27C43590
-5A06F84E
-64CC23AC
-76387C33
-A07A8306
-3BC362BF
-5ED88200
-CA6DC828
-4DBF3E47
-F633C85E
-96F44176
-76B2A46B
-CF414D71
-AD77A07A
-9A1F71BC
-FDEE86EE
-7A8AC33B
-AD3C257D
-BEFBD214
-5B562E2C
-3527654F
-FAFCD066
-575BF8E0
-BC2A071A
-C903C2CF
-EB1AB30
-7B8C7CA1
-5ED6E493
-E1C822C6
-368B9DDE
-91122C29
-5B1358F8
-6DCADBBF
-ED845AC
-61E42CB5
-732B420B
-39154876
-C10442B5
-E1CC1A11
-875215B9
-AE9E4FEC
-B2435F4C
-DBC844A
-10FDB0DA
-F85D3FC4
-608B78A1
-DAE2B7B2
-DCD08039
-CC0962E7
-10602FA7
-62522FE1
-D3AFCD9D
-2882BAA3
-70C31CD3
-A69E9A2A
-975BB834
-2A35C91F
-5FB2644F
-69B2BF1
-9C365DDE
-E4199E06
-ACCF8904
-DE105FEB
-9C07AC45
-F75CF55
-EF6E3E9C
-1FB088A2
-9A93BA86
-4E91C403
-E07827D7
-5F7593
-FC778EF4
-5B831E07
-354A60B2
-8D39DB34
-5C3C16CF
-38489DCA
-D83EBDED
-F9E5BE76
-D2C7FCF3
-E868A2FA
-D29E98A9
-5AFBCA1A
-D01628BF
-B2334643
-4EC99A5C
-189E9585
-CC2B18FB
-C692AC25
-A7F6B978
-C1530E03
-AC815E6
-6304151C
-52EB83ED
-C4921682
-96441A15
-56338D69
-5C82292
-FCA308FD
-978D2310
-192DB3D1
-CA6B9EAA
-7AD9F05D
-E7C35D2B
-AB5505FB
-3DD6013C
-532AAD00
-87EA4F8B
-1AC88F4A
-4BFC2053
-65356D9B
-B03A54FF
-6F585110
-2C75F6A4
-CFDC2733
-3E7BD30C
-2DE068DD
-F318385E
-26CEC150
-532C4D5B
-B264C41E
-46229E71
-39E85376
-A074FDB6
-461E84CD
-BADDA454
-77D4AD4E
-479457C8
-F0E4F65E
-DBA7730A
-24D4FEE1
-9442683
-7725F0EA
-F8647367
-5F4D5208
-6DC11B5C
-4E65BE22
-EC0713FD
-1D54F605
-4B0F99DD
-E585AB57
-E14C5EA4
-B7909465
-12ABA66C
-EEF519D
-62F4CFD1
-48DEF31F
-16B38659
-5528B313
-5C031870
-87ED6DE1
-55ACABF2
-FACEBE99
-3007B9E5
-F5C0C90F
-E97F9A15
-951AE375
-67E41B2C
-CF7F6BC3
-C7836B7F
-88B077DB
-DA60BEA0
-1FD6BE04
-95A08F39
-B7EA73B3
-10F6685D
-A9C04118
-EAC17020
-CEEDC89
-7EFB007C
-8D900B82
-4C2BCF1C
-9B9BDFC5
-28846A96
-139B4D19
-32E0786A
-72F19BF4
-66D61EB0
-609F7568
-3A785E09
-B6F2294F
-96E73FE3
-99A0812E
-1BBAE42
-9DF477DD
-111FF2F7
-8A882B32
-2542FA4E
-7BEAFF22
-405268CA
-2427EDE6
-7D9F0726
-7EF6ABC7
-7F8DD904
-C3F2F4AB
-213FB22D
-62AD3732
-955CA4C7
-9E83055D
-BE9C70CD
-C0E6DDF0
-892D1B64
-56F3A648
-43547D3E
-35EB967E
-EBC18CA5
-D4DAC35A
-9DDB564B
-6DFD4F07
-CB02555B
-425A1595
-B978D512
-B3D78E9F
-A3EA970F
-8E27124E
-6A57B7D
-26D405F2
-C8A1CED7
-7A6338C
-A497AA49
-95602B8B
-C6F1583D
-CF5B6A58
-81F2D693
-A34B3C07
-B7180B4C
-46C6E5CC
-8C3736E9
-980482E6
-8A34B532
-B698520A
-20E9DDDC
-A5D8B27
-6A0B3989
-10071434
-C82002AE
-8A343B26
-2FD61FC8
-C1257546
-FF154858
-1AFEAE33
-C2B1532D
-D979A2DC
-93F9FD3F
-769B0DDF
-4132C851
-A372D4CC
-6A5532FB
-E8F203C1
-A421B3A0
-B50F5C9F
-AE5B067F
-8CE6F896
-8BFFEABA
-B0CCFB51
-D455681E
-FDEEE781
-A4873A97
-E3FAC8DA
-5039A29
-C703A1CF
-E4E29AEE
-39C0B0DB
-DE5756E
-303C7D43
-586246C
-41ADBF9B
-D1CD7207
-3BC8FD94
-7E50A650
-390914DC
-ABD6170
-ECFBE529
-3D51360
-569802B4
-25F255D
-1523D176
-9F98AEF0
-9DB1B681
-DAE01D8
-46D4F7B7
-47DD8DB6
-23BDB9D8
-90C47F30
-998BF564
-5D60F7E4
-309B5851
-9D246C3
-C1895130
-1F918DFB
-6F303265
-71E0D0A7
-77F2FF64
-589BBF0D
-A25C4510
-9F05AB6E
-4990B583
-D335BD7
-6CBC0400
-D7894817
-36176CCF
-1C6A98BE
-53EE793B
-4003C3B3
-9E46BEB5
-57647A51
-D5599FED
-38156D3F
-B1F425B1
-7AD6402D
-74B619BE
-A11B18AA
-9C4211AF
-DB076668
-7A94C4DD
-6833F9A5
-A088A4AE
-6A70BAFA
-BC6740FF
-B7F6508A
-F3BAF225
-29BF8108
-7F074F1C
-18B3D5C1
-8A948077
-BE0483D3
-46B195FE
-D7AF0FD0
-C31414F4
-B5BD4871
-CFAC4C37
-57D2D42C
-10A73F90
-407A80A8
-21C50A11
-22E165A0
-8361F9A8
-EDEA52BD
-28F3650D
-CAD63254
-9AB9033E
-82BA1020
-E6E6A470
-9C829847
-BC3AB877
-A91A7C99
-1ABAB07E
-583AD9D7
-9AFA901C
-9AE116AB
-27B4F5A6
-877D0225
-92DEB3AB
-BAA1506D
-EB04B325
-C275FBF2
-2331B6DD
-74F623AE
-933EC4BD
-9470C6AF
-6C0828EF
-AAC0532D
-318961A
-29C176E6
-4011BAB1
-895DF78F
-410AD703
-F363E54D
-B4913DBE
-6B5047EE
-E7099A72
-E2961301
-E587CAE2
-1449E31A
-EB048AC6
-D21BCEF
-EACEF00E
-EF09B5C6
-2C050BB2
-D660ACA0
-361BA74E
-26D1A92E
-10F1FD22
-DAD028BE
-5DDB96F4
-A1C8F873
-66F44797
-DD6019B
-618F707A
-4E4525A0
-551B89EA
-6A93FE33
-8219D90A
-5E3E3FA6
-C9C25F24
-D4593D42
-CB12B9FF
-B09814CE
-DAF289CF
-C59234E7
-6C96C435
-1E7337A5
-FE315E60
-451A4E00
-CC3E2B8
-EB1AABDF
-B2D1AD85
-2A12A008
-B525A4EA
-ABE700A4
-80603A44
-3E2E49F6
-48630509
-9673204F
-7B0DEAD3
-B0B2B6D2
-68C0453E
-BA31833B
-4BD68812
-C64D0638
-A8987E25
-48850A6D
-9B337E66
-1D99461A
-D47AE0D1
-2E3023F7
-29CD452B
-A211306A
-15CD90B9
-D5D57C24
-727FA881
-51316FCD
-BF62F735
-9E67B311
-51A2B90F
-CF7C9936
-A537087E
-3EB2EE91
-8F4D2C93
-F83E1906
-826C14F4
-6CBE676
-ED2DF931
-38270781
-4C567B1E
-96BD9972
-E089656B
-7DD03E9
-534E777F
-695B12CF
-338EDC74
-D5E3DFDD
-13937C2C
-A386AB68
-CADAD94A
-B624A652
-9E4D0656
-3BDD26F4
-8B9D1ADD
-180D5005
-E8744FCF
-6CA71503
-20697624
-49269DB9
-B27B12B1
-AC181CE2
-9289684A
-E5D3A21F
-6A79B5AE
-EE6DD5DE
-355DA7A4
-C5B13162
-5FFA0324
-602F32A7
-85BA4032
-DCBEE18A
-D76BFC80
-4B72BA0
-4101BC2D
-A3CB1CE3
-4C6262A3
-59198E3D
-AAD7C84F
-4DFE129E
-E8153DB5
-66EA03BA
-D3247EB4
-750DAFC0
-68FB3A27
-67005B98
-C2255031
-1D9106CC
-7FD4C833
-491CF81A
-28D5F0BD
-E2275FB1
-762FF58D
-D9D940D7
-C6B5CBDC
-810E0D6B
-DAFD7E89
-15C3544B
-D7B6A237
-3DA125A3
-3272795
-A7BCF9DD
-4FE52CD5
-3FB69C23
-4F106EA9
-3632D2EE
-9DA08D3C
-5282D2C7
-9575F24E
-D390A80B
-2897EB0A
-A4B9FBE0
-DA3FD83B
-EAA2A95A
-73FC7AEE
-CCDBF4F9
-3EA97EA4
-A8AD7E75
-C533A490
-3FCE73
-D451BBF2
-6A71BE12
-76E1EC5A
-1845E1F8
-CD2B7C0F
-4D92E7BD
-81B44E4B
-65E1B458
-6B69FD73
-86CE76BD
-88B1CA29
-EA1F0D7F
-43D393F9
-C85E394
-B5C665F0
-AE373F77
-46196293
-E6057838
-7C63A634
-C3F66075
-1F15C3E1
-ED457843
-83F9BA3C
-D8B8A399
-852DA2FC
-3B81F785
-DFA3848
-877B985B
-1C82BEF1
-6482EA27
-A4F94E9D
-9FB72748
-47CF963D
-C514BF88
-4D4B79D
-232D2991
-3DEB3B5C
-49784213
-9D79AAEC
-EB89F7E9
-B9F9993
-71528CF1
-E1390DCC
-F4655453
-97847A30
-3C30D55E
-72649CB1
-F0647A6
-C6C8AC04
-FB48D1A
-39EA9573
-70C70D43
-3F6BAD93
-342ACF49
-F37B506D
-EE64D0B3
-4DC05CFD
-79E116BD
-5458D922
-3957971C
-970D89F1
-9AF398C7
-A9A651DF
-D3A64902
-27339129
-2FCC3329
-B1C70D5C
-3FCCAD9E
-C10A34
-80B546E
-7EC04275
-512434B7
-526742B7
-E96DE8A8
-27CE6F9D
-FD566C7B
-8DB1FE12
-93F810FE
-C660877D
-348D5704
-BB3F2FD7
-9F859C53
-907BB57E
-318DA95D
-BF1CF416
-3E8BF68B
-BB8CE4F6
-A9954212
-D1A396D6
-C33F5A44
-2DC0A59D
-5B66EF45
-1CB288E0
-D6874F40
-E275F00B
-E6B62E72
-6BB1EE97
-389CF9D6
-8C093ED1
-D4CB36E1
-12F4840B
-F18A2F83
-782EB525
-12BFBACE
-78F772C4
-91988F79
-55BE57F8
-6605D204
-5A7471F4
-355005FE
-267A8C9
-CAB49590
-9479E9EA
-BEE93B2A
-34E95C45
-61788682
-6B99ED61
-33D4D3D8
-DD149E5D
-D3BED775
-287B4087
-A2552A0E
-477D609D
-96765321
-2696E220
-3B6E26E8
-5CFFD0A4
-FDBF561C
-4C41A4FC
-B0637D44
-85DF60F0
-539171DD
-9A1D1F12
-72ADB48A
-D8C0C9CB
-E4FE15BC
-24EB5C50
-E1A9B3DC
-360563C8
-F20C02CA
-E9FBE774
-B2FEE97A
-EF34194C
-6DA8A0E1
-ED9FFA1
-4EB5D717
-47D296E0
-FA147414
-C1F868CB
-761182D1
-6B9F8311
-7A99903C
-95449FC9
-A349B21D
-F2AA6E8E
-CBD733B
-1EAA2224
-C7CC9CD1
-DF3D1C7F
-81343E5
-30682CA5
-65C5BDFE
-811D5CC5
-8D2DEF35
-D8B4F4DD
-9E121109
-FCA97592
-99E76951
-7CFB5D
-8489CBDE
-D7A8D721
-ADD1A5B5
-4A96DA59
-CE6C2C78
-17593D2D
-F94AF7BA
-6CE767D0
-DBCEDF25
-43629583
-CDB11A86
-BB630047
-8A579D2A
-FC17AF19
-ED54597D
-9BCAA00
-B7865C74
-BADFD092
-9AB0AF05
-AE371DB7
-EC0EE641
-A9781E96
-D1B8A429
-FE9A2043
-BA4C2CC0
-F243E36
-78A88066
-70925DF6
-97A35A05
-F18822EB
-212A79D
-666D7F82
-4558A3AC
-FCF953EF
-F8C6DD4A
-C535BE4F
-973A007C
-4DB7E662
-C8995287
-B3527C60
-FA4F7A3A
-D417AA12
-D861531D
-11A81498
-5072EC65
-5886C667
-7EF848B3
-CA4ED80C
-3DAEA7BC
-34EC1028
-349C86EB
-6423A583
-22A163C
-339CC766
-E93138FD
-7A79EA77
-E480913
-1220E06B
-65ED8DDB
-ADF487D5
-82CAE485
-A88E6546
-3A7F5961
-4672ECFA
-425EB8F
-AA3C4450
-44CA10FA
-B1EAA942
-9EC93584
-E417CBF4
-B5F4C488
-EAB1DE5C
-10446170
-C5F9C89A
-391EF7F7
-10C62C73
-817FC74C
-DA1A9F17
-FA38D673
-D2026552
-D7CD67A8
-4E0E21A6
-56812AAA
-1D7294ED
-575452A3
-90581C22
-82E00D73
-A8FECF07
-1CB1E500
-7F51D70F
-F840E8D4
-DD73E72F
-8DED415A
-3F029F0D
-C9CC871A
-3388492A
-AA1DEF8D
-F2E93846
-F9CC596
-48221BB4
-6F7B2734
-F5A1010C
-C0FB41C5
-8693416B
-C8EAD749
-21ED8A7A
-9FF52520
-613635AF
-92C5E0FF
-435C33AD
-2550A70F
-B17B7FE9
-9CC5F28E
-690D4EB3
-5C5DCAC4
-25E14191
-B03B4C07
-50DCF2C0
-499BCF9A
-5CCD6CF1
-ECBB2C48
-A2990792
-2105FDBF
-3D62BECB
-493AA5F0
-2CF5BAD2
-DFF53D23
-50D77C82
-35CDBF8D
-E3BD4C29
-6A2FC510
-A9B2D0FD
-404B053E
-BF548C52
-E52081D2
-AD550AB1
-D4316A79
-776E6C42
-203A4395
-54DAB8DE
-EB67FB95
-46E34074
-21679614
-C395F6BF
-6D513D56
-93DDFEE7
-7D2866A
-2283CD12
-12789536
-5C1F1037
-4170B23
-8BB451B5
-A9915ACA
-784C0FE1
-50A95654
-CB574A
-8A1690D5
-D9753D9A
-3084718F
-8E429880
-D1B7693E
-A7613422
-C1707E97
-D658E57C
-1C2A8F42
-21BE34EE
-E545D5C3
-23DF7522
-B7AD16A3
-C6E7279A
-2AD251D
-FF0BA8C9
-E586EA40
-D86C394D
-1A0D6737
-5AE27469
-8A0F53FE
-1A0DC5E9
-8A56C2C4
-AD3214FD
-DD999E92
-E53F55E7
-5AB39BDD
-119C7046
-19B8238
-E21A4F81
-5DE3F0F9
-BFB5E145
-5020F616
-C2794F78
-9B7D9F3A
-8FBBF3F1
-1D9C111C
-49FEEDAE
-1C83E386
-BB5B0273
-C290FD8
-52C788BC
-86C12DD3
-6608E8F1
-313C6430
-142570B6
-F75B9552
-C8F1E8B8
-F3E5AAB1
-9E4D9E8A
-7E48E48F
-2182FBF
-F21DC3
-BD6E45C0
-8DC88EA2
-D5B67DA1
-C592692A
-979B0A6B
-783D09B0
-C2231CCF
-5CBB3057
-4C10986F
-3F738112
-BED7BBF2
-A2577A6D
-13128005
-3C71262B
-BC8E920B
-40C44CC9
-C6C4B496
-5AA9CBD6
-C7A9741
-2A8EDC58
-D2253A26
-F343439A
-13F71CF9
-A4BB5CE3
-FB52ADA9
-1AF0749E
-ADABA787
-C22B2194
-C5132023
-846C2188
-33A64D52
-E5CE9022
-CAA4C044
-E7032B82
-30251130
-22463302
-954AA98D
-52D6F132
-11E0FDD7
-D62BAE17
-9844BF8B
-68ECD60A
-E637BA92
-1D7BA1A7
-F091F891
-CC96CCF3
-E2C50AF4
-149FAA77
-F16F7294
-27212569
-B96E1119
-E7806734
-15A5818F
-4E05DAF0
-F022D5A0
-303D930
-B92CF71
-377DE596
-8835F16D
-2D0B6E77
-2A89FF6F
-9EA75369
-FCDF31A7
-8F674B8
-34D270E7
-BFE6FD70
-F165A645
-675B8D2D
-318F8DAB
-9F52E28A
-A464F277
-B998CE45
-9E932DF9
-2918A97F
-EA5C5130
-952FECC3
-7DCBA50B
-DEE7C01D
-96B96F4F
-1C6106A0
-85A1AC4E
-D62EECAE
-6387F846
-271EB1BB
-E1A2582
-D1E03035
-9EC6EA57
-300E10D3
-CB91419
-52652E8
-8291BE30
-E1D52680
-5044FC2D
-35E58D3F
-C6A01A83
-814DA7BE
-97A50A83
-DB801411
-D4C43BF3
-BC3D29C
-E4A072E8
-6F51D4C3
-21A5886A
-F744A91A
-5E12BC21
-F86FDFF8
-C320E6BC
-3DEC9656
-F89A6364
-F668339E
-44999436
-F40A8A0F
-71837448
-B09D47B3
-2D2CAB19
-3FF04F12
-D8E5CC71
-33F39593
-160D74D7
-FB841949
-95F0E78B
-B9A6102A
-A4D3C679
-4774D90A
-AC55693
-8F3CF617
-5BDA2B57
-A548BA77
-B1158C29
-FE9A4D00
-B52446D2
-E6DA1712
-3EFF4A4A
-41EF9936
-D65FB56B
-E3AED57C
-BFF89053
-192E499D
-DD703817
-C2B8C9A2
-65A8417
-670D3446
-2E936BCB
-8A14CEFA
-CF71A41D
-842BD0E9
-628148DC
-9733E864
-1C57CF93
-1A0CA311
-A1E13B05
-2C8F3844
-66C2361E
-8981A417
-A4668A3C
-271048C3
-6DD908BE
-1A933D24
-BD0A78F8
-57C44DC3
-1EE04ABC
-32275D51
-B25BCCC5
-509C83A2
-E5E1B85F
-D45DFB17
-EF39D3BA
-4F4F32D2
-8F1E52D
-62A47A4F
-7E4010A6
-189250D7
-CF3B51EF
-5E9BE373
-E9719F77
-B2741A6D
-CF19D7BA
-993284DD
-A1839978
-AC00E790
-ACD3A888
-1E74292
-6306A56B
-F9EC26A3
-9FC5BC2
-2D6F22F
-8CAAA98F
-CD2135D6
-D2F5CD5A
-CFCC3D48
-6AF7A18F
-5A3EA067
-8DE9498F
-A279E5FE
-8C1D89E2
-5D15FE82
-AB291798
-40421279
-E101CFFC
-D2D0D57B
-5C977DF4
-68D4EF4D
-22C36080
-81526010
-E5A41122
-160C517E
-8BDCEC09
-5F12637A
-F3714AF4
-D21C140F
-B1EFABEE
-E49A3E48
-E67BFC93
-C4BE9508
-21854565
-60757AA0
-FB5C43BB
-150F6634
-115BE267
-3BE8F3E5
-EBF986EE
-BA18FFF7
-82B52CF4
-50546F93
-118CCB96
-AA6603F1
-F434B7D1
-FC356F35
-C996ABD3
-CC8CF7C9
-4C2935D2
-2DC9EB76
-ECA4D776
-5D2D35A8
-7C747824
-ECAA990E
-A6078345
-CF589355
-7E9AEC63
-859E12C
-C2F31842
-6563A3BC
-D43FE9EF
-39D1717
-AB887505
-1AADAED9
-3D07A0C
-7D2B456F
-53C1B39B
-DF349267
-FD9CC686
-5C1CB396
-89DD96DC
-A0D8DA69
-F2A68012
-7F40A406
-1DBF2E24
-B31EAEB0
-5D5073EA
-19C16D03
-10E50F00
-47D3D228
-A3C0E13B
-5E801D5E
-C58677AC
-F6E9095C
-E2C0938C
-14CB070F
-11B98703
-9FBA36D6
-5ADB369F
-681BC767
-BEAE4008
-5A0AE129
-ACAD1673
-F9992AFA
-2CA14EAA
-F77F77B6
-2705BD3F
-F9C3E6D6
-D3ED854E
-4A5FB85D
-54187218
-B9B8C83D
-EBD38F57
-C0D17CF6
-8B464900
-3F8D26CA
-C0FADB4A
-7F79A367
-123EEC9B
-99B683A9
-157062A4
-91DE43EF
-65733625
-56DC9E5F
-2C88A8E2
-83AE236C
-DDBF0A9C
-18873E45
-5040B3D7
-29927CA4
-B5A18202
-93CC4EA3
-5DC2F698
-A97A1713
-A104C149
-B9C5588A
-AF182A52
-CFEC25AE
-CB1C0A91
-143A132A
-27C4A3B9
-D73DB7B0
-53AF7F76
-9A614866
-82A54DBB
-D77A5A23
-AE3FA285
-8C2EEA1B
-DD21D577
-186EBEF7
-DBACB855
-18E30376
-144A1FCD
-773561F9
-F18F3C71
-4A13E021
-8738BA8E
-1A9FF053
-56A546BF
-860C6457
-9E5F2177
-B3CD57D8
-7A2CAF5E
-F8D57DC7
-941CACB
-E70A729F
-7EDB09B5
-E972B09
-ADB7C542
-3832A659
-AF33DD9
-152082D4
-9A2A3452
-70B5EDBB
-C6549E13
-D621FFE8
-15152F3A
-7781B485
-67B0DEA1
-C787B62B
-75B9A705
-C2A30FD7
-41CF8EA
-3D2B2148
-CA0445C0
-802799F6
-FCBCCE57
-F539ADB0
-54952BE5
-B343804A
-25752CC0
-3F276012
-7228715B
-7F61944C
-DCB8676E
-132DC654
-CBA2782E
-33016B92
-30F194E
-F2D953D8
-15A92EA
-495D2D8B
-4366F311
-8F8DC099
-C4B2611B
-D90839F0
-CEDA9833
-5CA78F56
-5D5F4751
-7F37FE54
-5B8F6537
-6B89CDD1
-6728B0EF
-D2BED44C
-60293190
-F41CF0F0
-8BF08F76
-861F32B8
-2053AB98
-315DF7D5
-58BAE934
-F38B7C9A
-653396B3
-E2152002
-A4E66BCB
-C1E3F151
-AE7AF50A
-545F0684
-643CF8AE
-BBC4B464
-7B8F849C
-334A660
-3FFF02AA
-7EFF666D
-F80965DF
-42D34429
-B8037A02
-36CA2FBE
-539208E3
-D03932C7
-5C619FA4
-FC641E3E
-D01051F3
-51DF9226
-116CF628
-8055029F
-4A9130C9
-5A2701CF
-89251BD3
-52D99785
-B2C16C02
-83581080
-57D8A09C
-6D551FEA
-EE6334BF
-7D8061F0
-8556CEF4
-D9418360
-82DE39D1
-AA9CAE96
-8D3C1056
-8C67B490
-C7BA78F
-D46697F3
-879107FB
-88F4FC5A
-E7B0C68A
-3BD94FEA
-648EAA00
-22724D11
-B6F00ECF
-488584F7
-A104F52
-FEE79F3B
-689DBC3C
-2DFDA897
-411EFFAC
-546F5C25
-45562F46
-C17613D7
-40CD3300
-9908DC56
-5AE62418
-4A3C1C82
-A28631C4
-4AA65060
-5614DE71
-6512AAA2
-5AE841E7
-B04094A1
-AA8F8123
-593A95CB
-21919833
-DFFAC729
-106727F1
-273A2977
-85E6CD4A
-E9751C6F
-DC308E67
-40F7722C
-1D8986DC
-489D6002
-7A869A39
-6E02A88F
-A04E30C2
-B98C740D
-3672EB58
-9702EBCB
-2CD4FB56
-A0CB2C94
-47299608
-6BB5451D
-36EB4DEF
-763593B9
-40029F5
-9392B153
-777DA521
-3125CFB6
-E60A4DE6
-98B9CB40
-819091F6
-83D23CD3
-ECE09D62
-22EE60D5
-29A3F86D
-797C0E72
-1EC708F
-76F78D62
-E527F0A5
-F11AD3D0
-BBF11E9D
-5E944B45
-D090FFCF
-4B8F7B5C
-96ABDB47
-2F5379A2
-38FD509C
-F49D4D2E
-F5538B3E
-BAD3E277
-E9C9831A
-22D3C209
-CEE03CFC
-EB55F3D7
-C61B5224
-6C4E6ACA
-A63B52BD
-695DBE54
-3C68D8AE
-847F8449
-72B426E6
-95642CE7
-B021A768
-AB094E2E
-90D8A573
-D3BFF1FB
-460DD461
-EF32D23C
-868AEBDA
-6BEC2EC0
-34D18392
-6C9D6621
-6CE02624
-75E6AE8F
-B5BE7494
-A033B3BE
-EED6D471
-99D40A8A
-BC742254
-530DDD69
-77698872
-E89F0ACA
-39716DFA
-C811D562
-FA7770AC
-1F68B8E
-7D325ECE
-8CD870A9
-DE561FD2
-8D49A512
-979F1346
-CBC53E73
-E779994F
-354561F2
-ECDDE60B
-52EE9980
-46AC0C6F
-555C8C8E
-D382E1DE
-2A9A602B
-4F18FA80
-96068D7F
-D1E5CBFA
-957912AF
-DC0A3107
-77CFB940
-E7161980
-EB44FE07
-C1597F4E
-FFE737C9
-ECBD5506
-AF75488F
-6D0BB14E
-9ED0A181
-8EF54B6D
-4E69EFD
-9337A7B7
-A880D3A7
-97A5D09D
-FD9F77A
-7CECCBB1
-2869D0F4
-F1806C1
-F9FEB241
-7D368AA7
-FF972C5E
-FEA0C745
-CC1413
-DD4CEA96
-FC8C6CEF
-75727E51
-5A17C784
-422EDDB7
-6505031A
-5662B865
-D7848124
-A93A9AC
-D874DF58
-FEFDE7F8
-5B3E37E8
-5CDC346E
-CAAFB037
-BF2135D8
-C6977D49
-8D61C84A
-C6B1C620
-30AF013B
-B98B3270
-CBBE51A9
-43E26F1
-99534D9A
-11DEC7C2
-F3952B8C
-52900E87
-80D2B350
-838A2A8C
-F8BFC35A
-AF0466F9
-CCFC01C9
-C4A559B8
-5FED8BFA
-ECB87D1F
-7BF187
-4662AA70
-1274E59B
-41188FCB
-A769BABA
-38F43333
-D4645494
-3E464034
-6F3BBB27
-8149A2D5
-D3D96C7F
-C04CB115
-DE3B6C40
-B94FC85F
-E0E6291E
-3E22885A
-30D35E07
-81014DDD
-A40ED586
-A713CBC9
-7E0CC084
-439FE695
-F4094931
-C293453E
-741A83B0
-D9C2E5F3
-4E623673
-309436D5
-807620F7
-7DE3993B
-8F31B5E7
-F12F65FD
-66763A72
-D3606695
-ED7794EC
-8BD7EF5B
-5B3449BB
-D9B93EBC
-5CF89E53
-103CE7A
-A1ADA14F
-BD020E01
-F737C35B
-8695E1B
-2AAC416C
-43B6BBD5
-31036C5F
-E5A61222
-F3E01282
-9A93EECB
-BA874043
-1D010D4C
-3F45AF54
-662F04F8
-279C9BE3
-217787A0
-1D399000
-6669B218
-A8F4D699
-181ED599
-A584DCDF
-97A49036
-C5D4A8F7
-3C7351B3
-E4A7A0A2
-9A13953B
-A9649AB5
-E9B91DF8
-CA6E2F04
-F0B63E4F
-C0F55BF2
-38EBAE63
-8D8A619A
-1A798058
-E5C218FF
-8B67C799
-A81704DD
-2562EF33
-74B37ACB
-B2C84D35
-2E0EC87
-5CAC361D
-7FA10429
-DDC1672C
-3574275D
-A831D84E
-65339BB4
-4B936FAF
-8348EDC1
-B1802336
-601EDB14
-BB5E4EC
-48CE4DD2
-4CC93BBC
-E77987CA
-6348CFF9
-90830A68
-1BF0414
-C2BC8AF9
-3EDED4A4
-66B38B85
-CD6A6E08
-92B71F79
-6BB2BA9D
-B4EAF374
-5B723892
-C350B751
-D7A56661
-576B1A79
-C66D8E1D
-442DA54F
-ED0C819A
-809EBE76
-413B884A
-817EF987
-D76CDB84
-90F40F80
-2BEB3E69
-C2782488
-F07FF38C
-93AD0DA3
-C3E8DFD3
-5B804608
-9CEFF79A
-BC524335
-495E18F4
-7FEB37D1
-A8F15A96
-3AE50033
-9DC5D0BC
-D4A241D8
-8F3CC38A
-4573A224
-5A3DA58B
-B446C862
-69EFCA93
-83B911B
-CD50A370
-2E05D74A
-407D2B79
-AD108E34
-95EA144B
-EA3DE818
-7AF026A3
-21366692
-4D5B7972
-C7D14546
-B6EF2543
-48E7457F
-6947E018
-F6B2DD01
-9FF698B9
-EA11BADF
-741FB523
-70901C0E
-6A71C468
-8BD95624
-1D98077E
-EF7CE480
-21F44B08
-563A0A30
-D9165A
-7F8E8474
-219FFBE2
-FE1D6D6E
-F7B8D66C
-CA49F15D
-C481484B
-85D5310D
-3FF17830
-8F69C740
-590A3DE5
-867A85CD
-21C9758
-2E625FDE
-7CD5B8DA
-8BF43699
-AA17B723
-C0DBB2D3
-617F6819
-4D6BE357
-A2D89B90
-C4B19255
-748BC770
-4BA5F90C
-2AB43820
-CB75746F
-FE7480E4
-239B7D6
-2567653F
-7BD1399F
-55A842E4
-572D6A8D
-CD1600C
-6C880525
-1C18F7EC
-C9C74D53
-AB3AB21E
-F5EA5F69
-F6F730D5
-FA454FEB
-978E940C
-64D4DE80
-2BB0D31F
-10268273
-D060E295
-85A74B89
-A7A3AE03
-7B8883FC
-D0615497
-9D637210
-105C40E7
-F9FB184B
-B4E67A79
-373530B8
-30E04C2
-47A1D75
-A6A67936
-1B789F9D
-AAC21CCB
-E00A8B8
-517BDE82
-B1004DA3
-3F745A4A
-8FD0E21A
-529E48CB
-BE6AE2A5
-DFD7DE91
-145FF288
-2B1AD7B5
-C2AE7259
-88B84292
-373D8796
-5E4B4FC5
-971622EA
-3C6F40B5
-5FBCF21A
-144B7DE0
-C588DF6D
-804B7F0E
-4B6714FC
-C1C2E61
-1CB08E0B
-6355112C
-1912B0BF
-22263C9C
-954A5DE3
-4520505E
-459D0661
-70FF554F
-F1FED0C0
-D1F602A5
-AE5D07A5
-B86AAF05
-452536BA
-B00C120F
-1431099A
-42F0959A
-FF1EAB1E
-9FD43C93
-5076B428
-ACB3DAA
-5D0BA50
-16E00180
-90E21E72
-D497B8D8
-8414A6CD
-B933AC93
-18B2DC20
-5BCC1468
-101CA9C
-5AF125FB
-E65A4FBE
-A5B927FC
-A8163208
-CBC14C7C
-A00E7C50
-62DDE328
-3704BAEC
-B354A1A8
-1FEFA49E
-BFA928AF
-73EBAEEF
-F21664AB
-B82DC773
-397C3EC7
-6DF7A081
-7B57E52F
-43B47A0D
-4BB8B26E
-748CD62D
-1D057255
-3A01A19E
-ED35DB9E
-B9192006
-9DAAEE03
-6F88BC5B
-41F22AAE
-DAF9FD8B
-8A8D06B2
-99E4A71A
-E0E5802
-AF2050EE
-35D07382
-3CDB4F32
-1587CDF9
-29E0BC17
-F6641B4C
-35557A67
-20B08FD9
-F89BE3B8
-994D534E
-5084DC42
-B49E2B0B
-25AD0456
-B05DABB3
-102657BF
-FA7342E8
-508B7BD7
-FED0EFE6
-5EFAD4C0
-15101C27
-420BBBF4
-1783F9D0
-CA890820
-BD3539D3
-578ED490
-1DA8E967
-134F8B74
-D6C5A224
-8C8B1F06
-8977D881
-541937F5
-9013604E
-4B54F163
-A9030FBF
-A9EF1A9C
-CB29FA97
-94A3F001
-4069BD15
-C0D5E43E
-4E17F81E
-90FFEC8B
-32D0B0C7
-4044EC4C
-7D7935C3
-BCFF474A
-9AD1BF76
-2ED2D299
-263F8852
-4073932E
-BEDCC036
-7A548119
-ADF45572
-7D8C451E
-465569B8
-CA9E87A4
-731803CD
-1DB59C5C
-A90C6543
-A22221B0
-173A0706
-E040DBBC
-941E546B
-5503B9D7
-CC5D8948
-F7FE8FB5
-1AA3AAD0
-20229A2A
-82CC4C33
-746BC086
-E9F90D08
-2B356E1A
-14897456
-D9BC34FB
-9056CB82
-1DD450BD
-BF64BC9A
-166164AD
-94363CB2
-ED715F84
-CF4D9ACB
-BC0EA0A1
-46E9697E
-72428536
-D9569B91
-2B84C8EA
-D4CDE0CD
-E439EA2C
-E19B71D5
-E45E8566
-541A4655
-845B296B
-B2E478AE
-1A35840C
-C94F4E9F
-A7AB9164
-AAF8D027
-82252CBF
-20106216
-ACC1C08E
-57E445D9
-FF68B8B3
-4DAE2000
-B5A7ACEC
-1E9BE78A
-88DC5BAF
-C8A00837
-210B7F85
-E2A072CF
-144DA567
-C6467799
-4BC0A056
-C60819E3
-B2B1ED7C
-C0ADC696
-56F0E8AB
-8D538C1E
-879C3079
-6EE2F434
-7B9CD649
-94A30F21
-7DA211F1
-64035D90
-916A9128
-EC9C52F6
-92991BB2
-53F4309A
-5AA71420
-F9B67D20
-45706BC1
-E71E83B
-B091D34C
-BE56577B
-7D3CE09C
-1A3F1DD2
-F90362F3
-3FD83E38
-E8274EA1
-CDFDF1C2
-62FD4CFB
-C3A1DB75
-15E3C709
-B7F81AF6
-E58D41BC
-5376E522
-698DCBFB
-C76EBF96
-46682F6B
-E5C0AE29
-50259284
-91A4E263
-4B03C104
-4B04D974
-914FF9B5
-783CEFF4
-4B232A85
-303E2F77
-6E902ACB
-8D630D23
-9BE394EC
-461237B1
-22760BF9
-B1F5BDC8
-F8557002
-9CA2BA41
-76418996
-B734B9D6
-C5D4B1EB
-59F49A63
-4F9C6BB0
-219811DD
-CB536800
-BDAC548A
-824F1A42
-5CE7C68B
-AC7A5DE8
-86D89A36
-49E127B3
-EE0E8BFB
-4997152C
-A43493BE
-ED7179
-1049E699
-431EBDAC
-379BEDAE
-FBFB2AF6
-72C255F
-F37B5D5C
-2D15F748
-7759FCC8
-D6730ACA
-52AE1913
-D709F4AA
-581518C7
-BE85DA4D
-1A24C4D7
-50ABC4ED
-7B50804D
-194F2CD7
-A56680A8
-1520F41A
-A614FFCF
-5F66A0AA
-46877891
-4926E937
-74E93C8E
-62515A1D
-8F3F6DF7
-AA4D19C5
-8057E286
-8C90FAB5
-4AD3F2DF
-D953B36F
-37D20E08
-644A2AFC
-5CF19FD
-8C9431A7
-EEDC46C5
-F86BE6DC
-6C12ED6C
-5EDE86A5
-7E59C795
-5EB83E6
-6F36E55D
-D9E35BDF
-CC7E1D72
-21A42C4F
-332994C1
-4E460BAE
-C9A0955F
-C080A0A0
-B2013D50
-E6CB68DE
-E9C759D0
-4A1C7783
-D1028E6C
-CEAC9773
-189398E7
-B57C20FE
-D0D3E05C
-6FEC2AAD
-17643391
-1291E620
-978A16DB
-37BE98F1
-9F773872
-1BEB32F2
-CF3DA84
-3088C11B
-2BEB338A
-1F308D75
-DD542BFE
-C568D953
-BEFE8926
-B9E201D5
-EE6FA353
-826FBE38
-CC867513
-A00D32D6
-CE9B8989
-8D3CA53C
-1718DB6C
-CE2AABE9
-8FF0C7CD
-DBEC0AA6
-E75EC71F
-FF266269
-3D7D0B68
-D606EE1E
-56F86B85
-6B67916A
-B164B35A
-D4E7337D
-D7A68BBA
-A39300CF
-D7C72CA5
-A32F6380
-385F8023
-1FF83E95
-F4E55989
-6BED2F68
-C714269C
-4D2E9366
-8C1A2FE6
-84756541
-6D353F18
-741B7419
-3BE84DCE
-8FFA851F
-FCA5E50F
-519AC53
-2E36273C
-995F9DF1
-A1A165BC
-F5E804CE
-DD395EDB
-7B2D8A34
-FC3F84B1
-19EE5FEA
-EB2CA6C2
-866CE073
-B60059C0
-35395446
-BD2B582E
-C6E73349
-634D409
-B9AAD6A6
-81B516BC
-6933344A
-806F4464
-22AA3AB2
-A6FA442A
-31DB2D66
-F64AFBC0
-480C5B8F
-8CE98937
-F8BF9101
-395669D0
-A560F096
-C8A13D26
-9C62AC71
-C0EA2E1
-BDC5E76D
-51C79BBC
-E84416E5
-30CF1A91
-E87F3E55
-6CA51768
-4D09690F
-D488F996
-ED850E82
-510DA36B
-709F9D1
-A6AAD3D4
-E0C4B7BB
-1A581776
-2F11B35C
-748C7EFD
-A2F0722A
-A8C6D678
-915B88D8
-42E5FD90
-25B58AA4
-8FF166C2
-B5FC3947
-6427FBD0
-E1C01EC7
-91FD1568
-FE570CB2
-BBEE870B
-811FA63F
-BE89954D
-C83ADB4F
-C1B4D237
-65AC0055
-5E2B279A
-3FC59820
-B1634DAF
-AC02E4BB
-B9D8412B
-AB22C318
-9E528E95
-F4220FD4
-D83A7E2F
-7C013BBC
-23849524
-BEED0AF2
-C9AD6213
-4F367F0B
-8FBA0438
-EC5899D7
-A4111441
-2D18DAF5
-E7349E7E
-57AC8D6A
-A27E98E3
-AA1A992A
-5E7E0E0E
-AE4AF437
-20A80262
-AE20A4C
-2CA493A5
-FFC756B3
-68045EAC
-A56BE46A
-7B3EDB89
-BF17C1AB
-445B3851
-FE16BE78
-23D0640A
-694D05D9
-D76F0407
-AAC3808D
-8D2609FF
-BDBECF1E
-D6074958
-7EA401E2
-CAD394F3
-4A67FBFE
-A2A7FBED
-59E0B573
-CEFE2B20
-2BE6EB1
-85FF9E57
-42C7617D
-E9E01845
-43F02D16
-DF309F8A
-880350B7
-65CE706E
-CA6A2B8C
-5C38AA9
-6C60FA8
-42BAB35F
-9453366B
-D5864332
-A25A3164
-F32EDF79
-C757635D
-F6712B29
-4C43A3E0
-80D02D7C
-A9DB16CA
-55270F91
-3FE8F468
-AB0C835E
-DD8A2F64
-D9551C26
-4642684D
-69D1935E
-9A7A2413
-E0BEC20B
-14724D4
-B4A43613
-559418E
-1E4A709B
-A32F1E7E
-EFEFB7A4
-5B26F487
-E6CBF46D
-7139D0C0
-EC214DFF
-7045BA9D
-A9AB902A
-CAE7661B
-3B50F210
-A065F80E
-B353DA84
-E6538D1B
-965D76CE
-E7F01488
-A1E57BCD
-76920B33
-4EC379D2
-43909492
-8F621446
-C9033570
-FEEEB7B8
-E6FFA222
-E8CDDAA2
-3C5C0252
-A63AF91A
-D545D3D7
-28ABECA4
-EA14F18F
-23FF43B0
-F9F0198
-24568599
-71F0C3DD
-63975EB3
-BF3AF93A
-7B95B627
-9B0D74D5
-20967FF3
-A621FE0C
-6CFF968B
-909CF3B8
-79B5DFFF
-FC87A4BC
-5BB19840
-DB7D8F85
-D4641400
-54449140
-CA93FF98
-85668EF3
-C871B119
-58D44D70
-D93434A8
-453FD827
-906A01B7
-FD446B38
-CB63F172
-E4B0DFD8
-D4FE1E63
-C78583A2
-1D7463DC
-7D69FEE0
-93EECB26
-337FCA9A
-5D5D7447
-1ACDDE16
-C4CB8D59
-F178B39F
-292E3426
-7A1A4318
-DCCE0A6D
-EEC1FCB9
-3B264208
-F9D7CB6
-9A23DA53
-58B2B3A4
-654072EB
-6CA920C5
-E145E547
-F5FF4A8E
-AB7C553C
-2A84E62D
-6F6AE7B2
-322DB9DE
-17E670D3
-7BDFB473
-7CD05987
-5B12A205
-5E9FB325
-542A1478
-FF46384C
-69DE91C9
-65B4C13E
-78DA8BBF
-D85BC864
-3882BAC6
-444A8F13
-886DBD37
-2613D1CA
-7CF2397E
-513D4563
-1C57D4F0
-32B75B54
-E18B4953
-B59C2B91
-98F11972
-594CCC07
-39BE7B96
-B14E5D15
-ED093697
-953DA37C
-6FDD4B93
-8D678AE0
-8B149A9C
-B9ED6AC
-E4FE210B
-44EB15E9
-805CE5D6
-62FF689B
-E6C011C6
-42C85768
-EC22FC81
-16858F65
-6A6BC5F1
-E5090FDE
-482D0881
-65EAB7D8
-620494B9
-6160FAE2
-542E102
-81BCAF6F
-C31AABA5
-BEFFEDB4
-A802765
-68A8ED5B
-A47FADCE
-3EC1897A
-4DBCCC04
-83EAFD50
-6B8E05E7
-4FA1891A
-9C2FCD23
-9ED7C877
-15FF9D1F
-67DE6F18
-D2932D4B
-E4B31601
-60B47713
-C1326724
-1F5FD6C9
-2A54C06B
-599854F5
-C2121D8C
-2D0FAD3B
-762DB289
-CCE2E11E
-622AD608
-29836424
-C9F1F838
-4E0F9445
-16C53328
-B9F2FC2E
-28FFB831
-7C216796
-E065DC2C
-561328B
-92EEB73E
-BBC5AE83
-2DE49E4B
-BB32B7FC
-E59D7B63
-B3375867
-5523615E
-5532A7B5
-6890882D
-21F33D70
-EA855CD7
-CBB7B3A1
-DD9C122E
-5CEAC143
-E9E4332A
-6F658BF6
-57E90D54
-715AA7A1
-DE7768FF
-D8A3302B
-1BECD73C
-AD442F70
-EBBCB63
-5D25E0FB
-EF9854C7
-DEBB6E96
-61591E99
-BE06EE6B
-F74EDD0E
-124B1712
-45833671
-1227307A
-546B647C
-9D2398D1
-DDB609E
-EB68EAF7
-F05AFA0B
-A6EABBB9
-60B5FC76
-992D25CF
-A99743C
-5FF72996
-E3D84005
-F47AC3D6
-D92BCBEB
-3AD6BC2D
-399AE49E
-FFD7134A
-80856732
-8C92A116
-D23F2A7F
-1C1FF7CD
-7E97215D
-63CE5EAB
-1E3D6441
-8CC7E1E2
-3144CABE
-1B369565
-E681B9FD
-3F72A224
-3146105D
-68639F13
-61E4A798
-CF28AF43
-F18B6903
-F4D16333
-557BEB41
-F5DEEE8E
-41F036AB
-D0DBBD23
-E8E240CB
-8FE50644
-8EF8CB38
-F8D6EBA6
-580EDAAC
-25F0FEBF
-1E09176D
-CD156787
-8198153A
-3D5D3DE3
-5132C51F
-4B39B7FD
-15BAA338
-AC2E0CAE
-91DC2332
-3632CBA5
-2AD744AC
-EF31B613
-6A9D8019
-17DE8C90
-E5CC66F7
-E81411C2
-C5B6931B
-E8CF72F1
-ABF2E66
-5B7DEA27
-340E7880
-2B4ED84D
-F6E86748
-9C181F92
-55DCA269
-1CEE9C9D
-1DB0A271
-B1BB73B1
-2B802754
-596ED430
-25F4A422
-E186EA6C
-A0793E1F
-B54A8F34
-4EEA557C
-A8085CD6
-276D7E7A
-F711A6D4
-2534D88B
-FA8CEFBD
-A7E9E1C7
-EF6F2E
-4620FD63
-7955C107
-50E0A968
-81DBA8B6
-92E0F3D4
-C78C01F7
-CFE5AB0F
-C290FC3B
-F12CC1D9
-56A9B1DA
-69AC05FF
-964D8EE
-EB198C02
-A3D9435
-30D0BD52
-2A1A5868
-DF336813
-14C97AB3
-BA6717D1
-43FC05DC
-32A6FFBC
-C47276AB
-DECB3B2F
-1511FAA2
-155693C7
-E5BB37E4
-CB20ED97
-FDFD4014
-FFB25A3D
-4F8B2CCE
-8EC8D538
-A60DDEE4
-9E6196D0
-8895A4D
-A2528B98
-D02F59B9
-47662556
-4FAB84CE
-6C7FC2FC
-F351CBF4
-F1917707
-B1F2737C
-B46CC768
-F87757B9
-A24CA3F5
-74EC8337
-C46290C3
-77BBC380
-1B3087DC
-C816F73C
-6E2C562B
-27C3E900
-4FB423EC
-A77B1E37
-51063C80
-432108D2
-11F0367D
-1D08F91D
-D56068FA
-F259DE46
-26CF3619
-6E6AF5EC
-10AFB2EE
-14F925E9
-5382204
-9F482CE6
-90B0897C
-C768AA0B
-654ED88C
-AD60966B
-8EB54FB3
-26275630
-A1C50A7E
-21587F6E
-9496FD06
-4B768A3F
-1798404A
-28C6B4D8
-5B579E3D
-C79ECD09
-EC63FA6A
-162A0135
-7FB7DDB1
-A0167E99
-196F14DB
-CCD227F3
-3FB917CC
-A3D30D38
-71874379
-E9E489BD
-5DA989C2
-4F7C8E1
-F6E0502F
-F8445D16
-25CC5FFA
-FB06FF63
-CFEA3C99
-E41A8123
-6A5A256C
-D7B67156
-50BDCCD2
-8165541
-F067F327
-B1E17258
-6901F3B0
-8B8CA0AC
-CBA88A2D
-4736E05D
-DD5AD020
-35B501DF
-73C67F6F
-F2C513F
-E6CF7C2D
-E6A85B1B
-8AE4F7E6
-1ACA7CFC
-BCFCC182
-2930369B
-642DC973
-990B6772
-681EC185
-164AC235
-9C676AC8
-B200AD7D
-F13B8C8D
-9D22DB12
-CE95663D
-CE956E42
-29485F4F
-BC5D5F8E
-DAB561EF
-C4C15BAA
-77B9192C
-86E8BF86
-5933ECE
-E50B93C6
-F8B0CFB0
-3286711B
-DD558ED9
-DD043899
-4AFAB231
-637BB2D7
-87036D19
-9A30430F
-27798B63
-4D6E407D
-CEE251F5
-ADFFB995
-B5C885B2
-7DF6519C
-6EF51C85
-B95DAF30
-65EA99E7
-772FBB19
-49DBE1EC
-F386A79B
-EECD2F55
-8935CCEC
-BAC4C120
-C71F82EF
-2DF7E67D
-9BA39901
-9614A4E1
-C6304402
-236FC777
-D47A5719
-8098EC85
-799E34F4
-896EBD9
-BAB10372
-32ED359C
-6F9F763B
-9D517447
-22B55AB9
-8E6F4104
-15BEC5D3
-6252E010
-23B5E8E7
-D0B113BA
-965C42E7
-F2A0C19A
-24CB582E
-1F449982
-2E805DF0
-851608AC
-755273C7
-3529A161
-6395258D
-C5BD7D0C
-27BABE75
-E1628E4A
-47E5CD77
-EE797B13
-AB11893E
-2F65151B
-9CE2B20B
-233C28A5
-749A0C91
-846BC1E1
-8C36F8FE
-1489CF6A
-70FB6BE0
-D0A84133
-9734B9B7
-FF166A04
-D118033F
-BDDB2D63
-6F6691F0
-44FB36D0
-EFF2B14E
-AC02C863
-ADFD2972
-905F6E84
-7C0008A8
-4A043A53
-D104FDC0
-1687FF25
-E6CF8FCF
-120143AE
-53F92C72
-19E2E798
-EE8C6B94
-15CEA57D
-C8968EBD
-D50EFBA3
-A8EA5FE1
-E2D073FB
-B4EE195F
-8928A91F
-6B9EB970
-C24B509C
-5D340563
-85FC3F3B
-934FA012
-A2AB8533
-A6BD3187
-105DF0E3
-243ADD05
-49C299EF
-7A42F84C
-C90A1935
-3268B298
-CFA3B2EE
-470C6457
-E579D2C4
-BB10428B
-78D10FE4
-11F21813
-8424CE28
-EA2B114
-8239463D
-9804414B
-44B4FD1D
-82D50F88
-10AED1B6
-E4768ADE
-E7235A66
-C8705714
-936532B0
-15C63108
-92A91B17
-154B2415
-9BF0D15C
-5F451388
-1DC102A8
-96CAFC23
-B076C0DE
-3EBDCC3D
-6B2EE523
-C6777AA9
-F7F48C4A
-B1E8ADBD
-FA30AC90
-5173D22A
-D22827A6
-6504AED6
-3115E6F6
-E8937768
-C5ACC0E9
-366E15FD
-AB81C84C
-C27AFE96
-7361C8B1
-613A0811
-595F48E4
-1619DFA6
-233D2474
-4C174E1C
-E7DCC63F
-308FDED9
-502A0AB0
-C5004E90
-B7FBEFEB
-918A77FF
-F7235A04
-5CCB8B7E
-3BA4B1ED
-32F47DAC
-FF7348B1
-996C8E7
-7203F1B0
-70583A2C
-4D8046A0
-551119AD
-BE5B31AE
-35400CC7
-E8ECD409
-D1C104E0
-1A0858F
-F26946
-458C8B3F
-E8D66E91
-2F3F6384
-B36EC71B
-289CD4C6
-6CA9E35
-B198A8B
-816873F1
-346D66C9
-BD906E97
-802E5969
-261BBBD1
-9D7605C6
-72C2CDE6
-6C8DBDB5
-D7C8DD7C
-F43FB2C8
-A9F384E6
-78FDC918
-6D20841A
-20755F34
-F4C6AF99
-19393B53
-A525AE84
-CE881A38
-3D075300
-9B0E4DCA
-7EB7E7A1
-4C4FD44A
-78483ED6
-32D9D894
-1CCD379A
-EA5FEB4B
-F7E001D
-44FA69A5
-E99F66B6
-9E16CD0B
-CD098C41
-6DAAD279
-5FE50411
-CC855E2
-130C6563
-356CD9A1
-BFB318B8
-2E963C0F
-DC5A046A
-FE16FB
-A599857C
-F72FE561
-2914E4FE
-B247AE8D
-6A6F13C0
-B1052C98
-8086E53A
-845345BA
-D43D5F7A
-82B30F5E
-4206EB1B
-89CCA1AE
-86289F6
-567F22DE
-25624C58
-6A78EC3F
-7EC32D03
-8017213D
-3A141336
-D1CA4E6E
-FA84C2C
-FE670E0
-3238E01
-18DF1794
-A7B900AD
-1FCE47CD
-14EFDCB1
-C21B04A8
-4C3343A2
-E5E611B7
-ADD06EF0
-32C81695
-201A9FEE
-BA8925BB
-5182EEED
-7DA4917E
-CC331235
-C304ABE9
-C2A16075
-937E1C4C
-CCA0184E
-9DB6C45A
-3F2A79C9
-151B469E
-162F22DA
-D955D54E
-E857CC0E
-FFF2005B
-60AD87FD
-85512214
-E0A506A0
-FAF1A145
-9DA17F03
-332D26D1
-9EDF9643
-7BBF2D9D
-3414FEA0
-A8FE5964
-D4841879
-3AE4E5EA
-BC6B6D60
-950F4693
-70FD0254
-177C7A1F
-635FE5B9
-C0C5B6CD
-15D1D22F
-BA495903
-CC100F38
-A5F1E225
-5AB4584F
-AC4731FD
-ABB04167
-A0E153B4
-5982BDA9
-8E2EE3AF
-D635C631
-7C6154A2
-9F0EEFEE
-429B22CA
-B1346D4E
-6B21663D
-6A7EDD8A
-DA34A355
-217132F0
-683BA78
-9CD46320
-A5D3BC4F
-3194AB03
-DD66F958
-E7506C47
-17EE83A2
-4E4D80A0
-EB56662F
-BE889C58
-6F5F6745
-2A05C12F
-13D266A0
-3B2B18C9
-EF435E02
-5604DB7F
-D35888A2
-CCC34421
-55E24355
-7F607F34
-E493720B
-C6A492D7
-7DC6A789
-E01474B2
-97D35C32
-71F32335
-D3083D7
-2327D424
-35EA4BA1
-F5B20C6F
-3ED28FCC
-453A76AE
-192A79A6
-2E64285D
-A9463AEB
-374E22E0
-92A5CF8F
-E707F8E8
-B8E2FF36
-E8E959EC
-91D9796C
-F03960F6
-B62467FA
-8836A487
-6418A93F
-60932160
-3B72687C
-37BBD7CB
-1001C76F
-201999EE
-5955A1CA
-925351D4
-767540E3
-570BBF27
-A073D4D8
-FE96246A
-44784995
-232C0150
-AB7BCE2
-D47BF099
-BFA6A422
-70F4BC01
-C2139449
-F9ACB817
-26657111
-13263449
-7989D26A
-2E972B3D
-2F1C1C6
-930E479
-23243FE7
-BA7DDF9C
-50C8AB43
-952377D6
-4C6C2B3A
-BDAF48F3
-1C0BAE6E
-7F6A8C04
-F529B9FA
-9ECA4162
-342E6562
-9BD5EB52
-A14DB3C9
-14B1DC2
-4E1BB6D1
-9A1158D5
-73F84EC
-685BD9F5
-8CE72161
-5F116605
-BA861D43
-A7150AC2
-391A105B
-C8D798E8
-16633750
-33B29C4C
-54211362
-34C2D5FB
-CA197734
-A635990A
-4E606FD7
-9D56673B
-89976DD5
-5F2D2794
-81E95955
-9377829
-5DED53B7
-FEAD5592
-1CC6419B
-BD3A45C6
-65FACDCA
-7EAD0EF3
-EB856702
-D857FA75
-3B92DC0D
-E66AE58C
-51912618
-C63C75BC
-ED05B556
-17EC2B32
-9F692578
-C706059B
-D88D5576
-C2661C7B
-6D7751C2
-119292CE
-418700CA
-2A2BC3D8
-CA20D341
-8A8F325D
-D4A2DC8D
-959FD62
-67883F8E
-FBD3686B
-6B862363
-F8C13880
-FCACA893
-8215D90C
-67567E2D
-3B501BED
-7AFBFAF4
-2EC3CC34
-B360BFD9
-716C5E9A
-907B1432
-E253CBD1
-4DB52F87
-6A37A21F
-C860A6A2
-72DFE5D2
-84E0705D
-80DDC195
-1ECD4E92
-2D2035A1
-B10A5B53
-C9AA9A79
-E999CC8D
-C8C790EB
-F7629DFA
-93158872
-FAB6E7DF
-58A0A3D
-6104EAC7
-2BACDD14
-A8E3DE88
-AC4E16F4
-F7042189
-5AA6D923
-F491667D
-C769767B
-46EE7E69
-CE4BAE4E
-FA1BE581
-2BF14278
-5356E813
-6225B503
-D33A6F26
-1A629247
-BD844A35
-E33ADFB
-EFE720D6
-3D49752E
-AD542CEB
-EE36C608
-99FD833C
-BA893EF7
-47E4A8A9
-B269C1DC
-CEF39BB2
-91FD5B03
-C02E6C1D
-29A3817F
-70894875
-8C851D1B
-8446E920
-8CBAB8AE
-D9D7B185
-97987DFC
-ADE83493
-4CD1FC4F
-1D82738C
-27665936
-CE3C907
-990136FD
-E1E40CF2
-A3E15CA6
-DB7D4E0F
-D8E87ED
-FC23DA2F
-76A6A0C0
-1C7F403F
-380BCEC9
-C2BDE917
-74145443
-14C0823C
-8D73C415
-BD7B9DB4
-C83449E7
-364D21C7
-7F01C97E
-9ED9F208
-51417FC4
-D557CFF2
-5ED6B81F
-BC0EBF41
-608D56CA
-60AA90AF
-8FC8A8D6
-809BE4D9
-47CD9035
-8CE71201
-B442C067
-A380EF4D
-7B74A914
-513ADF78
-63E5C752
-6D4F2B4B
-82717D99
-EC19F48C
-7D0D1EC5
-944D936F
-358B8D1F
-D3A7E17D
-5E6DFD92
-D6D2B538
-133AC914
-22C4BFCB
-A9F4ABBF
-7DDED93D
-6836C5
-3F10AEBF
-71713080
-A1868A02
-EC341DE1
-33D409F1
-41EA5D35
-47F18F89
-7C062A2E
-1C66DC90
-D5E11362
-FACCDD77
-D96EA1F2
-31676D3
-B00B9D1D
-36F80278
-754F427
-3D8C40A3
-D1FB426C
-ED4869D3
-AD137726
-9704A7D6
-107A0E2D
-AAD92A50
-58019B5B
-F6FD55A
-E876FBF7
-13451AEB
-A530BF41
-11FCB24D
-EF5D7F1B
-BB65E3F3
-DCAF1904
-4262AE51
-8C2318E1
-96E7A13F
-DDA281E3
-7B44E7BF
-8048EB55
-AFC8D749
-D3F7E592
-23FF8DE
-105E2923
-969758CE
-B1BF840D
-D301EDDB
-42A3C6C4
-2C934ECA
-B2FB9ACA
-452302A4
-C96F49CB
-D7342392
-48A6D82C
-6B831657
-1A6989B2
-312D282B
-9AC1D170
-3FB3070C
-D83B178C
-D894496D
-5FFA91E8
-436E970D
-54DC6812
-8CCA890F
-96971388
-9CED7192
-216196F
-BDBF8734
-441B7DC6
-8FCB2D4
-1C3375E3
-19EE1338
-E8BD4F25
-D65CD246
-85157D36
-34A4CE5A
-BFF7BCD5
-41DD5123
-D92D0021
-C0265B3
-652BE05B
-7B31FC27
-E8BBC732
-E5DB7686
-2D1EAFF8
-2283884
-CE0E4257
-1936BB27
-6ED44FBF
-476ED2B
-C249E9F6
-21C0827C
-8DA28ECA
-707E075B
-10EFDAF6
-3DF4B474
-24AC5C3B
-81F8A453
-8E1AF272
-E69E1816
-C40F1B4
-5AF2AD1A
-C1236EE6
-78507240
-588C4851
-385396C3
-BE2210DE
-E8FC3FE2
-B9E7C8F8
-A33939
-B9E8F7DB
-F7DF1BA4
-400E6C2F
-1139C2B3
-8195BA65
-A6052E5F
-29E1F01D
-512ABDD6
-ABE172A9
-350BB8FB
-63D89399
-6C7CDD2F
-F6E20A15
-36947843
-7D26A79A
-133DF31B
-AB375C67
-35D4F0E9
-8060F5A6
-94893A4F
-1B4E1612
-431938A9
-F4F22D48
-E83BC91E
-98D9DF02
-7CBB518A
-947735EF
-16DB6C38
-7BBEB95B
-393A60CF
-6984032C
-F1879BA2
-F014440B
-61CAEF50
-F9BAA90B
-6D9CDB7A
-4A4C3D3F
-DD498DC8
-E27FE395
-AEA01257
-15FEAA99
-61A173A1
-28EFFD56
-A27152DF
-10C613A7
-47AFE324
-5B4D4B5
-AF67027D
-11ADBB9E
-F8B22312
-4A9C0C1D
-E94F39C8
-9AA4F0E2
-4C394A49
-41ABACE1
-6A96270B
-171F3E81
-F29DB470
-A9E7F67E
-6B445012
-B53EFB86
-B0AB92A
-484432B2
-7C789E2
-116B012D
-5A5434DA
-83DD29B0
-418637F4
-C9E1FBB7
-FD84E0E9
-BB44A4ED
-4847C699
-61807BB2
-F558A9F0
-264F9191
-697F6915
-EBC115CC
-A1604C6E
-9CD73651
-50ADAD72
-DE3698D8
-DAD728B2
-58F5527
-C58A4754
-C8CCF740
-A5CD4E0A
-966E50B5
-6DEA9EAF
-66DEDD5B
-CE18EE1B
-E0293294
-3C0C586C
-ED04E099
-A1BB7722
-78AF5367
-3F0FBBB7
-4F623EEA
-E3E1A85A
-3C8EE1B0
-D2851D20
-F07248A0
-713EBA3
-8CCDC87C
-B5ADE0C6
-54DC4354
-F7F43DE5
-AB512848
-69136DAC
-71CEFCD8
-5F264F19
-D39D50DA
-A184BC23
-57F38C31
-34DFEB30
-6B39F755
-60F7B6C8
-EA7FF406
-914CD331
-F4A15FC9
-68DB20A3
-6609D547
-18BD6EF6
-F5DDB763
-9E2C6236
-A9C0CD72
-EE8A864E
-FA9A7891
-DCE7F5DE
-4E5A9B63
-FBC574F8
-13C26C91
-70A2AD7F
-9514018
-7786A6DF
-708A442D
-8AC98261
-57EC9F69
-D8B92F1F
-5525E8BD
-CFB927EB
-47BA617A
-4A71DA0F
-9632F7DD
-4A00D653
-3FC603A6
-A34C3C9F
-EDFCB326
-BA31E996
-4158D5
-888F01B5
-F001473B
-D67ACDF1
-587F7E20
-EC9AFA96
-6942D697
-76FEFEE9
-ED260881
-53D50BC9
-43FAA199
-DA4F8CB2
-D7FE8FC6
-7A659755
-394C88C8
-EFA3AFA
-87710DA8
-DA1FF12A
-C5D4E7F8
-4F0A47D7
-E7C2A799
-EE894D65
-20E4FD0E
-8E51626
-17BB7611
-E48021B1
-4320CA45
-5315D225
-39684701
-3E943281
-B3B7B298
-A63E5C66
-11F2EAE5
-2E339781
-9BE79114
-187467D
-9479787B
-565D0658
-B43DBE73
-67F7EA80
-D1962413
-BF4B89AF
-AC03F363
-1587941F
-B7A14BD6
-AE1A36A4
-BF710690
-8009F7B0
-FB37D608
-58934215
-327E7B3E
-A2BCED7
-57DB9C90
-3E7E56C9
-E554BE2A
-6B6273A0
-766F5A68
-503BD141
-586BF1E1
-AF75978E
-D93FB741
-75268390
-BDEAB299
-9871DD6A
-9C042A7A
-4CED46AC
-706B559E
-9C9CE827
-EFDAEFCB
-A1AA3846
-330AAB65
-602F6FCE
-DF14BBD9
-8BEF0FE8
-CEC4AC8B
-28456573
-95AB0149
-43E11079
-B50D7970
-6F8F89C6
-B96DCC6C
-E114C8BD
-CF3F36AA
-E02901C9
-8B452A2
-8AFEE7A2
-FD7C3D61
-4DA46DA5
-BD5C204A
-83FB677D
-42615EE0
-3783255C
-9FA48033
-270F0FCB
-157E94E0
-CC89D359
-715FCAEC
-32EF8DFD
-829D0BCF
-E4FC364E
-A629CB9D
-7CE1FED6
-D6E9FEEA
-24E55CE7
-8BB2DA23
-2FAEBFC0
-AD6EF205
-96142124
-6891653D
-C5061A39
-9EA7F89C
-D2CA9BBF
-544A569
-E908D41E
-EAA11FBF
-4250EAF7
-6A5E60CF
-5F84A53D
-4324D154
-57320611
-DC3C692F
-24685A97
-40F011E3
-25A224E
-3712F01
-30F1AB94
-45F92B8A
-450F8D4E
-F3EFF92B
-EA54D0BB
-7E10A58D
-D51BDF85
-FA6E7358
-A16E06FB
-CA158DFF
-9AAFDAD5
-AA48F649
-A4A78E50
-F2F73CFA
-519FA6F5
-32933CF5
-9E55F1C2
-806019A2
-E56E0B7E
-5F598AA3
-564C6D40
-757BDE5D
-30757BFF
-B906BD37
-52C6C503
-D2B00C73
-5969C7A1
-84FF193D
-E668D8D1
-71E66078
-A200D7C6
-6585828A
-FF8864E8
-B9EED36
-12C9F3AB
-2F2C4A2D
-2998FE0A
-A1D47491
-59463A75
-1347C537
-77000037
-E6AC6FFE
-C74CADE7
-83B75335
-767A69EF
-4248CAAE
-1DAA4A34
-BBCDEA3E
-CE177B23
-59449B11
-A9DC563D
-85589ACB
-8926A959
-CADAB503
-6A1E5AD1
-E79EAAB5
-9C25D798
-B4750BE3
-249329AF
-724F7831
-F4D2E094
-CD605F43
-CCC933E3
-4231A56
-8D15BB64
-A7B1E394
-FF2B04CB
-7260C6F0
-A483E58C
-35E5FBAC
-A3D734E9
-64BF02D7
-24F8B625
-FBDA78F6
-6FA335D5
-5CAAE8EA
-EBE22B69
-9BE5C3B2
-81028FF8
-E20FD2C2
-CC8506BD
-E079C912
-BDE0AE94
-AA4AD182
-AE682162
-AADAA077
-C757CE81
-E4BBF694
-8ACFF53D
-D1E85D5E
-E29E9979
-9DC46E06
-A8FB412B
-CA71D109
-987A6F6D
-E5A13D87
-BCF3C6D6
-DA5A6320
-E78095AF
-C0C4710D
-7F06A362
-FF3D8A8F
-428A02D8
-2EBFAF55
-D25B93D4
-344E75CC
-ABC855A9
-E3577D95
-843C4274
-F5326A2D
-EC6EB288
-7C4C82E6
-A70953D8
-8D8B314
-8772F0BB
-3BA5025
-1BE5CFF
-9592B505
-B9FE16F1
-EF77DAF1
-4C7B4119
-8B8FEB44
-3542576F
-375EBF3E
-D0927BE5
-2C6A3AAE
-45D18D70
-6126FAB3
-58146389
-FBF50CF3
-3129860E
-4B721C54
-95BCFF3C
-DDF12106
-1E2428D3
-827395A7
-35266B84
-3CC089A3
-B8198C2A
-B8EBD35B
-7EBB213B
-A93DCCAE
-CBB25C42
-2A03D874
-46F6CAA
-82986B02
-47EA89A6
-2C3E7BDC
-852B0630
-A928EB9
-66A2BC66
-BBB43A54
-A6F55CB7
-FE990460
-5FA8BA0E
-1CD34B74
-1C0F2BE4
-FE6C53A3
-C325B6C1
-A980B3D1
-9F031392
-31E17C1B
-38B6D6A3
-E30D49E5
-E83F8C4F
-BCF13E0E
-28124F6E
-57AF5DDB
-691BCC17
-BD071C94
-DF4984C2
-8579EA0F
-92150479
-7BB67579
-58D6EB84
-97754D0C
-F569F71B
-9990D0B5
-56DAB760
-9E988907
-9679988F
-3EC5E4F4
-328D67D9
-317EB4E7
-5E6D7E6A
-BFEE035F
-D12E6060
-4F2A7A2D
-F65F5B73
-54AE1242
-ADAD3A5B
-61A81471
-FB09DC55
-72874DB5
-5302F1D1
-8B5F6A90
-82E98E7F
-E808315D
-DDF5B32F
-C35356A6
-6F1FF7AC
-1549941D
-1460BF8A
-D53684E0
-1A384C42
-D319924E
-B0B1824A
-2772DB36
-BA61B594
-712F9397
-41F5740B
-C00A34B2
-F2FCE526
-4C874DC6
-FD5ED831
-301E874C
-CE244111
-D6AEAE23
-516AF534
-FC101FD2
-EACEA514
-C23A0FCD
-650BA0E6
-5C877E20
-ACB5DAE4
-5E56E78C
-1AE6F2A
-705046AF
-7F53EEE7
-AAB30590
-2A1BD5B6
-300A6D8F
-FECD64C6
-A8FF2EC9
-27B583C1
-29CAE718
-66D59871
-16E8C79F
-14D20B3B
-446862AA
-1C5EBC93
-3831B437
-556E9FE
-B877897C
-D6FE7901
-D19ABB8C
-964EB757
-D1DAC489
-B60AFF4D
-31D01640
-A963359E
-E233B856
-58D923CF
-EF31455B
-EC071BC8
-94F64E2E
-F9384093
-36C8A1F
-AC4A701F
-657CD41F
-731CAD58
-374B9753
-EC20E4D1
-E58959AF
-E83E1021
-B7C14D53
-A651DDBA
-D54BD80B
-7291E323
-31310762
-A54A712F
-482BD448
-1FC7B562
-EA69143D
-4342848D
-C4BB4C5F
-B0B43A48
-962EF559
-5C395F65
-6C40A83D
-AEC344E3
-881E5E3A
-42D50FC5
-144B9CA5
-15DE8B4E
-AB91DED2
-17FCB1B5
-87804536
-102205D0
-E57C9F29
-5D08E2E1
-A4AA0B4D
-4FB1351D
-F3BFE5C6
-5C439E04
-33A0A6AB
-826A9A49
-D165E206
-229A4A83
-4897797B
-396C7F04
-474B2792
-351AD33
-ECCFA3E6
-901B77BB
-42B16DDA
-FB3F707C
-C6816341
-CE19D1AD
-8297E119
-4458AB5
-FD9CA7B6
-250517BA
-2E23BFF5
-F0D1C983
-699A7882
-557EB3B1
-D0D5822D
-D1117539
-F271C507
-9364161D
-6793E35B
-8AF902C6
-DA5443B8
-EE1E1A0
-B941E448
-DE0E773A
-4A41AF87
-D4AA88C2
-80B09F9E
-53F2B381
-1C8EA42E
-3D15C64F
-93FE9251
-B242B629
-F7ED2942
-6AAE674C
-EBF19F56
-E299D4A8
-4F22DB1F
-20998388
-4742F182
-F6626B60
-992FB48A
-26822FD4
-784D31DD
-B84CAF35
-B8163E9E
-2A27EE0C
-FF09CF79
-81C74BBE
-C914DAC2
-E768AAF6
-FFA5171
-CA93E6BF
-E495891A
-482A252B
-18F8FD7D
-DE52E34B
-A4986019
-E363E1CB
-EAF53373
-59FEDE9F
-2FAEAEB6
-DCE56F6D
-F10257B2
-7609DFE6
-4D0D263A
-12696B9B
-A56E0541
-8F12E1B7
-9E8E5761
-98C5816A
-F2F8EFA5
-B91C1CF3
-59A19F9B
-9235B967
-A58D23DB
-71377517
-C50BCDB3
-60D31A7A
-874811FA
-58A69900
-CD8198EE
-E4FA90EE
-51352862
-3654B5D6
-B0442DA9
-5BA67D5E
-A9B84B57
-FF61069A
-21102ABD
-8E6B59D
-1DBF72C0
-9772AC77
-F26B2827
-E985C97D
-CC311683
-E8216C66
-13E346BE
-199D0C57
-578B8B90
-84462520
-7B33C9F9
-E18A5CC0
-8F70C75D
-B9773D99
-8A8BDCAF
-78B8631C
-1AA0C9F2
-76FDD536
-8CECE336
-999E6F4F
-29EB2768
-3417B854
-A56B87D4
-CA2F016B
-69DED6A1
-8AF8128C
-27732A2E
-654939F8
-F0DE0291
-501F84CA
-815055FE
-99B595F6
-627F49E7
-2A7BE8CB
-959032DB
-7FD03C7E
-54ADDCA0
-62EB2DA4
-6E458899
-2FE00E32
-B2E74808
-35803F87
-7369F52B
-1586B4DD
-61B61CC6
-1BDD1B8F
-C6BAFAF5
-C4339DA2
-E1D3A0DC
-8AD49CC3
-673B67FD
-D81B434E
-A41C5AA6
-BED70576
-22877C0D
-71A3DC2A
-FDE1F4AB
-4FA1751E
-DADBAFB0
-1C44975B
-76EE876B
-E3B81546
-86466730
-6A3F403E
-255A72F8
-2D2AAE1D
-77717644
-63E003E8
-40CDF1FA
-FF37E1B5
-F0FC3CCA
-45BE9807
-D8611D58
-D62AB82
-EE875225
-B8149434
-FFD0F0EB
-2F3699E6
-7EBD4BFA
-3E393CC6
-39777EAC
-FE2A33EF
-9AECBEB3
-322B14DC
-DA2EB056
-1C942882
-C42C7C32
-A20E0D02
-E91D2834
-D465D9D1
-FC60192C
-D3B7FCA1
-1E9B03FA
-40323FF4
-DFA3D47B
-2C26930E
-391E6E18
-E340B164
-36FD76AB
-204B0D9D
-5F5027DD
-FB05E9F
-33C3443D
-ABF1832A
-152FEBC6
-FD83B071
-310222F3
-E07F3402
-61818FE6
-6E14F915
-F89FE609
-86FC4F17
-C860D97A
-51B0EF08
-779B9BA3
-6D9C0908
-D14ED3D6
-692E8084
-233DEE29
-B85FF171
-12FAD29A
-D37B7593
-AEDD969F
-8E76CAF6
-A7FDDB58
-B5B7DFEF
-A8881968
-50D65153
-D57A8EEC
-7D144C49
-99B10DC
-5660CCA2
-C02A1001
-7EE499CE
-8C281511
-8B43EDB4
-31E58C4
-E9EAB787
-48BD8C20
-87C33E72
-9FD28F45
-9D8374B3
-3AEBB8FE
-D25F7E5E
-65B705F8
-ACB7BA8A
-C7CE28F4
-1A365014
-12997929
-BAC3250
-3DA4DE9C
-D90B5C3B
-731BC23E
-F952A129
-E5FECF74
-26D6A0
-B61C74A2
-B18937FA
-E034B86
-6B3E73E1
-FC5891FE
-E6F5F72B
-BE380D96
-DB6DA2C1
-8BCAC0F9
-FCE57C36
-10230AAB
-8E0B6278
-962C5A14
-4C257AA0
-95B50454
-478B67C6
-4BB1F24A
-9DE453A7
-241965D7
-DE5E4EEB
-77BCEB46
-A87FC004
-4EF35145
-35910ECD
-8900342B
-C9A653E2
-9AA2501F
-DD4D16E8
-A2340ACF
-F846821
-9A2A16D3
-33BF35C8
-185C4C5E
-9A3A7865
-6CA5232C
-8A93214E
-8F9C13E3
-CF212018
-777D973A
-3531924D
-DAEBD9FA
-4C4BA7D1
-C6DD4E96
-72F0CF35
-AD82F177
-B8486F78
-C89FE003
-991E4764
-F49CB023
-14C3A164
-B6B2733F
-F78D6623
-F1C9D84E
-6CE9487C
-68F59E42
-B13A9862
-A60DF7FC
-5680C3EE
-8DBB03F3
-FE660987
-7F302425
-98915B
-3EFAFEFE
-819E3A26
-CF086D8
-EDDF6ADF
-314D6342
-C7DC4A97
-231D9E12
-C8F0BB37
-E2A20026
-A9539B54
-E2047DA5
-3E5C9D4E
-F91C18A5
-37B1EDB1
-DE88277F
-765DEA9D
-555D803F
-6FAD1516
-41299623
-66D3E9F
-B040E22F
-28C55A65
-F5BBEB1
-8F85CC9
-C1F1FCFB
-E0ACADA
-FD138889
-F4E18B1B
-6EAD0B49
-38441326
-17AEF5F
-5A6EF970
-20ED5B3A
-46A95C2B
-CA7475C8
-8FA66C0
-3F831698
-E2C27DCC
-7AB6C35D
-9D979A50
-27F30FC
-4FA19438
-321E637C
-AD72B955
-C7BE128E
-A428B5EC
-48817E5
-7EBF668C
-8DCEC036
-272C5582
-F8175767
-6ED7A880
-71E2497F
-6EE3595D
-D2579856
-15439021
-87C91FDA
-A5682821
-E3FC8D77
-1545F959
-6341300
-D52520B7
-B0A0FAE6
-6F1C6BFB
-226DE897
-4449D2DD
-7E378981
-55A93F85
-91BFE157
-434EAE2F
-AEC8DFBE
-929F369C
-DF654EA5
-CC2D5431
-152C1E93
-D800D93B
-1969CB8D
-46776BE7
-DF3D435C
-2CD82C1F
-241528BB
-88B41461
-19463B47
-CD61AE6F
-3C5DFE3
-8053B926
-5D0C9D00
-75240C8
-53A9DCF1
-B217E766
-616C0F89
-E73E36F5
-1E3E0BC3
-B6C474CC
-9AFE8273
-AAA496CA
-E9770A12
-9C3E2617
-3CB73C1B
-2065FF5C
-3A2B3E59
-280EF886
-B6A728CC
-DDEE48DC
-BE40F70
-449577CF
-E5D72358
-5648EE48
-F6B9BB34
-F8E354C
-84895AB6
-95DA9283
-882AF6A3
-4FBA089C
-D27070D7
-17784421
-DDEBCE6E
-4E6A43B3
-82AE90D7
-1A524C8F
-D1C0C339
-993FA3FB
-52CCA574
-523FF9E9
-764B2F69
-621F0749
-5C95BE3E
-F2A36CAD
-5C92ADE4
-F4238C46
-BDD0079D
-CAE6D9F9
-5F3D1307
-9345998
-22C3C499
-631B8B0
-A6B9A88B
-471749A7
-6BCD27C8
-5D371C05
-57081397
-F6CEF315
-1BACE19
-B7BF405
-5B6DD011
-BC74DA95
-781349E
-F22A975C
-72A5A101
-27BB6AED
-933B9126
-14FBE3BB
-50D095D9
-1CC937B1
-22CBC28
-1A6135EE
-197E93EE
-26A1CB1B
-79BCF079
-A0134157
-9F232A75
-818BB26B
-B2339659
-911E36A8
-AF2F9282
-347C34E8
-6255FF5B
-1BB79854
-9A16AE8C
-2A3D9B7D
-93795FED
-8284A6D4
-E58090F9
-A36C45A3
-F8065618
-4122FC06
-6F4DC90B
-5336936D
-F4E4BEDF
-7A885091
-E19CB61D
-9D398B7E
-C9C4AF2D
-A1C076FC
-BF60AE9B
-CBF56B80
-11038EE3
-4B78AA1C
-59C72649
-D687CF08
-B182CC2E
-43E4B13A
-83126FE9
-EB042718
-627C8807
-47474E59
-3D317A4
-33919B88
-E00CD1A3
-3CC1F4AF
-2E91597C
-CDDAF2BE
-3D3A18D6
-5BD6E47E
-3D6A5286
-456410A0
-2B51CF4E
-B55046FA
-FA43946F
-F90AC852
-A064AFA3
-F84235C4
-D316F3D2
-1BB0D769
-46905EBA
-255EE03A
-EB4D2C17
-6AFFB5CF
-D755618F
-ABECFB93
-594CBE9A
-362C1B5
-ADFAAF67
-ECF2110C
-E86FA43A
-C789EFB4
-D9FDCC95
-F81FFEBB
-C239F63C
-16BBBF2F
-B1AFC20E
-B00BCEFB
-D6B41A49
-A5856CBF
-E2753B3C
-8C03166E
-537BA621
-B268C813
-C1B8E5B7
-1FCDD47C
-BB257FF0
-37B89618
-6AD0F548
-C5EB6B1
-482EAE33
-1F898EA
-C161076A
-8112502F
-77D0C22B
-B1EF60B9
-D8122593
-D0ED144
-A258567E
-7FCB11B8
-FC01313B
-8A39DE11
-B9612887
-FAF9C5E9
-AFB24528
-C51F261D
-15A83256
-E560FDB
-5749D494
-61C88749
-F7C9978C
-41583770
-73AF53AF
-EDB828F7
-5B9A931F
-B33EEF56
-3ED0DC67
-915BF5B
-CD090180
-3659A346
-E09A572
-B0EB23
-F35F97ED
-8708879A
-E3761150
-FBCA868
-8EE5D700
-67931F7B
-E3819B8F
-FA9DD938
-3C3DD434
-FB62C866
-9D6A734E
-2BE14923
-7ED6D7BE
-423CF38D
-CC4C4156
-898F3254
-405B1D62
-25995FCB
-C062465
-12471B35
-6DB351F2
-5F23ABC5
-49EF7D2C
-91B401B3
-85DE49E0
-81D81230
-9824E09D
-767C5312
-E0744F5
-D99A77B9
-7657BA4F
-46CA1289
-5D2AEFAC
-ECDA74CB
-DBA899D3
-AFC6E7B2
-DA79D8BB
-F6508AA8
-6D0E5BF
-76DD66F3
-DAA00B8F
-C7EB98CF
-65189199
-FC2F2235
-4F19D2CD
-48D4E497
-67A7643D
-777B5F1E
-2F089D44
-4E841850
-2D371993
-B3ADA2E9
-421A44E9
-1D470C4D
-81DA8998
-71D42D8D
-E5F09965
-24BDEA19
-F8FB47FE
-1CA01D53
-52A53F9B
-B13279A7
-840C17AF
-F27507D8
-36AA55D1
-29616808
-E5C25388
-404F7A96
-AF6CAD43
-AA2A8D86
-6D0D5DE5
-B60B5047
-F904AAE0
-9BCCB969
-73FFDDAF
-AEC2E379
-DDC3B6E3
-85273FF
-4F23EA7
-F1048821
-432CA7F7
-FEEFB49D
-2749D00
-F0914942
-878203C4
-AB657B2F
-FF754E6E
-2A1B63BB
-2B094F6C
-8DD98DF4
-7E8810E3
-D17A81B6
-BF297F6D
-FAE3391B
-B28655B9
-2B4507BB
-702B2563
-FFC8858A
-B8DF3A03
-80018970
-4387C2E2
-81246EAC
-1201F4B3
-9AF9F9B6
-29F63494
-98A87F7B
-C637C322
-BCFB7066
-3505C623
-10BE77F4
-BE44797A
-2EF31DB
-C8DB4396
-FA7C2378
-AD3C30C3
-C3AEB714
-58183DA
-5D961567
-1E42A328
-94430ED5
-866A3D67
-84B148EA
-C823439
-80B57816
-D6395105
-B389CD22
-B574BF88
-F12CE1CF
-C5B892E4
-94F6CE69
-9387A05E
-C806C5C5
-B2823B0D
-64F1253B
-DD3B64F8
-4C6980E
-BA9825C0
-573D9CE3
-A78DB442
-FB5510FE
-C45DE1A4
-66DFA70F
-47960901
-68D725DA
-ACAE1E6B
-60F9360
-8C9D39E
-E78D5AE3
-A1A0BB75
-80E4ACAF
-A0FD5042
-5E0CBC82
-C0474CF6
-840ADEA6
-6F972DE8
-5D16E0D1
-86688917
-E08A3150
-BB5FB87
-2EE82F9C
-62867EB6
-B592C066
-64852270
-7A7634F0
-58C6FA6D
-E83506E1
-7DC3ADA6
-E972E4D5
-4877FABF
-CB37BA71
-7BD3131E
-9CA64901
-C072094E
-A28F50EC
-CBBE833A
-225D213F
-D4266D98
-3DA08099
-22481B45
-899C4804
-3A8630B2
-7227F512
-FDA1F80E
-E5515F91
-6EECC93B
-4611F561
-47AD2CF3
-ED2A807A
-D694C082
-6DEB43CE
-9DBD4F70
-8C918F0D
-28C5219F
-EB23A332
-AAAACB21
-9B053C22
-6C5AEEBE
-B1941AF2
-DEFAA083
-255DAF18
-B513F3E8
-CDE47DE0
-43DD2231
-71BA21A
-AB772E2E
-510C581D
-93A91FFB
-ED683872
-E561882C
-C503A74E
-E274473E
-3F7D95C2
-AD48EE4C
-887342AA
-F4D0DC01
-68023FEA
-F996EC8B
-F4E33500
-8191511B
-AFE0184C
-8A6D392B
-EDFEA13A
-AC3E90B2
-94E7E8DF
-76F491E4
-D45224EF
-D32B9CD0
-C7167945
-2D56F7E1
-994E7AAB
-65EDCC15
-AEAF497A
-BA11EA7A
-53D5812F
-DF05201B
-10A9356
-ADAEF92
-508293CC
-B45B1908
-DD8C2367
-A385DBEF
-A77E11BF
-DE9B1792
-A9FFDB94
-AE48AD8B
-E7798E96
-BAAF5B51
-44648397
-80303BBA
-FBE848C0
-74F37EC6
-C9C0EE6E
-1D80DBC0
-6CA37DEC
-995387B6
-BA2D99D0
-D1869967
-39D0BB45
-36E391CD
-12D6AB0F
-4CB16A65
-8BED7413
-99987FE8
-55BD54E3
-5568C11B
-F63606C4
-AC4D0747
-3032CADB
-52407898
-C461B987
-1F3C8122
-C7E1B1FA
-BC1BF34A
-724843D7
-2DAB612E
-F5180E4E
-67FE89A9
-B7641E8E
-185E5197
-5FDD9BA3
-C6AC4D7E
-DB020625
-16ED5F8D
-5A2DB8DB
-58F7DE17
-8231D332
-9977723E
-CFF39DC3
-A8B71C3E
-3335D9BC
-D34AE6FB
-31559150
-E6494443
-D6C0C713
-515C9C4F
-AA09B03F
-EB32806D
-981F48D
-DAB324BE
-33EDC165
-88011009
-F1120840
-48119894
-137409C1
-7F45314A
-DD74A5A7
-C2251ABF
-AA45B420
-4ACBA24E
-D020B449
-50E55E0F
-D78DD382
-F6E82B05
-9957DCE
-1410E573
-CA93CF29
-83DBB1D9
-7AD6D5D4
-7921516F
-8399BEB7
-DF07D89D
-77AB752E
-6D6DBA45
-890771BA
-E87CBF52
-F90A7590
-78967761
-6617D522
-2EEDE919
-F28BA9E9
-E1E3AA90
-2CBEBEF8
-1D8A37FB
-9CE04F02
-680B5A92
-561178BA
-A19545D0
-DBDA24E8
-A7863CD1
-F1B829CD
-2BCBD34A
-B8DFF2A6
-2787D144
-A075B93E
-AA7BC361
-B560CBA7
-F8E79316
-417B968B
-9FF31C37
-F88ADDD1
-99A6E199
-D3D400B5
-79F33397
-4AF6EA07
-93EC79F3
-F7D9C5B8
-81D7EE3C
-2898D7DC
-4B8F67DB
-D52D0F0B
-10766E32
-E228EA2C
-54C96B61
-74A99589
-7E60A886
-8FAF588
-634DD09
-1258CA8E
-13E40785
-20861E8F
-69BF3004
-E91E2BC8
-583A44C3
-36FD8D36
-572B4202
-BE43EB2C
-65F871F3
-723C1C02
-65EBEF48
-8DD407C6
-513D6B1B
-150993D3
-4C771124
-A18E6FE4
-C46071C8
-D824EA73
-7A54B17A
-4AB1E70C
-F7D078B5
-A315F9A4
-9A39A8C8
-CD34D2A6
-8CDEF63D
-B273EFA6
-E15B8FB4
-BA2A092B
-E540DF83
-33A3B82E
-13BB16A4
-4AA79F4
-DCF1D80E
-65B77A7E
-80CB308
-9A407BA2
-D32D62B0
-DB34DA97
-109F323F
-4B07538E
-40AD97F
-A810835D
-6637380B
-1ED7261B
-DA642F4D
-309A47D6
-9009C0E9
-7D9D6E1E
-580CCE0B
-67F92DAA
-1936087F
-342D9739
-A191FAF4
-2EF56C33
-EAB9AD66
-FB6E4FF8
-E58333E1
-E42B465D
-2D61F572
-9FA12447
-848394C4
-599C9E50
-28675899
-8610332C
-968735B8
-ACE06F66
-266C841B
-8512CA53
-A25D3088
-D55264D0
-AC3678A9
-D1DF668E
-5BEBD716
-DE986F08
-17DB60F5
-B88254C7
-BCA0E5B2
-E78B3459
-494B6F35
-5E0408F6
-A8638621
-62C27360
-8D98C864
-37EDB15B
-ADC93344
-4197C21
-FEFE1A30
-ACD03EBB
-A3A230A3
-45741EE4
-DE86AD8D
-CDBB302B
-303A5D5D
-A42863D5
-9019ADA8
-EB8E036C
-A5558A5D
-A4D5AF4B
-F04E0726
-C5AEA4BE
-FCB9BC09
-3FF2E51A
-53E510E9
-86FB3D5B
-3031BBDC
-1294451B
-48879312
-972E95C1
-B8B861CE
-FD180B55
-F2930D40
-31C5CF76
-8C132827
-CD696B0C
-1446B194
-436D712D
-9089677B
-493A420F
-DF82C186
-377516B8
-20ED2C1E
-956EA0C3
-D26B4EEF
-BFE59283
-B4D36719
-67B01DDD
-6F3CA60
-BF6B98D
-1B120FBA
-7CF4D06
-83091BF6
-7D3F5D85
-D3E48FAD
-E3025BBD
-CA30F611
-64D1D991
-6A688C9
-D06F9682
-D346BF
-E4DC58EB
-4C4F7AB5
-9D5CBB9F
-5536C074
-CCD9D1E4
-FADD0C6F
-769C50EF
-A1F0E40D
-72EF3FEF
-C421D7AC
-182D7491
-3FDDA320
-49F136EE
-4EFABBAA
-7228A4DE
-40A616A9
-EA37E4ED
-5DADA164
-2F9C5671
-4D3D4CD3
-3A68B35E
-7A26619D
-11A14309
-D886253C
-8F545687
-3666D9FB
-131A5557
-9644C9A3
-FCC47DF7
-7CCDF226
-9FCBB958
-9DB97B96
-630B5596
-1B592B4C
-2AB5341F
-5817D559
-3C0A5FBE
-F65E3830
-1D38ABAB
-353E9D4
-41647BE0
-63DC6FC7
-CABC6846
-A7B8001D
-2C018A1D
-435D877E
-3E5F838C
-9709BC31
-ACA0EA75
-86A06AB
-DBB06480
-2A09283F
-D3A83953
-90967E13
-D055B4E1
-3365DA22
-E3FFD521
-50205ED7
-E907F5E6
-4D7D054C
-C66CA376
-2A72C5C6
-793120B3
-170AC5FD
-C4CFDAA2
-21A3CE3A
-19F354F0
-FCE7F112
-279C9605
-AA9FBB98
-E269592C
-B8E5DE7F
-AE0A77D5
-45B4CF97
-6E9EE4C1
-C31F7C62
-D9E8C76C
-75925FEC
-EE34024B
-73FEA2CD
-BC601F7D
-75776A1F
-AC2A0090
-AA6E1956
-64C62B96
-D73C3066
-2F9C7E78
-7F1529BF
-5974399A
-79D31554
-2D559A9A
-458A1BE
-A820156A
-26764010
-981D62C3
-A5C8534B
-F8A5FAE0
-69EA2102
-2F62B77
-2AE14076
-88EB9A0A
-36B5EF31
-73E63D55
-D6A15D81
-F5C8A216
-1EEFBC6A
-8F16F5B6
-87064008
-7EEAA78F
-35A4B04C
-AE70F49
-9642CC0B
-3199A9B1
-F0E6FE1C
-F682DFA
-E500C5B1
-AA1132D6
-3B3A2D9F
-86C9A21E
-BE1422DB
-2218AF29
-64512A76
-C4624FF3
-F4E52FE4
-8473989E
-269C4193
-B67528F3
-76FD1A6F
-ACF6869B
-DCEBBBFD
-3ED92226
-3FEA0905
-2C4A131E
-4CC5DF7B
-63E3A62
-988BE035
-BB06A621
-61C2E087
-C2E46B3F
-78010D43
-9EC6DFEB
-3781CAAF
-6D000EA0
-7E952EA8
-2874E849
-FAA54995
-45DB5F56
-8CB1094F
-336FA04C
-8CCD3F1C
-A40704F0
-7AC652EF
-83E998AF
-8167F5FD
-AA7527B6
-543AF979
-F21F16B6
-9A4E00F
-1686D0AC
-FB0EF404
-EBA9E0F4
-1A9BCC03
-F66D4C53
-4328EB30
-DF52A096
-4A61DDDE
-3F19448E
-5F3E0EDC
-C9FEB2B1
-D8EDCB6
-4EAE672C
-47FB8C0A
-B4D64E67
-7F5AA323
-38796C27
-3ED30872
-6241EEE1
-AAFD55B6
-F31CA43A
-54CE5828
-6D9103FC
-665303B
-ACD9B1CC
-4961E187
-EEDB6D29
-544577B0
-9CC76FDC
-718802FC
-2EDC02F0
-6735768
-FC351962
-30F3C426
-7BD3050D
-4C19A7C
-97DC5F3C
-720D7F42
-2F735FAA
-B067A6FB
-4F5EF847
-F500ABE8
-FD9E7B9E
-8C37652E
-B6189BE1
-BAEF411D
-2584FC7F
-FEA99C78
-873C71EE
-51491598
-8BCC9600
-60A2176C
-9D6D9475
-94E1A54E
-78124EEF
-4DDDA3D5
-DE77F79C
-67E3A57B
-1E75B5B5
-290C7ADC
-30FDC46D
-63BDBBD7
-9E61B234
-666593DE
-8C7C1E27
-9C723CAF
-EF1F2DDE
-CA69CD52
-4DE571F3
-A0AD3A46
-902EB90
-D761B7BB
-9F209F04
-15B1B5F
-5C389CFF
-B736B159
-97994EC
-A2DBE074
-353360C5
-19E771B
-94A72285
-2F4706A0
-64CC6476
-627BE8B7
-90FE94EA
-7D02778
-2EEDEFD1
-9A5EF7C
-E7B7B437
-F21A3517
-F33DF1F0
-7A865164
-4BFE70A7
-88A8B45C
-C0D320E2
-E93442D3
-AA086067
-11B873ED
-1BE002FE
-2E799A3
-2AACAAA0
-EB1A91C7
-9FA88D6D
-4D956843
-75FB8348
-1584A0EB
-4C9D1E1A
-413548BF
-FA0CF448
-90D1256
-BEB74BF9
-EE7C6510
-765277BA
-A6081E2D
-E616DE16
-EDFB0495
-12EDC382
-DA64FCA3
-E258DCC3
-92E0B54B
-B41B389A
-D818F160
-F8F1A55D
-17916C31
-DBC21683
-3272DA3
-931C08B3
-9F8EA606
-232CB0D7
-EC870992
-B5F586AB
-3ECEF68A
-BF7BE567
-2C009224
-C2BE6397
-90EE0A64
-FC3E6BC3
-F1190F98
-1D05D7F8
-52AA90F8
-FF7C45B0
-7F5579FE
-6609C7B
-9B56CD69
-4A6830B1
-ECF9E86F
-62331FA4
-294B7FAB
-DC7DFBA7
-4DFA98F8
-CA6447C5
-B0416FDF
-5FAD4523
-BBBEA8BD
-47DA6D1D
-FB598321
-E4A1EBBB
-DD0CD41D
-77FC8F60
-E4D74C7F
-E4B2B064
-52EF568C
-91E87E37
-FAF6069
-6E28131E
-4D39B103
-59A3C4EC
-3AA49C6E
-D90E743
-44FC3B9A
-7D181041
-AD89A0E7
-616A565F
-129B06C1
-907298A
-5E98085E
-9648A06
-4FE2BFCA
-F73FCCCC
-62DC849B
-BB543EC0
-EF301310
-9801EC66
-43557EE0
-2C382E49
-5151FB5C
-3C1DCC5B
-DD1C153B
-77B3F30
-FDE0F3E1
-C967E75E
-D5C68278
-6CC1FA37
-A3FED046
-5DE77F4E
-FB7F40F6
-2C9191BB
-D089B672
-1E9C6BAC
-756468C2
-13352B81
-D2CC73C6
-55B4D4BD
-8D6BD8F4
-65F7C5C0
-34A629D9
-79424449
-1CE03FD7
-451FC3D3
-255B39FA
-F5F01286
-D1623E81
-4B33EB3D
-CB2326EC
-9C1189DE
-1ED995BA
-1298FE00
-A5FDB07F
-D80D48D
-575374E6
-3664F373
-5ED3FE
-2171B235
-413BEA38
-FD67D4A
-34F10135
-F4544A59
-16BA37D6
-649879DE
-EE8D839B
-A545FEF1
-4573F79
-D53FE034
-F4418DBF
-92181012
-FB81741F
-376DF3DE
-19763A21
-47FB6EB7
-7F997F6A
-CB94D301
-36461AC2
-A3C2378C
-2541AE5
-67D92471
-EC619D04
-3BE21ECC
-A441FB3D
-A19F0955
-39492084
-6C680626
-C8D37B17
-68B215A0
-8B3846B1
-9B21F1DE
-8021097
-EBCC81B2
-E9310566
-AD50FB31
-AF65F01B
-739CBC38
-35573201
-F7F58733
-4015ACA
-6AA65104
-33202FD0
-B5B1AE8B
-C1C66F1C
-8BA3BEC9
-E55A2ED0
-49ABBD4B
-42DD0652
-A936340A
-8EE63409
-5C64BE2D
-4D47E9F
-745994DC
-7CCF78A6
-516C7BF5
-395F9C6
-58E11E54
-73EAA341
-E2D4631A
-C3552D0F
-4CF36F47
-3FE7034B
-EEFCB8C6
-8219943B
-E800BB09
-55544B91
-A3292FE8
-89BC5746
-F63B4EE1
-E866DAF9
-E99B2D4B
-BB57E938
-34FB7E1A
-EBB559C1
-24838BA
-48075561
-9E621607
-998E5D98
-DFCF97D6
-2ECF6FC5
-15EE774F
-C3E53B77
-8EF5F879
-763B1F55
-5C90BD9
-267E7FCE
-625E8032
-F12724C8
-635FC29F
-36AF3D44
-B7D2299C
-6E8F0DBE
-A76006D5
-723C72E0
-ECA467C2
-5C7DFAD4
-23AC163E
-F306D785
-67972062
-57D31D2C
-4038D82E
-D21756BD
-257A9123
-BE96CEDC
-917019D1
-362C4F33
-2A305FAF
-D4389CC3
-4C435238
-D68F1F0C
-372B2979
-A7D6B646
-53A2E4C2
-19E556E
-62D716A7
-64918481
-4D3AA8F0
-BA8C6B54
-2468C102
-499AD5B3
-81AE28CD
-42E94077
-C969675A
-341B58FE
-41159415
-ADE3FA94
-FF5F42BA
-379C83ED
-A7E678F
-C2D60CBB
-CC75230C
-A12B9169
-9CF6EE67
-2DD905D3
-EACCF580
-367F9A41
-477BB16D
-8438B576
-756D14EF
-980599BD
-C181C6AD
-99A3EF95
-151D4F12
-CD85DFB7
-695F12C9
-4CF48772
-CB00E50D
-B9E2AF4C
-97EC19E3
-54810B59
-EC4F2D89
-ED77DA60
-19451088
-D5A52E95
-F6FAA3D3
-F2458DDF
-D5AB6D8
-D4042924
-AEBEC90
-505DB6D0
-52505B2A
-ED9CB8B3
-DB06312E
-C508C5AF
-4279ED2F
-5C72A874
-15E22E84
-54E967EE
-80A13FE3
-EE346264
-3569BCA7
-9AA9263B
-2BEC95EA
-966F3368
-B74F6A2B
-25ADEA56
-30A1BCE9
-71EE7AB3
-74807D9C
-E4C0D662
-A62305A1
-6B9FB6F0
-C2CAB758
-E3FA413E
-5266648
-754C0A13
-C4FD0D47
-BEFA676C
-786AFDA7
-297AA674
-F2895DA0
-72A98C20
-A662B307
-54DFB586
-8147050E
-CF7C5819
-760EC4AA
-F011339D
-2D496BE5
-6FD43E03
-1DFD893E
-814ADCDF
-B7C38DCA
-2149763D
-EB58B9BA
-9F1B81B2
-94C15E0C
-5A9923B7
-6C4E0E11
-C63C3D44
-BF9AA840
-1A3E83C5
-B81CEED7
-7E9FD999
-C1A15CFF
-B28F657F
-287D5990
-8DB5B01E
-E241144B
-EB0EA64E
-884A8775
-99F5DBEA
-3DBB21D6
-CC9472CE
-B932014E
-22A35325
-7B22DCF6
-882BB2C3
-B47CDAE
-28767633
-ED17CB12
-6302A17F
-25D91C08
-4D61BFB6
-FA240AD0
-E9DBF560
-F0E9AD0E
-835C152D
-61E5F126
-C176F8FB
-B793DC1C
-622E04B
-D9FB6072
-60124DA7
-8BEA323D
-6C496459
-FBE1E578
-F1C73C9E
-6A7C4C58
-43F1DB50
-E9BF93AC
-B7DC5C72
-2E68083B
-F3DE081F
-AAA39D71
-73406424
-B99D0139
-E4FB0C67
-142AB82D
-3312CC57
-7A3BEDB7
-6B6E42D2
-F8330EA0
-2FE05DA6
-3E6BB118
-3C73E09
-5FDB1471
-6A226A31
-88792727
-78708ED3
-7A095177
-9CCAD23E
-C3B75180
-226F8D4C
-46DD1DBE
-D799BE11
-1F852432
-7361585D
-97380EF8
-4F1A8127
-2EB7A73C
-35B892A7
-933075A1
-2B6D3BEB
-BCDCA6F1
-E9409A22
-3A8E5575
-E37AE0CA
-97C2866C
-BA575BC0
-C16049A3
-79FED5B1
-6356E153
-98789BE6
-47B95292
-FBDEC30C
-2275A4D
-632C436D
-FDCBB3FE
-4E0ACB8D
-36A77186
-593FDA25
-D9B74A5D
-18021557
-3919EF9B
-DDD00927
-B0C6DFEE
-F761C0C7
-886DBB5
-807A21DF
-778F06D1
-27A67D08
-2CBBD43E
-2696EC44
-1F916066
-DE884377
-1472CADD
-F30A91AE
-89C35DEC
-84E5487E
-792613D4
-1E59B1A9
-B18BF896
-8D7034AC
-A144CE10
-F2FFC2AD
-2F5FBA7D
-FFEDDB97
-7C506BFD
-85B811DE
-CC3AD4C0
-B6CC2F1
-BFD63C90
-281E81D7
-89E82B39
-E5371DE9
-5BB68ED3
-3DA62382
-3C8CBB1D
-4BE92297
-878783A4
-F925E76B
-77DE554E
-7EB5914E
-9B3F869E
-F47FA82D
-23E861F2
-19E38BDE
-C26E5CA7
-317C9C64
-B96B12FC
-F6EB43AE
-F979DCAE
-DD5BE081
-5B11401
-3C4A8866
-38C6F309
-2FE6DD71
-84E2BDC8
-2FA36F63
-F0D171C
-8AAD8CA5
-92D5E506
-D4CF4E62
-82DFFC21
-2C686264
-CDDA9A2B
-98CF101
-847DC151
-C0FEC6AC
-A1638360
-DD36C966
-A6A8635A
-F700C63D
-48377DC5
-138CB9D1
-857331B5
-4844609F
-E29224CA
-A5079F42
-3B39EA92
-F020BFFE
-4859CF8E
-7C1B1E1E
-DD95482D
-24C31760
-3555FB83
-B1D20BED
-403E6587
-D04E4309
-74F63A1
-EAFDC6CD
-781795C6
-BA9A1FD1
-60F61FF3
-B93EE92A
-7BCCFCDF
-477FB17A
-B508142D
-D2BC8CD8
-F11D8200
-24A8149A
-8F00F213
-3822F374
-E37B6219
-4727F504
-12CD7551
-5FD2779
-E8EC01F6
-29CE5CE4
-1EDDBCF9
-69AFBC0F
-11B3CB87
-E39AE82B
-E66CDCBF
-6824DB75
-7183BE54
-12A11956
-ADA59196
-437E5E61
-F1A7F4A1
-671FDE0A
-9202817E
-33ABACB2
-B0705AB1
-39952407
-D3672EB1
-A03BD94B
-B46D2252
-1DC47573
-EE4C78D4
-B6E4D8E0
-12C2206A
-5656E1EE
-4D9D4988
-35E36416
-3AC9C8F2
-2161B02C
-1B5A8615
-62587331
-CC4036C
-EACDCEC6
-F40C98DC
-9C8FFDE9
-D87FB3C0
-C55AABE7
-1BE31E0B
-C0796911
-C08C311
-E41B196D
-E4FFB7A3
-2483C766
-FD348C63
-F294631A
-7B74B50A
-D6416CD9
-66559F6C
-A7CE68E0
-ACD88C63
-BB49939B
-7987A018
-E1797428
-CE39ECE8
-D7B3DA7
-8F2A3F0C
-37E3C72E
-21F1A24E
-57AFCEF2
-AB8CF2
-15B5A4E9
-94094315
-29C3AEB6
-A56B4233
-6D57E64E
-3A7399D2
-103AE960
-8B93E67E
-D5193079
-767DA47D
-88AEDE6F
-ABCFBF34
-2650782C
-7A716475
-C86C9BBA
-4423420D
-3AF8FD02
-72E202EE
-5A264F7B
-4E103072
-4DA5A0E0
-59319F97
-B54F9AC
-556DF0B3
-ABAD7DC0
-2A715C13
-9D443D0F
-54BDC92C
-1EC2B967
-80BE3AC2
-FA646E8A
-2EE396F1
-8B0315E8
-9F52B6E
-DAD30422
-2E9B6CDB
-8686D47A
-5D9DB3C7
-717E799B
-20A4D4E5
-C2DC8AE4
-F630FADD
-8C7DF047
-65F4928C
-BE66D11E
-6004484D
-C1B509AB
-FAA4C75F
-B3D272A0
-7FE6F083
-A54B6584
-FC3292F
-4D27DDFC
-A1ABC224
-872FED55
-D235AEC
-27ED8546
-1B170B2A
-CE9E5C0
-2267B02
-285992BD
-F855CC8
-8FFB1F6F
-C7BDDF81
-349B4F5F
-B9B28843
-D5D532A0
-8FD7BE3C
-2DB04DE8
-C7D0C2FD
-B6822987
-1FE0710D
-8EADA490
-A03F99CF
-F3E7F902
-F56CCCA3
-CED5B6BF
-D6B3DC0D
-92AA9FE8
-351208D
-A1C9623B
-5802547D
-3480D77C
-404D4E65
-679025BA
-905FF962
-B7130CA8
-5AFA9CFE
-2A654EFC
-26218A8
-473A88A
-5E3534CC
-771FF1E1
-EADD6296
-DF7157B3
-D48E42E8
-3D6E848B
-29CD6C
-68732656
-A6C6D52A
-B50279FF
-705B645
-6DF7F119
-34152606
-72948D92
-18BEE72
-36BE21E3
-C34FD53A
-9765DFF
-E5C9B4AF
-4604B155
-DEAC2388
-7841FE0C
-2E275885
-3EE65330
-EB66439B
-FF4AB5DE
-67EDA5EA
-BB722F57
-6A645B7
-DE9DD302
-5AC7601D
-371B5D5B
-42BAC84D
-21C7AA9E
-F4ECBE94
-554C8B8A
-B7C8BB88
-4C77DB1D
-D4D8F3AC
-DAB292E5
-85D906E8
-47785703
-9CEE88D4
-7DB86DB7
-694B5A34
-DE77B361
-E8DE3CB9
-315EC35A
-A71943BC
-C297B8CA
-55EA528C
-A11AF15D
-1490835E
-19DA117B
-403B0CC3
-FF7DE389
-ED6C22E8
-6F8A8782
-7BF2BA9B
-6C95F5DF
-F8270769
-AB421268
-F06B05EB
-8FF7DE5F
-F2AB2FCD
-A5EDD602
-31F05712
-3C269177
-67D92F11
-38D8D3C5
-2047013B
-8E8BA724
-EB6A773
-5AF14AD1
-49910D46
-C9D6F784
-B44B09CF
-1AEA48EF
-2F12BD47
-10E3F7C9
-39EA8108
-B88ADC9
-19DAC1B4
-554908DC
-587A0A7E
-109D1E5B
-1920E3CF
-BC49C914
-C1EB74A7
-A5E9A494
-5FA5B8C9
-320673C2
-CE643004
-720E4075
-FDFED2FE
-89C22F8E
-40887408
-3235FF6B
-A906F59D
-F6F98F12
-7122ECA4
-4CDFCB42
-391F2365
-53AE3667
-6CCCE2E2
-44877A8A
-92561CAB
-DA5DE0E7
-73B898D6
-2E37229E
-ABAAED3C
-21087331
-58C85412
-8BB37690
-1256467F
-6EE9FAF7
-DB0895D6
-954EF968
-1C7693BC
-5786650F
-7D441E12
-10AA9174
-492C6A3B
-34374CC9
-98E59E7C
-5B7BD4E0
-D1124C9F
-B5B3362F
-8ECC58C7
-8EB0E23E
-72991400
-13DF853B
-789E8DFE
-D85E60DC
-A168D4D
-C3B6FA3A
-11443EE2
-F63F9FDD
-1A14A7A5
-5EEBFD5
-B24D582D
-AEA8F125
-4AA038EE
-5F6A1A16
-CBADD812
-340605AA
-8BD8F6E9
-B85F3A6A
-A585AE8C
-6D12D2B3
-17C97329
-DBB835B9
-789C3DF4
-E048D462
-BECE080A
-506DE5CA
-63C4FA5C
-7C2D8103
-689A3516
-B218BADF
-8B7F0BDE
-85B17891
-8888A9C6
-3DFC9FA8
-5F2859CD
-FF72AE34
-9EA3FFCA
-CF2194D2
-53B56E7F
-C7009619
-B127FD51
-3A513DF0
-E9147D4B
-2FDF3C37
-22FA1629
-61480015
-57EE267A
-EE04DA43
-EB2D289C
-2C102144
-B012EED
-B1B339C8
-AC1EA89
-3A4420D0
-5623907B
-B0613D35
-A70F1B2C
-589E3EA7
-F998AB7D
-9566E921
-B133DB2D
-A3106F6A
-EFB4518
-6AA3FB8F
-C505C8DF
-65032E33
-6D3942DF
-333553CC
-BF392E2
-6C77F980
-39211AFC
-9E0B71C9
-A3BB7123
-7CE16B9A
-F15BB634
-BD68DE3E
-77BB27AB
-BB72659C
-BFA916CA
-7022CF20
-EA64C93D
-B61C32CC
-20201879
-148DDADC
-58977
-8D5CC2E6
-76E678BD
-5655B362
-587EAB4A
-599E3DCF
-7B470038
-E87E82DB
-9088EC5E
-ED9F9E4C
-3DD98E27
-5AFA5052
-3DF313C4
-BB22A60D
-44D97BDA
-601409F3
-CD1D3CFE
-7EAE52D0
-41ABBAA0
-A1D7C883
-FFE2B4C9
-13717374
-9DD27EC8
-29301EF0
-87953D6C
-9309161C
-C91DFE7C
-DD5EC452
-F6C27DF2
-43B433FD
-6D16B93F
-92F09DBA
-ABB598EF
-B49A721A
-3A03EE56
-3177D3AF
-5D24FD94
-FEF88FB2
-52B3170F
-64264DCC
-18B683B7
-6B21935F
-901A396C
-4601FB55
-51F2547E
-DD37C23B
-35E6B3DF
-31ABC979
-C7223449
-ABCA9CFB
-A8F57AFA
-A097240
-78704130
-7F1D7661
-456C2409
-63E31F62
-FD0D4BB1
-97FCC39
-951A7C93
-893165C9
-E86163CC
-25F5694C
-8890910A
-43F3AE36
-55D414A1
-1ADDD3BA
-C7EDFDDF
-5A8607BA
-219D3208
-27BD79E2
-2E9EA4B8
-5D8F951A
-F9E880D5
-B2C7612A
-862CCCF3
-7EDC71AC
-1B6EA644
-EC3AA9A0
-970224FD
-6C0DD16A
-C589D1B6
-71AC91EE
-C75B0206
-50232786
-316AAD4D
-F4D5A31B
-E30CCF43
-BD72BEAD
-26DE4F8F
-56E97741
-9243E978
-F7E2363D
-BAE2CF31
-6367CFB1
-B72ED4E6
-75216393
-4626E74F
-61194364
-8D6726A8
-458611B8
-1B536E4D
-837AAD1F
-F5A226D8
-8BB37701
-31F19003
-8E48DEEE
-9DA11E9
-3BBB5BB4
-C6F15B5D
-1A53A4EB
-69AADAB
-4FAE6295
-F0943601
-A449516E
-BF7EE395
-176B1370
-F55873EE
-553FEEF0
-9F3AB09
-2539B92E
-F6803BC
-BAA192FB
-DBB0AD5A
-B9C5415
-F92D0588
-88B9E738
-A033C767
-A1CA1EFF
-5AC07200
-AC60C03D
-17FE20F9
-B898B9AC
-51AF425E
-2706FC42
-F2A258E7
-353652D7
-CF3F89EE
-63A13050
-5E6A7997
-153FD92F
-1D0E8614
-6E504447
-5AAEC133
-9B6E5499
-64D5EAE6
-A29CFBAB
-52B44B68
-8DC7C01A
-704EB2F1
-395F1F7
-7D897418
-2FC66846
-ECCE81AE
-21CD8E31
-B2EFA3D4
-16C4CD41
-D6A21ED0
-944897F9
-F495D730
-B4317C3C
-8C074582
-22F6A9D9
-CE4425FB
-FB08BCBA
-DF07A006
-293AD5BA
-BD224A44
-9DA6701B
-DAB46DE4
-9F88773B
-57CC02C7
-7A6B68E4
-55A54D48
-BCFC1C53
-DF64F920
-A9FE6014
-4C64DB55
-5FE9345F
-412A1E48
-45D41945
-23B44D08
-8D5563A2
-26E5E437
-CECDF4D0
-1BE55025
-84329F92
-37C97F8F
-C3CDE976
-580955A
-C79E1131
-C5BC58E7
-7D14509B
-3DE94089
-1B78FE71
-49A0ECD9
-501D09B1
-F30135CD
-B0FA41B4
-33B11313
-32AB01B
-635EBA76
-666D7FE5
-68CCC93
-59B0ADA5
-B305CBAA
-1C553509
-5E564F7C
-F057084C
-52811FC8
-987465B2
-461DA750
-F0C471BB
-3C9D3E64
-73C920AF
-355A26B9
-3A1FDD13
-CEA3F7DD
-66C0687
-1319291
-9045182D
-174C724D
-2A491012
-BA53519F
-A62B41D8
-F6E1559E
-25F93E6F
-2A40C5F4
-C63D1AC2
-82598002
-2B81101A
-63442848
-3788BB2D
-74DDC016
-214CE0F4
-9CBAA8BD
-9288E1AC
-EF76E528
-719E7BAE
-BD579EF6
-4E6B0C62
-6285F757
-9049BDA3
-80BFE3C1
-4344B7A7
-4552F1DD
-DE2C0DAC
-86346BE2
-A0A897E7
-1797D93
-6CF3C7F0
-7592D9E7
-CFB46F1E
-17D6FF93
-87FF1727
-198FC755
-303540EF
-78C07416
-46CB391E
-8D441653
-3724DA3C
-860D4DDF
-A99F046E
-4B167D86
-E2AFCBE9
-6608F2D2
-4E49A130
-3C64B760
-958BCEB3
-8C784B24
-5E07EF07
-7E6CAC6A
-B69765D8
-65897B6D
-60A8FB7D
-6706E0E1
-142E4310
-15C4944C
-F6A075AD
-3CF66DF8
-CE1EFE72
-D6495864
-2BDEFA6B
-9E511045
-F2E2E9A7
-B71B03EB
-15DD8D69
-65E5A555
-52C644AE
-301A8F69
-35075232
-17ADE8C4
-A2C808CC
-F1A4C57B
-D6EE3EF3
-85942F72
-26011F23
-D4211E97
-595E1A12
-6886CE0
-FBD6F396
-D10BD980
-6615476D
-4662EB8F
-F80BE955
-93A6E68E
-4C3D4CAA
-5838D0CB
-756FB6E4
-F0BC8312
-EB89BE83
-D34E119E
-34F860EC
-F371DC73
-BB166E0D
-CE86AF89
-C177E633
-A19C1D9B
-B1DCBF1B
-D7310057
-2452939E
-120A830
-F92A9928
-64877B92
-3D69A585
-178187B6
-146C0495
-9A3D8886
-C79478AD
-9A429976
-29795A97
-32BD0034
-1EE08CD
-8982284A
-ED362AC4
-4A1AC734
-6FD164B3
-422ADEBA
-9374B593
-BBFA8568
-1C0B26A5
-5DF68365
-CFA1D689
-1C9509C2
-1056EAC4
-D492D000
-64076487
-2C1FB65B
-9E1DEBC7
-C5AECD05
-39652664
-57A1B9F9
-3652484
-E8CCF72B
-CB7EC405
-7DA97E78
-7ACE1B2C
-A5DC0B75
-40C14422
-777B17AF
-5AA3FEDF
-319C2B1C
-AB8EEE5F
-159D66E5
-3E479D0
-12AF93DE
-55EA550A
-38853E1F
-FB943864
-781FA52E
-4FB9C9FA
-377D8866
-8411E296
-641D997F
-1933684F
-27A62DEF
-50E15F68
-755BCD7C
-5DF3466F
-494A937C
-8763C6BD
-C04B98E0
-E9E067FF
-444151AB
-C5FC7398
-5EC7D30E
-E0610B7E
-76CEBB5
-B15D9821
-37B2D1E2
-CC1249BF
-3E064388
-246B17B3
-4A342228
-529E849B
-F25F250D
-31F3E925
-D1112DCA
-DA6A8BC9
-2A7789D8
-C0C2C72D
-4BB23226
-68166638
-4EC7519F
-D559B4B7
-8035E823
-DFB06DE0
-2B4B86
-83D6F12F
-84AC7F7B
-7139E98B
-C42D8AE3
-2992AD9C
-E1E24DA1
-838772BD
-CA28D517
-3606947F
-B9FDFA59
-6C4F8489
-76DBFFD4
-3F0BFDF6
-1B04AD1B
-8BA40134
-842A54F6
-621A0DFE
-1F3729FC
-C53AFEFE
-CD5F1E79
-D2C0C70
-30A4FF4F
-D384C76
-D73B9B17
-C74DC3F9
-E5ACD113
-901E6D5D
-D376A71F
-57BA08F9
-17E25669
-F7485021
-BCD1B9C5
-90C1A916
-EEF9DE6E
-6AD37907
-40B05A7B
-4A56C1D
-901093E1
-5424EEE9
-3336300D
-8B1767F3
-707A4B23
-37290194
-13A5E016
-C25902C0
-5C04C3AE
-B7D84F4D
-D57A495F
-EE168042
-1584DB78
-7DBFDBD3
-DBE2218D
-9EED8CD4
-2A562C0F
-C76F7E04
-8FCA82B8
-7211C54F
-8E76E82C
-9BAF59A6
-C1E7B9CE
-28E9E29F
-6746FB40
-7841DDA1
-37D07C7
-88A5CF5
-4B0B8A4E
diff --git a/finn-rtllib/memstream/sim/memstream_tb.sv b/finn-rtllib/memstream/sim/memstream_tb.sv
new file mode 100644
index 0000000000..4b2e850415
--- /dev/null
+++ b/finn-rtllib/memstream/sim/memstream_tb.sv
@@ -0,0 +1,212 @@
+/**
+ * Copyright (c) 2023, Xilinx
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of FINN nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author Thomas B. Preußer
+ */
+
+module memstream_tb;
+ localparam int unsigned DEPTH = 256;
+ localparam int unsigned DATA_WIDTH = 32;
+
+ // Global Control
+ logic clk = 0;
+ always #5ns clk = !clk;
+ logic rst;
+
+ // Configuration Interface
+ logic [31:0] config_address;
+ logic config_ce;
+ logic config_we;
+ logic [DATA_WIDTH-1:0] config_d0;
+ uwire config_rack;
+ uwire [DATA_WIDTH-1:0] config_q0;
+
+ // Streamed Output
+ logic ordy;
+ uwire ovld;
+ uwire [DATA_WIDTH-1:0] odat;
+
+ initial begin
+ config_address = 'x;
+ config_ce = 0;
+ config_we = 0;
+ config_d0 = 'x;
+
+ ordy = 0;
+
+ rst = 1;
+ repeat(16) @(posedge clk);
+ rst <= 0;
+
+ // Write Parameters
+ config_ce <= 1;
+ config_we <= 1;
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ config_address <= i;
+ config_d0 <= i;
+ @(posedge clk);
+ end
+ config_address <= 'x;
+ config_ce <= 0;
+ config_we <= 0;
+ config_d0 <= 'x;
+
+ rst <= 1;
+ @(posedge clk);
+ rst <= 0;
+
+ // One Round of Stream Read
+ ordy <= 1;
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ @(posedge clk iff ovld);
+ assert(odat == i) else begin
+ $error("Unexpected output: %0d instead of %0d", odat, i);
+ $stop;
+ end
+ end
+ ordy <= 0;
+
+ // Full Parameter Readback
+ if(1) begin
+ automatic logic [DATA_WIDTH-1:0] Q[$] = {};
+
+ config_ce <= 1;
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ config_address <= i;
+ @(posedge clk);
+ Q.push_back(i);
+
+ if(config_rack) begin
+ automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front();
+ assert(config_q0 == exp) else begin
+ $error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+ $stop;
+ end
+ end
+ end
+ config_address <= 'x;
+ config_ce <= 0;
+
+ while(Q.size) begin
+ automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front();
+
+ @(posedge clk iff config_rack);
+ assert(config_q0 == exp) else begin
+ $error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+ $stop;
+ end
+ end
+ end
+
+ repeat(6) @(posedge clk);
+
+ // Another Round of Stream Read
+ ordy <= 1;
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ @(posedge clk iff ovld);
+ assert(odat == i) else begin
+ $error("Unexpected output: %0d instead of %0d", odat, i);
+ $stop;
+ end
+ end
+ ordy <= 0;
+
+ // A Round of Stream Read with intermittent Read Backs
+ if(1) begin
+ automatic logic [DATA_WIDTH-1:0] Q[$] = {};
+
+ for(int unsigned i = 0; i < DEPTH; i++) begin
+ do begin
+ // Randomly delayed Readiness
+ if($urandom()%5 != 0) ordy <= 1;
+
+ // Issue and Check Random Read Backs
+ if($urandom()%9 == 0) begin
+ automatic int unsigned addr = $urandom() % DEPTH;
+ config_ce <= 1;
+ config_address <= addr;
+ Q.push_back(addr);
+ end
+ @(posedge clk);
+ config_ce <= 0;
+ config_address <= 'x;
+
+ if(config_rack) begin
+ automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front();
+ assert(config_q0 == exp) else begin
+ $error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+ $stop;
+ end
+ end
+
+ end while(!ovld || !ordy);
+ ordy <= 0;
+
+ assert(odat == i) else begin
+ $error("Unexpected output: %0d instead of %0d", odat, i);
+ $stop;
+ end
+ end
+
+ while(Q.size) begin
+ automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front();
+
+ @(posedge clk iff config_rack);
+ assert(config_q0 == exp) else begin
+ $error("Readback mismatch: %0d instead of %0d", config_q0, exp);
+ $stop;
+ end
+ end
+ end
+ ordy <= 0;
+
+ repeat(2) @(posedge clk);
+ $display("Test completed.");
+ $finish;
+ end
+
+ memstream #(
+ .DEPTH(DEPTH),
+ .WIDTH(DATA_WIDTH)
+ ) dut (
+ .clk, .rst,
+
+ .config_address,
+ .config_ce,
+ .config_we,
+ .config_d0,
+ .config_q0,
+ .config_rack,
+
+ .ordy,
+ .ovld,
+ .odat
+ );
+
+endmodule : memstream_tb
diff --git a/finn-rtllib/memstream/sim/tb_memstream.v b/finn-rtllib/memstream/sim/tb_memstream.v
deleted file mode 100644
index ad3efad5bd..0000000000
--- a/finn-rtllib/memstream/sim/tb_memstream.v
+++ /dev/null
@@ -1,369 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-`timescale 1ns/10ps
-
-module tb_memstream;
-
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-parameter CONFIG_EN = 1;
-parameter NSTREAMS = 4;//1 up to 6
-
-parameter MEM_DEPTH = 9216;
-parameter MEM_WIDTH = 32;
-parameter MEM_INIT = "./";
-parameter MEM_CHECK = "golden.dat";
-
-//widths per stream
-parameter STRM0_WIDTH = 32;
-parameter STRM1_WIDTH = 32;
-parameter STRM2_WIDTH = 32;
-parameter STRM3_WIDTH = 32;
-parameter STRM4_WIDTH = 1;
-parameter STRM5_WIDTH = 1;
-
-//depths per stream
-parameter STRM0_DEPTH = 2304;
-parameter STRM1_DEPTH = 2304;
-parameter STRM2_DEPTH = 2304;
-parameter STRM3_DEPTH = 2304;
-parameter STRM4_DEPTH = 1;
-parameter STRM5_DEPTH = 1;
-
-//offsets for each stream
-parameter STRM0_OFFSET = 0;
-parameter STRM1_OFFSET = 2304;
-parameter STRM2_OFFSET = 4608;
-parameter STRM3_OFFSET = 6912;
-parameter STRM4_OFFSET = 0;
-parameter STRM5_OFFSET = 0;
-
-
-reg clk;
-reg rst;
-
-reg [31:0] config_address = 0;
-reg config_ce = 0;
-reg config_we = 0;
-reg [31:0] config_d0 = 0;
-wire [31:0] config_q0;
-
-//multiple wire AXI Streams
-reg m_axis_0_afull;
-reg m_axis_0_tready;
-wire m_axis_0_tvalid;
-wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
-
-reg m_axis_1_afull;
-reg m_axis_1_tready;
-wire m_axis_1_tvalid;
-wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
-
-reg m_axis_2_afull;
-reg m_axis_2_tready;
-wire m_axis_2_tvalid;
-wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
-
-reg m_axis_3_afull;
-reg m_axis_3_tready;
-wire m_axis_3_tvalid;
-wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
-
-reg m_axis_4_afull;
-reg m_axis_4_tready;
-wire m_axis_4_tvalid;
-wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
-
-reg m_axis_5_afull;
-reg m_axis_5_tready;
-wire m_axis_5_tvalid;
-wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
-
-reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
-integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
-integer done = 0;
-reg [5:0] rng;
-
-//clock
-initial begin
- clk = 0;
- forever #5 clk = ~clk;
-end
-
-initial begin
- rst = 1;
- config_ce = 0;
- m_axis_0_afull = 0;
- m_axis_1_afull = 0;
- m_axis_2_afull = 0;
- m_axis_3_afull = 0;
- m_axis_4_afull = 0;
- m_axis_5_afull = 0;
- m_axis_0_tready = 1;
- m_axis_1_tready = 1;
- m_axis_2_tready = 1;
- m_axis_3_tready = 1;
- m_axis_4_tready = 1;
- m_axis_5_tready = 1;
- repeat(100) @(negedge clk);
- rst = 0;
- #100
- fork
- begin
- $display("Starting to generate random AFULL");
- while(~done) begin
- rng = $random;
- m_axis_0_afull = rng[0];
- m_axis_1_afull = rng[1];
- m_axis_2_afull = rng[2];
- m_axis_3_afull = rng[3];
- m_axis_4_afull = rng[4];
- m_axis_5_afull = rng[5];
- @(negedge clk);
- end
- end
- join
-end
-
-
-//DUT
-memstream
-#(
- CONFIG_EN,
- NSTREAMS,
- MEM_DEPTH,
- MEM_WIDTH,
- MEM_INIT,
-
- //widths per stream
- STRM0_WIDTH,
- STRM1_WIDTH,
- STRM2_WIDTH,
- STRM3_WIDTH,
- STRM4_WIDTH,
- STRM5_WIDTH,
-
- //depths per stream
- STRM0_DEPTH,
- STRM1_DEPTH,
- STRM2_DEPTH,
- STRM3_DEPTH,
- STRM4_DEPTH,
- STRM5_DEPTH,
-
- //offsets for each stream
- STRM0_OFFSET,
- STRM1_OFFSET,
- STRM2_OFFSET,
- STRM3_OFFSET,
- STRM4_OFFSET,
- STRM5_OFFSET
-)
-dut
-(
- clk,
- ~rst,
-
- //optional AXI-Lite interface
- config_address,
- config_ce,
- config_we,
- config_d0,
- config_q0,
-
- //multiple output AXI Streams
- m_axis_0_afull,
- m_axis_0_tready,
- m_axis_0_tvalid,
- m_axis_0_tdata,
-
- m_axis_1_afull,
- m_axis_1_tready,
- m_axis_1_tvalid,
- m_axis_1_tdata,
-
- m_axis_2_afull,
- m_axis_2_tready,
- m_axis_2_tvalid,
- m_axis_2_tdata,
-
- m_axis_3_afull,
- m_axis_3_tready,
- m_axis_3_tvalid,
- m_axis_3_tdata,
-
- m_axis_4_afull,
- m_axis_4_tready,
- m_axis_4_tvalid,
- m_axis_4_tdata,
-
- m_axis_5_afull,
- m_axis_5_tready,
- m_axis_5_tvalid,
- m_axis_5_tdata
-
-
-);
-
-//stream checkers
-initial begin
- ptr0 = STRM0_OFFSET;
- ptr1 = STRM1_OFFSET;
- ptr2 = STRM2_OFFSET;
- ptr3 = STRM3_OFFSET;
- ptr4 = STRM4_OFFSET;
- ptr5 = STRM5_OFFSET;
- fork
- //check stream 0
- begin
- $display("Starting stream 0 checker");
- while(~done & (NSTREAMS > 0)) begin
- @(negedge clk);
- if(m_axis_0_tvalid) begin
- if(m_axis_0_tdata != golden[ptr0]) begin
- $display("Mismatch on stream 0");
- $stop();
- end
- //increment pointer
- ptr0 = ptr0 + 1;
- //rewind pointer if it's reached end
- if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
- ptr0 = STRM0_OFFSET;
- end
- end
- end
- //check stream 1
- begin
- $display("Starting stream 1 checker");
- while(~done & (NSTREAMS > 1)) begin
- @(negedge clk);
- if(m_axis_1_tvalid) begin
- if(m_axis_1_tdata != golden[ptr1]) begin
- $display("Mismatch on stream 1");
- $stop();
- end
- //increment pointer
- ptr1 = ptr1 + 1;
- //rewind pointer if it's reached end
- if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
- ptr1 = STRM1_OFFSET;
- end
- end
- end
-
- //check stream 2
- begin
- $display("Starting stream 2 checker");
- while(~done & (NSTREAMS > 2)) begin
- @(negedge clk);
- if(m_axis_2_tvalid) begin
- if(m_axis_2_tdata != golden[ptr2]) begin
- $display("Mismatch on stream 2");
- $stop();
- end
- //increment pointer
- ptr2 = ptr2 + 1;
- //rewind pointer if it's reached end
- if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
- ptr2 = STRM2_OFFSET;
- end
- end
- end
- //check stream 3
- begin
- $display("Starting stream 3 checker");
- while(~done & (NSTREAMS > 3)) begin
- @(negedge clk);
- if(m_axis_3_tvalid) begin
- if(m_axis_3_tdata != golden[ptr3]) begin
- $display("Mismatch on stream 3");
- $stop();
- end
- //increment pointer
- ptr3 = ptr3 + 1;
- //rewind pointer if it's reached end
- if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
- ptr3 = STRM3_OFFSET;
- end
- end
- end
- //check stream 4
- begin
- $display("Starting stream 4 checker");
- while(~done & (NSTREAMS > 4)) begin
- @(negedge clk);
- if(m_axis_4_tvalid) begin
- if(m_axis_4_tdata != golden[ptr4]) begin
- $display("Mismatch on stream 4");
- $stop();
- end
- //increment pointer
- ptr4 = ptr4 + 1;
- //rewind pointer if it's reached end
- if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
- ptr4 = STRM4_OFFSET;
- end
- end
- end
- //check stream 5
- begin
- $display("Starting stream 5 checker");
- while(~done & (NSTREAMS > 5)) begin
- @(negedge clk);
- if(m_axis_5_tvalid) begin
- if(m_axis_5_tdata != golden[ptr5]) begin
- $display("Mismatch on stream 5");
- $stop();
- end
- //increment pointer
- ptr5 = ptr5 + 1;
- //rewind pointer if it's reached end
- if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
- ptr5 = STRM5_OFFSET;
- end
- end
- end
- join
-end
-
-initial begin
- done = 0;
- $readmemh(MEM_CHECK,golden);
-// $dumpfile("wave.vcd");
-// $dumpvars(0,tb_memstream);
- @(negedge rst);
- #10000000
- $display("Test done!");
- done = 1;
- #1000
- $finish();
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/sim/tb_memstream_writes.v b/finn-rtllib/memstream/sim/tb_memstream_writes.v
deleted file mode 100644
index c66807454b..0000000000
--- a/finn-rtllib/memstream/sim/tb_memstream_writes.v
+++ /dev/null
@@ -1,486 +0,0 @@
-/*
- Copyright (c) 2020, Xilinx
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
-
- * Redistributions of source code must retain the above copyright notice, this
- list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright notice,
- this list of conditions and the following disclaimer in the documentation
- and/or other materials provided with the distribution.
-
- * Neither the name of FINN nor the names of its
- contributors may be used to endorse or promote products derived from
- this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
- FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
- SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
- CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-`timescale 1ns/10ps
-
-module tb_memstream_writes;
-
-//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths
-parameter CONFIG_EN = 1;
-parameter NSTREAMS = 2;//1 up to 6
-
-parameter MEM_DEPTH = 40;
-parameter MEM_WIDTH = 70;
-
-//widths per stream
-parameter STRM0_WIDTH = 70;
-parameter STRM1_WIDTH = 32;
-parameter STRM2_WIDTH = 32;
-parameter STRM3_WIDTH = 32;
-parameter STRM4_WIDTH = 1;
-parameter STRM5_WIDTH = 1;
-
-//depths per stream
-parameter STRM0_DEPTH = 20;
-parameter STRM1_DEPTH = 20;
-parameter STRM2_DEPTH = 2304;
-parameter STRM3_DEPTH = 2304;
-parameter STRM4_DEPTH = 1;
-parameter STRM5_DEPTH = 1;
-
-//offsets for each stream
-parameter STRM0_OFFSET = 0;
-parameter STRM1_OFFSET = 20;
-parameter STRM2_OFFSET = 4608;
-parameter STRM3_OFFSET = 6912;
-parameter STRM4_OFFSET = 0;
-parameter STRM5_OFFSET = 0;
-
-
-reg clk;
-reg rst;
-
-wire awready;
-reg awvalid;
-reg [31:0] awaddr;
-reg [2:0] awprot;
-//write data
-wire wready;
-reg wvalid;
-reg [31:0] wdata;
-reg [3:0] wstrb;
-//burst response
-reg bready;
-wire bvalid;
-wire [1:0] bresp;
-
-//Read channels
-//read address
-wire arready;
-reg arvalid;
-reg [31:0] araddr;
-reg [2:0] arprot;
-//read data
-reg rready;
-wire rvalid;
-wire [1:0] rresp;
-wire [31:0] rdata;
-
-//multiple wire AXI Streams
-reg m_axis_0_afull;
-reg m_axis_0_tready;
-wire m_axis_0_tvalid;
-wire [STRM0_WIDTH-1:0] m_axis_0_tdata;
-
-reg m_axis_1_afull;
-reg m_axis_1_tready;
-wire m_axis_1_tvalid;
-wire [STRM1_WIDTH-1:0] m_axis_1_tdata;
-
-reg m_axis_2_afull;
-reg m_axis_2_tready;
-wire m_axis_2_tvalid;
-wire [STRM2_WIDTH-1:0] m_axis_2_tdata;
-
-reg m_axis_3_afull;
-reg m_axis_3_tready;
-wire m_axis_3_tvalid;
-wire [STRM3_WIDTH-1:0] m_axis_3_tdata;
-
-reg m_axis_4_afull;
-reg m_axis_4_tready;
-wire m_axis_4_tvalid;
-wire [STRM4_WIDTH-1:0] m_axis_4_tdata;
-
-reg m_axis_5_afull;
-reg m_axis_5_tready;
-wire m_axis_5_tvalid;
-wire [STRM5_WIDTH-1:0] m_axis_5_tdata;
-
-reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0];
-reg [MEM_WIDTH-1:0] gword;
-integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5;
-integer done = 0;
-integer i, j;
-reg [5:0] rng;
-
-parameter NFOLDS_PER_WORD = (MEM_WIDTH+31)/32;
-
-task axi_write;
- input [MEM_WIDTH-1:0] data;
- input [31:0] adr;
- begin
- for(j=0; j<(1<<$clog2(NFOLDS_PER_WORD)); j=j+1) begin
- @(negedge clk);
- awvalid = 1;
- wvalid = 1;
- wdata = data>>(j*32);
- awaddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4;
- fork
- begin
- @(posedge awready);
- @(posedge clk) awvalid = 0;
- end
- begin
- @(posedge wready);
- @(posedge clk) wvalid = 0;
- end
- join
- @(posedge clk);
- end
- end
-endtask
-
-task axi_read;
- input [31:0] adr;
- output [MEM_WIDTH-1:0] data;
- begin
- data = 0;
- for(j=0; j 0)) begin
- @(negedge clk);
- if(m_axis_0_tvalid & m_axis_0_tready) begin
- if(m_axis_0_tdata != golden[ptr0]) begin
- $display("Mismatch on stream 0");
- $stop();
- end
- //increment pointer
- ptr0 = ptr0 + 1;
- //rewind pointer if it's reached end
- if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH))
- ptr0 = STRM0_OFFSET;
- end
- end
- end
- //check stream 1
- begin
- $display("Starting stream 1 checker");
- while(~done & (NSTREAMS > 1)) begin
- @(negedge clk);
- if(m_axis_1_tvalid & m_axis_1_tready) begin
- if(m_axis_1_tdata != golden[ptr1]) begin
- $display("Mismatch on stream 1");
- $stop();
- end
- //increment pointer
- ptr1 = ptr1 + 1;
- //rewind pointer if it's reached end
- if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH))
- ptr1 = STRM1_OFFSET;
- end
- end
- end
- //check stream 2
- begin
- $display("Starting stream 2 checker");
- while(~done & (NSTREAMS > 2)) begin
- @(negedge clk);
- if(m_axis_2_tvalid & m_axis_2_tready) begin
- if(m_axis_2_tdata != golden[ptr2]) begin
- $display("Mismatch on stream 2");
- $stop();
- end
- //increment pointer
- ptr2 = ptr2 + 1;
- //rewind pointer if it's reached end
- if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH))
- ptr2 = STRM2_OFFSET;
- end
- end
- end
- //check stream 3
- begin
- $display("Starting stream 3 checker");
- while(~done & (NSTREAMS > 3)) begin
- @(negedge clk);
- if(m_axis_3_tvalid & m_axis_3_tready) begin
- if(m_axis_3_tdata != golden[ptr3]) begin
- $display("Mismatch on stream 3");
- $stop();
- end
- //increment pointer
- ptr3 = ptr3 + 1;
- //rewind pointer if it's reached end
- if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH))
- ptr3 = STRM3_OFFSET;
- end
- end
- end
- //check stream 4
- begin
- $display("Starting stream 4 checker");
- while(~done & (NSTREAMS > 4)) begin
- @(negedge clk);
- if(m_axis_4_tvalid & m_axis_4_tready) begin
- if(m_axis_4_tdata != golden[ptr4]) begin
- $display("Mismatch on stream 4");
- $stop();
- end
- //increment pointer
- ptr4 = ptr4 + 1;
- //rewind pointer if it's reached end
- if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH))
- ptr4 = STRM4_OFFSET;
- end
- end
- end
- //check stream 5
- begin
- $display("Starting stream 5 checker");
- while(~done & (NSTREAMS > 5)) begin
- @(negedge clk);
- if(m_axis_5_tvalid & m_axis_5_tready) begin
- if(m_axis_5_tdata != golden[ptr5]) begin
- $display("Mismatch on stream 5");
- $stop();
- end
- //increment pointer
- ptr5 = ptr5 + 1;
- //rewind pointer if it's reached end
- if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH))
- ptr5 = STRM5_OFFSET;
- end
- end
- end
- join
-end
-
-initial begin
- done = 0;
- @(negedge rst);
- $dumpfile("wave.vcd");
- $dumpvars(0,tb_memstream_writes);
- #50000
- $display("Test done!");
- done = 1;
- #1000
- $finish();
-end
-
-endmodule
diff --git a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
index 87565bc561..e802d81c79 100644
--- a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
+++ b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl
@@ -8,42 +8,21 @@ proc init_gui { IPINST } {
#Adding Page
set Page_0 [ipgui::add_page $IPINST -name "Page 0"]
ipgui::add_param $IPINST -name "AXILITE_ADDR_WIDTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "CONFIG_EN" -parent ${Page_0}
- ipgui::add_param $IPINST -name "MEM_DEPTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "MEM_INIT" -parent ${Page_0}
- ipgui::add_param $IPINST -name "MEM_WIDTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "NSTREAMS" -parent ${Page_0}
- ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0} -widget comboBox
- ipgui::add_param $IPINST -name "STRM0_DEPTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM0_OFFSET" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM0_WIDTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM1_DEPTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM1_OFFSET" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM1_WIDTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM2_DEPTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM2_OFFSET" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM2_WIDTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM3_DEPTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM3_OFFSET" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM3_WIDTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM4_DEPTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM4_OFFSET" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM4_WIDTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM5_DEPTH" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM5_OFFSET" -parent ${Page_0}
- ipgui::add_param $IPINST -name "STRM5_WIDTH" -parent ${Page_0}
-
-
+ ipgui::add_param $IPINST -name "DEPTH" -parent ${Page_0}
+ ipgui::add_param $IPINST -name "INIT_FILE" -parent ${Page_0}
+ ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0}
+ ipgui::add_param $IPINST -name "WIDTH" -parent ${Page_0}
}
-proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_WIDTH } {
+proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.DEPTH PARAM_VALUE.WIDTH } {
# Procedure called to update AXILITE_ADDR_WIDTH when any of the dependent parameters in the arguments change
+
set AXILITE_ADDR_WIDTH ${PARAM_VALUE.AXILITE_ADDR_WIDTH}
- set MEM_DEPTH ${PARAM_VALUE.MEM_DEPTH}
- set MEM_WIDTH ${PARAM_VALUE.MEM_WIDTH}
- set values(MEM_DEPTH) [get_property value $MEM_DEPTH]
- set values(MEM_WIDTH) [get_property value $MEM_WIDTH]
- set_property value [gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE $values(MEM_DEPTH) $values(MEM_WIDTH)] $AXILITE_ADDR_WIDTH
+ set DEPTH ${PARAM_VALUE.DEPTH}
+ set WIDTH ${PARAM_VALUE.WIDTH}
+ set values(DEPTH) [get_property value $DEPTH]
+ set values(WIDTH) [get_property value $WIDTH]
+ set_property value [gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE $values(DEPTH) $values(WIDTH)] $AXILITE_ADDR_WIDTH
}
proc validate_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH } {
@@ -51,48 +30,21 @@ proc validate_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH }
return true
}
-proc update_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
- # Procedure called to update CONFIG_EN when any of the dependent parameters in the arguments change
+proc update_PARAM_VALUE.DEPTH { PARAM_VALUE.DEPTH } {
+ # Procedure called to update DEPTH when any of the dependent parameters in the arguments change
}
-proc validate_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } {
- # Procedure called to validate CONFIG_EN
+proc validate_PARAM_VALUE.DEPTH { PARAM_VALUE.DEPTH } {
+ # Procedure called to validate DEPTH
return true
}
-proc update_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
- # Procedure called to update MEM_DEPTH when any of the dependent parameters in the arguments change
+proc update_PARAM_VALUE.INIT_FILE { PARAM_VALUE.INIT_FILE } {
+ # Procedure called to update INIT_FILE when any of the dependent parameters in the arguments change
}
-proc validate_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } {
- # Procedure called to validate MEM_DEPTH
- return true
-}
-
-proc update_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
- # Procedure called to update MEM_INIT when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } {
- # Procedure called to validate MEM_INIT
- return true
-}
-
-proc update_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
- # Procedure called to update MEM_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } {
- # Procedure called to validate MEM_WIDTH
- return true
-}
-
-proc update_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
- # Procedure called to update NSTREAMS when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } {
- # Procedure called to validate NSTREAMS
+proc validate_PARAM_VALUE.INIT_FILE { PARAM_VALUE.INIT_FILE } {
+ # Procedure called to validate INIT_FILE
return true
}
@@ -105,192 +57,29 @@ proc validate_PARAM_VALUE.RAM_STYLE { PARAM_VALUE.RAM_STYLE } {
return true
}
-proc update_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
- # Procedure called to update STRM0_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } {
- # Procedure called to validate STRM0_DEPTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
- # Procedure called to update STRM0_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } {
- # Procedure called to validate STRM0_OFFSET
- return true
-}
-
-proc update_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
- # Procedure called to update STRM0_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } {
- # Procedure called to validate STRM0_WIDTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
- # Procedure called to update STRM1_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } {
- # Procedure called to validate STRM1_DEPTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
- # Procedure called to update STRM1_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } {
- # Procedure called to validate STRM1_OFFSET
- return true
-}
-
-proc update_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
- # Procedure called to update STRM1_WIDTH when any of the dependent parameters in the arguments change
+proc update_PARAM_VALUE.WIDTH { PARAM_VALUE.WIDTH } {
+ # Procedure called to update WIDTH when any of the dependent parameters in the arguments change
}
-proc validate_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } {
- # Procedure called to validate STRM1_WIDTH
+proc validate_PARAM_VALUE.WIDTH { PARAM_VALUE.WIDTH } {
+ # Procedure called to validate WIDTH
return true
}
-proc update_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
- # Procedure called to update STRM2_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } {
- # Procedure called to validate STRM2_DEPTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
- # Procedure called to update STRM2_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } {
- # Procedure called to validate STRM2_OFFSET
- return true
-}
-
-proc update_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
- # Procedure called to update STRM2_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } {
- # Procedure called to validate STRM2_WIDTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
- # Procedure called to update STRM3_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } {
- # Procedure called to validate STRM3_DEPTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
- # Procedure called to update STRM3_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } {
- # Procedure called to validate STRM3_OFFSET
- return true
-}
-
-proc update_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
- # Procedure called to update STRM3_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } {
- # Procedure called to validate STRM3_WIDTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
- # Procedure called to update STRM4_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } {
- # Procedure called to validate STRM4_DEPTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
- # Procedure called to update STRM4_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } {
- # Procedure called to validate STRM4_OFFSET
- return true
-}
-proc update_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
- # Procedure called to update STRM4_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } {
- # Procedure called to validate STRM4_WIDTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
- # Procedure called to update STRM5_DEPTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } {
- # Procedure called to validate STRM5_DEPTH
- return true
-}
-
-proc update_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
- # Procedure called to update STRM5_OFFSET when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } {
- # Procedure called to validate STRM5_OFFSET
- return true
-}
-
-proc update_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
- # Procedure called to update STRM5_WIDTH when any of the dependent parameters in the arguments change
-}
-
-proc validate_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } {
- # Procedure called to validate STRM5_WIDTH
- return true
-}
-
-
-proc update_MODELPARAM_VALUE.CONFIG_EN { MODELPARAM_VALUE.CONFIG_EN PARAM_VALUE.CONFIG_EN } {
+proc update_MODELPARAM_VALUE.DEPTH { MODELPARAM_VALUE.DEPTH PARAM_VALUE.DEPTH } {
# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.CONFIG_EN}] ${MODELPARAM_VALUE.CONFIG_EN}
+ set_property value [get_property value ${PARAM_VALUE.DEPTH}] ${MODELPARAM_VALUE.DEPTH}
}
-proc update_MODELPARAM_VALUE.NSTREAMS { MODELPARAM_VALUE.NSTREAMS PARAM_VALUE.NSTREAMS } {
+proc update_MODELPARAM_VALUE.WIDTH { MODELPARAM_VALUE.WIDTH PARAM_VALUE.WIDTH } {
# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.NSTREAMS}] ${MODELPARAM_VALUE.NSTREAMS}
+ set_property value [get_property value ${PARAM_VALUE.WIDTH}] ${MODELPARAM_VALUE.WIDTH}
}
-proc update_MODELPARAM_VALUE.MEM_DEPTH { MODELPARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_DEPTH } {
+proc update_MODELPARAM_VALUE.INIT_FILE { MODELPARAM_VALUE.INIT_FILE PARAM_VALUE.INIT_FILE } {
# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.MEM_DEPTH}] ${MODELPARAM_VALUE.MEM_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.MEM_WIDTH { MODELPARAM_VALUE.MEM_WIDTH PARAM_VALUE.MEM_WIDTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.MEM_WIDTH}] ${MODELPARAM_VALUE.MEM_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.MEM_INIT { MODELPARAM_VALUE.MEM_INIT PARAM_VALUE.MEM_INIT } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.MEM_INIT}] ${MODELPARAM_VALUE.MEM_INIT}
+ set_property value [get_property value ${PARAM_VALUE.INIT_FILE}] ${MODELPARAM_VALUE.INIT_FILE}
}
proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE.RAM_STYLE } {
@@ -298,96 +87,6 @@ proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE.
set_property value [get_property value ${PARAM_VALUE.RAM_STYLE}] ${MODELPARAM_VALUE.RAM_STYLE}
}
-proc update_MODELPARAM_VALUE.STRM0_WIDTH { MODELPARAM_VALUE.STRM0_WIDTH PARAM_VALUE.STRM0_WIDTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM0_WIDTH}] ${MODELPARAM_VALUE.STRM0_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_WIDTH { MODELPARAM_VALUE.STRM1_WIDTH PARAM_VALUE.STRM1_WIDTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM1_WIDTH}] ${MODELPARAM_VALUE.STRM1_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_WIDTH { MODELPARAM_VALUE.STRM2_WIDTH PARAM_VALUE.STRM2_WIDTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM2_WIDTH}] ${MODELPARAM_VALUE.STRM2_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_WIDTH { MODELPARAM_VALUE.STRM3_WIDTH PARAM_VALUE.STRM3_WIDTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM3_WIDTH}] ${MODELPARAM_VALUE.STRM3_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_WIDTH { MODELPARAM_VALUE.STRM4_WIDTH PARAM_VALUE.STRM4_WIDTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM4_WIDTH}] ${MODELPARAM_VALUE.STRM4_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_WIDTH { MODELPARAM_VALUE.STRM5_WIDTH PARAM_VALUE.STRM5_WIDTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM5_WIDTH}] ${MODELPARAM_VALUE.STRM5_WIDTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM0_DEPTH { MODELPARAM_VALUE.STRM0_DEPTH PARAM_VALUE.STRM0_DEPTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM0_DEPTH}] ${MODELPARAM_VALUE.STRM0_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_DEPTH { MODELPARAM_VALUE.STRM1_DEPTH PARAM_VALUE.STRM1_DEPTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM1_DEPTH}] ${MODELPARAM_VALUE.STRM1_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_DEPTH { MODELPARAM_VALUE.STRM2_DEPTH PARAM_VALUE.STRM2_DEPTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM2_DEPTH}] ${MODELPARAM_VALUE.STRM2_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_DEPTH { MODELPARAM_VALUE.STRM3_DEPTH PARAM_VALUE.STRM3_DEPTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM3_DEPTH}] ${MODELPARAM_VALUE.STRM3_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_DEPTH { MODELPARAM_VALUE.STRM4_DEPTH PARAM_VALUE.STRM4_DEPTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM4_DEPTH}] ${MODELPARAM_VALUE.STRM4_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_DEPTH { MODELPARAM_VALUE.STRM5_DEPTH PARAM_VALUE.STRM5_DEPTH } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM5_DEPTH}] ${MODELPARAM_VALUE.STRM5_DEPTH}
-}
-
-proc update_MODELPARAM_VALUE.STRM0_OFFSET { MODELPARAM_VALUE.STRM0_OFFSET PARAM_VALUE.STRM0_OFFSET } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM0_OFFSET}] ${MODELPARAM_VALUE.STRM0_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM1_OFFSET { MODELPARAM_VALUE.STRM1_OFFSET PARAM_VALUE.STRM1_OFFSET } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM1_OFFSET}] ${MODELPARAM_VALUE.STRM1_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM2_OFFSET { MODELPARAM_VALUE.STRM2_OFFSET PARAM_VALUE.STRM2_OFFSET } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM2_OFFSET}] ${MODELPARAM_VALUE.STRM2_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM3_OFFSET { MODELPARAM_VALUE.STRM3_OFFSET PARAM_VALUE.STRM3_OFFSET } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM3_OFFSET}] ${MODELPARAM_VALUE.STRM3_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM4_OFFSET { MODELPARAM_VALUE.STRM4_OFFSET PARAM_VALUE.STRM4_OFFSET } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM4_OFFSET}] ${MODELPARAM_VALUE.STRM4_OFFSET}
-}
-
-proc update_MODELPARAM_VALUE.STRM5_OFFSET { MODELPARAM_VALUE.STRM5_OFFSET PARAM_VALUE.STRM5_OFFSET } {
- # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
- set_property value [get_property value ${PARAM_VALUE.STRM5_OFFSET}] ${MODELPARAM_VALUE.STRM5_OFFSET}
-}
-
proc update_MODELPARAM_VALUE.AXILITE_ADDR_WIDTH { MODELPARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.AXILITE_ADDR_WIDTH } {
# Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value
set_property value [get_property value ${PARAM_VALUE.AXILITE_ADDR_WIDTH}] ${MODELPARAM_VALUE.AXILITE_ADDR_WIDTH}
diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv
new file mode 100644
index 0000000000..0ac2628ee5
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_4sx4u.sv
@@ -0,0 +1,527 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48.
+ *****************************************************************************/
+
+module mvu_4sx4u #(
+ int unsigned PE,
+ int unsigned SIMD,
+ int unsigned ACCU_WIDTH,
+
+ int unsigned VERSION = 1,
+ bit SIGNED_ACTIVATIONS = 0,
+ bit FORCE_BEHAVIORAL = 0
+)(
+ // Global Control
+ input logic clk,
+ input logic rst,
+ input logic en,
+
+ // Input
+ input logic last,
+ input logic zero, // ignore current inputs and force this partial product to zero
+ input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights
+ input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS)
+
+ // Ouput
+ output logic vld,
+ output logic signed [PE-1:0][ACCU_WIDTH-1:0] p
+);
+ // for verilator always use behavioral code
+ localparam bit BEHAVIORAL =
+`ifdef VERILATOR
+ 1 ||
+`endif
+ FORCE_BEHAVIORAL;
+
+ typedef int unsigned leave_load_t[2*SIMD-1];
+ function leave_load_t init_leave_loads();
+ automatic leave_load_t res;
+ for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1;
+ for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2];
+ return res;
+ endfunction : init_leave_loads
+
+ // Pipeline for last indicator flag
+ logic [1:5] L = '0;
+ always_ff @(posedge clk) begin
+ if(rst) L <= '0;
+ else if(en) L <= { last, L[1:4] };
+ end
+ assign vld = L[5];
+
+ // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+ localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets
+
+ localparam int unsigned PIPE_COUNT = (PE+3)/4;
+ for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+ localparam int unsigned PE_BEG = 4*c;
+ localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1);
+ localparam int unsigned PE_REM = 4*(c+1) - PE_END;
+
+ uwire [57:0] p3[SIMD];
+ uwire signed [ 1:0] h3[SIMD][3];
+ for(genvar s = 0; s < SIMD; s++) begin : genSIMD
+
+ // Input Lane Assembly
+ uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] };
+ logic [29:0] aa;
+ logic [26:0] dd;
+ logic [ 1:0] xx[3:1];
+ if(1) begin : blkVectorize
+ uwire [3:0] ww[PE_END - PE_BEG];
+ for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin
+ assign ww[pe] = w[PE_BEG + pe][s];
+ if(pe) begin
+ if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+ else begin
+ LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+ .O6(xx[pe + PE_REM][1]),
+ .O5(xx[pe + PE_REM][0]),
+ .I5(1'b1),
+ .I4(zero),
+ .I3(ww[pe][1]),
+ .I2(a[s][1]),
+ .I1(ww[pe][0]),
+ .I0(a[s][0])
+ );
+ end
+`endif
+ end
+ end
+ always_comb begin
+ dd = '0;
+ aa = '0;
+ for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin
+ dd[D[pe + PE_REM]+:3] = ww[pe];
+ aa[D[pe + PE_REM]+ 3] = ww[pe][3];
+ end
+ end
+ end : blkVectorize
+
+ uwire [47:0] pp;
+
+ // Note: Since the product B * AD is computed,
+ // rst can be only applied to AD and zero only to B
+ // with the same effect as zeroing both.
+ if(BEHAVIORAL) begin : genBehav
+ // Stage #1: Input Refine
+ logic signed [17:0] B1 = 0;
+ always_ff @(posedge clk) begin
+ if(zero) B1 <= 0;
+ else if(en) B1 <= bb;
+ end
+
+ logic signed [26:0] AD1 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) AD1 <= 0;
+ else if(en) AD1 <= dd - aa;
+ end
+
+ // Stage #2: Multiply
+ logic signed [45:0] M2 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) M2 <= 0;
+ else if(en) M2 <=
+// synthesis translate off
+ (B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+ B1 * AD1;
+ end
+
+ // Stage #3: Accumulate
+ logic signed [47:0] P3 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) P3 <= 0;
+ else if(en) P3 <= M2 + (L[3]? 0 : P3);
+ end
+
+ assign pp = P3;
+ end : genBehav
+`ifndef VERILATOR
+ else begin : genDSP
+ localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01;
+ uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+ case(VERSION)
+ 1: DSP48E1 #(
+ // Feature Control Attributes: Data Path Selection
+ .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+ .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+ .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE)
+ .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+ .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+ // Pattern Detector Attributes: Pattern Detection Configuration
+ .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+ .MASK('1), // 48-bit mask value for pattern detect (1=ignore)
+ .PATTERN('0), // 48-bit pattern match for pattern detect
+ .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+ .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C")
+ .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET")
+
+ // Register Control Attributes: Pipeline Register Configuration
+ .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+ .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1)
+ .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1)
+ .AREG(0), // Number of pipeline stages for A (0, 1 or 2)
+ .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+ .BREG(1), // Number of pipeline stages for B (0, 1 or 2)
+ .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1)
+ .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1)
+ .CREG(0), // Number of pipeline stages for C (0 or 1)
+ .DREG(0), // Number of pipeline stages for D (0 or 1)
+ .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1)
+ .MREG(1), // Number of multiplier pipeline stages (0 or 1)
+ .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1)
+ .PREG(1) // Number of pipeline stages for P (0 or 1)
+ ) dsp (
+ // Cascade: 30-bit (each) output: Cascade Ports
+ .ACOUT(), // 30-bit output: A port cascade output
+ .BCOUT(), // 18-bit output: B port cascade output
+ .CARRYCASCOUT(), // 1-bit output: Cascade carry output
+ .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output
+ .PCOUT(), // 48-bit output: Cascade output
+
+ // Control: 1-bit (each) output: Control Inputs/Status Bits
+ .OVERFLOW(), // 1-bit output: Overflow in add/acc output
+ .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output
+ .PATTERNDETECT(), // 1-bit output: Pattern detect output
+ .UNDERFLOW(), // 1-bit output: Underflow in add/acc output
+
+ // Data: 4-bit (each) output: Data Ports
+ .CARRYOUT(), // 4-bit output: Carry output
+ .P(pp), // 48-bit output: Primary data output
+
+ // Cascade: 30-bit (each) input: Cascade Ports
+ .ACIN('x), // 30-bit input: A cascade data input
+ .BCIN('x), // 18-bit input: B cascade input
+ .CARRYCASCIN('x), // 1-bit input: Cascade carry input
+ .MULTSIGNIN('x), // 1-bit input: Multiplier sign input
+ .PCIN('x), // 48-bit input: P cascade input
+
+ // Control: 4-bit (each) input: Control Inputs/Status Bits
+ .CLK(clk), // 1-bit input: Clock input
+ .ALUMODE('0), // 4-bit input: ALU control input
+ .CARRYINSEL('0), // 3-bit input: Carry select input
+ .INMODE(5'b01100), // 5-bit input: INMODE control input
+ .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+ // Data: 30-bit (each) input: Data Ports
+ .A(aa), // 30-bit input: A data input
+ .B(bb), // 18-bit input: B data input
+ .C('x), // 48-bit input: C data input
+ .CARRYIN('0), // 1-bit input: Carry input signal
+ .D(dd), // 25-bit input: D data input
+
+ // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+ .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG
+ .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG
+ .CEAD(en), // 1-bit input: Clock enable input for ADREG
+ .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE
+ .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG
+ .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG
+ .CEC('0), // 1-bit input: Clock enable input for CREG
+ .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG
+ .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+ .CED('0), // 1-bit input: Clock enable input for DREG
+ .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG
+ .CEM(en), // 1-bit input: Clock enable input for MREG
+ .CEP(en), // 1-bit input: Clock enable input for PREG
+ .RSTA('0), // 1-bit input: Reset input for AREG
+ .RSTB( // 1-bit input: Reset for BREG
+// synthesis translate_off
+ rst ||
+// synthesis translate_on
+ zero
+ ),
+ .RSTC('0), // 1-bit input: Reset for CREG
+ .RSTD( // 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+ zero ||
+// synthesis translate_on
+ rst
+ ),
+ .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG
+ .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG
+ .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+ .RSTINMODE('0), // 1-bit input: Reset for INMODE register
+ .RSTM(rst), // 1-bit input: Reset for MREG
+ .RSTP(rst) // 1-bit input: Reset for PREG
+ );
+ 2: DSP48E2 #(
+ // Feature Control Attributes: Data Path Selection
+ .AMULTSEL("AD"), // Selects A input to multiplier (A, AD)
+ .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+ .BMULTSEL("B"), // Selects B input to multiplier (AD, B)
+ .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+ .PREADDINSEL("A"), // Selects input to pre-adder (A, B)
+ .RND('0), // Rounding Constant
+ .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+ .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24)
+ .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE)
+ .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+ // Pattern Detector Attributes: Pattern Detection Configuration
+ .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+ .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET).
+ .MASK('1), // 58-bit mask value for pattern detect (1=ignore)
+ .PATTERN('0), // 58-bit pattern match for pattern detect
+ .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+ .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN)
+ .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET)
+
+ // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+ .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE
+ .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN
+ .IS_CLK_INVERTED('0), // Optional inversion for CLK
+ .IS_INMODE_INVERTED('0), // Optional inversion for INMODE
+ .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE
+ .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN
+ .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE
+ .IS_RSTA_INVERTED('0), // Optional inversion for RSTA
+ .IS_RSTB_INVERTED('0), // Optional inversion for RSTB
+ .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A
+ .IS_RSTC_INVERTED('0), // Optional inversion for RSTC
+ .IS_RSTD_INVERTED('0), // Optional inversion for RSTD
+ .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE
+ .IS_RSTM_INVERTED('0), // Optional inversion for RSTM
+ .IS_RSTP_INVERTED('0), // Optional inversion for RSTP
+
+ // Register Control Attributes: Pipeline Register Configuration
+ .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+ .ADREG(1), // Pipeline stages for pre-adder (0-1)
+ .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1)
+ .AREG(0), // Pipeline stages for A (0-2)
+ .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+ .BREG(1), // Pipeline stages for B (0-2)
+ .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1)
+ .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1)
+ .CREG(0), // Pipeline stages for C (0-1)
+ .DREG(0), // Pipeline stages for D (0-1)
+ .INMODEREG(0), // Pipeline stages for INMODE (0-1)
+ .MREG(1), // Multiplier pipeline stages (0-1)
+ .OPMODEREG(1), // Pipeline stages for OPMODE (0-1)
+ .PREG(1) // Number of pipeline stages for P (0-1)
+ ) dsp (
+ // Cascade outputs: Cascade Ports
+ .ACOUT(), // 34-bit output: A port cascade
+ .BCOUT(), // 24-bit output: B cascade
+ .CARRYCASCOUT(), // 1-bit output: Cascade carry
+ .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade
+ .PCOUT(), // 58-bit output: Cascade output
+
+ // Control outputs: Control Inputs/Status Bits
+ .OVERFLOW(), // 1-bit output: Overflow in add/acc
+ .PATTERNBDETECT(), // 1-bit output: Pattern bar detect
+ .PATTERNDETECT(), // 1-bit output: Pattern detect
+ .UNDERFLOW(), // 1-bit output: Underflow in add/acc
+
+ // Data outputs: Data Ports
+ .CARRYOUT(), // 4-bit output: Carry
+ .P(pp), // 58-bit output: Primary data
+ .XOROUT(), // 8-bit output: XOR data
+
+ // Cascade inputs: Cascade Ports
+ .ACIN('x), // 34-bit input: A cascade data
+ .BCIN('x), // 24-bit input: B cascade
+ .CARRYCASCIN('x), // 1-bit input: Cascade carry
+ .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade
+ .PCIN('x), // 58-bit input: P cascade
+
+ // Control inputs: Control Inputs/Status Bits
+ .CLK(clk), // 1-bit input: Clock
+ .ALUMODE(4'h0), // 4-bit input: ALU control
+ .CARRYINSEL('0), // 3-bit input: Carry select
+ .INMODE(5'b01100), // 5-bit input: INMODE control
+ .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode
+
+ // Data inputs: Data Ports
+ .A(aa), // 34-bit input: A data
+ .B(bb), // 24-bit input: B data
+ .C('x), // 58-bit input: C data
+ .CARRYIN('0), // 1-bit input: Carry-in
+ .D(dd), // 27-bit input: D data
+
+ // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+ .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG
+ .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG
+ .CEAD(en), // 1-bit input: Clock enable for ADREG
+ .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE
+ .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG
+ .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG
+ .CEC('0), // 1-bit input: Clock enable for CREG
+ .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG
+ .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+ .CED('0), // 1-bit input: Clock enable for DREG
+ .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG
+ .CEM(en), // 1-bit input: Clock enable for MREG
+ .CEP(en), // 1-bit input: Clock enable for PREG
+ .RSTA('0), // 1-bit input: Reset for AREG
+ .RSTB( // 1-bit input: Reset for BREG
+// synthesis translate_off
+ rst ||
+// synthesis translate_on
+ zero
+ ),
+ .RSTC('0), // 1-bit input: Reset for CREG
+ .RSTD( // 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+ zero ||
+// synthesis translate_on
+ rst
+ ),
+ .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG
+ .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG
+ .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+ .RSTINMODE('0), // 1-bit input: Reset for INMODE register
+ .RSTM(rst), // 1-bit input: Reset for MREG
+ .RSTP(rst) // 1-bit input: Reset for PREG
+ );
+ default: initial begin
+ $error("Unknown version DSP48E%0d.", VERSION);
+ $finish;
+ end
+ endcase
+ end : genDSP
+`endif
+
+ // External Canary Pipeline
+ logic [1:0] X1[3:1] = '{ default: 0 };
+ logic [1:0] X2[3:1] = '{ default: 0 };
+ logic [1:0] X3[3:1] = '{ default: 0 };
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ X1 <= '{ default: 0 };
+ X2 <= '{ default: 0 };
+ X3 <= '{ default: 0 };
+ end
+ else if(en) begin
+ X1 <= xx;
+ X2 <= X1;
+ foreach(X3[i]) begin
+ X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]);
+ end
+ end
+ end
+
+ // Derive actual cross-lane overflows
+ for(genvar i = 0; i < 3; i++) begin
+ assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1];
+ end
+ assign p3[s] = pp;
+
+ end : genSIMD
+
+ // Stage #4: Cross-SIMD Reduction
+
+ // Count leaves reachable from each node
+ localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+
+ uwire signed [ACCU_WIDTH -1:0] up4;
+ uwire signed [ACCU_WIDTH -8:0] hi4[3];
+ uwire [$clog2(SIMD)+7:0] lo4[3];
+ for(genvar i = 0; i < 4; i++) begin
+ localparam int unsigned LO_WIDTH = D[i+1] - D[i];
+ localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH;
+
+ // Conclusive high part accumulation
+ if(i >= PE_REM && i < 3) begin : genHi
+ // Adder Tree across all SIMD high contributions, each from [-1:1]
+ uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree;
+ for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i];
+ for(genvar n = 0; n < SIMD-1; n++) begin
+ // Sum truncated to actual maximum bit width at this node
+ uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+ assign tree[n] = s;
+ end
+
+ // High Sideband Accumulation
+ logic signed [HI_WIDTH-1:0] Hi4 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) Hi4 <= 0;
+ else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]);
+ end
+ assign hi4[i] = Hi4;
+ end : genHi
+ else if (i < 3) begin : genHiZero
+ assign hi4[i] = '0;
+ end : genHiZero
+
+ // Conclusive low part accumulation
+ if(i >= PE_REM) begin : blkLo
+ // Adder Tree across all SIMD low contributions
+ localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+ uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree;
+ for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+ for(genvar n = 0; n < SIMD-1; n++) begin
+ // Sum truncated to actual maximum bit width at this node
+ localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+ uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+ assign tree[n] = s;
+ end
+
+ logic [ROOT_WIDTH-1:0] Lo4 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) Lo4 <= 0;
+ else if(en) Lo4 <= tree[0];
+ end
+
+ if(i == 3) assign up4 = Lo4;
+ else assign lo4[i] = Lo4;
+ end : blkLo
+ else begin : blkLoZero
+ assign lo4[i] = '0;
+ end : blkLoZero
+
+ end
+
+ // Stage #5: Resolve lane totals
+ logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 };
+ always_ff @(posedge clk) begin
+ if(rst) Res5 <= '{ default: 0 };
+ else if(en) begin
+ Res5[3] <= up4 - hi4[2];
+ Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1];
+ Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0];
+ Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] });
+ end
+ end
+
+ // Output
+ for(genvar pe = PE_BEG; pe < PE_END; pe++) begin
+ assign p[pe] = Res5[pe - PE_BEG + PE_REM];
+ end
+
+ end : genPipes
+
+endmodule : mvu_4sx4u
diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
new file mode 100644
index 0000000000..fbf48784f0
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv
@@ -0,0 +1,525 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48.
+ *****************************************************************************/
+
+module mvu_8sx8u_dsp48 #(
+ int unsigned PE,
+ int unsigned SIMD,
+ int unsigned ACCU_WIDTH,
+ int unsigned ACTIVATION_WIDTH,
+ int unsigned WEIGHT_WIDTH,
+
+ int unsigned VERSION = 1,
+ bit SIGNED_ACTIVATIONS = 0,
+ bit FORCE_BEHAVIORAL = 0
+)(
+ // Global Control
+ input logic clk,
+ input logic rst,
+ input logic en,
+
+ // Input
+ input logic last,
+ input logic zero, // ignore current inputs and force this partial product to zero
+ input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights
+ input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS)
+
+ // Ouput
+ output logic vld,
+ output logic signed [PE-1:0][ACCU_WIDTH-1:0] p
+);
+ // for verilator always use behavioral code
+ localparam bit BEHAVIORAL =
+`ifdef VERILATOR
+ 1 ||
+`endif
+ FORCE_BEHAVIORAL;
+
+ typedef int unsigned leave_load_t[2*SIMD-1];
+ function leave_load_t init_leave_loads();
+ automatic leave_load_t res;
+ for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1;
+ for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2];
+ return res;
+ endfunction : init_leave_loads
+
+ // Pipeline for last indicator flag
+ logic [1:5] L = '0;
+ always_ff @(posedge clk) begin
+ if(rst) L <= '0;
+ else if(en) L <= { last, L[1:4] };
+ end
+ assign vld = L[5];
+
+ // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism
+ localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH;
+ localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets
+
+ localparam int unsigned PIPE_COUNT = (PE+1)/2;
+ for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes
+
+ localparam int unsigned PE_BEG = 2*c;
+ localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1);
+ localparam int unsigned PE_REM = 2*(c+1) - PE_END;
+
+ uwire [57:0] p3[SIMD];
+ uwire signed [ 1:0] h3[SIMD];
+ for(genvar s = 0; s < SIMD; s++) begin : genSIMD
+
+ // Input Lane Assembly
+ uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] };
+ logic [29:0] aa;
+ logic [26:0] dd;
+ logic [ 1:0] xx;
+ if(1) begin : blkVectorize
+ uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG];
+ for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin
+ assign ww[pe] = w[PE_BEG + pe][s];
+ if(pe) begin
+ if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s];
+`ifndef VERILATOR
+ else begin
+ LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x (
+ .O6(xx[1]),
+ .O5(xx[0]),
+ .I5(1'b1),
+ .I4(zero),
+ .I3(ww[pe][1]),
+ .I2(a[s][1]),
+ .I1(ww[pe][0]),
+ .I0(a[s][0])
+ );
+ end
+`endif
+ end
+ end
+ always_comb begin
+ dd = '0;
+ aa = '0;
+ for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin
+ dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe];
+ aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1];
+ end
+ end
+ end : blkVectorize
+
+ uwire [47:0] pp;
+
+ // Note: Since the product B * AD is computed,
+ // rst can be only applied to AD and zero only to B
+ // with the same effect as zeroing both.
+ if(BEHAVIORAL) begin : genBehav
+ // Stage #1: Input Refine
+ logic signed [17:0] B1 = 0;
+ always_ff @(posedge clk) begin
+ if(zero) B1 <= 0;
+ else if(en) B1 <= bb;
+ end
+
+ logic signed [26:0] AD1 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) AD1 <= 0;
+ else if(en) AD1 <= dd - aa;
+ end
+
+ // Stage #2: Multiply
+ logic signed [45:0] M2 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) M2 <= 0;
+ else if(en) M2 <=
+// synthesis translate off
+ (B1 === '0) || (AD1 === '0)? 0 :
+// synthesis translate on
+ B1 * AD1;
+ end
+
+ // Stage #3: Accumulate
+ logic signed [47:0] P3 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) P3 <= 0;
+ else if(en) P3 <= M2 + (L[3]? 0 : P3);
+ end
+
+ assign pp = P3;
+ end : genBehav
+`ifndef VERILATOR
+ else begin : genDSP
+ localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01;
+ uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 };
+ case(VERSION)
+ 1: DSP48E1 #(
+ // Feature Control Attributes: Data Path Selection
+ .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+ .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+ .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE)
+ .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE")
+ .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12")
+
+ // Pattern Detector Attributes: Pattern Detection Configuration
+ .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH"
+ .MASK('1), // 48-bit mask value for pattern detect (1=ignore)
+ .PATTERN('0), // 48-bit pattern match for pattern detect
+ .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2"
+ .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C")
+ .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET")
+
+ // Register Control Attributes: Pipeline Register Configuration
+ .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2)
+ .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1)
+ .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1)
+ .AREG(0), // Number of pipeline stages for A (0, 1 or 2)
+ .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2)
+ .BREG(1), // Number of pipeline stages for B (0, 1 or 2)
+ .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1)
+ .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1)
+ .CREG(0), // Number of pipeline stages for C (0 or 1)
+ .DREG(0), // Number of pipeline stages for D (0 or 1)
+ .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1)
+ .MREG(1), // Number of multiplier pipeline stages (0 or 1)
+ .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1)
+ .PREG(1) // Number of pipeline stages for P (0 or 1)
+ ) dsp (
+ // Cascade: 30-bit (each) output: Cascade Ports
+ .ACOUT(), // 30-bit output: A port cascade output
+ .BCOUT(), // 18-bit output: B port cascade output
+ .CARRYCASCOUT(), // 1-bit output: Cascade carry output
+ .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output
+ .PCOUT(), // 48-bit output: Cascade output
+
+ // Control: 1-bit (each) output: Control Inputs/Status Bits
+ .OVERFLOW(), // 1-bit output: Overflow in add/acc output
+ .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output
+ .PATTERNDETECT(), // 1-bit output: Pattern detect output
+ .UNDERFLOW(), // 1-bit output: Underflow in add/acc output
+
+ // Data: 4-bit (each) output: Data Ports
+ .CARRYOUT(), // 4-bit output: Carry output
+ .P(pp), // 48-bit output: Primary data output
+
+ // Cascade: 30-bit (each) input: Cascade Ports
+ .ACIN('x), // 30-bit input: A cascade data input
+ .BCIN('x), // 18-bit input: B cascade input
+ .CARRYCASCIN('x), // 1-bit input: Cascade carry input
+ .MULTSIGNIN('x), // 1-bit input: Multiplier sign input
+ .PCIN('x), // 48-bit input: P cascade input
+
+ // Control: 4-bit (each) input: Control Inputs/Status Bits
+ .CLK(clk), // 1-bit input: Clock input
+ .ALUMODE('0), // 4-bit input: ALU control input
+ .CARRYINSEL('0), // 3-bit input: Carry select input
+ .INMODE(5'b01100), // 5-bit input: INMODE control input
+ .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input
+
+ // Data: 30-bit (each) input: Data Ports
+ .A(aa), // 30-bit input: A data input
+ .B(bb), // 18-bit input: B data input
+ .C('x), // 48-bit input: C data input
+ .CARRYIN('0), // 1-bit input: Carry input signal
+ .D(dd), // 25-bit input: D data input
+
+ // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs
+ .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG
+ .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG
+ .CEAD(en), // 1-bit input: Clock enable input for ADREG
+ .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE
+ .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG
+ .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG
+ .CEC('0), // 1-bit input: Clock enable input for CREG
+ .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG
+ .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG
+ .CED('0), // 1-bit input: Clock enable input for DREG
+ .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG
+ .CEM(en), // 1-bit input: Clock enable input for MREG
+ .CEP(en), // 1-bit input: Clock enable input for PREG
+ .RSTA('0), // 1-bit input: Reset input for AREG
+ .RSTB( // 1-bit input: Reset for BREG
+// synthesis translate_off
+ rst ||
+// synthesis translate_on
+ zero
+ ),
+ .RSTC('0), // 1-bit input: Reset for CREG
+ .RSTD( // 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+ zero ||
+// synthesis translate_on
+ rst
+ ),
+ .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG
+ .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG
+ .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+ .RSTINMODE('0), // 1-bit input: Reset for INMODE register
+ .RSTM(rst), // 1-bit input: Reset for MREG
+ .RSTP(rst) // 1-bit input: Reset for PREG
+ );
+ 2: DSP48E2 #(
+ // Feature Control Attributes: Data Path Selection
+ .AMULTSEL("AD"), // Selects A input to multiplier (A, AD)
+ .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+ .BMULTSEL("B"), // Selects B input to multiplier (AD, B)
+ .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+ .PREADDINSEL("A"), // Selects input to pre-adder (A, B)
+ .RND('0), // Rounding Constant
+ .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+ .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24)
+ .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE)
+ .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+
+ // Pattern Detector Attributes: Pattern Detection Configuration
+ .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+ .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET).
+ .MASK('1), // 58-bit mask value for pattern detect (1=ignore)
+ .PATTERN('0), // 58-bit pattern match for pattern detect
+ .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+ .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN)
+ .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET)
+
+ // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+ .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE
+ .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN
+ .IS_CLK_INVERTED('0), // Optional inversion for CLK
+ .IS_INMODE_INVERTED('0), // Optional inversion for INMODE
+ .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE
+ .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN
+ .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE
+ .IS_RSTA_INVERTED('0), // Optional inversion for RSTA
+ .IS_RSTB_INVERTED('0), // Optional inversion for RSTB
+ .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A
+ .IS_RSTC_INVERTED('0), // Optional inversion for RSTC
+ .IS_RSTD_INVERTED('0), // Optional inversion for RSTD
+ .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE
+ .IS_RSTM_INVERTED('0), // Optional inversion for RSTM
+ .IS_RSTP_INVERTED('0), // Optional inversion for RSTP
+
+ // Register Control Attributes: Pipeline Register Configuration
+ .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+ .ADREG(1), // Pipeline stages for pre-adder (0-1)
+ .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1)
+ .AREG(0), // Pipeline stages for A (0-2)
+ .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+ .BREG(1), // Pipeline stages for B (0-2)
+ .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1)
+ .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1)
+ .CREG(0), // Pipeline stages for C (0-1)
+ .DREG(0), // Pipeline stages for D (0-1)
+ .INMODEREG(0), // Pipeline stages for INMODE (0-1)
+ .MREG(1), // Multiplier pipeline stages (0-1)
+ .OPMODEREG(1), // Pipeline stages for OPMODE (0-1)
+ .PREG(1) // Number of pipeline stages for P (0-1)
+ ) dsp (
+ // Cascade outputs: Cascade Ports
+ .ACOUT(), // 34-bit output: A port cascade
+ .BCOUT(), // 24-bit output: B cascade
+ .CARRYCASCOUT(), // 1-bit output: Cascade carry
+ .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade
+ .PCOUT(), // 58-bit output: Cascade output
+
+ // Control outputs: Control Inputs/Status Bits
+ .OVERFLOW(), // 1-bit output: Overflow in add/acc
+ .PATTERNBDETECT(), // 1-bit output: Pattern bar detect
+ .PATTERNDETECT(), // 1-bit output: Pattern detect
+ .UNDERFLOW(), // 1-bit output: Underflow in add/acc
+
+ // Data outputs: Data Ports
+ .CARRYOUT(), // 4-bit output: Carry
+ .P(pp), // 58-bit output: Primary data
+ .XOROUT(), // 8-bit output: XOR data
+
+ // Cascade inputs: Cascade Ports
+ .ACIN('x), // 34-bit input: A cascade data
+ .BCIN('x), // 24-bit input: B cascade
+ .CARRYCASCIN('x), // 1-bit input: Cascade carry
+ .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade
+ .PCIN('x), // 58-bit input: P cascade
+
+ // Control inputs: Control Inputs/Status Bits
+ .CLK(clk), // 1-bit input: Clock
+ .ALUMODE(4'h0), // 4-bit input: ALU control
+ .CARRYINSEL('0), // 3-bit input: Carry select
+ .INMODE(5'b01100), // 5-bit input: INMODE control
+ .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode
+
+ // Data inputs: Data Ports
+ .A(aa), // 34-bit input: A data
+ .B(bb), // 24-bit input: B data
+ .C('x), // 58-bit input: C data
+ .CARRYIN('0), // 1-bit input: Carry-in
+ .D(dd), // 27-bit input: D data
+
+ // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+ .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG
+ .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG
+ .CEAD(en), // 1-bit input: Clock enable for ADREG
+ .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE
+ .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG
+ .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG
+ .CEC('0), // 1-bit input: Clock enable for CREG
+ .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG
+ .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+ .CED('0), // 1-bit input: Clock enable for DREG
+ .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG
+ .CEM(en), // 1-bit input: Clock enable for MREG
+ .CEP(en), // 1-bit input: Clock enable for PREG
+ .RSTA('0), // 1-bit input: Reset for AREG
+ .RSTB( // 1-bit input: Reset for BREG
+// synthesis translate_off
+ rst ||
+// synthesis translate_on
+ zero
+ ),
+ .RSTC('0), // 1-bit input: Reset for CREG
+ .RSTD( // 1-bit input: Reset for DREG and ADREG
+// synthesis translate_off
+ zero ||
+// synthesis translate_on
+ rst
+ ),
+ .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG
+ .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG
+ .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+ .RSTINMODE('0), // 1-bit input: Reset for INMODE register
+ .RSTM(rst), // 1-bit input: Reset for MREG
+ .RSTP(rst) // 1-bit input: Reset for PREG
+ );
+ default: initial begin
+ $error("Unknown version DSP48E%0d.", VERSION);
+ $finish;
+ end
+ endcase
+ end : genDSP
+`endif
+
+ // External Canary Pipeline
+ logic [1:0] X1 = '{ default: 0 };
+ logic [1:0] X2 = '{ default: 0 };
+ logic [1:0] X3 = '{ default: 0 };
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ X1 <= '{ default: 0 };
+ X2 <= '{ default: 0 };
+ X3 <= '{ default: 0 };
+ end
+ else if(en) begin
+ X1 <= xx;
+ X2 <= X1;
+ X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]);
+ end
+ end
+
+ // Derive actual cross-lane overflows
+ assign h3[s] = pp[D[1]+:2] - X3;
+
+ assign p3[s] = pp;
+
+ end : genSIMD
+
+ // Stage #4: Cross-SIMD Reduction
+
+ // Count leaves reachable from each node
+ localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop
+
+ uwire signed [ACCU_WIDTH -1:0] up4;
+ uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4;
+ uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4;
+
+ // Conclusive high part accumulation
+ if(PE_REM == 0) begin : genHi
+ localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1];
+ // Adder Tree across all SIMD high contributions, each from [-1:1]
+ uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree;
+ for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s];
+ for(genvar n = 0; n < SIMD-1; n++) begin
+ // Sum truncated to actual maximum bit width at this node
+ uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+ assign tree[n] = s;
+ end
+
+ // High Sideband Accumulation
+ logic signed [HI_WIDTH-1:0] Hi4 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) Hi4 <= 0;
+ else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]);
+ end
+ assign hi4 = Hi4;
+ end : genHi
+ else begin : genHiZero
+ assign hi4 = '0;
+ end : genHiZero
+
+ for(genvar i = 0; i < 2; i++) begin
+ localparam int unsigned LO_WIDTH = D[i+1] - D[i];
+ // Conclusive low part accumulation
+ if(i >= PE_REM) begin : blkLo
+ // Adder Tree across all SIMD low contributions
+ localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1));
+ uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree;
+ for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH];
+ for(genvar n = 0; n < SIMD-1; n++) begin
+ // Sum truncated to actual maximum bit width at this node
+ localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1));
+ uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]);
+ assign tree[n] = s;
+ end
+
+ logic [ROOT_WIDTH-1:0] Lo4 = 0;
+ always_ff @(posedge clk) begin
+ if(rst) Lo4 <= 0;
+ else if(en) Lo4 <= tree[0];
+ end
+
+ if(i == 1) assign up4 = Lo4;
+ else assign lo4 = Lo4;
+ end : blkLo
+ else begin : blkLoZero
+ assign lo4 = '0;
+ end : blkLoZero
+
+ end
+
+ // Stage #5: Resolve lane totals
+ logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 };
+ always_ff @(posedge clk) begin
+ if(rst) Res5 <= '{ default: 0 };
+ else if(en) begin
+ Res5[1] <= up4 - hi4;
+ Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 });
+ end
+ end
+
+ // Output
+ for(genvar pe = PE_BEG; pe < PE_END; pe++) begin
+ assign p[pe] = Res5[pe - PE_BEG + PE_REM];
+ end
+
+ end : genPipes
+
+endmodule : mvu_8sx8u_dsp48
diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
new file mode 100644
index 0000000000..3bbc7051b9
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv
@@ -0,0 +1,430 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Matrix/Vector Vector Unit (MVU/VVU) core compute kernel utilizing DSP58.
+ *****************************************************************************/
+
+module mvu_vvu_8sx9_dsp58 #(
+ bit IS_MVU,
+ int unsigned PE,
+ int unsigned SIMD,
+ int unsigned ACTIVATION_WIDTH,
+ int unsigned WEIGHT_WIDTH,
+ int unsigned ACCU_WIDTH,
+ bit SIGNED_ACTIVATIONS = 0,
+ int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment)
+ bit FORCE_BEHAVIORAL = 0,
+
+ localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD,
+ localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD
+ )
+ (
+ // Global Control
+ input logic clk,
+ input logic rst,
+ input logic en,
+
+ // Input
+ input logic last,
+ input logic zero, // ignore current inputs and force this partial product to zero
+ input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights
+ input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations
+
+ // Ouput
+ output logic vld,
+ output logic [PE-1:0][ACCU_WIDTH-1:0] p
+ );
+ // for verilator always use behavioral code
+ localparam bit BEHAVIORAL =
+`ifdef VERILATOR
+ 1 ||
+`endif
+ FORCE_BEHAVIORAL;
+
+//-------------------- Declare global signals --------------------\\
+ localparam int unsigned CHAINLEN = (SIMD+2)/3;
+ localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length
+ localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE;
+ uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN];
+ uwire [23:0] b_in_i [PE][CHAINLEN];
+ uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator
+
+//-------------------- Shift register for opmode select signal --------------------\\
+ localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register))
+ logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg).
+ // Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric)
+
+ always_ff @(posedge clk) begin
+ if(rst) L <= '{default: 0};
+ else if(en) begin
+ L[1+MAX_PIPELINE_STAGES] <= last;
+ L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES];
+ end
+ end
+ assign vld = L[0];
+
+//-------------------- Shift register for ZERO flag --------------------\\
+ logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric)
+
+ if (MAX_PIPELINE_STAGES > 1) begin : genZreg
+ always_ff @(posedge clk) begin
+ if (rst) Z <= '{default: 0};
+ else if(en) begin
+ Z[0] <= zero;
+ if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3];
+ end
+ end
+ end;
+
+//-------------------- Buffer for input activations --------------------\\
+ localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH;
+ for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0;
+ localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3;
+
+ if (EXTERNAL_PREGS > 0) begin : genExternalPregAct
+ logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0};
+ always_ff @(posedge clk) begin
+ if (rst) A <= '{default: 0};
+ else if(en) begin
+ A[EXTERNAL_PREGS-1] <=
+// synthesis translate_off
+ zero ? '1 :
+// synthesis translate_on
+ a[SIMD*k + 3*i +: LANES_OCCUPIED];
+ if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1];
+ end
+ end
+ for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0;
+ localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3;
+
+ if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight
+ logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0};
+ always_ff @(posedge clk) begin
+ if (rst) B <= '{default: 0};
+ else if (en) begin
+ B[i][EXTERNAL_PREGS-1] <=
+// synthesis translate_off
+ zero ? '1 :
+// synthesis translate_on
+ //w[i][3*j +: LANES_OCCUPIED];
+ w[SIMD*i+3*j +: LANES_OCCUPIED];
+ if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1];
+ end
+ end
+ for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin
+ assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] };
+ end : genBin
+ for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+ assign b_in_i[i][j][8*k +: 8] = 8'b0;
+ end : genBinZero
+ end : genExternalPregWeight
+ else begin : genInpDSPWeight
+ for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin
+ assign b_in_i[i][j][8*k +: 8] =
+// synthesis translate_off
+ zero ? '1 :
+// synthesis translate_on
+ PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] };
+ end : genBin
+ for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero
+ assign b_in_i[i][j][8*k +: 8] = 8'b0;
+ end : genBinZero
+ end : genInpDSPWeight
+ end : genWeightSIMD
+ end : genWeightPE
+
+//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\
+ for (genvar i=0; i0 ? 2 : 1; // 1 : 0
+ localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1;
+ localparam bit FIRST = j == 0;
+ localparam bit LAST = j == CHAINLEN-1;
+ uwire [57:0] pp;
+
+ if (LAST) begin : genPOUT
+ assign p[i] = pp[ACCU_WIDTH-1:0];
+ end
+
+ // Note: Since the product B * AD is computed,
+ // rst can be only applied to AD and zero only to B
+ // with the same effect as zeroing both.
+ if(BEHAVIORAL) begin : genBehav
+ // Stage #1: Input A/B
+ logic signed [33:0] Areg [INTERNAL_PREGS];
+ always_ff @(posedge clk) begin
+ if (rst) Areg <= '{ default : 0};
+ else if (en) begin
+ Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] };
+ if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0];
+ end
+ end
+ logic signed [23:0] Breg [INTERNAL_PREGS];
+ always_ff @(posedge clk) begin
+ if (rst) Breg <= '{ default : 0};
+ else if (en) begin
+ Breg[0] <= b_in_i[i][j];
+ if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0];
+ end
+ end
+
+ // Stage #2: Multiply-Accumulate
+ logic signed [57:0] Mreg;
+ logic InmodeZero = 0;
+ always_ff @(posedge clk) begin
+ if (rst) InmodeZero <= 0;
+ else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero );
+ end
+ always_ff @(posedge clk) begin
+ if (rst) Mreg <= 0;
+ else if (en) begin
+ automatic logic signed [57:0] m = 0;
+ for (int k = 0; k < 3; k++) begin
+ m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8]));
+ end
+ Mreg <= m;
+ end
+ end
+
+ // Stage #3: Accumulate
+ logic signed [57:0] Preg;
+ logic Opmode = 0;
+ if (FIRST && !LAST) begin : genFirst
+ if (PREG) begin : genPregBehav
+ always_ff @(posedge clk) begin
+ if (rst) Preg <= 0;
+ else if (en) Preg <= Mreg;
+ end
+ end
+ else assign Preg = Mreg;
+ end
+ else if (FIRST && LAST) begin : genSingle
+ always_ff @(posedge clk) begin
+ if (rst) Opmode <= 0;
+ else if (en) Opmode <= L[1];
+ end
+ always_ff @(posedge clk) begin
+ if (rst) Preg <= 0;
+ else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg;
+ end
+ end
+ else if (!FIRST && LAST) begin : genLast
+ always_ff @(posedge clk) begin
+ if (rst) Opmode <= 0;
+ else if (en) Opmode <= L[1];
+ end
+ always_ff @(posedge clk) begin
+ if (rst) Preg <= 0;
+ else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1];
+ end
+ end
+ else begin : genMid
+ if (PREG) begin : genPregBehav
+ always_ff @(posedge clk) begin
+ if (rst) Preg <= 0;
+ else if (en) Preg <= Mreg + pcout[i][j-1];
+ end
+ end
+ else assign Preg = Mreg + pcout[i][j-1];
+ end
+ assign pp = Preg;
+ assign pcout[i][j] = Preg;
+ end : genBehav
+`ifndef VERILATOR
+ else begin: genDSP
+ DSP58 #(
+ // Feature Control Attributes: Data Path Selection
+ .AMULTSEL("A"), // Selects A input to multiplier (A, AD)
+ .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port)
+ .BMULTSEL("B"), // Selects B input to multiplier (AD, B)
+ .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port)
+ .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for
+ // legacy mode.
+ .PREADDINSEL("A"), // Selects input to pre-adder (A, B)
+ .RND(58'h000000000000000), // Rounding Constant
+ .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE)
+ .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24)
+ .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE)
+ .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116)
+ // Pattern Detector Attributes: Pattern Detection Configuration
+ .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH
+ .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET).
+ .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore)
+ .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect
+ .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2
+ .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN)
+ .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET)
+ // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins
+ .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE
+ .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN
+ .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK
+ .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE
+ .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE
+ .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0
+ FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN
+ 2'b01, // Y : M
+ 2'b01 // X: M
+ }), // Optional inversion for OPMODE
+ .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN
+ .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE
+ .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA
+ .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB
+ .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A
+ .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC
+ .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD
+ .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE
+ .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM
+ .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP
+ // Register Control Attributes: Pipeline Register Configuration
+ .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2)
+ .ADREG(0), // Pipeline stages for pre-adder (0-1)
+ .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1)
+ .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2)
+ .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2)
+ .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2)
+ .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1)
+ .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1)
+ .CREG(0), // Pipeline stages for C (0-1)
+ .DREG(0), // Pipeline stages for D (0-1)
+ .INMODEREG(1), // Pipeline stages for INMODE (0-1)
+ .MREG(1), // Multiplier pipeline stages (0-1)
+ .OPMODEREG(1), // Pipeline stages for OPMODE (0-1)
+ .PREG(PREG), // Number of pipeline stages for P (0-1)
+ .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC).
+ )
+ DSP58_inst (
+ // Cascade outputs: Cascade Ports
+ .ACOUT(), // 34-bit output: A port cascade
+ .BCOUT(), // 24-bit output: B cascade
+ .CARRYCASCOUT(), // 1-bit output: Cascade carry
+ .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade
+ .PCOUT(pcout[i][j]), // 58-bit output: Cascade output
+ // Control outputs: Control Inputs/Status Bits
+ .OVERFLOW(), // 1-bit output: Overflow in add/acc
+ .PATTERNBDETECT(), // 1-bit output: Pattern bar detect
+ .PATTERNDETECT(), // 1-bit output: Pattern detect
+ .UNDERFLOW(), // 1-bit output: Underflow in add/acc
+ // Data outputs: Data Ports
+ .CARRYOUT(), // 4-bit output: Carry
+ .P(pp), // 58-bit output: Primary data
+ .XOROUT(), // 8-bit output: XOR data
+ // Cascade inputs: Cascade Ports
+ .ACIN('x), // 34-bit input: A cascade data
+ .BCIN('x), // 24-bit input: B cascade
+ .CARRYCASCIN('x), // 1-bit input: Cascade carry
+ .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade
+ .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade
+ // Control inputs: Control Inputs/Status Bits
+ .ALUMODE(4'h0), // 4-bit input: ALU control
+ .CARRYINSEL('0), // 3-bit input: Carry select
+ .CLK(clk), // 1-bit input: Clock
+ .INMODE({
+ INTERNAL_PREGS==2 ? 1'b0 : 1'b1,
+ 2'b00,
+ TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero,
+ INTERNAL_PREGS==2 ? 1'b0 : 1'b1
+ }), // 5-bit input: INMODE control
+ .NEGATE('0), // 3-bit input: Negates the input of the multiplier
+ .OPMODE({
+ LAST ? {1'b0, L[1]} : 2'b00,
+ 7'b000_0000
+ }), // 9-bit input: Operation mode
+ // Data inputs: Data Ports
+ .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data
+ .B(b_in_i[i][j]), // 24-bit input: B data
+ .C('x), // 58-bit input: C data
+ .CARRYIN('0), // 1-bit input: Carry-in
+ .D('x), // 27-bit input: D data
+ // Reset/Clock Enable inputs: Reset/Clock Enable Inputs
+ .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers.
+ .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG
+ .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG
+ .CEAD('0), // 1-bit input: Clock enable for ADREG
+ .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE
+ .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG
+ .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG
+ .CEC('0), // 1-bit input: Clock enable for CREG
+ .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG
+ .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG
+ .CED('0), // 1-bit input: Clock enable for DREG
+ .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG
+ .CEM(en), // 1-bit input: Clock enable for MREG
+ .CEP(PREG && en), // 1-bit input: Clock enable for PREG
+ .RSTA(rst), // 1-bit input: Reset for AREG
+ .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG
+ .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG
+ .RSTB(rst), // 1-bit input: Reset for BREG
+ .RSTC('0), // 1-bit input: Reset for CREG
+ .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG
+ .RSTD('0), // 1-bit input: Reset for DREG and ADREG
+ .RSTINMODE(rst), // 1-bit input: Reset for INMODE register
+ .RSTM(rst), // 1-bit input: Reset for MREG
+ .RSTP(PREG && rst) // 1-bit input: Reset for PREG
+ );
+ end : genDSP
+`endif
+ end : genDSPChain
+ end : genDSPPE
+
+endmodule : mvu_vvu_8sx9_dsp58
diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv
new file mode 100644
index 0000000000..6498530113
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi.sv
@@ -0,0 +1,367 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper.
+ * @details
+ * The following compute cores are supported:
+ * - 4-bit MVU on DSP48 achieving 4 MACs/DSP,
+ * - (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP,
+ * - [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP,
+ * Folding hints:
+ * - PE scaling should divide MH.
+ * - SIMD scaling should divide MW.
+ * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to
+ * impact critical paths more than PE scaling. PE scaling implies a
+ * bigger fanout on the input activations.
+ * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated
+ *****************************************************************************/
+
+module mvu_vvu_axi #(
+ bit IS_MVU,
+ parameter COMPUTE_CORE,
+ int unsigned MW,
+ int unsigned MH,
+ int unsigned PE,
+ int unsigned SIMD,
+ int unsigned SEGMENTLEN = 0,
+
+ int unsigned ACTIVATION_WIDTH,
+ int unsigned WEIGHT_WIDTH,
+ int unsigned ACCU_WIDTH,
+ bit SIGNED_ACTIVATIONS = 0,
+
+ bit PUMPED_COMPUTE = 0,
+ bit FORCE_BEHAVIORAL = 0,
+ bit M_REG_LUT = 1,
+
+ // Safely deducible parameters
+ localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH,
+ localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8,
+ localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH,
+ localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8,
+ localparam int unsigned OUTPUT_STREAM_WIDTH = PE*ACCU_WIDTH,
+ localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8,
+ localparam bit SIMD_UNEVEN = SIMD % 2
+)(
+ // Global Control
+ input logic ap_clk,
+ input logic ap_clk2x, // synchronous, double-speed clock; only used for PUMPED_COMPUTE
+ input logic ap_rst_n,
+
+ // Weight Stream
+ input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata,
+ input logic s_axis_weights_tvalid,
+ output logic s_axis_weights_tready,
+
+ // Input Stream
+ input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata,
+ input logic s_axis_input_tvalid,
+ output logic s_axis_input_tready,
+
+ // Output Stream
+ output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata,
+ output logic m_axis_output_tvalid,
+ input logic m_axis_output_tready
+);
+
+//-------------------- Parameter sanity checks --------------------\\
+ initial begin
+ if (MW % SIMD != 0) begin
+ $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD);
+ $finish;
+ end
+ if (MH % PE != 0) begin
+ $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE);
+ $finish;
+ end
+ if (WEIGHT_WIDTH > 8) begin
+ $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH);
+ $finish;
+ end
+ if (ACTIVATION_WIDTH > 8) begin
+ if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin
+ $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH);
+ $finish;
+ end
+ end
+ if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin
+ if (SEGMENTLEN == 0) begin
+ $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+ end
+ if (SEGMENTLEN > (SIMD+2)/3) begin
+ $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3);
+ $finish;
+ end
+ end
+ if (!IS_MVU) begin
+ if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin
+ $error("VVU only supported on DSP58 or LUT-based implementation");
+ $finish;
+ end
+ end
+ end
+
+ uwire clk = ap_clk;
+ uwire clk2x = ap_clk2x;
+ uwire rst = !ap_rst_n;
+
+ //- Replay to Accommodate Neuron Fold -----------------------------------
+ typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t;
+ uwire mvu_flatin_t amvau;
+ uwire alast;
+ uwire afin;
+ uwire avld;
+ uwire ardy;
+
+ localparam int unsigned SF = MW/SIMD;
+ localparam int unsigned NF = MH/PE;
+ replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay (
+ .clk, .rst,
+ .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)),
+ .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin)
+ );
+
+ //- Unflatten inputs into structured matrices ---------------------------
+ localparam int unsigned ACT_PE = IS_MVU? 1 : PE;
+ typedef logic [PE -1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t;
+ typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t;
+
+ uwire mvu_w_t mvu_w = s_axis_weights_tdata;
+
+ //- Conditional Activations Layout Adjustment for VVU
+ uwire mvu_a_t amvau_i;
+ if (IS_MVU || (PE == 1)) begin : genMVUInput
+ assign amvau_i = amvau;
+ end : genMVUInput
+ else begin : genVVUInput
+ // The input stream will have the channels interleaved for VVU when PE>1
+ // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..]
+ // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+ // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+ // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i)
+ for(genvar pe = 0; pe < ACT_PE; pe++) begin
+ for(genvar simd = 0; simd < SIMD; simd++) begin
+ assign amvau_i[pe][simd] = amvau[simd*ACT_PE+pe];
+ end
+ end
+ end : genVVUInput
+
+ //- Flow Control Bracket around Compute Core ----------------------------
+ uwire en;
+ uwire istb = avld && s_axis_weights_tvalid;
+ assign ardy = en && s_axis_weights_tvalid;
+ assign s_axis_weights_tready = en && avld;
+
+ //- Conditionally Pumped DSP Compute ------------------------------------
+ typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t;
+ uwire ovld;
+ uwire dsp_p_t odat;
+ if(1) begin : blkDsp
+ localparam int unsigned EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD;
+ localparam int unsigned DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1);
+ typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t;
+ typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t;
+
+ uwire dsp_clk;
+ uwire dsp_en;
+
+ uwire dsp_last;
+ uwire dsp_zero;
+ uwire dsp_w_t dsp_w;
+ uwire dsp_a_t dsp_a;
+
+ uwire dsp_vld;
+ uwire dsp_p_t dsp_p;
+
+ if(!PUMPED_COMPUTE) begin : genUnpumpedCompute
+ assign dsp_clk = clk;
+ assign dsp_en = en;
+
+ assign dsp_last = alast && avld;
+ assign dsp_zero = !istb;
+ assign dsp_w = mvu_w;
+ assign dsp_a = amvau_i;
+
+ assign ovld = dsp_vld;
+ assign odat = dsp_p;
+ end : genUnpumpedCompute
+ else begin : genPumpedCompute
+ assign dsp_clk = clk2x;
+
+ // Identify second fast cycle just before active slow clock edge
+ logic Active = 0;
+ if(1) begin : blkActive
+ uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net
+ (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk));
+ (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0]));
+ always_ff @(posedge clk2x) Active <= clk_lut[1];
+ end : blkActive
+
+ // The input for a slow cycle is split across two fast cycles along the SIMD dimension.
+ // - Both fast cycles are controlled by the same enable state.
+ // - A zero cycle is duplicated across both fast cycles.
+ // - The last flag must be restricted to the second fast cycle.
+
+ dsp_w_t W = 'x;
+ for(genvar pe = 0; pe < PE; pe++) begin : genPERegW
+
+ uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0] w;
+ for(genvar i = 0; i < SIMD; i++) assign w[i] = mvu_w[pe][i];
+ for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0;
+
+ always_ff @(posedge clk2x) begin
+ if(rst) W[pe] <= 'x;
+ else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+ end
+
+ end : genPERegW
+
+ dsp_a_t A = 'x;
+ for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA
+
+ uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] a;
+ for(genvar i = 0; i < SIMD; i++) assign a[i] = amvau_i[pe][i];
+ for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0;
+
+ always_ff @(posedge clk2x) begin
+ if(rst) A[pe] <= 'x;
+ else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD];
+ end
+
+ end : genPERegA
+
+ logic Zero = 1;
+ logic Last = 0;
+ always_ff @(posedge clk2x) begin
+ if(rst) begin
+ Zero <= 1;
+ Last <= 0;
+ end
+ else if(en) begin
+ Zero <= !istb;
+ Last <= alast && avld && Active;
+ end
+ end
+
+ assign dsp_en = en;
+ assign dsp_last = Last;
+ assign dsp_zero = Zero;
+ assign dsp_w = W;
+ assign dsp_a = A;
+
+ // Since no two consecutive last cycles will ever be asserted on the input,
+ // valid outputs will also always be spaced by, at least, one other cycle.
+ // We can always hold a captured output for two cycles to allow the slow
+ // clock to pick it up.
+ logic Vld = 0;
+ dsp_p_t P = 'x;
+ always_ff @(posedge clk2x) begin
+ if(rst) begin
+ Vld <= 0;
+ P <= 'x;
+ end
+ else if(en) begin
+ if(dsp_vld) P <= dsp_p;
+ Vld <= dsp_vld || (Vld && !Active);
+ end
+ end
+ assign ovld = Vld;
+ assign odat = P;
+
+ end : genPumpedCompute
+
+ case(COMPUTE_CORE)
+ "mvu_vvu_8sx9_dsp58":
+ mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+ .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN),
+ .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+ .clk(dsp_clk), .rst, .en(dsp_en),
+ .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+ .vld(dsp_vld), .p(dsp_p)
+ );
+ "mvu_4sx4u":
+ mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+ .clk(dsp_clk), .rst, .en(dsp_en),
+ .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+ .vld(dsp_vld), .p(dsp_p)
+ );
+ "mvu_8sx8u_dsp48":
+ mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH),
+ .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core (
+ .clk(dsp_clk), .rst, .en(dsp_en),
+ .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a),
+ .vld(dsp_vld), .p(dsp_p)
+ );
+ default: initial begin
+ $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE);
+ $finish;
+ end
+ endcase
+
+ end : blkDsp
+
+//-------------------- Output register slice --------------------\\
+ // Make `en`computation independent from external inputs.
+ // Drive all outputs from registers.
+ struct packed {
+ logic rdy;
+ logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+ } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure
+ struct packed {
+ logic vld;
+ logic [PE-1:0][ACCU_WIDTH-1:0] dat;
+ } B = '{ vld: 0, default: 'x }; // ultimate output register
+
+ assign en = A.rdy;
+ uwire b_load = !B.vld || m_axis_output_tready;
+
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ A <= '{ rdy: 1, default: 'x };
+ B <= '{ vld: 0, default: 'x };
+ end
+ else begin
+ if(A.rdy) A.dat <= odat;
+ A.rdy <= (A.rdy && !ovld) || b_load;
+
+ if(b_load) begin
+ B <= '{
+ vld: ovld || !A.rdy,
+ dat: A.rdy? odat : A.dat
+ };
+ end
+ end
+ end
+ assign m_axis_output_tvalid = B.vld;
+ // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO?
+ // These extra bits should never be used. Why not 'x them out?
+ assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat};
+
+endmodule : mvu_vvu_axi
diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
new file mode 100644
index 0000000000..50c15c1b02
--- /dev/null
+++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v
@@ -0,0 +1,97 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Verilog AXI-lite wrapper for MVU & VVU.
+ *****************************************************************************/
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+ parameter IS_MVU = $IS_MVU$,
+ parameter COMPUTE_CORE = "$COMPUTE_CORE$",
+ parameter PUMPED_COMPUTE = 0,
+ parameter MW = $MW$,
+ parameter MH = $MH$,
+ parameter PE = $PE$,
+ parameter SIMD = $SIMD$,
+ parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$,
+ parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$,
+ parameter ACCU_WIDTH = $ACCU_WIDTH$,
+ parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$,
+ parameter SEGMENTLEN = $SEGMENTLEN$,
+ parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$,
+
+ // Safely deducible parameters
+ parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8,
+ parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8,
+ parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8
+)(
+ // Global Control
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+ input ap_clk,
+ // (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *)
+ // (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *)
+ // input ap_clk2x,
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+ input ap_rst_n,
+
+ // Weight Stream
+ input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA,
+ input weights_V_TVALID,
+ output weights_V_TREADY,
+ // Input Stream
+ input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA,
+ input in0_V_TVALID,
+ output in0_V_TREADY,
+ // Output Stream
+ output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA,
+ output out_V_TVALID,
+ input out_V_TREADY
+);
+
+mvu_vvu_axi #(
+ .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD),
+ .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH),
+ .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)
+ ) inst (
+ .ap_clk(ap_clk),
+ .ap_clk2x(1'b0), // wired to ground since double-pumped compute not enabled through FINN for now
+ .ap_rst_n(ap_rst_n),
+ .s_axis_weights_tdata(weights_V_TDATA),
+ .s_axis_weights_tvalid(weights_V_TVALID),
+ .s_axis_weights_tready(weights_V_TREADY),
+ .s_axis_input_tdata(in0_V_TDATA),
+ .s_axis_input_tvalid(in0_V_TVALID),
+ .s_axis_input_tready(in0_V_TREADY),
+ .m_axis_output_tdata(out_V_TDATA),
+ .m_axis_output_tvalid(out_V_TVALID),
+ .m_axis_output_tready(out_V_TREADY)
+);
+
+endmodule // $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv
new file mode 100644
index 0000000000..3e2766f63d
--- /dev/null
+++ b/finn-rtllib/mvu/replay_buffer.sv
@@ -0,0 +1,181 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Replay buffer for counted sequences on an AXI-lite stream.
+ * @author Thomas B. Preußer
+ *****************************************************************************/
+
+module replay_buffer #(
+ int unsigned LEN, // Sequence length
+ int unsigned REP, // Sequence replay count
+ int unsigned W // Data width
+)(
+ input logic clk,
+ input logic rst,
+
+ input logic [W-1:0] idat,
+ input logic ivld,
+ output logic irdy,
+
+ output logic [W-1:0] odat,
+ output logic olast,
+ output logic ofin,
+ output logic ovld,
+ input logic ordy
+);
+
+ if(LEN == 0) initial begin
+ $error("%m: Illegal zero sequence LEN.");
+ $finish;
+ end
+ if(REP == 0) initial begin
+ $error("%m: Illegal zero REP count.");
+ $finish;
+ end
+
+ // Track position in Sequence
+ uwire last_item;
+ uwire shift;
+ if(LEN == 1) assign last_item = 1;
+ else begin
+ typedef logic [$clog2(LEN)-1:0] count_t;
+ count_t Count = 0;
+ logic Last = 0;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ Count <= 0;
+ Last <= 0;
+ end
+ else if(shift) begin
+ Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1);
+ Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last);
+ end
+ end
+ assign last_item = Last;
+ end
+
+ if(REP == 1) begin
+ assign shift = ivld && ordy;
+
+ assign irdy = ordy;
+ assign odat = idat;
+ assign olast = last_item;
+ assign ofin = last_item;
+ assign ovld = ivld;
+ end
+ else begin
+
+ // Track Repetitions
+ uwire last_rep;
+ if(1) begin : blkRep
+ typedef logic [$clog2(REP)-1:0] rep_t;
+ rep_t RepCnt = 0;
+ logic RepLst = 0;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ RepCnt <= 0;
+ RepLst <= 0;
+ end
+ else if(last_item && shift) begin
+ RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1);
+ RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst);
+ end
+ end
+ assign last_rep = RepLst;
+ end : blkRep
+
+ localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN);
+ typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB
+ typedef logic [W -1:0] data_t;
+
+ // Output Registers
+ data_t ODat;
+ logic OVld = 0;
+ logic OLst = 'x;
+ logic OFin = 'x;
+ assign odat = ODat;
+ assign olast = OLst;
+ assign ofin = OFin;
+ assign ovld = OVld;
+
+ // Buffer Memory Management
+ data_t Mem[2**AWIDTH];
+ ptr_t WP = 0; // Write Pointer
+ ptr_t RP = 0; // Read Pointer
+ ptr_t FP = 0; // Free Pointer
+
+ // Operational Guards
+ // Occupancy: WP-FP
+ // WP-FP < 2**AWIDTH -> writing allowed
+ // - increments WP
+ // Availability: WP-RP
+ // WP-RP > 0 -> reading allowed
+ // - increments RP, last in sequence rewinds to FP for non-final repetition
+ // - increments FP in last repetition
+ assign irdy = !((WP-FP) >> AWIDTH);
+
+ uwire wr = irdy && ivld;
+ uwire rd = !OVld || ordy;
+ always_ff @(posedge clk) begin
+ if(wr) Mem[WP[AWIDTH-1:0]] <= idat;
+ if(rd) ODat <= Mem[RP[AWIDTH-1:0]];
+ end
+
+ uwire vld = (RP != WP);
+ assign shift = rd && vld;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ WP <= 0;
+ RP <= 0;
+ FP <= 0;
+
+ OVld <= 0;
+ OLst <= 'x;
+ OFin <= 'x;
+ end
+ else begin
+ if(wr) WP <= WP + 1;
+ if(rd) begin
+ if(vld) begin
+ automatic logic rewind = last_item && !last_rep;
+ RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1);
+ FP <= FP + last_rep;
+ end
+
+ OVld <= vld;
+ OLst <= last_item;
+ OFin <= last_rep && last_item;
+ end
+ end
+ end
+
+ end
+
+endmodule : replay_buffer
diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
new file mode 100644
index 0000000000..34b5d8eb53
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv
@@ -0,0 +1,165 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Testbench for MVU core compute kernel.
+ *****************************************************************************/
+
+module mvu_8sx9_tb();
+
+//-------------------- Simulation parameters --------------------\\
+ // Matrix & parallelism config
+ localparam int unsigned MH = 256;
+ localparam int unsigned PE = 16;
+ localparam int unsigned MW = 600;
+ localparam int unsigned SIMD = 60;
+ localparam int unsigned SEGMENTLEN = 4;
+ // Bit-width config
+ localparam int unsigned ACTIVATION_WIDTH = 8;
+ localparam int unsigned WEIGHT_WIDTH = 4;
+ localparam bit SIGNED_ACTIVATIONS = 1;
+ // Simulation constants
+ localparam int unsigned NF = MH/PE;
+ localparam int unsigned SF = MW/SIMD;
+ localparam int unsigned NUM_OF_DSP = SIMD/3;
+
+ typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+ typedef activation_t activation_vector_t[SF];
+
+ function activation_vector_t init_ACTIVATIONS;
+ automatic activation_vector_t res;
+ std::randomize(res);
+ return res;
+ endfunction : init_ACTIVATIONS
+
+ typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+ typedef weight_t weight_matrix_t[NF][SF];
+
+ function weight_matrix_t init_WEIGHTS;
+ automatic weight_matrix_t res;
+ std::randomize(res);
+ return res;
+ endfunction : init_WEIGHTS;
+
+ typedef logic signed [PE-1:0][57:0] output_t;
+ typedef output_t output_vector_t [NF];
+
+ function output_vector_t check_output(activation_vector_t a, weight_matrix_t w);
+ automatic output_vector_t res = '{default: 0};
+ for (int j = 0; j 1) && !rst;
+ end
+
+ // Compare computed output against golden output when vld flag is raised by DUT
+ always_ff @(posedge clk iff (vld && en)) begin
+ foreach(p[i]) begin
+ assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+ else begin
+ $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+ $stop;
+ end
+ end
+ NF_CNT += 1;
+ end
+
+ // Instantiate DUT
+ mvu_8sx9 #(
+ .PE(PE),
+ .SIMD(SIMD),
+ .WEIGHT_WIDTH(WEIGHT_WIDTH),
+ .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+ .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+ .SEGMENTLEN(SEGMENTLEN)
+ )
+ dut (
+ .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p
+ );
+
+endmodule : mvu_8sx9_tb
diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
new file mode 100644
index 0000000000..4ed7b4bf5f
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv
@@ -0,0 +1,229 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Testbench for MVU AXI wrapper module.
+ *****************************************************************************/
+
+module mvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+ // Matrix & parallelism config
+ localparam bit IS_MVU = 1;
+ localparam string COMPUTE_CORE = "mvu_4sx4u";
+ localparam int unsigned MW = 120;
+ localparam int unsigned MH = 40;
+ localparam int unsigned SIMD = 20;
+ localparam int unsigned PE = 10;
+ localparam int unsigned SEGMENTLEN = 2.0;
+ localparam bit FORCE_BEHAVIORAL = 1;
+ localparam bit M_REG_LUT = 1;
+ // Bit-width config
+ localparam int unsigned ACTIVATION_WIDTH = 4;
+ localparam int unsigned WEIGHT_WIDTH = 4;
+ localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+ localparam bit SIGNED_ACTIVATIONS = 0;
+ // Simulation constants
+ localparam int unsigned NF = MH/PE;
+ localparam int unsigned SF = MW/SIMD;
+ localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+ localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8;
+ localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+ localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH;
+ localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+ // Generate clk and reset signal
+ logic clk = 0;
+ always #5ns clk = !clk;
+
+ logic ap_rst_n = 0;
+ initial begin
+ repeat(16) @(posedge clk);
+ ap_rst_n <= 1;
+ end
+
+ uwire ap_clk = clk;
+
+ // Generate activations
+ typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+ typedef activation_t activation_vector_t[SF];
+
+ function activation_vector_t init_ACTIVATIONS;
+ automatic activation_vector_t res;
+ std::randomize(res);
+ return res;
+ endfunction : init_ACTIVATIONS
+
+ activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+ struct {
+ activation_t dat;
+ logic vld;
+ logic rdy;
+ } activations;
+
+ initial begin
+ activations.vld = 0;
+ activations.dat = 'X;
+ @(posedge clk iff ap_rst_n);
+
+ for (int i=0; i= 0;
+ @(posedge clk);
+ end while (!(activations.vld === 1 && activations.rdy === 1));
+ end
+
+ activations.vld <= 0;
+ activations.dat <= 'x;
+ end
+
+ // Generate weights
+ typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+ typedef weight_t weight_matrix_t[NF][SF];
+
+ function weight_matrix_t init_WEIGHTS;
+ automatic weight_matrix_t res;
+ std::randomize(res);
+ return res;
+ endfunction : init_WEIGHTS;
+
+ weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+ struct {
+ weight_t dat;
+ logic vld;
+ logic rdy;
+ } weights;
+
+ initial begin
+ weights.vld = 0;
+ weights.dat = 'X;
+ @(posedge clk iff ap_rst_n);
+
+ weights.vld <= 1;
+ for (int i=0; i [..][PE][SIMD][..]
+ // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+ // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+ // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+ for (int i = 0; i < NF; i++) begin
+ for (int j = 0; j < SF; j++) begin
+ for (int k = 0; k < PE; k++) begin
+ for (int l = 0; l < SIMD; l++) begin
+ if (SIGNED_ACTIVATIONS)
+ res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]);
+ else
+ res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]);
+ end
+ end
+ end
+ end
+ return res;
+ endfunction : check_output;
+
+ output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+ int unsigned NF_CNT = 0;
+ initial begin
+ outputs.rdy = 0;
+ while (NF_CNT < NF) begin
+ // Loop until both rdy & vld are asserted
+ do begin
+ outputs.rdy <= $urandom()%7 >= 0;
+ @(posedge clk iff ap_rst_n);
+ end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+ // Compare produced outputs against golden outputs
+ foreach(outputs.dat[i]) begin
+ assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+ else begin
+ $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+ $stop;
+ end
+ end
+
+ NF_CNT += 1;
+ end
+
+ $finish;
+ end
+
+ // Instantiate DUT
+ mvu_vvu_axi #(
+ .IS_MVU(IS_MVU),
+ .COMPUTE_CORE(COMPUTE_CORE),
+ .MW(MW),
+ .MH(MH),
+ .PE(PE),
+ .SIMD(SIMD),
+ .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+ .WEIGHT_WIDTH(WEIGHT_WIDTH),
+ .ACCU_WIDTH(ACCU_WIDTH),
+ .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+ .SEGMENTLEN(SEGMENTLEN),
+ .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+ .M_REG_LUT(M_REG_LUT)
+ )
+ dut (
+ .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+ .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+ .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+ .m_axis_output_tready(outputs.rdy)
+ );
+
+endmodule : mvu_axi_tb
diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
new file mode 100644
index 0000000000..108980c497
--- /dev/null
+++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv
@@ -0,0 +1,142 @@
+module mvu_dsp58_tb;
+
+ localparam int unsigned N = 1000;
+
+ localparam int unsigned MW = 12;
+ localparam int unsigned MH = 4;
+ localparam int unsigned PE = 2;
+ localparam int unsigned SIMD = 6;
+ localparam int unsigned ACTIVATION_WIDTH = 8;
+ localparam int unsigned WEIGHT_WIDTH = 8;
+ localparam int unsigned ACCU_WIDTH = 24;
+
+ //- Global Control ------------------
+ logic clk = 1;
+ logic clk2x = 1;
+ always #5ns clk = !clk;
+ always #2.5ns clk2x = !clk2x;
+
+ logic rst = 1;
+ initial begin
+ repeat(8) @(posedge clk);
+ rst <= 0;
+ end
+
+ //- DUTs ----------------------------
+
+ // Weight Stream
+ logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata;
+ logic s_axis_weights_tvalid[2];
+ uwire s_axis_weights_tready[2];
+
+ // Input Stream
+ logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata;
+ logic s_axis_input_tvalid[2];
+ uwire s_axis_input_tready[2];
+
+ // Output Stream
+ uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata[2];
+ uwire m_axis_output_tvalid[2];
+ logic m_axis_output_tready[2];
+
+ for(genvar i = 0; i < 2; i++) begin : genDUTs
+ mvu_vvu_axi #(
+ .IS_MVU(1),
+ .COMPUTE_CORE("mvu_vvu_8sx9_dsp58"),
+ .MW(MW), .MH(MH),
+ .PE(PE), .SIMD(SIMD),
+ .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+ .WEIGHT_WIDTH(WEIGHT_WIDTH),
+ .ACCU_WIDTH(ACCU_WIDTH),
+ .PUMPED_COMPUTE(i)
+ ) dut (
+ .ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst),
+ .s_axis_weights_tdata, .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]),
+ .s_axis_input_tdata, .s_axis_input_tvalid (s_axis_input_tvalid [i]), .s_axis_input_tready (s_axis_input_tready [i]),
+ .m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i])
+ );
+ end : genDUTs
+
+
+ //- Stimuli -------------------------
+
+ // Weight Feed
+ initial begin
+ s_axis_weights_tvalid = '{ default: 0 };
+ s_axis_weights_tdata = 'x;
+ @(posedge clk iff !rst);
+
+ repeat(N * (MH/PE)*(MW/SIMD)) begin
+ automatic type(s_axis_weights_tdata) weights;
+ std::randomize(weights);
+ s_axis_weights_tdata <= weights;
+ s_axis_weights_tvalid <= '{ default: 1 };
+ fork
+ begin
+ @(posedge clk iff s_axis_weights_tready[0]);
+ s_axis_weights_tvalid[0] <= 0;
+ end
+ begin
+ @(posedge clk iff s_axis_weights_tready[1]);
+ s_axis_weights_tvalid[1] <= 0;
+ end
+ join
+ end
+ end
+
+ // Input Feed
+ initial begin
+ s_axis_input_tvalid = '{ default: 0 };
+ s_axis_input_tdata = 'x;
+ @(posedge clk iff !rst);
+
+ repeat(N * (MW/SIMD)) begin
+ automatic type(s_axis_input_tdata) in;
+ std::randomize(in);
+ s_axis_input_tdata <= in;
+ s_axis_input_tvalid <= '{ default: 1 };
+ fork
+ begin
+ @(posedge clk iff s_axis_input_tready[0]);
+ s_axis_input_tvalid[0] <= 0;
+ end
+ begin
+ @(posedge clk iff s_axis_input_tready[1]);
+ s_axis_input_tvalid[1] <= 0;
+ end
+ join
+ end
+ end
+
+ // Output Capture and Comparison
+ initial begin
+ m_axis_output_tready = '{ default: 0 };
+ @(posedge clk iff !rst);
+
+ repeat(N * (MH/PE)) begin
+ automatic type(m_axis_output_tdata) res;
+ m_axis_output_tready <= '{ default: 1 };
+ fork
+ begin
+ @(posedge clk iff m_axis_output_tvalid[0]);
+ m_axis_output_tready[0] <= 0;
+ res[0] = m_axis_output_tdata[0];
+ end
+ begin
+ @(posedge clk iff m_axis_output_tvalid[1]);
+ m_axis_output_tready[1] <= 0;
+ res[1] = m_axis_output_tdata[1];
+ end
+ join
+ assert(res[0] == res[1]) else begin
+ $error("Output mismatch: %0x <=> %0x", res[0], res[1]);
+ $stop;
+ end
+ while($urandom()%7 < MW/SIMD) @(posedge clk); // Occassional backpressure
+ end
+
+ $display("Test completed.");
+ $finish;
+ end
+
+endmodule : mvu_dsp58_tb
diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
new file mode 100644
index 0000000000..5581354e0e
--- /dev/null
+++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv
@@ -0,0 +1,130 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Testbench for replay_buffer module.
+ * @author Thomas B. Preußer
+ *****************************************************************************/
+
+module replay_buffer_tb;
+
+ // Global Control
+ logic clk = 0;
+ always #5ns clk = !clk;
+ uwire rst = 0;
+
+ // DUT Geometries
+ localparam int unsigned DIMS[3] = '{ 7, 8, 10 };
+ localparam int unsigned W = 8;
+ typedef logic [W-1:0] data_t;
+
+ bit [2**$size(DIMS)-1:0] done = 0;
+ always_comb begin
+ if(&done) begin
+ $display("Test completed.");
+ $finish;
+ end
+ end
+
+ // Parallel DUT Instantiations
+ for(genvar r = 0; r < $size(DIMS); r++) begin
+ for(genvar l = 0; l < $size(DIMS); l++) begin
+ localparam int unsigned REP = DIMS[r];
+ localparam int unsigned LEN = DIMS[l];
+
+ data_t idat;
+ logic ivld;
+ uwire irdy;
+
+ uwire data_t odat;
+ uwire olast;
+ uwire ofin;
+ uwire ovld;
+ logic ordy;
+
+ replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut (
+ .clk, .rst,
+ .idat, .ivld, .irdy,
+ .odat, .olast, .ofin, .ovld, .ordy
+ );
+
+ // Input Feed: 0, 1, ..., 10*LEN-1
+ initial begin
+ idat = 'x;
+ ivld = 0;
+ @(posedge clk iff !rst);
+
+ for(int unsigned i = 0; i < 10*LEN; i++) begin
+ idat <= i;
+ ivld <= 1;
+ @(posedge clk iff irdy);
+ idat <= 'x;
+ ivld <= 0;
+ while($urandom()%(REP-1) != 0) @(posedge clk);
+ end
+ end
+
+ // Output Check
+ initial begin
+ automatic int unsigned base = 0;
+
+ ordy = 0;
+ @(posedge clk iff !rst);
+
+ for(int unsigned k = 0; k < 10; k++) begin
+ for(int unsigned j = 0; j < REP; j++) begin
+ for(int unsigned i = 0; i < LEN; i++) begin
+ ordy <= 1;
+ @(posedge clk iff ovld);
+ assert(odat == base+i) else begin
+ $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i);
+ $stop;
+ end
+ assert(olast == (i == LEN-1)) else begin
+ $error("#%0d.%0d: Last mismatch.", r, l);
+ $stop;
+ end
+ assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin
+ $error("#%0d.%0d: Fin mismatch.", r, l);
+ $stop;
+ end
+
+ ordy <= 0;
+ while($urandom()%13 == 0) @(posedge clk);
+ end
+ end
+ base += LEN;
+ end
+
+ done[$size(DIMS)*r + l] <= 1;
+ end
+ end
+ end
+
+endmodule : replay_buffer_tb
diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
new file mode 100644
index 0000000000..853dcc6e17
--- /dev/null
+++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv
@@ -0,0 +1,227 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Testbench for VVU AXI wrapper module.
+ *****************************************************************************/
+
+module vvu_axi_tb();
+
+//-------------------- Simulation parameters --------------------\\
+ // Matrix & parallelism config
+ localparam bit IS_MVU = 0;
+ localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58";
+ localparam int unsigned MW = 25; // Kernel*Kernel
+ localparam int unsigned MH = 4; // Channels
+ localparam int unsigned SIMD = 1; // MW%SIMD == 0
+ localparam int unsigned PE = 1; // MH%PE == 0
+ localparam int unsigned SEGMENTLEN = 1.0;
+ localparam bit FORCE_BEHAVIORAL = 1;
+ localparam bit M_REG_LUT = 1;
+ // Bit-width config
+ localparam int unsigned ACTIVATION_WIDTH = 4;
+ localparam int unsigned WEIGHT_WIDTH = 4;
+ localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW);
+ localparam bit SIGNED_ACTIVATIONS = 1;
+ // Simulation constants
+ localparam int unsigned NF = MH/PE;
+ localparam int unsigned SF = MW/SIMD;
+ localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8;
+ localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8;
+ localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH;
+ localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH;
+ localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8;
+
+ // Generate clk and reset signal
+ logic clk = 0;
+ always #5ns clk = !clk;
+
+ logic ap_rst_n = 0;
+ initial begin
+ repeat(16) @(posedge clk);
+ ap_rst_n <= 1;
+ end
+
+ uwire ap_clk = clk;
+
+ // Generate activations
+ typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t;
+ typedef activation_t activation_vector_t[NF*SF];
+
+ function activation_vector_t init_ACTIVATIONS;
+ automatic activation_vector_t res;
+ std::randomize(res);
+ return res;
+ endfunction : init_ACTIVATIONS
+
+ activation_vector_t ACTIVATIONS = init_ACTIVATIONS();
+
+ struct {
+ activation_t dat;
+ logic vld;
+ logic rdy;
+ } activations;
+
+ initial begin
+ activations.vld = 0;
+ activations.dat = 'X;
+ @(posedge clk iff ap_rst_n);
+
+ for (int i=0; i= 0;
+ @(posedge clk);
+ end while (!(activations.vld === 1 && activations.rdy === 1));
+ end
+
+ activations.vld <= 0;
+ activations.dat <= 'x;
+ end
+
+ // Generate weights
+ typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t;
+ typedef weight_t weight_matrix_t[NF][SF];
+
+ function weight_matrix_t init_WEIGHTS;
+ automatic weight_matrix_t res;
+ std::randomize(res);
+ return res;
+ endfunction : init_WEIGHTS;
+
+ weight_matrix_t WEIGHTS = init_WEIGHTS();
+
+ struct {
+ weight_t dat;
+ logic vld;
+ logic rdy;
+ } weights;
+
+ initial begin
+ weights.vld = 0;
+ weights.dat = 'X;
+ @(posedge clk iff ap_rst_n);
+
+ weights.vld <= 1;
+ for (int i=0; i [..][PE][SIMD][..]
+ // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like:
+ // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to
+ // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i)
+ for (int i = 0; i < NF; i++) begin
+ for (int j = 0; j < SF; j++) begin
+ for (int k = 0; k < PE; k++) begin
+ for (int l = 0; l < SIMD; l++) begin
+ if (SIGNED_ACTIVATIONS)
+ res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]);
+ else
+ res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]);
+ end
+ end
+ end
+ end
+ return res;
+ endfunction : check_output;
+
+ output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS);
+
+ int unsigned NF_CNT = 0;
+ initial begin
+ outputs.rdy = 0;
+ while (NF_CNT < NF) begin
+ // Loop until both rdy & vld are asserted
+ do begin
+ outputs.rdy <= $urandom()%7 >= 0;
+ @(posedge clk iff ap_rst_n);
+ end while (!(outputs.rdy === 1 && outputs.vld === 1));
+
+ // Compare produced outputs against golden outputs
+ foreach(outputs.dat[i]) begin
+ assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+ else begin
+ $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i]));
+ $stop;
+ end
+ end
+
+ NF_CNT += 1;
+ end
+
+ $finish;
+ end
+
+ // Instantiate DUT
+ mvu_vvu_axi #(
+ .IS_MVU(IS_MVU),
+ .COMPUTE_CORE(COMPUTE_CORE),
+ .MW(MW),
+ .MH(MH),
+ .PE(PE),
+ .SIMD(SIMD),
+ .ACTIVATION_WIDTH(ACTIVATION_WIDTH),
+ .WEIGHT_WIDTH(WEIGHT_WIDTH),
+ .ACCU_WIDTH(ACCU_WIDTH),
+ .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS),
+ .SEGMENTLEN(SEGMENTLEN),
+ .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL),
+ .M_REG_LUT(M_REG_LUT)
+ )
+ dut (
+ .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld),
+ .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld),
+ .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld),
+ .m_axis_output_tready(outputs.rdy)
+ );
+
+endmodule : vvu_axi_tb
diff --git a/finn-rtllib/swg/swg_common.sv b/finn-rtllib/swg/swg_common.sv
new file mode 100644
index 0000000000..c1d388550a
--- /dev/null
+++ b/finn-rtllib/swg/swg_common.sv
@@ -0,0 +1,248 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+
+// loop controller used for both, "default" and "parallel", implementation styles
+module swg_controller
+import swg::*; #(
+ int unsigned LOOP_H_ITERATIONS,
+ int unsigned LOOP_W_ITERATIONS,
+ int unsigned LOOP_KH_ITERATIONS,
+ int unsigned LOOP_KW_ITERATIONS,
+ int unsigned LOOP_SIMD_ITERATIONS,
+
+ int unsigned INCR_BITWIDTH,
+
+ bit IS_DEPTHWISE,
+
+ int HEAD_INCR_SIMD,
+ int HEAD_INCR_KW,
+ int HEAD_INCR_KH,
+ int HEAD_INCR_W,
+ int HEAD_INCR_H,
+ int TAIL_INCR_W,
+ int TAIL_INCR_H,
+ int TAIL_INCR_LAST,
+
+ state_e INNERMOST_STATE
+)(
+ input logic clk,
+ input logic rst_n,
+
+ input logic advance,
+ output logic [INCR_BITWIDTH-1:0] addr_incr,
+ output logic [INCR_BITWIDTH-1:0] tail_incr
+);
+
+ // state and counters
+ state_e State = INNERMOST_STATE;
+ state_e state_next;
+
+ logic signed [$clog2(LOOP_H_ITERATIONS +2)+1-1:0] Counter_loop_h = LOOP_H_ITERATIONS;
+ logic signed [$clog2(LOOP_W_ITERATIONS +2)+1-1:0] Counter_loop_w = LOOP_W_ITERATIONS;
+ logic signed [$clog2(LOOP_KH_ITERATIONS +2)+1-1:0] Counter_loop_kh = LOOP_KH_ITERATIONS;
+ logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS;
+ logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS;
+
+ // combinational logic for addr_incr generation
+ always_comb begin : blkHead
+ unique case (State)
+ STATE_START : addr_incr = 0;
+ STATE_LOOP_SIMD : addr_incr = HEAD_INCR_SIMD;
+ STATE_LOOP_KW : addr_incr = HEAD_INCR_KW;
+ STATE_LOOP_KH : addr_incr = HEAD_INCR_KH;
+ STATE_LOOP_W : addr_incr = HEAD_INCR_W;
+ STATE_LOOP_H : addr_incr = HEAD_INCR_H;
+ endcase
+ end
+
+ // combinational logic for tail_incr generation
+ uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
+ assign tail_incr =
+ tail_incr_inner_condition? 1 :
+ Counter_loop_w >= 0? TAIL_INCR_W :
+ Counter_loop_h >= 0? TAIL_INCR_H :
+ /* else */ TAIL_INCR_LAST;
+
+ // combinational next state logic
+ always_comb begin : blkState
+ state_next = State;
+ if(State != INNERMOST_STATE) state_next = INNERMOST_STATE;
+ else begin
+ if(Counter_loop_simd < 0) begin
+ state_next =
+ (Counter_loop_kw >= 0)? STATE_LOOP_KW :
+ (Counter_loop_kh >= 0)? STATE_LOOP_KH :
+ (Counter_loop_w >= 0)? STATE_LOOP_W :
+ (Counter_loop_h >= 0)? STATE_LOOP_H :
+ /* else */ STATE_START;
+ end
+ end
+ end : blkState
+
+ // sequential logic
+ always_ff @ (posedge clk) begin
+ if(!rst_n) begin
+ State <= INNERMOST_STATE;
+ Counter_loop_h <= LOOP_H_ITERATIONS;
+ Counter_loop_w <= LOOP_W_ITERATIONS;
+ Counter_loop_kh <= LOOP_KH_ITERATIONS;
+ Counter_loop_kw <= LOOP_KW_ITERATIONS;
+ Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+ end
+ else if(advance) begin
+ State <= state_next;
+ if (State == INNERMOST_STATE) begin
+ if(Counter_loop_simd >= 0) Counter_loop_simd <= Counter_loop_simd-1;
+ else begin
+ Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
+ if(Counter_loop_kw >= 0) Counter_loop_kw <= Counter_loop_kw-1;
+ else begin
+ Counter_loop_kw <= LOOP_KW_ITERATIONS;
+ if(Counter_loop_kh >= 0) Counter_loop_kh <= Counter_loop_kh-1;
+ else begin
+ Counter_loop_kh <= LOOP_KH_ITERATIONS;
+ if(Counter_loop_w >= 0) Counter_loop_w <= Counter_loop_w-1;
+ else begin
+ Counter_loop_w <= LOOP_W_ITERATIONS;
+ if(Counter_loop_h >= 0) Counter_loop_h <= Counter_loop_h-1;
+ else Counter_loop_h <= LOOP_H_ITERATIONS;
+ end
+ end
+ end
+ end
+ end
+ end
+ end
+
+endmodule : swg_controller
+
+// buffer used in "default" implementation style
+module swg_cyclic_buffer_addressable #(
+ int unsigned WIDTH,
+ int unsigned DEPTH,
+ parameter RAM_STYLE = "auto"
+)(
+ input logic clk,
+
+ input logic write_enable,
+ input logic [$clog2(DEPTH)-1:0] write_addr,
+ input logic [WIDTH-1:0] data_in,
+
+ input logic read_enable,
+ input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer
+ output logic [WIDTH-1:0] data_out
+);
+
+ (*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram[DEPTH];
+ logic [WIDTH-1:0] Out = 'x;
+ always_ff @(posedge clk) begin
+ if (read_enable) Out <= Ram[read_addr];
+ if (write_enable) Ram[write_addr] <= data_in;
+ end
+ assign data_out = Out;
+
+endmodule : swg_cyclic_buffer_addressable
+
+// buffer used in "parallel" implementation style
+module swg_reg_buffer
+#(
+ int unsigned WIDTH = 1,
+ int unsigned DEPTH = 1
+)
+(
+ input logic clk,
+ input logic shift_enable,
+ input logic [WIDTH-1:0] shift_in,
+ output logic [WIDTH-1:0] shift_out,
+ output logic [WIDTH*DEPTH-1:0] data_out
+);
+
+logic [WIDTH-1:0] Data [DEPTH-1:0];
+
+assign shift_out = Data[DEPTH-1];
+
+for (genvar e=0; e 1) Data[DEPTH-1:1] <= Data[DEPTH-2:0];
+ Data[0] <= shift_in;
+ end
+end
+endmodule : swg_reg_buffer
+
+// buffer used in "parallel" implementation style
+module swg_ram_buffer
+#(
+ int unsigned WIDTH,
+ int unsigned DEPTH,
+ parameter RAM_STYLE = "auto"
+)
+(
+ input logic clk,
+ input logic rst_n,
+ input logic shift_enable,
+ input logic [WIDTH-1:0] shift_in,
+ output logic [WIDTH-1:0] shift_out
+);
+
+logic [WIDTH-1:0] Out_reg;
+assign shift_out = Out_reg;
+
+logic [$clog2(DEPTH)-1:0] Addr_w = 0;
+logic [$clog2(DEPTH)-1:0] Addr_r = 0;
+
+(*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram [DEPTH-1:0];
+
+always_ff @(posedge clk) begin
+ if (rst_n == 1'b0) begin
+ Addr_w <= 0;
+ Addr_r <= 1;
+ end else begin
+ if (shift_enable) begin
+ Ram[Addr_w] <= shift_in;
+ Out_reg <= Ram[Addr_r];
+
+ if (Addr_w == DEPTH-1)
+ Addr_w <= 0;
+ else
+ Addr_w <= Addr_w + 1;
+
+ if (Addr_r == DEPTH-1)
+ Addr_r <= 0;
+ else
+ Addr_r <= Addr_r + 1;
+ end
+ end
+end
+endmodule : swg_ram_buffer
diff --git a/finn-rtllib/swg/swg_pkg.sv b/finn-rtllib/swg/swg_pkg.sv
new file mode 100644
index 0000000000..1200310aca
--- /dev/null
+++ b/finn-rtllib/swg/swg_pkg.sv
@@ -0,0 +1,41 @@
+/******************************************************************************
+ * Copyright (C) 2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+package swg;
+ typedef enum logic [2:0] {
+ STATE_START,
+ STATE_LOOP_SIMD,
+ STATE_LOOP_KW,
+ STATE_LOOP_KH,
+ STATE_LOOP_W,
+ STATE_LOOP_H
+ } state_e;
+endpackage : swg
diff --git a/finn-rtllib/swg/swg_template_axilite.v b/finn-rtllib/swg/swg_template_axilite.v
index 9479c7f80d..1f39e4440e 100644
--- a/finn-rtllib/swg/swg_template_axilite.v
+++ b/finn-rtllib/swg/swg_template_axilite.v
@@ -1,8 +1,35 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
-`timescale 1 ns / 1 ps
-
-module $TOP_MODULE_NAME$_axilite #
-(
+module $TOP_MODULE_NAME$_axilite #(
// Users to add parameters here
// User parameters ends
@@ -12,8 +39,7 @@ module $TOP_MODULE_NAME$_axilite #
parameter integer C_S_AXI_DATA_WIDTH = 32,
// Width of S_AXI address bus
parameter integer C_S_AXI_ADDR_WIDTH = 6
-)
-(
+)(
// Users to add ports here
output wire [C_S_AXI_DATA_WIDTH-1:0] cfg_reg0,
output wire [C_S_AXI_DATA_WIDTH-1:0] cfg_reg1,
diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv
index 06e65e9111..78a8d0a3b9 100644
--- a/finn-rtllib/swg/swg_template_default.sv
+++ b/finn-rtllib/swg/swg_template_default.sv
@@ -28,141 +28,6 @@
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-module $TOP_MODULE_NAME$_controller #(
- int unsigned LOOP_H_ITERATIONS = $LOOP_H_ITERATIONS$,
- int unsigned LOOP_W_ITERATIONS = $LOOP_W_ITERATIONS$,
- int unsigned LOOP_KH_ITERATIONS = $LOOP_KH_ITERATIONS$,
- int unsigned LOOP_KW_ITERATIONS = $LOOP_KW_ITERATIONS$,
- int unsigned LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$,
-
- int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$,
-
- bit IS_DEPTHWISE = $IS_DEPTHWISE$
-)(
- input logic clk,
- input logic rst_n,
-
- input logic advance,
- output logic [INCR_BITWIDTH-1:0] addr_incr,
- output logic [INCR_BITWIDTH-1:0] tail_incr
-);
-
- // state and counters
- typedef enum logic [2:0] {
- STATE_START,
- STATE_LOOP_SIMD,
- STATE_LOOP_KW,
- STATE_LOOP_KH,
- STATE_LOOP_W,
- STATE_LOOP_H
- } state_e;
- state_e State = $INNERMOST_STATE$;
- state_e state_next;
-
- logic signed [$clog2(LOOP_H_ITERATIONS +2)+1-1:0] Counter_loop_h = LOOP_H_ITERATIONS;
- logic signed [$clog2(LOOP_W_ITERATIONS +2)+1-1:0] Counter_loop_w = LOOP_W_ITERATIONS;
- logic signed [$clog2(LOOP_KH_ITERATIONS +2)+1-1:0] Counter_loop_kh = LOOP_KH_ITERATIONS;
- logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS;
- logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS;
-
- // combinational logic for addr_incr generation
- always_comb begin : blkHead
- unique case (State)
- 0 : addr_incr = 0;
- 1 : addr_incr = $HEAD_INCR_SIMD$;
- 2 : addr_incr = $HEAD_INCR_KW$;
- 3 : addr_incr = $HEAD_INCR_KH$;
- 4 : addr_incr = $HEAD_INCR_W$;
- 5 : addr_incr = $HEAD_INCR_H$;
- endcase
- end
-
- // combinational logic for tail_incr generation
- uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0;
- assign tail_incr =
- tail_incr_inner_condition? 1 :
- Counter_loop_w >= 0? $TAIL_INCR_W$ :
- Counter_loop_h >= 0? $TAIL_INCR_H$ :
- /* else */ $TAIL_INCR_LAST$;
-
- // combinational next state logic
- always_comb begin : blkState
- state_next = State;
- if(State != $INNERMOST_STATE$) state_next = $INNERMOST_STATE$;
- else begin
- if(Counter_loop_simd < 0) begin
- state_next =
- (Counter_loop_kw >= 0)? STATE_LOOP_KW :
- (Counter_loop_kh >= 0)? STATE_LOOP_KH :
- (Counter_loop_w >= 0)? STATE_LOOP_W :
- (Counter_loop_h >= 0)? STATE_LOOP_H :
- /* else */ STATE_START;
- end
- end
- end : blkState
-
- // sequential logic
- always_ff @ (posedge clk) begin
- if(!rst_n) begin
- State <= $INNERMOST_STATE$;
- Counter_loop_h <= LOOP_H_ITERATIONS;
- Counter_loop_w <= LOOP_W_ITERATIONS;
- Counter_loop_kh <= LOOP_KH_ITERATIONS;
- Counter_loop_kw <= LOOP_KW_ITERATIONS;
- Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
- end
- else if(advance) begin
- State <= state_next;
- if (State == $INNERMOST_STATE$) begin
- if(Counter_loop_simd >= 0) Counter_loop_simd <= Counter_loop_simd-1;
- else begin
- Counter_loop_simd <= LOOP_SIMD_ITERATIONS;
- if(Counter_loop_kw >= 0) Counter_loop_kw <= Counter_loop_kw-1;
- else begin
- Counter_loop_kw <= LOOP_KW_ITERATIONS;
- if(Counter_loop_kh >= 0) Counter_loop_kh <= Counter_loop_kh-1;
- else begin
- Counter_loop_kh <= LOOP_KH_ITERATIONS;
- if(Counter_loop_w >= 0) Counter_loop_w <= Counter_loop_w-1;
- else begin
- Counter_loop_w <= LOOP_W_ITERATIONS;
- if(Counter_loop_h >= 0) Counter_loop_h <= Counter_loop_h-1;
- else Counter_loop_h <= LOOP_H_ITERATIONS;
- end
- end
- end
- end
- end
- end
- end
-
-endmodule : $TOP_MODULE_NAME$_controller
-
-module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
- int unsigned WIDTH,
- int unsigned DEPTH
-)(
- input logic clk,
-
- input logic write_enable,
- input logic [$clog2(DEPTH)-1:0] write_addr,
- input logic [WIDTH-1:0] data_in,
-
- input logic read_enable,
- input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer
- output logic [WIDTH-1:0] data_out
-);
-
- $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
- logic [WIDTH-1:0] Out = 'x;
- always_ff @(posedge clk) begin
- if (read_enable) Out <= Ram[read_addr];
- if (write_enable) Ram[write_addr] <= data_in;
- end
- assign data_out = Out;
-
-endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
-
module $TOP_MODULE_NAME$_impl #(
int BIT_WIDTH,
int SIMD,
@@ -197,9 +62,10 @@ module $TOP_MODULE_NAME$_impl #(
uwire window_buffer_read_enable;
uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_write_addr;
uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr;
- $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+ swg_cyclic_buffer_addressable #(
.WIDTH(BUF_IN_WIDTH),
- .DEPTH(BUF_ELEM_TOTAL)
+ .DEPTH(BUF_ELEM_TOTAL),
+ .RAM_STYLE($RAM_STYLE$)
) window_buffer_inst (
.clk(ap_clk),
@@ -216,7 +82,25 @@ module $TOP_MODULE_NAME$_impl #(
uwire advance_controller;
uwire signed [INCR_BITWIDTH-1:0] addr_incr;
uwire [INCR_BITWIDTH-1:0] tail_incr;
- $TOP_MODULE_NAME$_controller controller_inst (
+ swg_controller #(
+ .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$),
+ .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$),
+ .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$),
+ .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$),
+ .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$),
+ .HEAD_INCR_SIMD($HEAD_INCR_SIMD$),
+ .HEAD_INCR_KW($HEAD_INCR_KW$),
+ .HEAD_INCR_KH($HEAD_INCR_KH$),
+ .HEAD_INCR_W($HEAD_INCR_W$),
+ .HEAD_INCR_H($HEAD_INCR_H$),
+ .TAIL_INCR_W($TAIL_INCR_W$),
+ .TAIL_INCR_H($TAIL_INCR_H$),
+ .TAIL_INCR_LAST($TAIL_INCR_LAST$),
+ .INCR_BITWIDTH($INCR_BITWIDTH$),
+ .IS_DEPTHWISE($IS_DEPTHWISE$),
+ .INNERMOST_STATE(swg::$INNERMOST_STATE$)
+ )
+ controller_inst (
.clk(ap_clk),
.rst_n(ap_rst_n),
.advance(advance_controller),
diff --git a/finn-rtllib/swg/swg_template_default_dynamic.sv b/finn-rtllib/swg/swg_template_default_dynamic.sv
index eb53978b58..5a6fdda170 100644
--- a/finn-rtllib/swg/swg_template_default_dynamic.sv
+++ b/finn-rtllib/swg/swg_template_default_dynamic.sv
@@ -1,3 +1,34 @@
+/******************************************************************************
+ * Copyright (C) 2022-2023, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
module $TOP_MODULE_NAME$_controller #(
int unsigned CNTR_BITWIDTH,
int unsigned INCR_BITWIDTH,
@@ -27,6 +58,8 @@ module $TOP_MODULE_NAME$_controller #(
input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last
);
+ import swg::*;
+
// (dynamic) configuration registers
logic [CNTR_BITWIDTH-1:0] Cfg_cntr_simd = $LOOP_SIMD_ITERATIONS$;
logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kw = $LOOP_KW_ITERATIONS$;
@@ -62,14 +95,6 @@ module $TOP_MODULE_NAME$_controller #(
end
// state and counters
- typedef enum logic [2:0] {
- STATE_START,
- STATE_LOOP_SIMD,
- STATE_LOOP_KW,
- STATE_LOOP_KH,
- STATE_LOOP_W,
- STATE_LOOP_H
- } state_e;
state_e State = $INNERMOST_STATE$;
state_e state_next;
@@ -152,31 +177,6 @@ module $TOP_MODULE_NAME$_controller #(
endmodule : $TOP_MODULE_NAME$_controller
-module $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
- int unsigned WIDTH,
- int unsigned DEPTH
-)(
- input logic clk,
-
- input logic write_enable,
- input logic [$clog2(DEPTH)-1:0] write_addr,
- input logic [WIDTH-1:0] data_in,
-
- input logic read_enable,
- input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer
- output logic [WIDTH-1:0] data_out
-);
-
- $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH];
- logic [WIDTH-1:0] Out = 'x;
- always_ff @(posedge clk) begin
- if (read_enable) Out <= Ram[read_addr];
- if (write_enable) Ram[write_addr] <= data_in;
- end
- assign data_out = Out;
-
-endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable
-
module $TOP_MODULE_NAME$_impl #(
int BIT_WIDTH,
int SIMD,
@@ -242,9 +242,10 @@ module $TOP_MODULE_NAME$_impl #(
uwire window_buffer_read_enable;
uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_write_addr;
uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr;
- $TOP_MODULE_NAME$_cyclic_buffer_addressable #(
+ swg_cyclic_buffer_addressable #(
.WIDTH(BUF_IN_WIDTH),
- .DEPTH(BUF_ELEM_TOTAL)
+ .DEPTH(BUF_ELEM_TOTAL),
+ .RAM_STYLE($RAM_STYLE$)
) window_buffer_inst (
.clk(ap_clk),
diff --git a/finn-rtllib/swg/swg_template_parallel.sv b/finn-rtllib/swg/swg_template_parallel.sv
new file mode 100644
index 0000000000..b92f27b2ca
--- /dev/null
+++ b/finn-rtllib/swg/swg_template_parallel.sv
@@ -0,0 +1,216 @@
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module $TOP_MODULE_NAME$_wb
+#(
+ int unsigned IN_WIDTH = 1, // bit-width*C*MMV_in
+ int unsigned OUT_ELEM_WIDTH = 1, // bit-width*C
+ int unsigned OUT_WIDTH = 1, // bit-width*C*MMV_out
+ int unsigned BUFFER_ELEM_TOTAL = 1
+)
+(
+ input logic clk,
+ input logic rst_n,
+ input logic shift_enable,
+ input logic [IN_WIDTH-1:0] data_in,
+ output logic [OUT_WIDTH-1:0] data_out
+);
+
+$GENERATE_REG_FIFOS$
+
+$GENERATE_BRAM_FIFOS$
+
+// fixed interconnect between linear buffers
+$GENERATE_BUFFER_CONNECTION$
+
+// fixed REG FIFO -> output mapping
+$GENERATE_OUTPUT_MAPPING$
+
+endmodule : $TOP_MODULE_NAME$_wb
+
+module $TOP_MODULE_NAME$_impl #(
+ int unsigned BIT_WIDTH,
+ int unsigned SIMD,
+ int unsigned MMV_IN,
+ int unsigned MMV_OUT,
+ int unsigned LAST_READ_ELEM = $LAST_READ_ELEM$,
+ int unsigned FIRST_WRITE_ELEM = $FIRST_WRITE_ELEM$,
+ int unsigned LAST_WRITE_ELEM = $LAST_WRITE_ELEM$,
+ int unsigned BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$,
+ int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$
+)(
+ input logic ap_clk,
+ input logic ap_rst_n,
+
+ input logic in0_V_V_TVALID,
+ output logic in0_V_V_TREADY,
+ input logic [BIT_WIDTH * SIMD * MMV_IN-1:0] in0_V_V_TDATA,
+
+ output logic out_V_V_TVALID,
+ input logic out_V_V_TREADY,
+ output logic [BIT_WIDTH * SIMD * MMV_OUT-1:0] out_V_V_TDATA
+);
+ // derived constants
+ localparam int unsigned BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
+ localparam int unsigned BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD;
+ localparam int unsigned BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
+
+ // main buffer instantiation
+ uwire [BUF_IN_WIDTH -1:0] window_buffer_in;
+ uwire [BUF_OUT_WIDTH-1:0] window_buffer_out;
+ uwire window_buffer_shift_enable;
+ $TOP_MODULE_NAME$_wb
+ #(
+ .IN_WIDTH(BUF_IN_WIDTH),
+ .OUT_ELEM_WIDTH(BUF_OUT_ELEM_WIDTH),
+ .OUT_WIDTH(BUF_OUT_WIDTH),
+ .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL)
+ )
+ window_buffer_inst
+ (
+ .clk(ap_clk),
+ .rst_n(ap_rst_n),
+ .data_in(window_buffer_in),
+ .shift_enable(window_buffer_shift_enable),
+ .data_out(window_buffer_out)
+ );
+
+ // controller instantiation
+ uwire advance_controller;
+ uwire signed [INCR_BITWIDTH-1:0] addr_incr;
+ uwire [INCR_BITWIDTH-1:0] tail_incr;
+ swg_controller #(
+ .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$),
+ .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$),
+ .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$),
+ .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$),
+ .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$),
+ .HEAD_INCR_SIMD($HEAD_INCR_SIMD$),
+ .HEAD_INCR_KW($HEAD_INCR_KW$),
+ .HEAD_INCR_KH($HEAD_INCR_KH$),
+ .HEAD_INCR_W($HEAD_INCR_W$),
+ .HEAD_INCR_H($HEAD_INCR_H$),
+ .TAIL_INCR_W($TAIL_INCR_W$),
+ .TAIL_INCR_H($TAIL_INCR_H$),
+ .TAIL_INCR_LAST($TAIL_INCR_LAST$),
+ .INCR_BITWIDTH($INCR_BITWIDTH$),
+ .IS_DEPTHWISE($IS_DEPTHWISE$),
+ .INNERMOST_STATE(swg::$INNERMOST_STATE$)
+ )
+ controller_inst (
+ .clk(ap_clk),
+ .rst_n(ap_rst_n),
+ .advance(advance_controller),
+ .addr_incr(addr_incr),
+ .tail_incr(tail_incr)
+ );
+
+ // counters/address registers
+ logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0] Newest_buffered_elem = -1;
+ logic [$clog2(LAST_READ_ELEM+1)+1-1:0] Current_elem = FIRST_WRITE_ELEM;
+
+ // control registers/signals
+ logic Writing_done = 0;
+ logic Write_done = 0;
+ uwire write_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !Writing_done;;
+ uwire write_ok = write_cmd && (out_V_V_TREADY || Write_done);
+ uwire write_blocked = write_cmd && !out_V_V_TREADY && !Write_done;
+
+ uwire reading_done = Newest_buffered_elem == LAST_READ_ELEM;
+ uwire read_cmd = !reading_done && (Writing_done || Newest_buffered_elem <= $signed(Current_elem));
+ uwire read_ok = read_cmd && in0_V_V_TVALID && !write_blocked;
+
+ // includes waiting on W if W-only cycle: wait only on W no R/W to wait for
+ uwire advance = read_ok || (!read_cmd && write_ok) || (!read_cmd && !write_cmd);
+
+ // assign buffer control
+ assign window_buffer_shift_enable = advance;
+ assign advance_controller = write_ok;
+
+ // assign I/O ports
+ assign window_buffer_in = in0_V_V_TDATA;
+ assign out_V_V_TDATA = window_buffer_out;
+ assign in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed)
+ assign out_V_V_TVALID = ap_rst_n && write_cmd && !Write_done; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink)
+
+ // write done logic
+ always_ff @(posedge ap_clk) begin
+ if(!ap_rst_n) begin
+ Write_done <= 1'b0;
+ end
+ else begin
+ if (advance) begin
+ Write_done <= 1'b0; //reset flag
+ end else if (write_ok) //successful W in this cycle, but R still outstanding
+ Write_done <= 1'b1; //write can happen even if read is blocked, but only for the current cycle!
+ end
+ end
+
+ // main process for advancing counters
+ always_ff @(posedge ap_clk) begin
+ if(!ap_rst_n) begin
+ Newest_buffered_elem <= -1;
+ Current_elem <= FIRST_WRITE_ELEM;
+ Writing_done <= 0;
+ end
+ else begin
+ if (read_ok) begin
+ Newest_buffered_elem <= Newest_buffered_elem+1;
+
+ // check if this is the last read cycle (reading_done will be true afterwards)
+ if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin
+ // start processing of next FM if writing is done already (possible due to unused input elements at the tail end)
+ // todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM)
+ Newest_buffered_elem <= -1;
+ Current_elem <= FIRST_WRITE_ELEM;
+ Writing_done <= 0;
+ end
+ end
+
+ if (write_ok) begin
+ // check if this is the last write cycle (Writing_done will be true afterwards)
+ if (Current_elem == LAST_WRITE_ELEM) begin
+ Writing_done <= 1;
+
+ if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin
+ // start processing of next FM if reading is done already, or completes in the same cycle
+ Newest_buffered_elem <= -1;
+ Current_elem <= FIRST_WRITE_ELEM;
+ Writing_done <= 0;
+ end
+ end
+ else
+ Current_elem <= $signed(Current_elem) + addr_incr;
+ end
+ end
+ end
+
+endmodule : $TOP_MODULE_NAME$_impl
diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v
index 0cc3579a25..22dc6bd8cd 100644
--- a/finn-rtllib/swg/swg_template_wrapper.v
+++ b/finn-rtllib/swg/swg_template_wrapper.v
@@ -28,19 +28,19 @@
* OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-`timescale 1 ns / 1 ps
module $TOP_MODULE_NAME$ (
-(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
-input ap_clk,
-(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *)
-input ap_rst_n,
-input [BUF_IN_WIDTH-1:0] in0_V_TDATA,
-input in0_V_TVALID,
-output in0_V_TREADY,
-output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
-output out_V_TVALID,
-input out_V_TREADY
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+ input ap_clk,
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+ input ap_rst_n,
+ input [IN_WIDTH_PADDED-1:0] in0_V_TDATA,
+ input in0_V_TVALID,
+ output in0_V_TREADY,
+ output [OUT_WIDTH_PADDED-1:0] out_V_TDATA,
+ output out_V_TVALID,
+ input out_V_TREADY
);
// top-level parameters (set via code-generation)
@@ -48,28 +48,27 @@ parameter BIT_WIDTH = $BIT_WIDTH$;
parameter SIMD = $SIMD$;
parameter MMV_IN = $MMV_IN$;
parameter MMV_OUT = $MMV_OUT$;
+parameter IN_WIDTH_PADDED = $IN_WIDTH_PADDED$;
+parameter OUT_WIDTH_PADDED = $OUT_WIDTH_PADDED$;
// derived constants
parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN;
parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT;
-$TOP_MODULE_NAME$_impl
-#(
- .BIT_WIDTH(BIT_WIDTH),
- .SIMD(SIMD),
- .MMV_IN(MMV_IN),
- .MMV_OUT(MMV_OUT)
-)
-impl
-(
- .ap_clk(ap_clk),
- .ap_rst_n(ap_rst_n),
- .in0_V_V_TDATA(in0_V_TDATA),
- .in0_V_V_TVALID(in0_V_TVALID),
- .in0_V_V_TREADY(in0_V_TREADY),
- .out_V_V_TDATA(out_V_TDATA),
- .out_V_V_TVALID(out_V_TVALID),
- .out_V_V_TREADY(out_V_TREADY)
+$TOP_MODULE_NAME$_impl #(
+ .BIT_WIDTH(BIT_WIDTH),
+ .SIMD(SIMD),
+ .MMV_IN(MMV_IN),
+ .MMV_OUT(MMV_OUT)
+) impl (
+ .ap_clk(ap_clk),
+ .ap_rst_n(ap_rst_n),
+ .in0_V_V_TDATA(in0_V_TDATA[BUF_IN_WIDTH-1:0]),
+ .in0_V_V_TVALID(in0_V_TVALID),
+ .in0_V_V_TREADY(in0_V_TREADY),
+ .out_V_V_TDATA(out_V_TDATA[BUF_OUT_WIDTH-1:0]),
+ .out_V_V_TVALID(out_V_TVALID),
+ .out_V_V_TREADY(out_V_TREADY)
);
-endmodule //TOP_MODULE_NAME
+endmodule : $TOP_MODULE_NAME$
diff --git a/finn-rtllib/swg/swg_template_wrapper_dynamic.v b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
index ca870ace11..158f3132e3 100644
--- a/finn-rtllib/swg/swg_template_wrapper_dynamic.v
+++ b/finn-rtllib/swg/swg_template_wrapper_dynamic.v
@@ -1,4 +1,33 @@
-`timescale 1 ns / 1 ps
+/******************************************************************************
+ * Copyright (C) 2022, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
module $TOP_MODULE_NAME$ #(
// top-level parameters (set via code-generation)
@@ -6,6 +35,8 @@ module $TOP_MODULE_NAME$ #(
parameter SIMD = $SIMD$,
parameter MMV_IN = $MMV_IN$,
parameter MMV_OUT = $MMV_OUT$,
+ parameter IN_WIDTH_PADDED = $IN_WIDTH_PADDED$,
+ parameter OUT_WIDTH_PADDED = $OUT_WIDTH_PADDED$,
parameter CNTR_BITWIDTH = $CNTR_BITWIDTH$,
parameter INCR_BITWIDTH = $INCR_BITWIDTH$,
@@ -18,14 +49,15 @@ module $TOP_MODULE_NAME$ #(
parameter integer C_s_axilite_ADDR_WIDTH = 6
)
(
- (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite, ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
input ap_clk,
- (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *)
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
input ap_rst_n,
- input [BUF_IN_WIDTH-1:0] in0_V_TDATA,
+ input [IN_WIDTH_PADDED-1:0] in0_V_TDATA,
input in0_V_TVALID,
output in0_V_TREADY,
- output [BUF_OUT_WIDTH-1:0] out_V_TDATA,
+ output [OUT_WIDTH_PADDED-1:0] out_V_TDATA,
output out_V_TVALID,
input out_V_TREADY,
@@ -113,23 +145,20 @@ $TOP_MODULE_NAME$_axilite # (
.cfg_reg15(cfg_last_write)
);
-$TOP_MODULE_NAME$_impl
-#(
+$TOP_MODULE_NAME$_impl #(
.BIT_WIDTH(BIT_WIDTH),
.SIMD(SIMD),
.MMV_IN(MMV_IN),
.MMV_OUT(MMV_OUT),
.CNTR_BITWIDTH(CNTR_BITWIDTH),
.INCR_BITWIDTH(INCR_BITWIDTH)
-)
-impl
-(
+) impl (
.ap_clk(ap_clk),
.ap_rst_n(ap_rst_n),
- .in0_V_V_TDATA(in0_V_TDATA),
+ .in0_V_V_TDATA(in0_V_TDATA[BUF_IN_WIDTH-1:0]),
.in0_V_V_TVALID(in0_V_TVALID),
.in0_V_V_TREADY(in0_V_TREADY),
- .out_V_V_TDATA(out_V_TDATA),
+ .out_V_V_TDATA(out_V_TDATA[BUF_OUT_WIDTH-1:0]),
.out_V_V_TVALID(out_V_TVALID),
.out_V_V_TREADY(out_V_TREADY),
@@ -151,4 +180,4 @@ impl
.cfg_last_write(cfg_last_write)
);
-endmodule //TOP_MODULE_NAME
+endmodule : $TOP_MODULE_NAME$
diff --git a/finn-rtllib/thresholding/hdl/axilite_if.v b/finn-rtllib/thresholding/hdl/axilite_if.v
new file mode 100644
index 0000000000..2aeff770d2
--- /dev/null
+++ b/finn-rtllib/thresholding/hdl/axilite_if.v
@@ -0,0 +1,211 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+
+module axi4lite_if
+#(
+ parameter ADDR_WIDTH = 32,
+ parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64
+ parameter IP_DATA_WIDTH = 64//can be any power-of-2 multiple of DATA_WIDTH
+)
+(
+//system signals
+input aclk,
+input aresetn,//active low, asynchronous assertion and synchronous deassertion
+
+//Write channels
+//write address
+output reg awready,
+input awvalid,
+input [ADDR_WIDTH-1:0] awaddr,
+input [2:0] awprot,
+//write data
+output reg wready,
+input wvalid,
+input [DATA_WIDTH-1:0] wdata,
+input [(DATA_WIDTH/8)-1:0] wstrb,
+//burst response
+input bready,
+output reg bvalid,
+output reg [1:0] bresp,//NOTE: 00 = OKAY, 10 = SLVERR (write error)
+
+//Read channels
+//read address
+output reg arready,
+input arvalid,
+input [ADDR_WIDTH-1:0] araddr,
+input [2:0] arprot,
+//read data
+input rready,
+output reg rvalid,
+output reg [1:0] rresp,//NOTE: 00 = OKAY, 10 = SLVERR (read error)
+output reg [DATA_WIDTH-1:0] rdata,
+
+//IP-side interface
+output reg ip_en,
+output reg ip_wen,
+output reg [ADDR_WIDTH-1:0] ip_addr,
+output [IP_DATA_WIDTH-1:0] ip_wdata,
+input ip_rack,
+input [IP_DATA_WIDTH-1:0] ip_rdata
+);
+
+localparam RESP_OKAY = 2'b00;
+localparam RESP_SLVERR = 2'b10;
+//get ceil(log2(ceil(IP_DATA_WIDTH/DATA_WIDTH)))
+localparam NFOLDS_LOG = $clog2((IP_DATA_WIDTH + DATA_WIDTH - 1) / DATA_WIDTH);
+
+reg internal_ren;
+reg internal_wen;
+reg internal_wack;
+reg [ADDR_WIDTH-1:0] internal_raddr;
+reg [ADDR_WIDTH-1:0] internal_waddr;
+reg [DATA_WIDTH-1:0] internal_wdata;
+wire [DATA_WIDTH-1:0] internal_rdata;
+reg internal_error = 0;
+
+//check DATA_WIDTH
+initial begin
+ if(DATA_WIDTH != 32 & DATA_WIDTH != 64) begin
+ $display("AXI4Lite DATA_WIDTH must be 32 or 64");
+ $finish;
+ end
+end
+
+//transaction state machine
+localparam STATE_IDLE = 0,
+ STATE_READ = 1,
+ STATE_WRITE = 2;
+
+reg [1:0] state;
+
+always @(posedge aclk or negedge aresetn)
+ if(~aresetn)
+ state <= STATE_IDLE;
+ else case(state)
+ STATE_IDLE:
+ if(awvalid & wvalid)
+ state <= STATE_WRITE;
+ else if(arvalid)
+ state <= STATE_READ;
+ STATE_READ:
+ if(rvalid & rready)
+ state <= STATE_IDLE;
+ STATE_WRITE:
+ if(bvalid & bready)
+ state <= STATE_IDLE;
+ default: state <= STATE_IDLE;
+ endcase
+
+//write-related internal signals
+always @(*) begin
+ internal_waddr = awaddr >> $clog2(DATA_WIDTH/8);
+ internal_wdata = wdata;
+ internal_wen = (state == STATE_IDLE) & awvalid & wvalid;
+end
+
+always @(posedge aclk) begin
+ awready <= internal_wen;
+ wready <= internal_wen;
+end
+
+//read-related internal signals
+always @(*) begin
+ internal_raddr = araddr >> $clog2(DATA_WIDTH/8);
+ internal_ren = (state == STATE_IDLE) & ~internal_wen & arvalid;
+end
+
+always @(posedge aclk)
+ arready <= internal_ren;
+
+wire write_to_last_fold;
+
+always @(posedge aclk) begin
+ ip_wen <= write_to_last_fold;
+ ip_en <= internal_ren | write_to_last_fold;
+ if(internal_ren | write_to_last_fold)
+ ip_addr <= internal_ren ? (internal_raddr >> NFOLDS_LOG) : (internal_waddr >> NFOLDS_LOG);
+ internal_wack <= internal_wen;
+end
+
+genvar i;
+reg [(1<> (internal_rfold*DATA_WIDTH);
+ always @(posedge aclk)
+ if(internal_ren)
+ internal_rfold <= internal_raddr[NFOLDS_LOG-1:0];
+ for(i=0; i<(1<
+ *
+ * @description
+ * Produces the N-bit count of those among 2^N-1 thresholds that are not
+ * larger than the corresponding input:
+ * y = Σ(T_i <= x)
+ * The result is computed by binary search. The runtime-configurable
+ * thresholds must be written in ascending order:
+ * i < j => T_i < T_j
+ * The design supports channel folding allowing each input to be processed
+ * with respect to a selectable set of thresholds. The corresponding
+ * threshold configuration relies on a channel address prefix. Inputs are
+ * accompanied by a channel selector.
+ *
+ * Parameter Layout as seen on AXI-Lite (row by row):
+ * | Base \ Offs | 0 1 2 ... 2^N-2 2^N-1
+ * ---------+--------------------------------+------------------------------------
+ * Chnl #0 | 0 | T_0 T_1 T_2 ... T_{2^N-2} 'x
+ * Chnl #1 | 2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x
+ * Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x
+ *
+ *****************************************************************************/
+module thresholding #(
+ int unsigned N, // output precision
+ int unsigned K, // input/threshold precision
+ int unsigned C, // number of channels
+ int unsigned PE, // parallel processing elements
+
+ bit SIGNED = 1, // signed inputs
+ bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa
+ int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS]
+
+ // Initial Thresholds
+ parameter THRESHOLDS_PATH = "",
+ bit USE_CONFIG = 1,
+
+ // Force Use of On-Chip Memory Blocks
+ int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio)
+ int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM
+ bit DEEP_PIPELINE = 0,
+
+ localparam int unsigned CF = C/PE, // Channel fold
+ localparam int unsigned O_BITS = BIAS >= 0?
+ /* unsigned */ $clog2(2**N+BIAS) :
+ /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
+)(
+ // Global Control
+ input logic clk,
+ input logic rst,
+
+ // Threshold Configuration
+ input logic cfg_en,
+ input logic cfg_we,
+ input logic [$clog2(CF)+$clog2(PE)+N-1:0] cfg_a,
+ input logic [K-1:0] cfg_d,
+ output logic cfg_rack,
+ output logic [K-1:0] cfg_q,
+
+ // Input Stream
+ output logic irdy,
+ input logic ivld,
+ input logic [PE-1:0][K-1:0] idat,
+
+ // Output Stream
+ input logic ordy,
+ output logic ovld,
+ output logic [PE-1:0][O_BITS-1:0] odat
+);
+
+ // Parameter Constraints Checking
+ initial begin
+ if(CF*PE != C) begin
+ $error("Parallelism PE=%0d is not a multiple of channel count C=%0d.", PE, C);
+ $finish;
+ end
+ end
+
+ // Operations within Pipeline
+ typedef enum logic [1:0] {
+ NOP = 2'b00, // No operation
+ TH = 2'b01, // Thresholding
+ WR = 2'b11, // Write (initialization)
+ RB = 2'b10, // Readback (validation)
+ CFG = 2'b1x // Config op (pointer-preserving)
+ } op_e;
+
+ // Pipeline Link Type
+ typedef logic [$clog2(CF)+N-1:0] ptr_t;
+ typedef logic [K -1:0] val_t;
+ typedef struct packed {
+ op_e op;
+ ptr_t ptr; // WR/RB: address; TH: result
+ val_t val; // WR/RB: threshold value; TH: input value
+ } pipe_t;
+
+ //-----------------------------------------------------------------------
+ // Pipeline Feed
+ // - configuration always takes precedence
+ // - number of pending thresholding ops capped to N+3
+ // across pipeline and output FIFO: pipe:N + A:1 + B:1 + 1
+ localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*N + 3;
+ pipe_t pipe[PE][N+1];
+ if(1) begin : blkFeed
+
+ // Thresholding Input Guard ensuring Output FIFO is never overrun
+ logic signed [$clog2(MAX_PENDING):0] GuardSem = MAX_PENDING-1; // MAX_PENDING-1, ..., 0, -1
+ uwire th_full = GuardSem[$left(GuardSem)];
+ always_ff @(posedge clk) begin
+ if(rst) GuardSem <= MAX_PENDING-1;
+ else begin
+ automatic logic dec = !(USE_CONFIG && cfg_en) && !th_full && ivld;
+ automatic logic inc = ovld && ordy;
+ GuardSem <= GuardSem + (inc == dec? 0 : inc? 1 : -1);
+ end
+ end
+
+ // PE Configuration Address Decoding
+ logic cfg_sel[PE];
+ logic cfg_oob;
+ logic [N-1:0] cfg_ofs;
+ if(PE == 1) begin
+ assign cfg_sel[0] = 1;
+ assign cfg_oob = 0;
+ assign cfg_ofs = cfg_a[0+:N];
+ end
+ else begin
+ uwire [$clog2(PE)-1:0] cfg_pe = cfg_a[N+:$clog2(PE)];
+ always_comb begin
+ foreach(cfg_sel[pe]) begin
+ cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_pe == pe);
+ end
+ cfg_oob = (cfg_pe >= PE);
+ cfg_ofs = cfg_a[0+:N];
+ if(cfg_oob && !cfg_we) begin
+ // Map readbacks from padded rows (non-existent PEs) to padded highest threshold index of first PE
+ cfg_sel[0] = 1;
+ cfg_ofs = '1;
+ end
+ end
+ end
+
+ uwire ptr_t iptr;
+ assign iptr[0+:N] = cfg_ofs;
+ if(CF > 1) begin
+ // Channel Fold Rotation
+ logic [$clog2(CF)-1:0] CnlCnt = 0;
+ logic CnlLst = 0;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ CnlCnt <= 0;
+ CnlLst <= 0;
+ end
+ else if(!(USE_CONFIG && cfg_en) && !th_full && ivld) begin
+ CnlCnt <= CnlCnt + (CnlLst? 1-CF : 1);
+ CnlLst <= CnlCnt == CF-2;
+ end
+ end
+
+ assign iptr[N+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[N+$clog2(PE)+:$clog2(CF)] : CnlCnt;
+ end
+
+ for(genvar pe = 0; pe < PE; pe++) begin
+ assign pipe[pe][0] = '{
+ op: USE_CONFIG && cfg_en?
+ (!cfg_sel[pe]? NOP : cfg_we? WR : RB) :
+ (ivld && !th_full? TH : NOP),
+ ptr: iptr,
+ val: !(USE_CONFIG && cfg_en)? idat[pe] : cfg_we? cfg_d : 0
+ };
+ end
+
+ assign irdy = !(USE_CONFIG && cfg_en) && !th_full;
+ end : blkFeed
+
+ //-----------------------------------------------------------------------
+ // Free-Running Thresholding Pipeline
+ for(genvar stage = 0; stage < N; stage++) begin : genStages
+
+ localparam int unsigned SN = N-1-stage;
+ for(genvar pe = 0; pe < PE; pe++) begin : genPE
+ uwire pipe_t p = pipe[pe][stage];
+ uwire cs = (p.ptr[SN:0] == 2**SN-1);
+
+ // Threshold Memory
+ val_t Thresh; // Read-out register
+ if(1) begin : blkThresh
+ localparam int unsigned DEPTH = CF * 2**stage;
+ localparam RAM_STYLE =
+ DEPTH_TRIGGER_URAM && (DEPTH >= DEPTH_TRIGGER_URAM)? "ultra" :
+ DEPTH_TRIGGER_BRAM && (DEPTH >= DEPTH_TRIGGER_BRAM)? "block" :
+ // If BRAM trigger defined, force distributed memory below if Vivado may be tempted to use BRAM nonetheless.
+ DEPTH_TRIGGER_BRAM && (DEPTH >= 64)? "distributed" : "auto";
+
+ (* RAM_STYLE = RAM_STYLE *)
+ val_t Threshs[DEPTH];
+ if(THRESHOLDS_PATH != "") begin
+ initial $readmemh($sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage), Threshs);
+ end
+
+ if(USE_CONFIG) begin : genThreshMem
+ uwire we = (p.op ==? WR) && cs;
+ if((CF == 1) && (stage == 0)) begin
+ always @(posedge clk) begin
+ if(we) Threshs[0] <= p.val;
+ end
+ end
+ else begin
+ uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1];
+ always @(posedge clk) begin
+ if(we) Threshs[addr] <= p.val;
+ end
+ end
+ end : genThreshMem
+
+ if((CF == 1) && (stage == 0)) begin
+ assign Thresh = Threshs[0];
+ end
+ else begin
+ uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1];
+ always_ff @(posedge clk) begin
+ Thresh <= Threshs[addr];
+ end
+ end
+
+ end : blkThresh
+
+ // Pipeline State
+ pipe_t P = '{ op: NOP, default: 'x };
+ logic Reval = 0;
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ P <= '{ op: NOP, default: 'x };
+ Reval <= 0;
+ end
+ else begin
+ P <= p;
+ Reval <= (p.op ==? RB) && cs;
+ end
+ end
+
+ logic cmp;
+ if(!SIGNED) assign cmp = $unsigned(Thresh) <= $unsigned(P.val);
+ else if(!FPARG) assign cmp = $signed(Thresh) <= $signed(P.val);
+ else begin : blkSignedFloat
+ uwire mag_eq = Thresh[K-2:0] == P.val[K-2:0];
+ uwire mag_le = Thresh[K-2:0] <= P.val[K-2:0];
+ always_comb begin
+ unique case({Thresh[K-1], P.val[K-1]})
+ 2'b00: cmp = mag_le;
+ 2'b01: cmp = 0;
+ 2'b10: cmp = 1;
+ 2'b11: cmp = !mag_le || mag_eq;
+ default: cmp = 'x;
+ endcase
+ end
+ end : blkSignedFloat
+
+ // Pipeline State Update
+ pipe_t pp;
+ always_comb begin
+ pp = P;
+ if(P.op !=? CFG) pp.ptr[SN] = cmp;
+ if(Reval) pp.val = Thresh;
+ end
+
+ // Pipeline State Forward (potentially additional register)
+ pipe_t pf;
+ if(!DEEP_PIPELINE) assign pf = pp;
+ else begin
+ pipe_t Pf = '{ op: NOP, default: 'x };
+ always_ff @(posedge clk) begin
+ if(rst) Pf <= '{ op: NOP, default: 'x };
+ else Pf <= pp;
+ end
+ assign pf = Pf;
+ end
+
+ assign pipe[pe][stage+1] = pf;
+
+ end : genPE
+ end : genStages
+
+ //-----------------------------------------------------------------------
+ // Configuration Readback
+ always_comb begin
+ cfg_rack = 0;
+ cfg_q = 0;
+ foreach(pipe[pe]) begin
+ automatic pipe_t p = pipe[pe][N];
+ cfg_rack |= p.op ==? RB;
+ cfg_q |= p.val;
+ end
+ end
+
+ //-----------------------------------------------------------------------
+ // Stream Output through FIFO
+ // - Depth of N + Output Reg to allow pipe to drain entirely under backpressure
+ // - Typically mapped to an SRL shift register
+ if(1) begin : blkStreamOutput
+ localparam int unsigned A_DEPTH = MAX_PENDING - 1;
+ logic [PE-1 : 0][N-1 : 0] ADat[A_DEPTH];
+ logic signed [$clog2(A_DEPTH):0] APtr = '1; // -1, 0, 1, ..., A_DEPTH-1
+ uwire avld = !APtr[$left(APtr)];
+
+ logic [PE-1:0][N-1:0] BDat = 'x;
+ logic BVld = 0;
+
+ uwire aload = pipe[0][N].op ==? TH;
+ uwire bload = !BVld || ordy;
+
+ always_ff @(posedge clk) begin
+ if(aload) begin
+ assert(APtr < $signed(A_DEPTH-1)) else begin
+ $error("Overrun after failing stream guard.");
+ $stop;
+ end
+ foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][N].ptr;
+ for(int unsigned i = 1; i < A_DEPTH; i++) ADat[i] <= ADat[i-1];
+ end
+ end
+ always_ff @(posedge clk) begin
+ if(rst) APtr <= '1;
+ else APtr <= APtr + (aload == (avld && bload)? 0 : aload? 1 : -1);
+ end
+ always_ff @(posedge clk) begin
+ if(rst) begin
+ BDat <= 'x;
+ BVld <= 0;
+ end
+ else if(bload) begin
+ BDat <= ADat[APtr];
+ BVld <= avld;
+ end
+ end
+
+ assign ovld = BVld;
+ for(genvar pe = 0; pe < PE; pe++) begin
+ assign odat[pe] = BDat[pe] + BIAS;
+ end
+ end : blkStreamOutput
+
+endmodule : thresholding
diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
new file mode 100644
index 0000000000..5c7182b214
--- /dev/null
+++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv
@@ -0,0 +1,164 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief All-AXI interface adapter for thresholding module.
+ * @author Thomas B. Preußer
+ *
+ * @description
+ * This AXI adapter fits the core thresholding functionality:
+ * - with AXI stream data interfaces with flow control
+ * - with implicit round-robin channel rotation as used by FINN, and
+ * - performs aligned byte address to parameter word address translation.
+ *****************************************************************************/
+
+module thresholding_axi #(
+ int unsigned N, // output precision
+ int unsigned K, // input/threshold precision
+ int unsigned C = 1, // Channels
+ int unsigned PE = 1, // Processing Parallelism, requires C = k*PE
+
+ bit SIGNED = 1, // signed inputs
+ bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa
+ int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS]
+
+ // Initial Thresholds
+ parameter THRESHOLDS_PATH = "",
+
+ bit USE_AXILITE, // Implement AXI-Lite for threshold read/write
+
+ // Force Use of On-Chip Memory Blocks
+ int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio)
+ int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM
+ bit DEEP_PIPELINE = 0,
+
+ localparam int unsigned CF = C/PE, // Channel Fold
+ localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2,
+ localparam int unsigned O_BITS = BIAS >= 0?
+ /* unsigned */ $clog2(2**N+BIAS) :
+ /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS)
+)(
+ //- Global Control ------------------
+ input logic ap_clk,
+ input logic ap_rst_n,
+
+ //- AXI Lite ------------------------
+ // Writing
+ input logic s_axilite_AWVALID,
+ output logic s_axilite_AWREADY,
+ input logic [ADDR_BITS-1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored
+
+ input logic s_axilite_WVALID,
+ output logic s_axilite_WREADY,
+ input logic [31:0] s_axilite_WDATA,
+ input logic [ 3:0] s_axilite_WSTRB,
+
+ output logic s_axilite_BVALID,
+ input logic s_axilite_BREADY,
+ output logic [1:0] s_axilite_BRESP,
+
+ // Reading
+ input logic s_axilite_ARVALID,
+ output logic s_axilite_ARREADY,
+ input logic [ADDR_BITS-1:0] s_axilite_ARADDR,
+
+ output logic s_axilite_RVALID,
+ input logic s_axilite_RREADY,
+ output logic [31:0] s_axilite_RDATA,
+ output logic [ 1:0] s_axilite_RRESP,
+
+ //- AXI Stream - Input --------------
+ output logic s_axis_tready,
+ input logic s_axis_tvalid,
+ input logic [((PE*K+7)/8)*8-1:0] s_axis_tdata,
+
+ //- AXI Stream - Output -------------
+ input logic m_axis_tready,
+ output logic m_axis_tvalid,
+ output logic [((PE*O_BITS+7)/8)*8-1:0] m_axis_tdata
+);
+
+ //-----------------------------------------------------------------------
+ // AXI-lite Configuration Interface
+ uwire cfg_en;
+ uwire cfg_we;
+ uwire [ADDR_BITS-3:0] cfg_a;
+ uwire [K -1:0] cfg_d;
+ uwire cfg_rack;
+ uwire [K -1:0] cfg_q;
+
+ if(USE_AXILITE) begin
+ uwire [ADDR_BITS-1:0] cfg_a0;
+ axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(K)) axi (
+ .aclk(ap_clk), .aresetn(ap_rst_n),
+
+ .awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x),
+ .wready(s_axilite_WREADY), .wvalid(s_axilite_WVALID), .wdata(s_axilite_WDATA), .wstrb(s_axilite_WSTRB),
+ .bready(s_axilite_BREADY), .bvalid(s_axilite_BVALID), .bresp(s_axilite_BRESP),
+
+ .arready(s_axilite_ARREADY), .arvalid(s_axilite_ARVALID), .araddr(s_axilite_ARADDR), .arprot('x),
+ .rready(s_axilite_RREADY), .rvalid(s_axilite_RVALID), .rresp(s_axilite_RRESP), .rdata(s_axilite_RDATA),
+
+ .ip_en(cfg_en), .ip_wen(cfg_we), .ip_addr(cfg_a0), .ip_wdata(cfg_d),
+ .ip_rack(cfg_rack), .ip_rdata(cfg_q)
+ );
+ assign cfg_a = cfg_a0[ADDR_BITS-3:0];
+ always_ff @(posedge ap_clk) begin
+ assert(!ap_rst_n || !cfg_en || (cfg_a0[ADDR_BITS-2+:2] === 3'h0)) else begin
+ $error("%m: Spurious high address bits.");
+ $stop;
+ end
+ end
+ end
+ else begin
+ assign cfg_en = 0;
+ assign cfg_we = 'x;
+ assign cfg_a = 'x;
+ assign cfg_d = 'x;
+ end
+
+ //-----------------------------------------------------------------------
+ // Kernel Implementation
+ thresholding #(
+ .N(N), .K(K), .C(C), .PE(PE),
+ .SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS),
+ .THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE),
+ .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM),
+ .DEEP_PIPELINE(DEEP_PIPELINE)
+ ) impl (
+ .clk(ap_clk), .rst(!ap_rst_n),
+
+ .cfg_en, .cfg_we, .cfg_a, .cfg_d,
+ .cfg_rack, .cfg_q,
+
+ .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata),
+ .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata)
+ );
+
+endmodule : thresholding_axi
diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
new file mode 100644
index 0000000000..f35db156f6
--- /dev/null
+++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v
@@ -0,0 +1,121 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @author Thomas B. Preußer
+ * @brief Verilog wrapper for IP packaging.
+ */
+
+module $MODULE_NAME_AXI_WRAPPER$ #(
+ parameter N = $N$, // output precision
+ parameter K = $M$, // input/threshold precision
+ parameter C = $C$, // Channels
+ parameter PE = $PE$, // Processing Parallelism, requires C = k*PE
+
+ parameter SIGNED = $SIGNED$, // signed inputs
+ parameter FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa
+ parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS]
+
+ parameter THRESHOLDS_PATH = $THRESHOLDS_PATH$, // Directory with initial threshold data
+ parameter USE_AXILITE = $USE_AXILITE$, // Implement AXI-Lite for threshold read/write
+
+ // Force Use of On-Chip Memory Blocks
+ parameter DEPTH_TRIGGER_URAM = $DEPTH_TRIGGER_URAM$, // if non-zero, local mems of this depth or more go into URAM (prio)
+ parameter DEPTH_TRIGGER_BRAM = $DEPTH_TRIGGER_BRAM$, // if non-zero, local mems of this depth or more go into BRAM
+ parameter DEEP_PIPELINE = $DEEP_PIPELINE$, // [bit] extra pipeline stages for easier timing closure
+
+ parameter O_BITS = $O_BITS$
+)(
+ // Global Control
+ (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axilite:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *)
+ (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *)
+ input ap_clk,
+ (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *)
+ input ap_rst_n,
+
+ //- AXI Lite ------------------------
+ // Writing
+ input s_axilite_AWVALID,
+ output s_axilite_AWREADY,
+ input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored
+
+ input s_axilite_WVALID,
+ output s_axilite_WREADY,
+ input [31:0] s_axilite_WDATA,
+ input [ 3:0] s_axilite_WSTRB,
+
+ output s_axilite_BVALID,
+ input s_axilite_BREADY,
+ output [1:0] s_axilite_BRESP,
+
+ // Reading
+ input s_axilite_ARVALID,
+ output s_axilite_ARREADY,
+ input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_ARADDR,
+
+ output s_axilite_RVALID,
+ input s_axilite_RREADY,
+ output [31:0] s_axilite_RDATA,
+ output [ 1:0] s_axilite_RRESP,
+
+ //- AXI Stream - Input --------------
+ output in0_V_TREADY,
+ input in0_V_TVALID,
+ input [((PE*K+7)/8)*8-1:0] in0_V_TDATA,
+
+ //- AXI Stream - Output -------------
+ input out_V_TREADY,
+ output out_V_TVALID,
+ output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA
+);
+
+ thresholding_axi #(
+ .N(N), .K(K), .C(C), .PE(PE),
+ .SIGNED(SIGNED),
+ .FPARG(FPARG),
+ .BIAS(BIAS),
+ .THRESHOLDS_PATH(THRESHOLDS_PATH),
+ .USE_AXILITE(USE_AXILITE),
+ .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM),
+ .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM),
+ .DEEP_PIPELINE(DEEP_PIPELINE)
+ ) core (
+ .ap_clk(ap_clk), .ap_rst_n(ap_rst_n),
+
+ .s_axilite_AWVALID(s_axilite_AWVALID), .s_axilite_AWREADY(s_axilite_AWREADY), .s_axilite_AWADDR(s_axilite_AWADDR),
+ .s_axilite_WVALID(s_axilite_WVALID), .s_axilite_WREADY(s_axilite_WREADY), .s_axilite_WDATA(s_axilite_WDATA), .s_axilite_WSTRB(s_axilite_WSTRB),
+ .s_axilite_BVALID(s_axilite_BVALID), .s_axilite_BREADY(s_axilite_BREADY), .s_axilite_BRESP(s_axilite_BRESP),
+
+ .s_axilite_ARVALID(s_axilite_ARVALID), .s_axilite_ARREADY(s_axilite_ARREADY), .s_axilite_ARADDR(s_axilite_ARADDR),
+ .s_axilite_RVALID(s_axilite_RVALID), .s_axilite_RREADY(s_axilite_RREADY), .s_axilite_RDATA(s_axilite_RDATA), .s_axilite_RRESP(s_axilite_RRESP),
+ .s_axis_tready(in0_V_TREADY), .s_axis_tvalid(in0_V_TVALID), .s_axis_tdata(in0_V_TDATA),
+ .m_axis_tready(out_V_TREADY), .m_axis_tvalid(out_V_TVALID), .m_axis_tdata(out_V_TDATA)
+ );
+
+endmodule // $MODULE_NAME_AXI_WRAPPER$
diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv
new file mode 100644
index 0000000000..ae30503f8f
--- /dev/null
+++ b/finn-rtllib/thresholding/sim/thresh_gen.sv
@@ -0,0 +1,75 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *****************************************************************************/
+module thresh_gen;
+ localparam int unsigned K = 9;
+ localparam int unsigned N = 4;
+ localparam int unsigned C = 6;
+
+ typedef logic [K-1:0] thresh_t;
+ localparam thresh_t THRESHOLDS[C][2**N-1] = '{
+ '{ 'h00, 'h01, 'h02, 'h03, 'h04, 'h05, 'h06, 'h07, 'h08, 'h09, 'h0a, 'h0b, 'h0c, 'h0d, 'h0e },
+ '{ 'h10, 'h11, 'h12, 'h13, 'h14, 'h15, 'h16, 'h17, 'h18, 'h19, 'h1a, 'h1b, 'h1c, 'h1d, 'h1e },
+ '{ 'h20, 'h21, 'h22, 'h23, 'h24, 'h25, 'h26, 'h27, 'h28, 'h29, 'h2a, 'h2b, 'h2c, 'h2d, 'h2e },
+ '{ 'h30, 'h31, 'h32, 'h33, 'h34, 'h35, 'h36, 'h37, 'h38, 'h39, 'h3a, 'h3b, 'h3c, 'h3d, 'h3e },
+ '{ 'h40, 'h41, 'h42, 'h43, 'h44, 'h45, 'h46, 'h47, 'h48, 'h49, 'h4a, 'h4b, 'h4c, 'h4d, 'h4e },
+ '{ 'h50, 'h51, 'h52, 'h53, 'h54, 'h55, 'h56, 'h57, 'h58, 'h59, 'h5a, 'h5b, 'h5c, 'h5d, 'h5e }
+ };
+ localparam THRESHOLDS_PATH = "./";
+
+ localparam int unsigned PE = 2;
+ localparam int unsigned CF = C/PE;
+
+ for(genvar stage = 0; stage < N; stage++) begin
+ localparam int unsigned SN = N-1-stage;
+ for(genvar pe = 0; pe < PE; pe++) begin
+ initial begin
+ automatic string file = $sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage);
+
+ automatic thresh_t threshs[CF * 2**stage];
+ for(int unsigned c = 0; c < CF; c++) begin
+ for(int unsigned i = 0; i < 2**stage; i++) begin
+ threshs[(c << stage) + i] = THRESHOLDS[c*PE + pe][(i<<(N-stage)) + 2**SN-1];
+ end
+ end
+
+ $writememh(file, threshs);
+ end
+ end
+ end
+
+ // Quit after running all initializers
+ initial begin
+ #1ns;
+ $display("Generation done.");
+ $finish;
+ end
+
+endmodule : thresh_gen
diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
new file mode 100644
index 0000000000..429fb7776f
--- /dev/null
+++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv
@@ -0,0 +1,314 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Testbench for thresholding_axi.
+ * @author Monica Chiosa
+ *
+ */
+
+module thresholding_axi_tb #(
+ int unsigned N = 4, // output precision
+ int unsigned C = 6, // number of channels
+ int unsigned PE = 2,
+ real M0 = 7.3, // slope of the uniform thresholding line
+ real B0 = 3.1, // offset of the uniform thresholding line
+ bit THROTTLED = 1,
+
+ localparam int unsigned CF = C/PE, // Channel Fold
+ localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2
+);
+
+ //-----------------------------------------------------------------------
+ // Design Geometry
+
+ // For each channel = [0,channel):
+ // M_channel = M0 + CX*channel
+ // B_channel = B0 + CX*channel
+ // Input/threshold precision computed according with the maximum posible value
+ localparam real CX = 1.375;
+ localparam int unsigned K = $clog2((2**N-1)*(M0+C*CX) + (B0+C*CX)); // unused sign + magnitude
+ localparam int unsigned C_BITS = C < 2? 1 : $clog2(C);
+
+ localparam int unsigned MST_STRM_WROUNDS = 503;
+
+ typedef int unsigned threshs_t[C][2**N-1];
+ function threshs_t init_thresholds();
+ automatic threshs_t res;
+ for(int unsigned c = 0; c < C; c++) begin
+ automatic real m = M0 + c*CX;
+ automatic real b = B0 + c*CX;
+ foreach(res[c][i]) begin
+ res[c][i] = int'($ceil(m*i + b));
+ end
+ end
+ return res;
+ endfunction : init_thresholds
+ localparam threshs_t THRESHS = init_thresholds();
+
+ //-----------------------------------------------------------------------
+ // Clock and Reset Control
+ logic clk = 0;
+ always #5ns clk = !clk;
+ logic rst = 1;
+ initial begin
+ #10ns;
+ @(posedge clk);
+ rst <= 0;
+ end
+
+ //-----------------------------------------------------------------------
+ // DUT
+ logic s_axilite_AWVALID;
+ uwire s_axilite_AWREADY;
+ logic [ADDR_BITS-1:0] s_axilite_AWADDR; // lowest 2 bits (byte selectors) are ignored
+ logic s_axilite_WVALID;
+ uwire s_axilite_WREADY;
+ logic [ 31:0] s_axilite_WDATA;
+ uwire s_axilite_BVALID;
+ logic s_axilite_BREADY;
+ uwire [ 1:0] s_axilite_BRESP;
+ logic s_axilite_ARVALID;
+ uwire s_axilite_ARREADY;
+ logic [ADDR_BITS-1:0] s_axilite_ARADDR;
+ uwire s_axilite_RVALID;
+ uwire s_axilite_RREADY = 1;
+ uwire [ 31:0] s_axilite_RDATA;
+ uwire [ 1:0] s_axilite_RRESP;
+
+ uwire irdy;
+ logic ivld;
+ logic [PE-1:0][K-1:0] idat;
+
+ logic ordy = 0;
+ uwire ovld;
+ uwire [PE-1:0][N-1:0] odat;
+
+ thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut (
+ .ap_clk(clk), .ap_rst_n(!rst),
+
+ // Configuration
+ .s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR,
+ .s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1),
+ .s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP,
+ .s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR,
+ .s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP,
+
+ // Stream Processing
+ .s_axis_tready(irdy), .s_axis_tvalid(ivld), .s_axis_tdata(idat),
+ .m_axis_tready(ordy), .m_axis_tvalid(ovld), .m_axis_tdata(odat)
+ );
+
+ //-----------------------------------------------------------------------
+ // Input Stimuli
+ typedef logic [PE-1:0][K-1:0] input_t;
+ typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t;
+ input_t QW[$]; // Input Feed Tracing
+ addr_t QC[$];
+
+ int unsigned error_cnt = 0;
+ bit done = 0;
+ initial begin
+ // Report testbench details
+ $display("Testbench - tresholding K=%0d -> N=%0d", K, N);
+ for(int unsigned c = 0; c < C; c++) begin
+ $write("Channel #%0d: Thresholds = {", c);
+ for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0d", THRESHS[c][i]);
+ $display(" }");
+ end
+
+ // Config
+ s_axilite_AWVALID = 0;
+ s_axilite_AWADDR = 'x;
+ s_axilite_WVALID = 0;
+ s_axilite_WDATA = 'x;
+ s_axilite_BREADY = 0;
+ s_axilite_ARVALID = 0;
+ s_axilite_ARADDR = 'x;
+
+ // Stream Input
+ ivld = 0;
+ idat = 'x;
+
+ @(posedge clk iff !rst);
+
+ // Threshold Configuration
+ for(int unsigned c = 0; c < C; c+=PE) begin
+ automatic addr_t addr = 0;
+ if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = c/PE;
+ for(int unsigned pe = 0; pe < PE; pe++) begin
+ if(PE > 1) addr[N+:$clog2(PE)] = pe;
+ for(int unsigned t = 0; t < 2**N-1; t++) begin
+ addr[0+:N] = t;
+ fork
+ begin
+ s_axilite_AWVALID <= 1;
+ s_axilite_AWADDR <= { addr, 2'b00 };
+ @(posedge clk iff s_axilite_AWREADY);
+ s_axilite_AWVALID <= 0;
+ s_axilite_AWADDR <= 'x;
+ end
+ begin
+ s_axilite_WVALID <= 1;
+ s_axilite_WDATA <= THRESHS[c+pe][t];
+ @(posedge clk iff s_axilite_WREADY);
+ s_axilite_WVALID <= 0;
+ s_axilite_WDATA <= 'x;
+ end
+ begin
+ s_axilite_BREADY <= 1;
+ @(posedge clk iff s_axilite_BVALID);
+ assert(s_axilite_BRESP == '0) else begin
+ $error("Error on parameter write.");
+ $stop;
+ end
+ s_axilite_BREADY <= 0;
+ end
+ join
+ end
+ end
+ end
+
+ fork
+ // Intermittent configuration readback
+ while(!done) begin
+ if(($urandom()%37) != 0) begin
+ s_axilite_ARVALID <= 0;
+ s_axilite_ARADDR <= 'x;
+ @(posedge clk);
+ end
+ else begin
+ automatic addr_t addr = $urandom()%(N-1);
+ if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE;
+ if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF;
+
+ s_axilite_ARVALID <= 1;
+ s_axilite_ARADDR <= { addr, 2'b00 };
+ @(posedge clk iff s_axilite_ARREADY);
+
+ QC.push_back(addr);
+ end
+ end
+
+ // AXI4Stream MST Writes input values
+ repeat(MST_STRM_WROUNDS) begin
+ automatic input_t dat;
+
+ while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk);
+
+ std::randomize(dat);
+ ivld <= 1;
+ idat <= dat;
+ @(posedge clk iff irdy);
+ ivld <= 0;
+ idat <= 'x;
+ QW.push_back(dat);
+ end
+ join_any
+ done <= 1;
+ repeat(N+6) @(posedge clk);
+
+ assert(QW.size() == 0) else begin
+ $error("Missing %0d outputs.", QW.size());
+ $stop;
+ end
+ assert(QC.size() == 0) else begin
+ $error("Missing %0d readback replies.", QC.size());
+ $stop;
+ end
+
+ $display("Test completed: %0d errors in %0d tests.", error_cnt, MST_STRM_WROUNDS);
+ $display("=========================================");
+ $finish;
+ end
+
+ // Output Checker -------------------------------------------------------
+
+ // Configuration Readback
+ always_ff @(posedge clk iff s_axilite_RVALID) begin
+ assert(s_axilite_RRESP == '0) else begin
+ $error("Read back error.");
+ $stop;
+ end
+ assert(QC.size()) begin
+ automatic addr_t addr = QC.pop_front();
+ automatic int unsigned cnl =
+ (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) +
+ (PE == 1? 0 : addr[N+:$clog2(PE)]);
+ automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]];
+ assert(s_axilite_RDATA == exp) else begin
+ $error("Readback mismatch on #%0d.%0d: %0d instead of %0d", cnl, addr[0+:N], s_axilite_RDATA, exp);
+ $stop;
+ end
+ end
+ else begin
+ $error("Spurious readback output.");
+ $stop;
+ end
+ end
+
+ // Stream Output
+ int unsigned OCnl = 0;
+ always @(posedge clk) begin
+ if(rst) begin
+ OCnl <= 0;
+ ordy <= 1'b0;
+ end
+ else begin
+ if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED;
+
+ if(ordy && ovld) begin
+ assert(QW.size()) begin
+ automatic input_t x = QW.pop_front();
+
+ for(int unsigned pe = 0; pe < PE; pe++) begin
+ automatic int unsigned cnl = OCnl + pe;
+
+ $display("Mapped CNL=%0d DAT=%3d -> #%2d", cnl, x[pe], odat[pe]);
+ assert(
+ ((odat[pe] == 0) || (THRESHS[cnl][odat[pe]-1] <= x[pe])) &&
+ ((odat[pe] == 2**N-1) || (x[pe] < THRESHS[cnl][odat[pe]]))
+ ) else begin
+ $error("Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", cnl, x[pe], odat[pe]);
+ error_cnt++;
+ $stop;
+ end
+ end
+ end
+ else begin
+ $error("Spurious output.");
+ $stop;
+ end
+
+ OCnl <= (OCnl + PE)%C;
+ end
+ end
+ end
+
+endmodule: thresholding_axi_tb
diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv
new file mode 100644
index 0000000000..1564f28f0d
--- /dev/null
+++ b/finn-rtllib/thresholding/sim/thresholding_tb.sv
@@ -0,0 +1,274 @@
+/******************************************************************************
+ * Copyright (C) 2024, Advanced Micro Devices, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
+ * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * @brief Testbench for thresholding_axi.
+ * @author Monica Chiosa
+ *
+ */
+
+module thresholding_tb #(
+ int unsigned K = 10, // input precision
+ int unsigned N = 4, // output precision
+ int unsigned C = 6, // number of channels
+ int unsigned PE = 2,
+
+ localparam int unsigned CF = C/PE // Channel Fold
+);
+ localparam bit DEEP_PIPELINE = 1;
+
+ localparam int unsigned MST_STRM_WROUNDS = 507;
+ localparam bit THROTTLED = 1;
+
+ //-----------------------------------------------------------------------
+ // Clock and Reset Control
+ logic clk = 0;
+ always #5ns clk = !clk;
+ logic rst = 1;
+ initial begin
+ #10ns;
+ @(posedge clk);
+ rst <= 0;
+ end
+
+ //-----------------------------------------------------------------------
+ // Parallel Instances differing in Data Type
+ typedef logic [K -1:0] val_t;
+ typedef val_t threshs_t[C][2**N-1];
+ typedef val_t [PE-1:0] input_t;
+ typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t;
+ logic [0:2] term = '0;
+ always_comb begin
+ if(&term) $finish;
+ end
+ for(genvar i = 0; i < 3; i++) begin : genTypes
+ localparam bit SIGNED = i>0;
+ localparam bit FPARG = i>1;
+
+ //- DUT -------------------------
+ logic cfg_en;
+ logic cfg_we;
+ logic [$clog2(C)+N-1:0] cfg_a;
+ logic [K-1:0] cfg_d;
+ uwire cfg_rack;
+ uwire [K-1:0] cfg_q;
+
+ uwire irdy;
+ logic ivld;
+ logic [PE-1:0][K-1:0] idat;
+
+ logic ordy = 0;
+ uwire ovld;
+ uwire [PE-1:0][N-1:0] odat;
+
+ thresholding #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .FPARG(FPARG), .USE_CONFIG(1), .DEEP_PIPELINE(DEEP_PIPELINE)) dut (
+ .clk, .rst,
+
+ // Configuration
+ .cfg_en, .cfg_we, .cfg_a, .cfg_d,
+ .cfg_rack, .cfg_q,
+
+ // Stream Processing
+ .irdy, .ivld, .idat,
+ .ordy, .ovld, .odat
+ );
+
+ //- Stimulus Driver -------------
+ threshs_t THRESHS;
+ function val_t sigord(input val_t x);
+ automatic val_t res = x;
+ if(SIGNED) begin
+ if(FPARG && x[K-1]) res[K-2:0] = ~x[K-2:0];
+ res[K-1] = !x[K-1];
+ end
+ return res;
+ endfunction : sigord
+
+ input_t QW[$]; // Input tracing
+ addr_t QC[$]; // Readback tracking
+ int unsigned error_cnt = 0;
+ bit done = 0;
+ initial begin
+
+ // Generate thresholds
+ std::randomize(THRESHS);
+ foreach(THRESHS[c]) begin
+ val_t row[2**N-1] = THRESHS[c];
+ row.sort with (sigord(item));
+ THRESHS[c] = row;
+ end
+
+ // Report test case details
+ $display("[%0d] Thresholding %s%s%0d -> uint%0d", i, SIGNED? "s" : "u", FPARG? "fp" : "int", K, N);
+ for(int unsigned c = 0; c < C; c++) begin
+ $write("[%0d] Channel #%0d: Thresholds = {", i, c);
+ for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0X", THRESHS[c][i]);
+ $display(" }");
+ end
+
+ // Config
+ cfg_en = 0;
+ cfg_we = 'x;
+ cfg_a = 'x;
+ cfg_d = 'x;
+
+ // Stream Input
+ ivld = 0;
+ idat = 'x;
+
+ @(posedge clk iff !rst);
+
+ // Threshold Configuratin
+ cfg_en <= 1;
+ cfg_we <= 1;
+ for(int unsigned c = 0; c < C; c+=PE) begin
+ if(CF > 1) cfg_a[N+$clog2(PE)+:$clog2(CF)] <= c/PE;
+ for(int unsigned pe = 0; pe < PE; pe++) begin
+ if(PE > 1) cfg_a[N+:$clog2(PE)] = pe;
+ for(int unsigned t = 0; t < 2**N-1; t++) begin
+ cfg_a[0+:N] <= t;
+ cfg_d <= THRESHS[c+pe][t];
+ @(posedge clk);
+ end
+ end
+ end
+ cfg_d <= 'x;
+
+ fork
+ // Intermittent configuration readback
+ while(!done) begin
+ cfg_en <= 0;
+ cfg_we <= 'x;
+ cfg_a <= 'x;
+ @(posedge clk);
+ if(($urandom()%41) == 0) begin
+ automatic addr_t addr = $urandom()%(N-1);
+ if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE;
+ if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF;
+
+ cfg_en <= 1;
+ cfg_we <= 0;
+ cfg_a <= addr;
+ @(posedge clk);
+ QC.push_back(addr);
+ end
+ end
+
+ // AXI4Stream MST Writes input values
+ repeat(MST_STRM_WROUNDS) begin
+ automatic input_t dat;
+
+ while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk);
+
+ std::randomize(dat);
+ ivld <= 1;
+ idat <= dat;
+ @(posedge clk iff irdy);
+ ivld <= 0;
+ idat <= 'x;
+ QW.push_back(dat);
+ end
+ join_any
+ done <= 1;
+ repeat((DEEP_PIPELINE+1)*N+8) @(posedge clk);
+
+ assert(QW.size() == 0) else begin
+ $error("[%0d] Missing %0d outputs.", i, QW.size());
+ $stop;
+ end
+ assert(QC.size() == 0) else begin
+ $error("[%0d] Missing %0d readback replies.", i, QC.size());
+ $stop;
+ end
+
+ $display("[%0d] Test completed: %0d errors in %0d tests.", i, error_cnt, MST_STRM_WROUNDS);
+ $display("=============================================");
+ term[i] <= 1;
+ end
+
+ //- Readback Checker --------------
+ always_ff @(posedge clk iff cfg_rack) begin
+ assert(QC.size()) begin
+ automatic addr_t addr = QC.pop_front();
+ automatic int unsigned cnl =
+ (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) +
+ (PE == 1? 0 : addr[N+:$clog2(PE)]);
+ automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]];
+ assert(cfg_q == exp) else begin
+ $error("[%0d] Readback mismatch on #%0d.%0d: %0d instead of %0d", i, cnl, addr[0+:N], cfg_q, exp);
+ $stop;
+ end
+ end
+ else begin
+ $error("[%0d] Spurious readback output.", i);
+ $stop;
+ end
+ end
+
+ // Output Checker
+ int unsigned OCnl = 0;
+ always @(posedge clk) begin
+ if(rst) begin
+ OCnl <= 0;
+ ordy <= 1'b0;
+ end
+ else begin
+ if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED;
+
+ if(ordy && ovld) begin
+ assert(QW.size()) begin
+ automatic input_t x = QW.pop_front();
+
+ for(int unsigned pe = 0; pe < PE; pe++) begin
+ automatic int unsigned cnl = OCnl + pe;
+
+ $display("[%0d] Mapped CNL=%0d DAT=%3x -> #%2d", i, cnl, x[pe], odat[pe]);
+ assert(
+ ((odat[pe] == 0) || (sigord(THRESHS[cnl][odat[pe]-1]) <= sigord(x[pe]))) &&
+ ((odat[pe] == 2**N-1) || (sigord(x[pe]) < sigord(THRESHS[cnl][odat[pe]])))
+ ) else begin
+ $error("[%0d] Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", i, cnl, x[pe], odat[pe]);
+ error_cnt++;
+ $stop;
+ end
+ end
+ end
+ else begin
+ $error("[%0d] Spurious output.", i);
+ $stop;
+ end
+
+ OCnl <= (OCnl + PE)%C;
+ end
+ end
+ end
+
+ end : genTypes
+
+endmodule: thresholding_tb
diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb
index f8444520c3..5ed48ca6d8 100644
--- a/notebooks/advanced/0_custom_analysis_pass.ipynb
+++ b/notebooks/advanced/0_custom_analysis_pass.ipynb
@@ -52,7 +52,9 @@
"metadata": {},
"outputs": [],
"source": [
- "showInNetron(\"../LFCW1A1.onnx\")"
+ "import os\n",
+ "notebook_dir = os.environ['FINN_ROOT'] + \"/notebooks\"\n",
+ "showInNetron(notebook_dir + \"/LFCW1A1.onnx\")"
]
},
{
@@ -69,7 +71,7 @@
"outputs": [],
"source": [
"from qonnx.core.modelwrapper import ModelWrapper\n",
- "model = ModelWrapper('../LFCW1A1.onnx')"
+ "model = ModelWrapper(notebook_dir + \"/LFCW1A1.onnx\")"
]
},
{
@@ -151,9 +153,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb
index 391e852a71..91dd925b25 100644
--- a/notebooks/advanced/1_custom_transformation_pass.ipynb
+++ b/notebooks/advanced/1_custom_transformation_pass.ipynb
@@ -110,8 +110,11 @@
"metadata": {},
"outputs": [],
"source": [
+ "import os\n",
+ "notebook_dir = os.environ['FINN_ROOT'] + \"/notebooks\"\n",
+ "\n",
"import onnx\n",
- "onnx_model = onnx.load('../LFCW1A1.onnx')\n",
+ "onnx_model = onnx.load(notebook_dir + \"/LFCW1A1.onnx\")\n",
"from qonnx.core.modelwrapper import ModelWrapper\n",
"onnx_model = ModelWrapper(onnx_model)"
]
@@ -122,7 +125,7 @@
"metadata": {},
"outputs": [],
"source": [
- "showInNetron('../LFCW1A1.onnx')"
+ "showInNetron(notebook_dir + \"/LFCW1A1.onnx\")"
]
},
{
@@ -209,7 +212,7 @@
"\n",
"To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 4 by default, this can be increased or decreased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n",
"\n",
- "In the following we want to take a closer look at the implementation using the compile transformation as example."
+ "In the following we want to take a closer look at the implementation using the compile transformation that is used for cpp simulation as example."
]
},
{
@@ -227,7 +230,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is fpgadataflow node."
+ "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is an hls node."
]
}
],
@@ -247,9 +250,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb
index 636da64dd5..bdd2976412 100644
--- a/notebooks/advanced/2_custom_op.ipynb
+++ b/notebooks/advanced/2_custom_op.ipynb
@@ -672,7 +672,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb
new file mode 100644
index 0000000000..8c7b97d6c6
--- /dev/null
+++ b/notebooks/advanced/3_folding.ipynb
@@ -0,0 +1,668 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# FINN - Folding\n",
+ "--------------------------------------\n",
+ "**Note: We will utilize one of the intermediate models generated in the process of the cybersecurity end2end example**\n",
+ "\n",
+ "There is a local copy of `step_specialize_layers.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_specialize_layers.onnx`. \n",
+ "\n",
+ "This notebook describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. \n",
+ "\n",
+ "Please be aware that the folding factors can not be selected arbitrarily, each layer has constraints on which values the parallelization parameters can be set to, for more information see here: https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer\n",
+ "\n",
+ "We'll use the utility function `showInNetron()` to visualize and interact with our network in the Jupyter Notebook and `showSrc()` to show source code of FINN library calls."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from finn.util.visualization import showInNetron, showSrc"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Note: The build_flow in the cybsec_mlp notebook comprises a transformation step `step_target_fps_parallelization` that automatically sets custom parallelization parameters needed to achieve a given `target_fps` by invoking the [`SetFolding` transformation](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_folding.py#L46).\n",
+ "\n",
+ "More details of the above step can be found [here](https://github.com/Xilinx/finn/blob/main/src/finn/builder/build_dataflow_steps.py#L394)."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This notebook shows the manual version of this step and explains how these attributes can improve performance and what are their effects on resource utilization for developers who need to maximize the performance of their network. \n",
+ "\n",
+ "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HW layers and then specialized to either HLS or RTL variants. In this example, all nodes were converted to HLS variants this means that each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n",
+ "\n",
+ "We will take this model to show how to set the folding factors manually and analyze the estimated execution clock cycles and the resource utilization of each layer in the network."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### FINN-style Dataflow Architectures \n",
+ "\n",
+ "We start with a quick recap of FINN-style dataflow architectures. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, as illustrated in the figure below.\n",
+ "\n",
+ "\n",
+ "\n",
+ "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by RTL modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n",
+ "\n",
+ "Since each layer will be instantiated, we can flexibly set the parallelization of each layer and thus control resources and throughput of our network, as visualized in the image below:\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Part-1 : Loading the ONNX model.\n",
+ "\n",
+ "As discussed above, the network needs to go through a few preparation steps before it can be fed into our estimation functions.\n",
+ "\n",
+ "The `.onnx` file loaded here is taken from the cybersecurity end2end example notebook. \n",
+ "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HW blocks. In this case, the HLS variants of MatrixVectorActivation, `MVAU_hls` units. \n",
+ "\n",
+ "To interact with the `.onnx` file we use `ModelWrapper()`. This wrapper simplifies the access to different model attributes and allows us to apply custom transformations on the model.\n",
+ "\n",
+ "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron. Additionally, we call the transformation `GiveUniqueNodeNames` as a preparation."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "from qonnx.core.modelwrapper import ModelWrapper\n",
+ "from qonnx.transformation.general import GiveUniqueNodeNames\n",
+ "\n",
+ "model = ModelWrapper(os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\")\n",
+ "model = model.transform(GiveUniqueNodeNames())\n",
+ "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD_named_nodes.onnx\"\n",
+ "model.save(model_path)\n",
+ "\n",
+ "showInNetron(model_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Part 2 : Parallelization Parameters: PE & SIMD"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The computational parallelism can be varied by setting the folding factors or also called parallelization parameters **PE** and **SIMD** of each layer. These parallelization attributes are subject to certain constraints and should be selected accordingly.\n",
+ "\n",
+ "To see more details about how this is implemented in the HLS variant of the MatrixVectorActivation layer (`MVAU_hls`), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n",
+ "\n",
+ ""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the case of the MVAU, `PE` & `SIMD` are subject to the following constraints: \n",
+ "\n",
+ "If `MW` is the number of input features and `MH` the number of output features:\n",
+ "\n",
+ " MW % SIMD == 0\n",
+ " MH % PE == 0\n",
+ " \n",
+ "Total folding in the case of the MVAU is defined as:\n",
+ "\n",
+ " Total folding = (MH/PE) x (MW/SIMD)\n",
+ "\n",
+ "In a streaming dataflow architecture like it is in FINN designs the throughput is determined by the slowest layer. So, the goal of adjusting these parameters is to get an almost balanced pipeline i.e. equalizing the throughput rate of layers in the generated dataflow architecture.\n",
+ "\n",
+ "The FINN compiler provides analysis passes to facilitate the exploration of the folding factors of each layer. In this notebook we will show how to use these functions and explore how the parallelization parameters affect the clock cycles and the resource utilization of the generated dataflow architecture.\n",
+ "\n",
+ "We start with a naive case where `PE` & `SIMD` values across all layers are 1, this is the starting point of our exploration and is also the state the network is in after the conversion to HLS layers. If you take a look at the model using Netron and click on one of the MVAU layers, you can see that `PE` and `SIMD` are both set to 1 by default."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(model_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We import the analysis passes `exp_cycles_per_layer()` and `res_estimation()` to estimate the number of clock cycles and resource utilization of each network layer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer\n",
+ "from finn.analysis.fpgadataflow.res_estimation import res_estimation"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Analysis passes in FINN return information about the model in form of a dictionary, you can learn more about analysis passes in general in this Jupyter notebook: [0_custom_analysis_pass.ipynb](0_custom_analysis_pass.ipynb).\n",
+ "\n",
+ "We start by calling the analysis pass `exp_cycles_per_layer()`, which returns a dictionary with the layer names as keys and the expected cycles as values. Afterwards, we plot the result in a block diagram."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cycles_dict = model.analysis(exp_cycles_per_layer)\n",
+ "cycles_dict"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import matplotlib.pyplot as plt\n",
+ "\n",
+ "fig = plt.figure(figsize = (10, 5))\n",
+ "plt.bar(cycles_dict.keys(), cycles_dict.values(), color ='blue', width = 0.3)\n",
+ "plt.xlabel(\"Network layers\")\n",
+ "plt.ylabel(\"Number of clock cycles\")\n",
+ "plt.title(\"Clock cycles per layer PE=SIMD=1\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We observe that the bottleneck in the execution of the model on hardware would come from the execution of the first layer which takes estimated 38400 clock cycles to execute one set of its inputs.\n",
+ "\n",
+ "No matter how quickly the other layers execute, the throughput will be defined by the first layer's execution latency.\n",
+ "\n",
+ "Let's have a look now at the estimated resources per layer by calling another analysis pass.\n",
+ "The keys are again the layer names, but the values are now a dictionary with the resource estimates per layer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "res_dict = model.analysis(res_estimation)\n",
+ "res_dict"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_hls_0 uses 5 block ram and they are 83% utilized. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After we extract that information from the model, we plot the number of LUTs. In this notebook we concentrate on the influence on the LUT usage, but by manipulating the code below, you can also extract information about memory and dsp usage."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extracting LUTs from res_dict\n",
+ "LUTs = [res_dict[key][\"LUT\"] for key in res_dict.keys()] \n",
+ "\n",
+ "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+ "fig = plt.figure(figsize = (10, 5))\n",
+ "plt.bar(res_dict.keys(), LUTs, color ='green', width = 0.3)\n",
+ "plt.xlabel(\"Network layers\")\n",
+ "plt.ylabel(\"Number of LUTs\")\n",
+ "plt.title(\"No. of LUTs per layer PE=SIMD=1\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Since we identified above that the first layer takes the highest number of cycles to complete the execution, we will now try to adjust the folding parameters to reduce its latency at the expense of an increase in resource utilization."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Modify Parameters\n",
+ "\n",
+ "We now modify the parallelization parameters of the first network layer to reduce its latency.\n",
+ "We only extract the first `MVAU_hls` block from the model and set the parallelization parameters manually.\n",
+ "\n",
+ "In the first step, we left the `PE` & `SIMD` values for all the layers on default (=1) to establish a baseline and measure the estimated clock cycles and resource utilization for each of the individual layers.\n",
+ "\n",
+ "To set `PE` & `SIMD`, we will utilize functionality from the FINN compiler. Each layer type has a Python wrapper which can be instantiated using the `getCustomOp()` function. The wrapper offers several helper functions like `get_nodeattr()` and `set_nodeattr()` to access and set the attributes of a node."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from qonnx.custom_op.registry import getCustomOp\n",
+ "\n",
+ "list_of_mvaus = model.get_nodes_by_op_type(\"MVAU_hls\")\n",
+ "mvau0 = list_of_mvaus[0]\n",
+ "\n",
+ "mvau0_inst = getCustomOp(mvau0)\n",
+ "\n",
+ "# Get the node attributes to check the current setting\n",
+ "print(\"The parallelization parameters of %s were: \" % mvau0.name)\n",
+ "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n",
+ "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))\n",
+ "\n",
+ "# Set the new node attributes\n",
+ "mvau0_inst.set_nodeattr(\"PE\", 2)\n",
+ "mvau0_inst.set_nodeattr(\"SIMD\", 5)\n",
+ "\n",
+ "# Get the node attributes to check the updated setting\n",
+ "print(\"The parallelization parameters of %s are updated to: \" % mvau0.name)\n",
+ "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n",
+ "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We save the model and view it. On expanding the first `MVAU_hls` we can see the updated `PE` & `SIMD` parameters for that layer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.save(\"cybsec_PE_SIMD_modified.onnx\")\n",
+ "showInNetron(\"cybsec_PE_SIMD_modified.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "From the above total folding formula, we have reduced the total folding of our layer from `600 x 64` to `120 x 32`. Hence, resulting in an estimated `10x` decrease in the execution latency of our layer. \n",
+ "This can be observed in the new estimated clock cycles."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "cycles_dict_updated = model.analysis(exp_cycles_per_layer)\n",
+ "cycles_dict_updated"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fig = plt.figure(figsize = (10, 5))\n",
+ "plt.bar(cycles_dict_updated.keys(), cycles_dict_updated.values(), color ='blue', width = 0.3)\n",
+ "plt.xlabel(\"Network layers\")\n",
+ "plt.ylabel(\"Number of clock cycles\")\n",
+ "plt.title(\"Clock cycles per layer with updated folding factors\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "This has of course consequences for the resource usage of the network."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "res_dict_updated = model.analysis(res_estimation)\n",
+ "res_dict_updated"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extracting LUTs from res_dict\n",
+ "LUTs_updated = [res_dict_updated[key][\"LUT\"] for key in res_dict_updated.keys()] \n",
+ "\n",
+ "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+ "fig = plt.figure(figsize = (10, 5))\n",
+ "plt.bar(res_dict_updated.keys(), LUTs_updated, color ='green', width = 0.3)\n",
+ "plt.xlabel(\"Network Layers\")\n",
+ "plt.ylabel(\"LUT Utilisation\")\n",
+ "plt.title(\"No. of LUTs per layer with updated folding factors\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "From these numbers, we see that the first layer has been removed as the bottleneck and that the entire network can now perform one inference in ~4096 clock cycles (when the pipeline is full) as compared to the earlier configuration where it took ~38400 execution cycles.\n",
+ "\n",
+ "This decrease in execution latency of the network though comes at a cost of a 45% increase in LUT resource utilization for the first layer of the network."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Important Note : StreamingDataWidthConverters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next to resources and performance, folding factors (or parallelization parameters) are influencing also other properties of the generated design. Since we are able to generate results in parallel, the data that gets fed into the layer needs to be packed in a specific format to provide the correct data at the correct time for the internal parallelism. Also, the data that comes out of a layer will be in a specific format depending on the internal parallelism."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To analyze the influence of the folding factors on the data streams between layers, we first will import the original model (with `PE=SIMD=1`) and then we will import the updated model, so that we can compare the two of them."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dir_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/\" \n",
+ "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD_named_nodes.onnx\")\n",
+ "model_updated = ModelWrapper(\"cybsec_PE_SIMD_modified.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the next step we extract the information from all layers. For MVAUs the input shape is (1, MW/SIMD, SIMD) and the output shape is (1, MH/PE, PE)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Original model\n",
+ "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n",
+ "print(\"In the original model (pe=simd=1): \")\n",
+ "for mvau in list_of_mvaus:\n",
+ " mvau_inst = getCustomOp(mvau)\n",
+ " print(\"Layer: \" + mvau.name)\n",
+ " print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n",
+ " print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Updated model\n",
+ "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n",
+ "print(\"In the original model (pe=simd=1): \")\n",
+ "for mvau in list_of_mvaus:\n",
+ " mvau_inst = getCustomOp(mvau)\n",
+ " print(\"Layer: \" + mvau.name)\n",
+ " print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n",
+ " print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can see that the input and output shape for MVAU_hls_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showSrc(mvau_inst.get_instream_width)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showSrc(mvau_inst.get_outstream_width)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The input stream width can be calculated by multiplying the input bit width with SIMD and the output stream width can be calculated by multiplying the output bit width with PE."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "To connect two layers with each other for the final design, the input stream width of a node needs to match the output stream width of the preceding node. If that is not the case FINN inserts DataWidthConverters (DWCs) to resolve this mismatch. Let's have a look at the input/output stream width of the layers before updating the parallelization parameters."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Original model\n",
+ "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n",
+ "print(\"In the original model (pe=simd=1): \")\n",
+ "for mvau in list_of_mvaus:\n",
+ " mvau_inst = getCustomOp(mvau)\n",
+ " print(\"Layer: \" + mvau.name)\n",
+ " print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n",
+ " print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the original model the output stream width of one layer matches the input stream width of the following layer. So there would be no DWC required when generating the final design."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "For the updated model, the situation is different. Let's have a look how the stream widths have changed."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Updated model\n",
+ "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n",
+ "print(\"In the original model (pe=simd=1): \")\n",
+ "for mvau in list_of_mvaus:\n",
+ " mvau_inst = getCustomOp(mvau)\n",
+ " print(\"Layer: \" + mvau.name)\n",
+ " print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n",
+ " print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "As we can see, the output stream width of MVAU_hls_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by first calling the transformation `InsertDWC` and then converting the resulting DWCs into an HLS or RTL variant by calling `SpecializeLayers`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n",
+ "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n",
+ "\n",
+ "model_updated = model_updated.transform(InsertDWC())\n",
+ "model_updated = model_updated.transform(SpecializeLayers())\n",
+ "model_updated = model_updated.transform(GiveUniqueNodeNames())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_updated.save(\"cybsec_DWC.onnx\")\n",
+ "showInNetron(\"cybsec_DWC.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can observe in the model that a DWC was inserted between the first two layers.\n",
+ "Since the DWC will also be a hardware block in our final FINN design, it has a latency and resources associated with it. Let's have a final look in our resource estimates."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_dwc = ModelWrapper(\"cybsec_DWC.onnx\")\n",
+ "res_dict_dwc = model_dwc.analysis(res_estimation)\n",
+ "res_dict_dwc"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Since we have now one additional layer, we manipulate the data to shorten the layer names in the plot."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "layers = res_dict_dwc.keys()\n",
+ "# replace names of layers with abbreviations\n",
+ "layers = [n.replace(\"StreamingDataWidthConverter_Batch\", \"DWC\") for n in layers]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extracting LUTs from res_dict\n",
+ "LUTs_dwc = [res_dict_dwc[key][\"LUT\"] for key in res_dict_dwc.keys()] \n",
+ "\n",
+ "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n",
+ "fig = plt.figure(figsize = (10, 5))\n",
+ "plt.bar(layers, LUTs_dwc, color ='red', width = 0.3)\n",
+ "plt.xlabel(\"Network Layers\")\n",
+ "plt.ylabel(\"LUT Utilisation\")\n",
+ "plt.title(\"Estimated LUT values used for each network layer\")\n",
+ "plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the case of our example network, the `StreamingDataWidthConverter_Batch` layer does not consume a large number of LUT resources as shown in the graph. This might be different for larger models and if there are a higher number of DWCs inserted. Please be aware of this when setting the folding factors for your network."
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb
new file mode 100644
index 0000000000..5139377342
--- /dev/null
+++ b/notebooks/advanced/4_advanced_builder_settings.ipynb
@@ -0,0 +1,1844 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "8fcff912",
+ "metadata": {},
+ "source": [
+ "# Advanced Builder settings\n",
+ "\n",
+ "\n",
+ "\n",
+ "In this notebook, we'll use the FINN compiler to generate an FPGA accelerator with a streaming dataflow architecture from a small convolutional network trained on CIFAR-10. The key idea in streaming dataflow architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vitis HLS or RTL description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n",
+ "These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a830e730",
+ "metadata": {},
+ "source": [
+ "In this tutorial, we will have a more detailed look into the FINN builder tool and explore different options to customize your FINN design. We assume that you have already completed the [Cybersecurity notebooks](../end2end_example/cybersecurity) and that you have a basic understanding of how the FINN compiler works and how to use the FINN builder tool."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5ec9a0db",
+ "metadata": {},
+ "source": [
+ "## Outline\n",
+ "---------------\n",
+ "\n",
+ "1. [Introduction to the CNV-w2a2 network](#intro_cnv)\n",
+ "2. [Recap default builder flow](#recap_builder)\n",
+ "3. [Build steps](#build_step)\n",
+ " 1. [How to create a custom build step](#custom_step)\n",
+ "4. [Specialize layers configuration json](#specialize_layers)\n",
+ "5. [Folding configuration json](#folding_config)\n",
+ "6. [Additional builder arguments](#builder_arg)\n",
+ " 1. [Verification steps](#verify)\n",
+ " 2. [Other builder arguments](#other_args)\n",
+ " 3. [Examples for additional builder arguments & bitfile generation](#example_args)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5dbed63f",
+ "metadata": {},
+ "source": [
+ "## Introduction to the CNV-w2a2 network \n",
+ "\n",
+ "The particular quantized neural network (QNN) we will be targeting in this notebook is referred to as CNV-w2a2 and it classifies 32x32 RGB images into one of ten CIFAR-10 classes. All weights and activations in this network are quantized to two bit, with the exception of the input (which is RGB with 8 bits per channel) and the final output (which is 32-bit numbers). It is similar to the convolutional neural network used in the [cnv_end2end_example](../end2end_example/bnn-pynq/cnv_end2end_example.ipynb) Jupyter notebook.\n",
+ "\n",
+ "\n",
+ "You'll have a chance to interactively examine the layers that make up the network in Netron. We start by setting the build directory to the directory this notebook is in and importing helper functions to use in the notebook to examine ONNX graphs and source code."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce459f3c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from finn.util.visualization import showInNetron, showSrc\n",
+ "import os\n",
+ " \n",
+ "build_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7fc6444c",
+ "metadata": {},
+ "source": [
+ "In the next step, we will export the trained network directly from Brevitas to the QONNX format. QONNX is the intermediate representation (IR) that is used as the frontend to the FINN compiler. Please note that the internal representation of the network is still the FINN-ONNX format. [QONNX and FINN-ONNX](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-qonnx-and-finn-onnx) are extensions to the ONNX format to represent quantization, especially below 8 bit, in ONNX graphs. The main difference is that quantization in QONNX graphs is represented using dedicated quantization nodes ([more about QONNX](https://github.com/fastmachinelearning/qonnx)) while the quantization in FINN-ONNX is an annotation attached to the tensors."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fe262964",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import torch\n",
+ "from finn.util.test import get_test_model_trained\n",
+ "from brevitas.export import export_qonnx\n",
+ "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
+ "\n",
+ "cnv = get_test_model_trained(\"CNV\", 2, 2)\n",
+ "export_onnx_path = build_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path)\n",
+ "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d24b632f",
+ "metadata": {},
+ "source": [
+ "After the export, we call a clean up function on the model. This makes sure, that for example all shapes in the network are inferred, constant folding was applied and all tensors and nodes have unique names. In the next step, we can visualize the graph using Netron. When scrolling through the graph, you can see the Quant nodes that indicate the quantization in the network. In the [first step](https://github.com/Xilinx/finn/blob/main/src/finn/builder/build_dataflow_steps.py#L260) of the FINN builder flow, the network gets converted from the QONNX format to the FINN-ONNX format. That means these Quant nodes will not be present in the graph anymore and instead the quantization will be attached as an annotation to the tensors."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87f59da6",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/end2end_cnv_w2a2_export.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c764ed76",
+ "metadata": {},
+ "source": [
+ "## Quick recap, how to setup up default builder flow for resource estimations "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a26e5418",
+ "metadata": {},
+ "source": [
+ "As a quick recap, let's set up the builder like we have done in the cybersecurity example to get the resource estimates for our example network."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9007705a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Quick recap on how to setup the default builder flow for resource estimations\n",
+ "\n",
+ "import finn.builder.build_dataflow as build\n",
+ "import finn.builder.build_dataflow_config as build_cfg\n",
+ "import os\n",
+ "import shutil\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "estimates_output_dir = build_dir + \"/output_estimates_only\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(estimates_output_dir):\n",
+ " shutil.rmtree(estimates_output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = estimates_output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " target_fps = 10000,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_cfg.estimate_only_dataflow_steps,\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "02e4c0f0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4fa0b9f5",
+ "metadata": {},
+ "source": [
+ "The output directory was created and we can extract information about our model and also how it was processed in the FINN compiler from the generated files. Let's focus on the intermediate models for now. You can find them in the output directory in the folder \"intermediate_models\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "05a941ef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!ls -t -r {build_dir}/output_estimates_only/intermediate_models"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d746eff3",
+ "metadata": {},
+ "source": [
+ "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "72de8d4c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_to_investigate = \"step_qonnx_to_finn.onnx\"\n",
+ "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/\"+model_to_investigate)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bccebd0d",
+ "metadata": {},
+ "source": [
+ "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6d86463a",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hw.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2719cc09",
+ "metadata": {},
+ "source": [
+ "As you can see in the graph, the first two nodes (a MultiThreshold and Transpose node) and the last two nodes (a Mul and Add node) are not converted into HW layers. FINN currently only converts integer only operations into HW layers, this means only when the input, output & weights are quantized to integer the node will be converted."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ff7fa549",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "Important notice: We are working on supporting additional data types and this limitation might disappear in the near future.\n",
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6e6d942e",
+ "metadata": {},
+ "source": [
+ "When we click on the `global_in` in the graph, we can see that the quantization annotation does not contain a data type. If no data type is set and it can not be derived from the preceeding node, the FINN compiler automatically assumes that the data type is floating point. This is why the first node does not get converted into an HW layer, the input is assumed to be floating point."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8b8994e6",
+ "metadata": {},
+ "source": [
+ "The solution to the problem depends on the actual data input.\n",
+ "1. The data set is quantized and `global_in` is an integer: We set the data type of the tensor `global_in` before passing the model to the FINN compiler using [helper functions of ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#helper-functions-for-tensors).\n",
+ "2. The data set is not quantized: we can either execute the first layer in software (e.g. as part of the Python driver) or we can add a preprocessing step into the graph."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7504dce7",
+ "metadata": {},
+ "source": [
+ "Even though in the example of the CNVw2a2, the inputs are 32x32 RGB images, so the input values are 8 bit (UINT8) \"quantized\", the input to the exported model is floating point. For training in Brevitas, these values were normalized between 0 and 1.0 and so the exported model expects floating point values as input. \n",
+ "This means we are in scenario 2. In the next section we will develop a custom step for the FINN builder flow to add preprocessing to our network.\n",
+ "\n",
+ "But before we move to the next section, let's take a look at the last two nodes in the graph that were not converted to HW layers."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f9c2696b",
+ "metadata": {},
+ "source": [
+ "We have two nodes at the end of the graph that we were not able to convert: a floating poing scalar multiplication and addition. These operations are \"left-over\" from streamlining and cannot be merged into a succeeding thresholding operation. \n",
+ "\n",
+ "Our example is a network for image classification, so the output is a vector of 10 values that give a predicition score for each of the classes in the CIFAR-10 data set. If we are only interested in the Top-1 result of the classification, we can add a post-processing step which inserts a TopK node in the graph. \n",
+ "\n",
+ "Since the last two layers are scalar operations, they have the same influence on all predicition scores in the output vector and we can safely merge them into the TopK node. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4fc8fbf5",
+ "metadata": {},
+ "source": [
+ "These pre-processing and post-processing steps are network dependent and we will need to write **custom steps** that can then be executed using the FINN builder tool.\n",
+ "\n",
+ "In the next section we will first look into how a standard build step inside FINN looks like and then we will write our own custom steps for pre- and post-processing and add them to the builder configuration."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7e561a91",
+ "metadata": {},
+ "source": [
+ "## Build steps "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fb18b21d",
+ "metadata": {},
+ "source": [
+ "The following steps are executed when using the `estimates_only`-flow."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f3fe1186",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"\\n\".join(build_cfg.estimate_only_dataflow_steps))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "dd3ef987",
+ "metadata": {},
+ "source": [
+ "You can have a closer look at each step by either using the `showSrc()` function or by accessing the doc string."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "313fac18",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+ "print(build_dataflow_steps.step_tidy_up.__doc__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "029da0da",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+ "showSrc(build_dataflow_steps.step_tidy_up)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2809f6a7",
+ "metadata": {},
+ "source": [
+ "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end the modified model is returned."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e9c2c97f",
+ "metadata": {},
+ "source": [
+ "### How to create a custom build step "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "537a44e7",
+ "metadata": {},
+ "source": [
+ "When writing our own custom steps, we use the same pattern. See below the code for the pre-processing for the example network."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b9d43cc8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from finn.util.pytorch import ToTensor\n",
+ "from qonnx.transformation.merge_onnx_models import MergeONNXModels\n",
+ "from qonnx.core.modelwrapper import ModelWrapper\n",
+ "from qonnx.core.datatype import DataType\n",
+ "\n",
+ "def custom_step_add_pre_proc(model: ModelWrapper, cfg: build.DataflowBuildConfig):\n",
+ " ishape = model.get_tensor_shape(model.graph.input[0].name)\n",
+ " # preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n",
+ " preproc = ToTensor()\n",
+ " export_qonnx(preproc, torch.randn(ishape), \"preproc.onnx\", opset_version=11)\n",
+ " preproc_model = ModelWrapper(\"preproc.onnx\")\n",
+ " # set input finn datatype to UINT8\n",
+ " preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType[\"UINT8\"])\n",
+ " # merge pre-processing onnx model with cnv model (passed as input argument)\n",
+ " model = model.transform(MergeONNXModels(preproc_model))\n",
+ " return model\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7a6798aa",
+ "metadata": {},
+ "source": [
+ "In the next step we can modify the builder configuration to execute a custom sequence of builder steps, including the newly implemented pre-processing custom step.\n",
+ "\n",
+ "For that we create a list `build_steps` which contains next to the standard steps from the `estimate_only` flow, also the new custom step to add the pre-processing. This list then gets passed in the build configuration."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "6f00b465",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Builder flow with custom step for pre-processing\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_pre_proc\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_target_fps_parallelization\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " target_fps = 10000,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_steps,\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d3a2bcea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "51b7dbd5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!ls -t -r {build_dir}/output_pre_proc/intermediate_models"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4690049f",
+ "metadata": {},
+ "source": [
+ "An intermediate .onnx file after the execution of the custom step was automatically created, let's have a look at the graph."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "87e5651e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_pre_proc/intermediate_models/custom_step_add_pre_proc.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "90c6bef9",
+ "metadata": {},
+ "source": [
+ "The graph is in QONNX format and a division by 255 is inserted in the beginning. We can now use the CIFAR-10 images directly as input to the graph and the new `global_in` tensor is UINT8.\n",
+ "\n",
+ "You can already have a look on how the intermediate models have changed by modifying the code in the cell above. Before we go into more detail, we will add another custom step to insert the post-processing. In this case this means the insertion of a TopK node."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8c6f1bd0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from qonnx.transformation.insert_topk import InsertTopK\n",
+ "\n",
+ "def custom_step_add_post_proc(model: ModelWrapper, cfg: build.DataflowBuildConfig):\n",
+ " model = model.transform(InsertTopK(k=1))\n",
+ " return model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "57adbb44",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Builder flow with custom step for pre-processing and post-processing\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_pre_and_post_proc\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_target_fps_parallelization\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " target_fps = 10000,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_steps,\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b0598b81",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "95230896",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!ls -t -r {build_dir}/output_pre_and_post_proc/intermediate_models"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3a0263b1",
+ "metadata": {},
+ "source": [
+ "You can use the code in the cell below to investigate the generated intermediate models. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "44127417",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model_to_investigate = \"custom_step_add_post_proc.onnx\"\n",
+ "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/\"+model_to_investigate)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5cc97505",
+ "metadata": {},
+ "source": [
+ "Let's have a look at the model after the conversion to hw, to verify that now all layers are correctly converted."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "63131e3e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/step_convert_to_hw.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8fd0af6b",
+ "metadata": {},
+ "source": [
+ "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "a6edf5c4-9213-45cd-834f-615c12685d9e",
+ "metadata": {},
+ "source": [
+ "## Specialize layers configuration json "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4ae83d6e-c704-4c7f-a922-a4b470c0a55f",
+ "metadata": {},
+ "source": [
+ "The FINN compiler was developed with the assumption that the hardware blocks corresponding to the neural network layers are developed based on HLS. Although we do not want to abolish this HLS implementation at this time, it has become apparent over the years that for certain modules it makes sense to implement them in RTL. This allows us greater control over the resulting hardware and we can make optimal use of FPGA resources.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ed72aabf-0517-422f-a686-6c70e7492114",
+ "metadata": {},
+ "source": [
+ "So, with the growth of more and more RTL variants of common FINN hardware building blocks, we introduced an additional builder step called `step_specialize_layers`. In this step HW nodes get specialized to either an HLS or RTL variant of the node. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "82a2bc39-8a37-49aa-a79d-2818e66ebd11",
+ "metadata": {},
+ "source": [
+ "They get converted either based on pre-determined rules or the user provides a configuration file which contains the desired setting. If the user preference cannot be fulfilled, a warning will be printed and the implementation style will be set to a default. "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bc90b589-7a92-4996-9704-02736ac4e60e",
+ "metadata": {},
+ "source": [
+ "The builder flow step before `step_create_dataflow_partition` generates a template json file to set the preferred implementation style per layer. We can copy it from one of the previous runs to this folder and manipulate it to pass it to a new build."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ddb88eb1-3f11-4343-ae7c-3e5e8cbc34dc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n",
+ " specialize_layers_config = json.load(json_file)\n",
+ "\n",
+ "print(json.dumps(specialize_layers_config, indent=1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "158d7d8c-a072-4a50-9714-43ebaefa53d1",
+ "metadata": {},
+ "source": [
+ "As you can see, each node is listed in the .json file and an empty string for the node attribute `preferred_impl_style` is instantiated by default. We can now use this .json and set the `preferred_impl_style` to pass to a new builder flow."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3f464d35-6774-4751-80b4-b6230e501539",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n",
+ " specialize_layers_config = json.load(json_file)\n",
+ "\n",
+ "# Set all preferred_impl_style to all HLS\n",
+ "for key in specialize_layers_config:\n",
+ " if \"preferred_impl_style\" in specialize_layers_config[key]:\n",
+ " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n",
+ "# Save as .json \n",
+ "with open(\"specialize_layers_all_hls.json\", \"w\") as jsonFile:\n",
+ " json.dump(specialize_layers_config, jsonFile)\n",
+ " \n",
+ "# Set SWG to RTL variant\n",
+ "for key in specialize_layers_config:\n",
+ " if \"preferred_impl_style\" in specialize_layers_config[key]:\n",
+ " if key.startswith(\"ConvolutionInputGenerator\"):\n",
+ " specialize_layers_config[key][\"preferred_impl_style\"] = \"rtl\"\n",
+ " else:\n",
+ " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n",
+ "# Save as .json \n",
+ "with open(\"specialize_layers_swg_rtl.json\", \"w\") as jsonFile:\n",
+ " json.dump(specialize_layers_config, jsonFile)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "52592ea6-cd12-46b9-af91-5960b4749e7e",
+ "metadata": {},
+ "source": [
+ "We created two `specialize_layers_config_files`:\n",
+ "* One which sets all layers to `\"hls\"`\n",
+ "* One that sets `preferred_impl_style` for the ConvolutionInputGenerator to `\"rtl\"`"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "701905d8-c5cc-4cc0-b872-156c5b9d0432",
+ "metadata": {},
+ "source": [
+ "In the following we will setup two build flows and run them to the estimate reports step. Afterwards we will investigate the intermediate .onnx files and compare the two runs."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "22ff1a91-7ef7-44cb-86d3-60b9af7a8c5e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Build flow with custom folding configuration\n",
+ "## specialize_layers_config_file = \"specialize_layers_all_hls.json\"\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_all_hls\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_steps,\n",
+ " specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c9df41ff-ef6a-4d0e-ab36-241bb11ed241",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ff617f21-6001-4bb7-9cf7-2cc2acd3fbec",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Build flow with custom folding configuration\n",
+ "## specialize_layers_config_file = \"specialize_layers_swg_rtl.json\"\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_swg_rtl\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_steps,\n",
+ " specialize_layers_config_file = \"specialize_layers_swg_rtl.json\",\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8f48ba95-f7b5-455b-8041-25b7341ad115",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bed4bedd-397d-4bd1-8531-c6ceac306715",
+ "metadata": {},
+ "source": [
+ "First we are looking into the intermediate model after `step_create_dataflow_partition` and then after `step_specialize_layers`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4e64db23-98cb-494b-851f-3cc2c3847451",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_create_dataflow_partition.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e1a6351-367f-47a6-b802-a2613ea455a1",
+ "metadata": {},
+ "source": [
+ "Let's have a look first at the model which we specialize to \"all HLS\"."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f85d6c42-153d-4a40-b3cc-a4c8c89fe636",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_specialize_layers.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e1520920-b7de-42a5-9ec8-e8503992fbd1",
+ "metadata": {},
+ "source": [
+ "As you can see, each op type has now a suffix indicating that it is an HLS variant of the node. Additionally, when you click on one of the node in the Netron visualization, you can see that module is set to `finn.custom_op.fpgadataflow.hls`.\n",
+ "\n",
+ "Let's now have a look at the model in which we specialized the ConvolutionInputGenerator to `\"rtl\"`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9f1f26a0-3a62-4920-bf40-5b1b798fa02e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_swg_rtl/intermediate_models/step_specialize_layers.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3f9c4de4-61ef-4698-ab23-87bf5953c5ae",
+ "metadata": {},
+ "source": [
+ "You can use the cells above to try out different settings and pass it to the builder flow. Please note that not all layers have HLS and RTL variants, so it might be that the setting you define in `specialize_layers_config.json` gets ignored and a sensible default is set instead. The FINN compiler will display a warning in this case."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "5ffbadd1",
+ "metadata": {},
+ "source": [
+ "## Folding configuration json "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c164040f",
+ "metadata": {},
+ "source": [
+ "The FINN compiler allows the user to implement a network in streaming dataflow architecture, this means every layer is implemented individually and the data is streamed through the accelerator. We can customize each layer for specific performance and resource requirements by adjusting the parallelism and resource type of each layer. In the FINN context we refer to this customization of parallelism in each layer as folding. To learn more details about the influence of folding factors/parallelism in FINN, please have a look at our [folding tutorial](./3_folding.ipynb).\n",
+ "\n",
+ "In this section, we will look into the interface over which we can influence the customization of each layer using the FINN builder tool: A json file containing the folding configuration."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1299b86d",
+ "metadata": {},
+ "source": [
+ "Depending on the invoked step, the FINN compiler can produce or consume a .json file containing the folding configuration for each layer. In the cell below, we will have a look at the automatically generated .json file, which is produced by `step_target_fps_parallelization`. We use this then as starting point to manipulate the folding configuration and feed it back into the builder tool."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f75f5634",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import json\n",
+ "\n",
+ "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+ " folding_config = json.load(json_file)\n",
+ "\n",
+ "print(json.dumps(folding_config, indent=1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8de787a7",
+ "metadata": {},
+ "source": [
+ "As you can see from the printed cell above, the keys in the .json file are the node names of the layers in our network. For each of the layers, some node attributes are listed:\n",
+ "* `PE` and `SIMD` are the folding parameters that determine the parallelism of each layer, depending on the layer they can be set to different values, for details refer to [this table](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer).\n",
+ "* `mem_mode`: determines if the parameter memory will be implemented as part of the HLS/RTL code (`const`) or instantiated separately and connected with the layer over a memory streamer unit (`decoupled`). You can find more details in this part of the documentation: https://finn-dev.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode . It is also possible to set the mem_mode to external which allows for the implementation for external weights.\n",
+ "* `ram_style`: when selecting `decoupled` mode, the FINN compiler allows us to choose which memory resource will be used for the layer. The argument `ram_style` is set to the selected memory type:\n",
+ " * `auto`: Vivado will make the decision if the implementation is using LUTRAM or BRAM\n",
+ " * `distributed`: LUTRAM will be used\n",
+ " * `block`: BRAM will be used\n",
+ " * `ultra`: URAM will be used, if available on the selected board\n",
+ "\n",
+ "* `resType`: This is a node attribute for the MVAU layer and can be set to `lut` or `dsp`. Please note that selecting `dsp` will not enable the optimized RTL variant of the MVAU but rather generate HLS code utilizing DSPs, this is not optimal yet but can give an additional parameter for design space exploration.\n",
+ "* `runtime_writeable_weights`: FINN offers the option to implement the weights as \"runtime writable\", this means you can write the weight values from the driver via an axilite interface."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "fd1519fe",
+ "metadata": {},
+ "source": [
+ "In the following part of the tutorial, we will use the auto generated json file as starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n",
+ "For that, we will extract the total resources from the *estimate_layer_resources.json* report in the following cell."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f7f42774",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(build_dir+\"/output_pre_and_post_proc/report/estimate_layer_resources.json\", 'r') as json_file:\n",
+ " json_object = json.load(json_file)\n",
+ "\n",
+ "print(json.dumps(json_object[\"total\"], indent=1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0be3b0e1",
+ "metadata": {},
+ "source": [
+ "The FINN compiler estimates the network to use ~500 BRAM blocks and ~100k LUTs."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d4d177dc",
+ "metadata": {},
+ "source": [
+ "We will use the `auto_folding_config.json` and create two folding configuration from that file:\n",
+ "* All `ram_style` attributes set to `distributed`\n",
+ "* All `ram_style` attributes set to `block`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "112af6fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n",
+ " folding_config = json.load(json_file)\n",
+ "\n",
+ "# Set all ram_style to LUT RAM\n",
+ "for key in folding_config:\n",
+ " if \"ram_style\" in folding_config[key]:\n",
+ " folding_config[key][\"ram_style\"] = \"distributed\" \n",
+ "# Save as .json \n",
+ "with open(\"folding_config_all_lutram.json\", \"w\") as jsonFile:\n",
+ " json.dump(folding_config, jsonFile)\n",
+ " \n",
+ "# Set all ram_style to BRAM\n",
+ "for key in folding_config:\n",
+ " if \"ram_style\" in folding_config[key]:\n",
+ " folding_config[key][\"ram_style\"] = \"block\" \n",
+ "# Save as .json \n",
+ "with open(\"folding_config_all_bram.json\", \"w\") as jsonFile:\n",
+ " json.dump(folding_config, jsonFile)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0e64a499",
+ "metadata": {},
+ "source": [
+ "After generating these files, we will invoke the builder flow. To enable the FINN builder to take the generated folding configuration as input, we will need to set the additional builder argument `folding_config_file` and we will change the `build_steps` to not run `step_target_fps_parallelization`. The build step does not necessarily need to be excluded, but since we pass a separate folding configuration, the output from that step would be overwritten anyways, so we skip it for a faster execution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cdd9f706",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Build flow with custom folding configuration\n",
+ "## folding_config_file = \"folding_config_all_lutram.json\"\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_all_lutram\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_steps,\n",
+ " folding_config_file = \"folding_config_all_lutram.json\",\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "99b647c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e705767d",
+ "metadata": {},
+ "source": [
+ "We can now have a look at the produced model, when clicking on the individual nodes, you can see that all layers have the node attribute `ram_style` set to `distributed`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc680178",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_all_lutram/intermediate_models/step_generate_estimate_reports.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "695ecfb1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(build_dir+\"/output_all_lutram/report/estimate_layer_resources.json\", 'r') as json_file:\n",
+ " json_object = json.load(json_file)\n",
+ "\n",
+ "print(json.dumps(json_object[\"total\"], indent=1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "55208c70",
+ "metadata": {},
+ "source": [
+ "The estimation report shows that BRAM utilization is down to zero and the LUT count went up to around 150k."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "11b8430a",
+ "metadata": {},
+ "source": [
+ "Let's do the same with the folding configuration which sets all memory resources to use BRAM."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "59e8aaaa",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Build flow with custom folding configuration\n",
+ "## folding_config_file = \"folding_config_all_bram.json\"\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_all_bram\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_steps,\n",
+ " folding_config_file = \"folding_config_all_bram.json\",\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2cdc1aa0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cd0388fd",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showInNetron(build_dir+\"/output_all_bram/intermediate_models/step_generate_estimate_reports.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e60a3efb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "with open(build_dir+\"/output_all_bram/report/estimate_layer_resources.json\", 'r') as json_file:\n",
+ " json_object = json.load(json_file)\n",
+ "\n",
+ "print(json.dumps(json_object[\"total\"], indent=1))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "97f87780",
+ "metadata": {},
+ "source": [
+ "The initial implementation already had a high utilization of BRAM, but the estimations went now up to ~500 BRAMs while the LUT count went down to ~99k."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e65a8ded",
+ "metadata": {},
+ "source": [
+ "You can use this example as a starting point to manipulate the folding configuration yourself. Instead of using the above code, you can also manually open one of the example .json files and set the values differently. Please be aware that the node attributes can not be set to arbitrary values. Especially the folding factors need to fulfil [certain constraints](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer). The other settings for node attributes, can be best looked up in the individual custom operator classes: [e.g. for MVAU](https://github.com/Xilinx/finn/blob/dev/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py#L64)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4a675834",
+ "metadata": {},
+ "source": [
+ "## Additional builder arguments "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "f7012b9a",
+ "metadata": {},
+ "source": [
+ "In this section, we will have a peak into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "467d8829",
+ "metadata": {},
+ "source": [
+ "We start by enabling the verification flow in the builder. The FINN compiler applies multiple transformations to the model before it gets turned into hardware, so we need to make sure that the functional behavior of the network does not change."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e0c167f4",
+ "metadata": {},
+ "source": [
+ "### Verification steps "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "308d52ba",
+ "metadata": {},
+ "source": [
+ "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4fe7318e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+ "showSrc(build_dataflow_steps.step_tidy_up)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "2bbb84fb",
+ "metadata": {},
+ "source": [
+ "Some of the default build steps have automatic verification enabled, when the corresponding verification step is set."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ce1aa025",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showSrc(build_cfg.VerificationStepType)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "da1a2b88",
+ "metadata": {},
+ "source": [
+ "In the cells below, we will use an example input from the CIFAR-10 data set and use the forward pass in Brevitas to generate a reference output. We save the input as `input.npy` and the reference output as `expected_output.npy`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e157d03c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Get golden io pair from Brevitas and save as .npy files\n",
+ "from finn.util.test import get_trained_network_and_ishape, get_example_input, get_topk\n",
+ "import numpy as np\n",
+ "\n",
+ "\n",
+ "(brevitas_model, ishape) = get_trained_network_and_ishape(\"cnv\", 2, 2)\n",
+ "input_tensor_npy = get_example_input(\"cnv\")\n",
+ "input_tensor_torch = torch.from_numpy(input_tensor_npy).float()\n",
+ "input_tensor_torch = ToTensor().forward(input_tensor_torch).detach()\n",
+ "output_tensor_npy = brevitas_model.forward(input_tensor_torch).detach().numpy()\n",
+ "output_tensor_npy = get_topk(output_tensor_npy, k=1)\n",
+ "\n",
+ "np.save(\"input.npy\", input_tensor_npy)\n",
+ "np.save(\"expected_output.npy\", output_tensor_npy)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "d03450e7",
+ "metadata": {},
+ "source": [
+ "In the next step we set up the builder flow again, this time we will set the build argument `verify_steps` and pass a list of verification steps."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5cd3032b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Build flow with additional builder arguments enabled\n",
+ "## verification steps\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_with_verification\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_target_fps_parallelization\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " target_fps = 10000,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " steps = build_steps,\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ],\n",
+ " verify_steps=[\n",
+ " build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,\n",
+ " build_cfg.VerificationStepType.TIDY_UP_PYTHON,\n",
+ " build_cfg.VerificationStepType.STREAMLINED_PYTHON,\n",
+ " ]\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "1d05b985",
+ "metadata": {},
+ "source": [
+ "When execution the code below, the verification will be invoked in the background. After the execution we can check if the verification was successful by investigating the output directory."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a3a46e76",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ca1d571d",
+ "metadata": {},
+ "source": [
+ "The output directory has now an additional directory called `verification_output`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ca74d537",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!ls {build_dir}/output_with_verification"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "908ecda4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "!ls {build_dir}/output_with_verification/verification_output"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "bcbc6f49",
+ "metadata": {},
+ "source": [
+ "The directory contains three .npy files. These files are the saved output files from the different verification steps. The suffix indicates if the array matches with the expected output. In our case, the suffix is for all verification steps `_SUCCESS`. Since the outputs are saved as .npy, we can open and investigate the files simply in Python."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "7a1b6ca9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "verify_initial_python = np.load(build_dir + \"/output_with_verification/verification_output/verify_initial_python_0_SUCCESS.npy\")\n",
+ "print(\"The output of the verification step after the step_tidy_up is: \" + str(verify_initial_python))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "6558e19e",
+ "metadata": {},
+ "source": [
+ "If the generated output does not match the expected output, these files can be used for debugging."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "4609f94d",
+ "metadata": {},
+ "source": [
+ "### Other builder arguments "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "37b6853d",
+ "metadata": {},
+ "source": [
+ "Next to the enablement of the verification flows, the FINN builder has numerous additional builder arguments to further customize your network. \n",
+ "Let's have a look at the options for the arguments. We want to only filter out the FINN specific arguments."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e9f6aa29",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Filter out methods\n",
+ "builder_args = [m for m in dir(build_cfg.DataflowBuildConfig) if not m.startswith('_')]\n",
+ "print(\"\\n\".join(builder_args))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b12ab370",
+ "metadata": {},
+ "source": [
+ "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments. \n",
+ "\n",
+ "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "9aba0493",
+ "metadata": {},
+ "source": [
+ "So far, in this notebook, we only looked at configurations up to the generation of estimate reports, a lot of these builder arguments actually become relevant at a later stage in the FINN flow.\n",
+ "\n",
+ "Let's have a look at the default build dataflow steps for the complete FINN flow."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ec39b9f2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"\\n\".join(build_cfg.default_build_dataflow_steps))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b9bc5715",
+ "metadata": {},
+ "source": [
+ "You can see that after the generation of the estimate reports, the code generation and the ip generation is invoked (`step_hw_codegen` and `step_hw_ipgen`). The FIFO depths are determined and the FIFOs are inserted in the network (`step_set_fifo_depths`), we can then create an IP design of our whole network by stitching the IPs from each layer together (`step_create_stitched_ip`). At this point we have an implementation of the neural network that we can integrate within a bigger FPGA design, we can run performance measurements using simulation (`step_measure_rtlsim_performance`) and out-of-context synthesis (`step_out_of_context_synthesis`) for it.\n",
+ "The FINN builder also provides automatic system integration for Zynq and Alveo devices, this can be invoked by running `step_synthesize_bitfile`, `step_make_pynq_driver` and `step_deployment_package`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "76df000f",
+ "metadata": {},
+ "source": [
+ "You can have a closer look at each step by either using the `showSrc()` function or by accessing the doc string."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "caf49f03",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import finn.builder.build_dataflow_steps as build_dataflow_steps\n",
+ "print(build_dataflow_steps.step_hw_codegen.__doc__)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c84a9fbc",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "showSrc(build_dataflow_steps.step_hw_codegen)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "c249f141",
+ "metadata": {},
+ "source": [
+ "This concludes the advanced builder settings tutorial. Below you can find code that can help you investigating more of the builder arguments and invoking the whole flow to generate a bitfile."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3b98eb65",
+ "metadata": {},
+ "source": [
+ "### Example for additional builder arguments & bitfile generation "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "0dbdab42",
+ "metadata": {},
+ "source": [
+ "#### Standalone Thresholds"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "e21ff36f",
+ "metadata": {},
+ "source": [
+ "In FINN, convolutions are expressed with three components:\n",
+ "* An Im2Col operation\n",
+ "* A matrix multiplication\n",
+ "* A MultiThreshold operation\n",
+ "\n",
+ "When converting these nodes into HW layers, by default the MatMul and the MultiThreshold gets converted into **one** component called Matrix-Vector-Activation Unit (MVAU). But the FINN compiler allows us to implement the activation separately. This gives an additional possibility for customization because we can adjust the folding parameters of the standalone threshold unit independently. \n",
+ "\n",
+ "If you would like to enable this feature, you can set the build argument `standalone_thresholds` to `True`. In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "2619ebde",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "## Build flow with additional builder arguments enabled\n",
+ "## standalone_thresholds = True\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_standalone_thresholds\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_target_fps_parallelization\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ "]\n",
+ "\n",
+ "cfg_estimates = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " target_fps = 10000,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " fpga_part = \"xc7z020clg400-1\",\n",
+ " standalone_thresholds = True,\n",
+ " steps = build_steps,\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "b2e9bc42",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#%%time\n",
+ "#build.build_dataflow_cfg(model_file, cfg_estimates);"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "32ae296e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#showInNetron(build_dir+\"/output_standalone_thresholds/intermediate_models/step_generate_estimate_reports.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "601eb5f8",
+ "metadata": {},
+ "source": [
+ "#### Run the whole flow"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "42aa929b",
+ "metadata": {},
+ "source": [
+ "The code below can be used to invoke the full builder flow and obtain more output products, be aware that this runs synthesis and bitfile generation and it might take over an hour. Please note that you need to uncomment the code first."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "ffa2a352",
+ "metadata": {},
+ "source": [
+ "For an optimized design, we saved a local copy of the folding configuration for cnv-w2a2 on the Pynq-Z1 board from [finn-examples](https://github.com/Xilinx/finn-examples) in this folder. And will pass it to the build flow. Please also note below that we now pass the board as argument to the builder (`board = \"Pynq-Z1\"`) instead of just the fpga part. This time we will select all possible outputs to generate. Please be aware that running the full build might take a few hours."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "8d1b041f-027c-444e-81ac-98ce9b6d1b51",
+ "metadata": {},
+ "source": [
+ "Note that we set one additional argument: `default_swg_exception = True`. This is done because this example is customized to fit on the Pynq-Z1 board, to optimize the resources we remove FIFOs between SWGs and MVAUs manually to avoid unnecessary buffering."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4efd46f4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import finn.builder.build_dataflow as build\n",
+ "import finn.builder.build_dataflow_config as build_cfg\n",
+ "import os\n",
+ "import shutil\n",
+ "\n",
+ "## Build flow with hardware build\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n",
+ "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n",
+ "\n",
+ "output_dir = build_dir + \"/output_bitfile\"\n",
+ "\n",
+ "#Delete previous run results if exist\n",
+ "if os.path.exists(output_dir):\n",
+ " shutil.rmtree(output_dir)\n",
+ " print(\"Previous run results deleted!\")\n",
+ "\n",
+ "build_steps = [\n",
+ " custom_step_add_pre_proc,\n",
+ " custom_step_add_post_proc,\n",
+ " \"step_qonnx_to_finn\",\n",
+ " \"step_tidy_up\",\n",
+ " \"step_streamline\",\n",
+ " \"step_convert_to_hw\",\n",
+ " \"step_create_dataflow_partition\",\n",
+ " \"step_specialize_layers\",\n",
+ " \"step_target_fps_parallelization\",\n",
+ " \"step_apply_folding_config\",\n",
+ " \"step_minimize_bit_width\",\n",
+ " \"step_generate_estimate_reports\",\n",
+ " \"step_hw_codegen\",\n",
+ " \"step_hw_ipgen\",\n",
+ " \"step_set_fifo_depths\",\n",
+ " \"step_create_stitched_ip\",\n",
+ " \"step_measure_rtlsim_performance\",\n",
+ " \"step_out_of_context_synthesis\",\n",
+ " \"step_synthesize_bitfile\",\n",
+ " \"step_make_pynq_driver\",\n",
+ " \"step_deployment_package\",\n",
+ "]\n",
+ "\n",
+ "cfg_build = build.DataflowBuildConfig(\n",
+ " output_dir = output_dir,\n",
+ " mvau_wwidth_max = 80,\n",
+ " synth_clk_period_ns = 10.0,\n",
+ " #specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n",
+ " folding_config_file = \"cnv-w2a2_folding_config.json\",\n",
+ " board = \"Pynq-Z1\",\n",
+ " shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n",
+ " steps = build_steps,\n",
+ " default_swg_exception = True,\n",
+ " generate_outputs=[\n",
+ " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n",
+ " build_cfg.DataflowOutputType.STITCHED_IP,\n",
+ " build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,\n",
+ " build_cfg.DataflowOutputType.OOC_SYNTH,\n",
+ " build_cfg.DataflowOutputType.BITFILE,\n",
+ " build_cfg.DataflowOutputType.PYNQ_DRIVER,\n",
+ " build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,\n",
+ " ],\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "c7ff6c19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#%%time\n",
+ "#build.build_dataflow_cfg(model_file, cfg_build);"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/notebooks/advanced/cnv-w2a2_folding_config.json b/notebooks/advanced/cnv-w2a2_folding_config.json
new file mode 100644
index 0000000000..68409ff695
--- /dev/null
+++ b/notebooks/advanced/cnv-w2a2_folding_config.json
@@ -0,0 +1,79 @@
+{
+ "Defaults": {},
+ "Thresholding_hls_0": {
+ "PE": 1,
+ "ram_style": "distributed"
+ },
+ "ConvolutionInputGenerator_rtl_0": {
+ "SIMD": 3,
+ "ram_style": "distributed"
+ },
+ "MVAU_hls_0": {
+ "PE": 8,
+ "SIMD": 3,
+ "ram_style": "auto"
+ },
+ "ConvolutionInputGenerator_rtl_1": {
+ "SIMD": 16,
+ "ram_style": "distributed"
+ },
+ "MVAU_hls_1": {
+ "PE": 16,
+ "SIMD": 16,
+ "ram_style": "auto"
+ },
+ "ConvolutionInputGenerator_rtl_2": {
+ "SIMD": 16,
+ "ram_style": "distributed"
+ },
+ "MVAU_hls_2": {
+ "PE": 8,
+ "SIMD": 16,
+ "ram_style": "auto"
+ },
+ "ConvolutionInputGenerator_rtl_3": {
+ "SIMD": 16,
+ "ram_style": "distributed"
+ },
+ "MVAU_hls_3": {
+ "PE": 8,
+ "SIMD": 16,
+ "ram_style": "block"
+ },
+ "ConvolutionInputGenerator_rtl_4": {
+ "SIMD": 8,
+ "ram_style": "distributed"
+ },
+ "MVAU_hls_4": {
+ "PE": 4,
+ "SIMD": 8,
+ "ram_style": "auto"
+ },
+ "ConvolutionInputGenerator_rtl_5": {
+ "SIMD": 8,
+ "ram_style": "distributed"
+ },
+ "MVAU_hls_5": {
+ "PE": 1,
+ "SIMD": 8,
+ "ram_style": "auto"
+ },
+ "MVAU_hls_6": {
+ "PE": 1,
+ "SIMD": 2,
+ "ram_style": "distributed"
+ },
+ "MVAU_hls_7": {
+ "PE": 2,
+ "SIMD": 2,
+ "ram_style": "block"
+ },
+ "MVAU_hls_8": {
+ "PE": 5,
+ "SIMD": 1,
+ "ram_style": "distributed"
+ },
+ "LabelSelect_hls_0": {
+ "PE": 1
+ }
+}
diff --git a/notebooks/advanced/cybsec_PE_SIMD.onnx b/notebooks/advanced/cybsec_PE_SIMD.onnx
new file mode 100644
index 0000000000..8d42b2e37b
Binary files /dev/null and b/notebooks/advanced/cybsec_PE_SIMD.onnx differ
diff --git a/notebooks/advanced/finn-dataflow.png b/notebooks/advanced/finn-dataflow.png
new file mode 100755
index 0000000000..ebe98d0fbd
Binary files /dev/null and b/notebooks/advanced/finn-dataflow.png differ
diff --git a/notebooks/advanced/finn-folding-mvau.png b/notebooks/advanced/finn-folding-mvau.png
new file mode 100755
index 0000000000..bbba00182c
Binary files /dev/null and b/notebooks/advanced/finn-folding-mvau.png differ
diff --git a/notebooks/advanced/finn-folding.png b/notebooks/advanced/finn-folding.png
new file mode 100755
index 0000000000..019b4aa1e7
Binary files /dev/null and b/notebooks/advanced/finn-folding.png differ
diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb
index 35a83ea97b..f1b3dcf68b 100644
--- a/notebooks/basics/0_how_to_work_with_onnx.ipynb
+++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb
@@ -613,9 +613,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
similarity index 65%
rename from notebooks/basics/1_brevitas_network_import.ipynb
rename to notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
index a884e90d75..5c2f10310f 100644
--- a/notebooks/basics/1_brevitas_network_import.ipynb
+++ b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb
@@ -4,13 +4,15 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Importing Brevitas networks into FINN\n",
+ "# Importing Brevitas networks into FINN with the QONNX interchange format\n",
+ "\n",
+ "**Note: Previously it was possible to directly export the FINN-ONNX interchange format from Brevitas to pass to the FINN compiler. This support is deprecated and FINN uses the export to the QONNX format as a front end, internally FINN uses still the FINN-ONNX format.**\n",
"\n",
"In this notebook we'll go through an example of how to import a Brevitas-trained QNN into FINN. The steps will be as follows:\n",
"\n",
"1. Load up the trained PyTorch model\n",
- "2. Call Brevitas FINN-ONNX export and visualize with Netron\n",
- "3. Import into FINN and call cleanup transformations\n",
+ "2. Call Brevitas QONNX export and visualize with Netron\n",
+ "3. Import into FINN and converting QONNX to FINN-ONNX\n",
"\n",
"We'll use the following utility functions to print the source code for function calls (`showSrc()`) and to visualize a network using netron (`showInNetron()`) in the Jupyter notebook:"
]
@@ -120,15 +122,18 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 2. Call Brevitas FINN-ONNX export and visualize with Netron\n",
+ "## 2. Call Brevitas QONNX export and visualize with Netron\n",
+ "\n",
+ "Brevitas comes with built-in QONNX export functionality. This is similar to the regular ONNX export capabilities of PyTorch, with a few differences:\n",
"\n",
- "Brevitas comes with built-in FINN-ONNX export functionality. This is similar to the regular ONNX export capabilities of PyTorch, with a few differences:\n",
+ "1. Weight and activation quantization is represented as a 'fake-quantization' with Quant and BipolarQuant nodes.\n",
+ "2. Truncation operations as required by average pooling are represented with a Trunc node.\n",
"\n",
- "1. The weight quantization logic is not exported as part of the graph; rather, the quantized weights themselves are exported.\n",
- "2. Special quantization annotations are used to preserve the low-bit quantization information. ONNX (at the time of writing) supports 8-bit quantization as the minimum bitwidth, whereas FINN-ONNX quantization annotations can go down to binary/bipolar quantization.\n",
- "3. Low-bit quantized activation functions are exported as MultiThreshold operators.\n",
+ "One can read more about how QONNX works and why it was developed here: https://xilinx.github.io/finn//2021/11/03/qonnx-and-finn.html\n",
"\n",
- "It's actually quite straightforward to export ONNX from our Brevitas model as follows:"
+ "Additionally QONNX comes with a set of tools for working with the format. These are maintained together with the Fast Machinelearning collaboration as an open-source projet here: https://github.com/fastmachinelearning/qonnx\n",
+ "\n",
+ "It's actually quite straightforward to export QONNX from our Brevitas model as follows:"
]
},
{
@@ -137,10 +142,10 @@
"metadata": {},
"outputs": [],
"source": [
- "import brevitas.onnx as bo\n",
- "export_onnx_path = \"/tmp/LFCW1A1.onnx\"\n",
+ "from brevitas.export import export_qonnx\n",
+ "export_onnx_path = \"/tmp/LFCW1A1_qonnx.onnx\"\n",
"input_shape = (1, 1, 28, 28)\n",
- "bo.export_finn_onnx(lfc, input_shape, export_onnx_path)"
+ "export_qonnx(lfc, torch.randn(input_shape), export_onnx_path);"
]
},
{
@@ -156,23 +161,23 @@
"metadata": {},
"outputs": [],
"source": [
- "showInNetron('/tmp/LFCW1A1.onnx')"
+ "showInNetron(export_onnx_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "When running this notebook in the FINN Docker container, you should be able to see an interactive visualization of the imported network above, and click on individual nodes to inspect their parameters. If you look at any of the MatMul nodes, you should be able to see that the weights are all {-1, +1} values, and the activations are Sign functions."
+ "When running this notebook in the FINN Docker container, you should be able to see an interactive visualization of the imported network above, and click on individual nodes to inspect their parameters. If you look at any of the MatMul nodes, you should be able to see that the weights are all {-1, +1} values."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 3. Import into FINN and call cleanup transformations\n",
+ "## 3. Import into FINN and converting QONNX to FINN-ONNX\n",
"\n",
- "We will now import this ONNX model into FINN using the ModelWrapper, and examine some of the graph attributes from Python."
+ "We will first run a cleanup transformation on the exported QONNX model."
]
},
{
@@ -181,16 +186,10 @@
"metadata": {},
"outputs": [],
"source": [
- "from qonnx.core.modelwrapper import ModelWrapper\n",
- "model = ModelWrapper(export_onnx_path)\n",
- "model.graph.node[8]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "The ModelWrapper exposes a range of other useful functions as well. For instance, by convention the second input of the MatMul node will be a pre-initialized weight tensor, which we can view using the following:"
+ "from qonnx.util.cleanup import cleanup\n",
+ "\n",
+ "export_onnx_path_cleaned = \"/tmp/LFCW1A1-qonnx-clean.onnx\"\n",
+ "cleanup(export_onnx_path, out_file=export_onnx_path_cleaned)"
]
},
{
@@ -199,14 +198,14 @@
"metadata": {},
"outputs": [],
"source": [
- "model.get_initializer(model.graph.node[8].input[1])"
+ "showInNetron(export_onnx_path_cleaned)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can also examine the quantization annotations and shapes of various tensors using the convenience functions provided by ModelWrapper."
+ "We will now import this QONNX model into FINN using the ModelWrapper. Here we can immediatley execute the model to verify correctness."
]
},
{
@@ -215,7 +214,14 @@
"metadata": {},
"outputs": [],
"source": [
- "model.get_tensor_datatype(model.graph.node[8].input[1]).name"
+ "from qonnx.core.modelwrapper import ModelWrapper\n",
+ "import qonnx.core.onnx_exec as oxe\n",
+ "model = ModelWrapper(export_onnx_path_cleaned)\n",
+ "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n",
+ "output_dict = oxe.execute_onnx(model, input_dict)\n",
+ "produced_qonnx = output_dict[list(output_dict.keys())[0]]\n",
+ "\n",
+ "produced_qonnx"
]
},
{
@@ -224,14 +230,14 @@
"metadata": {},
"outputs": [],
"source": [
- "model.get_tensor_shape(model.graph.node[8].input[1])"
+ "np.isclose(produced, produced_qonnx).all()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "If we want to operate further on this model in FINN, it is a good idea to execute certain \"cleanup\" transformations on this graph. Here, we will run shape inference and constant folding on this graph, and visualize the resulting graph in Netron again."
+ "Using the `QONNXtoFINN` transformation we can convert the model to the FINN internal FINN-ONNX representation. Notably all Quant and BipolarQuant nodes will have disappeared and are converted into MultiThreshold nodes."
]
},
{
@@ -240,12 +246,13 @@
"metadata": {},
"outputs": [],
"source": [
- "from qonnx.transformation.fold_constants import FoldConstants\n",
- "from qonnx.transformation.infer_shapes import InferShapes\n",
- "model = model.transform(InferShapes())\n",
- "model = model.transform(FoldConstants())\n",
- "export_onnx_path_transformed = \"/tmp/LFCW1A1-clean.onnx\"\n",
- "model.save(export_onnx_path_transformed)"
+ "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
+ "model = ModelWrapper(export_onnx_path_cleaned)\n",
+ "\n",
+ "model = model.transform(ConvertQONNXtoFINN())\n",
+ "\n",
+ "export_onnx_path_converted = \"/tmp/LFCW1A1-qonnx-converted.onnx\"\n",
+ "model.save(export_onnx_path_converted)"
]
},
{
@@ -254,14 +261,14 @@
"metadata": {},
"outputs": [],
"source": [
- "showInNetron('/tmp/LFCW1A1-clean.onnx')"
+ "showInNetron(export_onnx_path_converted)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can see that the resulting graph has become smaller and simpler. Specifically, the input reshaping is now a single Reshape node instead of the Shape -> Gather -> Unsqueeze -> Concat -> Reshape sequence. We can now use the internal ONNX execution capabilities of FINN to ensure that we still get the same output from this model as we did with PyTorch."
+ "And once again we can execute the model with the FINN/QONNX execution engine."
]
},
{
@@ -270,8 +277,8 @@
"metadata": {},
"outputs": [],
"source": [
- "import finn.core.onnx_exec as oxe\n",
- "input_dict = {\"0\": nph.to_array(input_tensor)}\n",
+ "model = ModelWrapper(export_onnx_path_cleaned)\n",
+ "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n",
"output_dict = oxe.execute_onnx(model, input_dict)\n",
"produced_finn = output_dict[list(output_dict.keys())[0]]\n",
"\n",
@@ -284,7 +291,7 @@
"metadata": {},
"outputs": [],
"source": [
- "np.isclose(produced, produced_finn).all()"
+ "np.isclose(produced_qonnx, produced_finn).all()"
]
},
{
@@ -311,9 +318,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
index 388accad3a..3141d54ddf 100644
--- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb
@@ -46,8 +46,8 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
- "There is an additional section for functional verification (red section) on the left side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n",
+ "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n",
+ "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n",
"\n",
"\n",
"We will use the helper function `showInNetron` to show the ONNX model at the current transformation step. The Netron displays are interactive, but they only work when running the notebook actively and not on GitHub (i.e. if you are viewing this on GitHub you'll only see blank squares)."
@@ -72,7 +72,7 @@
"source": [
"## 1. Brevitas Export, FINN Import and Tidy-Up\n",
"\n",
- "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology."
+ "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology. The network will be exported in QONNX format and then converted into the FINN-ONNX format to prepare it for the FINN compiler."
]
},
{
@@ -81,17 +81,23 @@
"metadata": {},
"outputs": [],
"source": [
+ "import torch\n",
"import onnx\n",
"from finn.util.test import get_test_model_trained\n",
- "import brevitas.onnx as bo\n",
+ "from brevitas.export import export_qonnx\n",
+ "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
"from qonnx.core.modelwrapper import ModelWrapper\n",
+ "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
"from qonnx.transformation.infer_shapes import InferShapes\n",
"from qonnx.transformation.fold_constants import FoldConstants\n",
"from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n",
"\n",
"cnv = get_test_model_trained(\"CNV\", 1, 1)\n",
- "bo.export_finn_onnx(cnv, (1, 3, 32, 32), build_dir + \"/end2end_cnv_w1a1_export.onnx\")\n",
- "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_export.onnx\")\n",
+ "export_onnx_path = build_dir + \"/end2end_cnv_w1a1_export.onnx\"\n",
+ "export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path)\n",
+ "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)\n",
+ "model = ModelWrapper(export_onnx_path)\n",
+ "model = model.transform(ConvertQONNXtoFINN())\n",
"model = model.transform(InferShapes())\n",
"model = model.transform(FoldConstants())\n",
"model = model.transform(GiveUniqueNodeNames())\n",
@@ -148,10 +154,12 @@
"# preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n",
"totensor_pyt = ToTensor()\n",
"chkpt_preproc_name = build_dir+\"/end2end_cnv_w1a1_preproc.onnx\"\n",
- "bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)\n",
+ "export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name)\n",
+ "qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)\n",
+ "pre_model = ModelWrapper(chkpt_preproc_name)\n",
+ "pre_model = pre_model.transform(ConvertQONNXtoFINN())\n",
"\n",
"# join preprocessing and core model\n",
- "pre_model = ModelWrapper(chkpt_preproc_name)\n",
"model = model.transform(MergeONNXModels(pre_model))\n",
"# add input quantization annotation: UINT8 for all BNN-PYNQ models\n",
"global_inp_name = model.graph.input[0].name\n",
@@ -199,7 +207,7 @@
"\n",
"\n",
"\n",
- "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n",
+ "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU) or sometimes called matrix-vector-activation unit (MVAU). But now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib) and/or as RTL modules in [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n",
"\n",
"\n",
"To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n",
@@ -240,11 +248,11 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We won't go into too much detail about what happens in each transformation and why they are called in the particular order they are (feel free to visualize the intermediate steps using Netron yourself if you are curious) but here is a brief summmmary:\n",
+ "We won't go into too much detail about what happens in each transformation and why they are called in the particular order they are (feel free to visualize the intermediate steps using Netron yourself if you are curious) but here is a brief summary:\n",
"\n",
"* `Streamline` moves floating point scaling and addition operations closer to the input of the nearest thresholding activation and absorbs them into thresholds\n",
"* `LowerConvsToMatMul` converts ONNX `Conv` nodes into sequences of `Im2Col, MatMul` nodes as discussed above. `Im2Col` is a custom FINN ONNX high-level node type that implements the sliding window operator.\n",
- "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n",
+ "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib and finn-rtllib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n",
"* You may recall `ConvertBipolarMatMulToXnorPopcount` from the TFC-w1a1 example, which is needed to implement bipolar-by-bipolar (w1a1) networks correctly using finn-hlslib.\n",
"\n",
"Let's visualize the streamlined and lowered network with Netron. Observe how all the `Conv` nodes have turned into pairs of `Im2Col, MatMul` nodes, and many nodes including `BatchNorm, Mul, Add` nodes have disappeared and replaced with `MultiThreshold` nodes."
@@ -263,9 +271,9 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "## 3. Partitioning, Conversion to HLS Layers and Folding\n",
+ "## 3. Partitioning, Conversion to HW Layers and Folding\n",
"\n",
- "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HLS equivalents and separate them out into a *dataflow partition*:\n"
+ "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HW equivalents, separate them out into a *dataflow partition* and specialize them to HLS variants:\n"
]
},
{
@@ -274,27 +282,25 @@
"metadata": {},
"outputs": [],
"source": [
- "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n",
+ "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n",
"from finn.transformation.fpgadataflow.create_dataflow_partition import (\n",
" CreateDataflowPartition,\n",
")\n",
"from finn.transformation.move_reshape import RemoveCNVtoFCFlatten\n",
+ "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n",
"from qonnx.custom_op.registry import getCustomOp\n",
"from qonnx.transformation.infer_data_layouts import InferDataLayouts\n",
"\n",
- "# choose the memory mode for the MVTU units, decoupled or const\n",
- "mem_mode = \"decoupled\"\n",
- "\n",
"model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_streamlined.onnx\")\n",
- "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))\n",
- "model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))\n",
+ "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n",
+ "model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())\n",
"# TopK to LabelSelect\n",
- "model = model.transform(to_hls.InferLabelSelectLayer())\n",
+ "model = model.transform(to_hw.InferLabelSelectLayer())\n",
"# input quantization (if any) to standalone thresholding\n",
- "model = model.transform(to_hls.InferThresholdingLayer())\n",
- "model = model.transform(to_hls.InferConvInpGen())\n",
- "model = model.transform(to_hls.InferStreamingMaxPool())\n",
- "# get rid of Reshape(-1, 1) operation between hlslib nodes\n",
+ "model = model.transform(to_hw.InferThresholdingLayer())\n",
+ "model = model.transform(to_hw.InferConvInpGen())\n",
+ "model = model.transform(to_hw.InferStreamingMaxPool())\n",
+ "# get rid of Reshape(-1, 1) operation between hw nodes\n",
"model = model.transform(RemoveCNVtoFCFlatten())\n",
"# get rid of Tranpose -> Tranpose identity seq\n",
"model = model.transform(absorb.AbsorbConsecutiveTransposes())\n",
@@ -306,7 +312,9 @@
"sdp_node = getCustomOp(sdp_node)\n",
"dataflow_model_filename = sdp_node.get_nodeattr(\"model\")\n",
"# save the dataflow partition with a different name for easier access\n",
+ "# and specialize the layers to HLS variants\n",
"dataflow_model = ModelWrapper(dataflow_model_filename)\n",
+ "dataflow_model = dataflow_model.transform(SpecializeLayers())\n",
"dataflow_model.save(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")"
]
},
@@ -314,7 +322,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations in hlslib. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*"
+ "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*"
]
},
{
@@ -356,7 +364,7 @@
"outputs": [],
"source": [
"model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")\n",
- "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+ "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n",
"# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n",
"folding = [\n",
" (16, 3, [128]),\n",
@@ -376,7 +384,7 @@
" fcl_inst.set_nodeattr(\"inFIFODepths\", ififodepth)\n",
"\n",
"# use same SIMD values for the sliding window operators\n",
- "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator\")\n",
+ "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator_rtl\")\n",
"for i in range(len(swg_layers)):\n",
" swg_inst = getCustomOp(swg_layers[i])\n",
" simd = folding[i][1]\n",
@@ -390,7 +398,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Below we visualize in Netron to observe the `StreamingDataWidthConverter` and `StreamingFIFO` nodes that have been inserted into graph, as well as the folding factors in the `PE` and `SIMD` attributes of each `MatrixVectorActivation`."
+ "Below we visualize in Netron to observe the folding factors in the `PE` and `SIMD` attributes of each `MVAU_hls`."
]
},
{
@@ -508,12 +516,13 @@
"metadata": {},
"outputs": [],
"source": [
- "import pkg_resources as pk\n",
+ "import importlib_resources\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"\n",
- "fn = pk.resource_filename(\"finn.qnn-data\", \"cifar10/cifar10-test-data-class3.npz\")\n",
- "x = np.load(fn)[\"arr_0\"]\n",
+ "ref = importlib_resources.files(\"finn.qnn-data\") / \"cifar10/cifar10-test-data-class3.npz\"\n",
+ "with importlib_resources.as_file(ref) as fn:\n",
+ " x = np.load(fn)[\"arr_0\"]\n",
"x = x.reshape(3, 32,32).transpose(1, 2, 0)\n",
"plt.imshow(x)"
]
@@ -632,9 +641,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg
index fa36be96c5..561770f2da 100755
--- a/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg
+++ b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg
@@ -1 +1 @@
-
+
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
index eec17b2fa7..bbaa74dbff 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb
@@ -33,7 +33,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n",
+ "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n",
"There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n",
"\n",
"\n",
@@ -81,19 +81,23 @@
"metadata": {},
"outputs": [],
"source": [
+ "import torch\n",
"import onnx\n",
"from finn.util.test import get_test_model_trained\n",
- "import brevitas.onnx as bo\n",
+ "from brevitas.export import export_qonnx\n",
+ "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
"\n",
"tfc = get_test_model_trained(\"TFC\", 1, 1)\n",
- "bo.export_finn_onnx(tfc, (1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\"); # semicolon added to suppress log"
+ "export_onnx_path = build_dir+\"/tfc_w1_a1.onnx\"\n",
+ "export_qonnx(tfc, torch.randn(1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\"); # semicolon added to suppress log\n",
+ "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "The model was now exported, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n",
+ "The model was now exported in QONNX format, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n",
"To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties."
]
},
@@ -110,7 +114,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN."
+ "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. `ModelWrapper` is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN. The model was exported in QONNX format, to feed it into the FINN flow, our first step is to convert it to the FINN-ONNX format."
]
},
{
@@ -120,7 +124,26 @@
"outputs": [],
"source": [
"from qonnx.core.modelwrapper import ModelWrapper\n",
- "model = ModelWrapper(build_dir+\"/tfc_w1_a1.onnx\")"
+ "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
+ "model = ModelWrapper(build_dir+\"/tfc_w1_a1.onnx\")\n",
+ "model = model.transform(ConvertQONNXtoFINN())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After the conversion we save the model and visualize it using Netron. As you can see, quantization is now expressed differently. Where we had Quant nodes before, there are now MultiThreshold nodes present in the graph."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model.save(build_dir+\"/tfc_w1_a1_finn.onnx\")\n",
+ "showInNetron(build_dir+\"/tfc_w1_a1_finn.onnx\")"
]
},
{
@@ -143,8 +166,9 @@
"* [FINN-style Dataflow Architectures](#dataflow_arch)\n",
"* [Tidy-up transformations](#basic_trafo)\n",
"* [Streamlining](#streamline)\n",
- "* [Conversion to HLS layers](#hls_layers)\n",
+ "* [Conversion to HW layers](#hw_layers)\n",
"* [Creating a Dataflow Partition](#dataflow_partition)\n",
+ "* [Specialize layers](#specialize_layers)\n",
"* [Folding and Datawidth Converter, FIFO and TLastMarker Insertion](#folding)\n",
"\n",
"\n",
@@ -161,7 +185,7 @@
"\n",
"\n",
"\n",
- "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process."
+ "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by Verilog modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib). As these function calls/modules can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls/modules, which is the goal of the network preparation process."
]
},
{
@@ -248,7 +272,7 @@
"\n",
"In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n",
"\n",
- "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L86), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
+ "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L93), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use."
]
},
{
@@ -267,10 +291,12 @@
"# preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n",
"totensor_pyt = ToTensor()\n",
"chkpt_preproc_name = build_dir+\"/tfc_w1_a1_preproc.onnx\"\n",
- "bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)\n",
+ "export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name)\n",
+ "qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)\n",
+ "pre_model = ModelWrapper(chkpt_preproc_name)\n",
+ "pre_model = pre_model.transform(ConvertQONNXtoFINN())\n",
"\n",
"# join preprocessing and core model\n",
- "pre_model = ModelWrapper(chkpt_preproc_name)\n",
"model = model.transform(MergeONNXModels(pre_model))\n",
"# add input quantization annotation: UINT8 for all BNN-PYNQ models\n",
"global_inp_name = model.graph.input[0].name\n",
@@ -399,32 +425,25 @@
"model = model.transform(InferDataLayouts())\n",
"model = model.transform(RemoveUnusedTensors())\n",
"\n",
- "model.save(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n",
- "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")"
+ "model.save(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n",
+ "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to HLS layers."
+ "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to hardware (HW) layers."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "### Conversion to HLS layers \n",
- "Converts the nodes to HLS layers that correspond to the functions in [finn-hls library](https://finn-hlslib.readthedocs.io/en/latest/). In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MatrixVectorActivation layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.\n",
+ "### Conversion to HW layers \n",
+ "Converts the nodes to HW layers, these layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow. In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MVAU layers (matrix vector activation unit). Any immediately following MultiThreshold layers will also be absorbed into the MVAU.\n",
"\n",
- "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MatrixVectorActivation` nodes, which will correspond to a function call from the [finn-hlslib](https://finn-hlslib.readthedocs.io/en/latest/library/matrixvector.html) library."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "**Note:** The transformation `to_hls.InferBinaryMatrixVectorActivation` gets the string \"decoupled\" as argument, this indicates the `mem_mode` for the weights. In FINN there are different options to set the way the weights are stored and accessed. For details please have a look on the [FINN readthedocs website](https://finn.readthedocs.io/) under Internals."
+ "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MVAU` nodes."
]
},
{
@@ -433,22 +452,15 @@
"metadata": {},
"outputs": [],
"source": [
- "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n",
- "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n",
- "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(\"decoupled\"))\n",
+ "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n",
+ "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n",
+ "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n",
"# TopK to LabelSelect\n",
- "model = model.transform(to_hls.InferLabelSelectLayer())\n",
+ "model = model.transform(to_hw.InferLabelSelectLayer())\n",
"# input quantization (if any) to standalone thresholding\n",
- "model = model.transform(to_hls.InferThresholdingLayer())\n",
- "model.save(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n",
- "showInNetron(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Each MatrixVectorActivation node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. We will shortly cover how these can be adjusted, but first we want to separate the HLS layers from the non-HLS layers in this network."
+ "model = model.transform(to_hw.InferThresholdingLayer())\n",
+ "model.save(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n",
+ "showInNetron(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")"
]
},
{
@@ -457,7 +469,7 @@
"source": [
"### Creating a Dataflow Partition \n",
"\n",
- "In the graph above, you can see that there is a mixture of FINN HLS layers (MatrixVectorActivation and Thresholding_Batch) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition."
+ "In the graph above, you can see that there is a mixture of FINN HW layers (`MVAU` and `Thresholding`) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HW layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition."
]
},
{
@@ -468,7 +480,7 @@
"source": [
"from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition\n",
"\n",
- "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n",
+ "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n",
"parent_model = model.transform(CreateDataflowPartition())\n",
"parent_model.save(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")\n",
"showInNetron(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")"
@@ -478,7 +490,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can see that the `MatrixVectorActivation` instances and the `Thresholding_Batch` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:"
+ "We can see that the `MVAU` instances and the `Thresholding` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HW dataflow-only graph:"
]
},
{
@@ -498,7 +510,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can see all the extracted `MatrixVectorActivation` instances and the `Thresholding_Batch` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it."
+ "We can see all the extracted `MVAU` instances and the `Thresholding` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it."
]
},
{
@@ -510,6 +522,60 @@
"model = ModelWrapper(dataflow_model_filename)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Specialize layers "
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the `SpecializeLayers` transformation. It is possible to let the FINN flow know a preference for the implementation style `{\"hls\", \"rtl\"}` and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default. In the tfc example, we will set all layers to their HLS variants. To showcase how to set the preferred implementation, we will set the node attribute in the `Thresholding` layer to `\"hls\"`, for the `MVAUs` and the `LabelSelect` we will leave this node attribute empty and in this case by default it will be set to HLS."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "thresh_node = model.get_nodes_by_op_type(\"Thresholding\")[0]\n",
+ "thresh_node_inst = getCustomOp(thresh_node)\n",
+ "thresh_node_inst.set_nodeattr(\"preferred_impl_style\", \"hls\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Then we will call `SpecializeLayers` to convert each HW abstraction layer to (in this case) an HLS variant."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n",
+ "model = model.transform(SpecializeLayers())\n",
+ "\n",
+ "model.save(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")\n",
+ "showInNetron(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Each node type has now a suffix (`_hls`) and the module (`\n",
+ "finn.custom_op.fpgadataflow.hls` also indicates that that the HLS variant of the layer is selected.\n",
+ "We can now proceed by adjusting the parallelism of each node to customize the performance and resource usage.)"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -518,14 +584,17 @@
"\n",
"*Folding* in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several *folding factors* for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original [FINN paper](https://arxiv.org/pdf/1612.07119). The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume. \n",
"\n",
- "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a MatrixVectorActivation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four MatrixVectorActivation. So as an example we extract the second node of the graph."
+ "Each MVAU_hls node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. \n",
+ "\n",
+ "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a Matrix-Vector-Activation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four `MVAUs`. So as an example we extract the second node of the graph."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/hlscustomop.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
+ "We can use the higher-level CustomOp wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Above, we have already used this abstraction to set the node attribute of the Thresholding HW layer.\n",
+ "Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes."
]
},
{
@@ -556,7 +625,7 @@
"metadata": {},
"outputs": [],
"source": [
- "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n",
+ "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n",
"# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n",
"config = [\n",
" (16, 49, [16], [64], \"block\"),\n",
@@ -573,7 +642,7 @@
" fcl_inst.set_nodeattr(\"ram_style\", ramstyle)\n",
" \n",
"# set parallelism for input quantizer to be same as first layer's SIMD\n",
- "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_Batch\")[0]\n",
+ "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_hls\")[0]\n",
"inp_qnt = getCustomOp(inp_qnt_node)\n",
"inp_qnt.set_nodeattr(\"PE\", 49)"
]
@@ -650,7 +719,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "In previous versions of FINN, we had to manually go through several steps to generate HLS code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**"
+ "In previous versions of FINN, we had to manually go through several steps to generate HLS/RTL code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**"
]
},
{
@@ -732,7 +801,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:"
+ "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Both layer types are inserted as RTL variants. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:"
]
},
{
@@ -1006,9 +1075,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
}
diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
index 6c3b796509..a07a8d2254 100644
--- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
+++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb
@@ -7,16 +7,16 @@
"# FINN - Functional Verification of End-to-End Flow\n",
"-----------------------------------------------------------------\n",
"\n",
- "**Important: This notebook depends on the tfc_end2end_example notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n",
+ "**Important: This notebook depends on the [tfc_end2end_example](tfc_end2end_example.ipynb) notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n",
"\n",
- "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. Besides the methods in this notebook, there is another one that is covered in the Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb): remote execution. The remote execution allows functional verification directly on the PYNQ board, for details please have a look at the mentioned Jupyter notebook."
+ "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- ""
+ ""
]
},
{
@@ -72,9 +72,9 @@
"source": [
"## Simulation using Python \n",
"\n",
- "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n",
+ "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow.hls` or `backend` $\\neq$ `fpgadataflow.rtl`) this model can be checked for functionality using Python.\n",
"\n",
- "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n"
+ "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of an XNOR popcount node.\n"
]
},
{
@@ -95,7 +95,7 @@
"\n",
"This execution function and onnxruntime is used when `execute_onnx` from `onnx_exec` is applied to the model. The model is then simulated node by node and the result is stored in a context dictionary, which contains the values of each tensor at the end of the execution. To get the result, only the output tensor has to be extracted.\n",
"\n",
- "The procedure is shown below. We take the model right before the nodes should be converted into HLS layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs."
+ "The procedure is shown below. We take the model right before the nodes should be converted into HW layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs."
]
},
{
@@ -108,7 +108,7 @@
"from qonnx.core.modelwrapper import ModelWrapper\n",
"input_dict = {\"global_in\": nph.to_array(input_tensor)}\n",
"\n",
- "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")"
+ "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")"
]
},
{
@@ -121,12 +121,11 @@
"output_dict = oxe.execute_onnx(model_for_sim, input_dict, return_full_exec_context=False)\n",
"output_pysim = output_dict[list(output_dict.keys())[0]]\n",
"\n",
- "\n",
- "\n",
- "if np.isclose(output_pysim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+ "try:\n",
+ " assert np.isclose(output_pysim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
" print(\"Results are the same!\")\n",
- "else:\n",
- " print(\"The results are not the same!\")"
+ "except AssertionError:\n",
+ " assert False, \"The results are not the same!\""
]
},
{
@@ -142,7 +141,16 @@
"source": [
"## Simulation (cppsim) using C++\n",
"\n",
- "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model."
+ "When dealing with HLS or RTL custom op nodes in FINN the simulation using Python is no longer sufficient. If the nodes are specialized to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding `finn-hlslib` function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS variants of the layers, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "
\n",
+ "Note: HW layer can also be converted to RTL variants, in this case \"cppsim\" is not an option we can execute. If nevertheless \"cppsim\" is selected as execution mode for the layer, the execution defaults to the parent class. Like this, networks with a mix of HLS and RTL layers can be executed using \"cppsim\" for the HLS layers. \n",
+ "
"
]
},
{
@@ -159,7 +167,7 @@
"metadata": {},
"source": [
"To generate the code for this simulation and to generate the executable two transformations are used:\n",
- "* `PrepareCppSim` which generates the C++ code for the corresponding hls layer\n",
+ "* `PrepareCppSim` which generates the C++ code for the corresponding HLS layer\n",
"* `CompileCppSim` which compules the C++ code and stores the path to the executable"
]
},
@@ -268,10 +276,11 @@
"output_dict = oxe.execute_onnx(parent_model, input_dict)\n",
"output_cppsim = output_dict[list(output_dict.keys())[0]]\n",
"\n",
- "if np.isclose(output_cppsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+ "try:\n",
+ " assert np.isclose(output_cppsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
" print(\"Results are the same!\")\n",
- "else:\n",
- " print(\"The results are not the same!\")"
+ "except AssertionError:\n",
+ " assert False, \"The results are not the same!\""
]
},
{
@@ -280,9 +289,9 @@
"source": [
"## Emulation (rtlsim) using PyVerilator\n",
"\n",
- "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n",
+ "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers or for RTL layers directly using the generated Verilog files. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n",
"\n",
- "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS nodes could also be executed as whole."
+ "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS/RTL nodes could also be executed as whole."
]
},
{
@@ -356,10 +365,11 @@
"output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)\n",
"output_rtlsim = output_dict[list(output_dict.keys())[0]]\n",
"\n",
- "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+ "try:\n",
+ " assert np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
" print(\"Results are the same!\")\n",
- "else:\n",
- " print(\"The results are not the same!\")"
+ "except AssertionError:\n",
+ " assert False, \"The results are not the same!\""
]
},
{
@@ -379,18 +389,14 @@
"source": [
"from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n",
"from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n",
+ "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n",
"from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP\n",
"\n",
"child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n",
- "child_model = child_model.transform(InsertDWC())\n",
- "\n",
- "# set all impl_styles of the DWCs to hls to enable emulation\n",
- "dwc_nodes = child_model.get_nodes_by_op_type(\"StreamingDataWidthConverter_Batch\")\n",
- "for dwc in dwc_nodes:\n",
- " dwc_inst = getCustomOp(dwc)\n",
- " dwc_inst.set_nodeattr(\"impl_style\", \"hls\")\n",
- " \n",
+ "child_model = child_model.transform(InsertDWC()) \n",
"child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n",
+ "# DWC and FIFOs need to be specialized to either HLS or RTL variants\n",
+ "child_model = child_model.transform(SpecializeLayers())\n",
"child_model.save(build_dir + \"/test.onnx\");\n",
"child_model = child_model.transform(GiveUniqueNodeNames())\n",
"child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n",
@@ -430,10 +436,11 @@
"metadata": {},
"outputs": [],
"source": [
- "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n",
+ "try:\n",
+ " assert np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n",
" print(\"Results are the same!\")\n",
- "else:\n",
- " print(\"The results are not the same!\")"
+ "except AssertionError:\n",
+ " assert False, \"The results are not the same!\""
]
}
],
@@ -453,7 +460,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/notebooks/end2end_example/bnn-pynq/verification.png b/notebooks/end2end_example/bnn-pynq/verification.png
deleted file mode 100755
index cb50ba1b67..0000000000
Binary files a/notebooks/end2end_example/bnn-pynq/verification.png and /dev/null differ
diff --git a/notebooks/end2end_example/bnn-pynq/verification.svg b/notebooks/end2end_example/bnn-pynq/verification.svg
new file mode 100755
index 0000000000..9cf8e86088
--- /dev/null
+++ b/notebooks/end2end_example/bnn-pynq/verification.svg
@@ -0,0 +1 @@
+
diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
index 3d77586258..da037050bb 100644
--- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
+++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb
@@ -53,7 +53,7 @@
" * [(Option 1) Train the Model from Scratch](#train_scratch)\n",
" * [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n",
"* [Network Surgery Before Export](#network_surgery)\n",
- "* [Export to FINN-ONNX](#export_finn_onnx)"
+ "* [Export to QONNX and Conversion to FINN-ONNX](#export_qonnx)"
]
},
{
@@ -62,8 +62,11 @@
"metadata": {},
"outputs": [],
"source": [
+ "import os\n",
"import onnx\n",
- "import torch"
+ "import torch\n",
+ "\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\""
]
},
{
@@ -483,13 +486,14 @@
"metadata": {},
"outputs": [],
"source": [
+ "import os\n",
"import torch\n",
"\n",
"# Make sure the model is on CPU before loading a pretrained state_dict\n",
"model = model.cpu()\n",
"\n",
"# Load pretrained weights\n",
- "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
+ "trained_state_dict = torch.load(model_dir + \"/state_dict.pth\")[\"models_state_dict\"][0]\n",
"\n",
"model.load_state_dict(trained_state_dict, strict=False)"
]
@@ -663,12 +667,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
- "# Export to FINN-ONNX \n",
+ "# Export to QONNX and Conversion to FINN-ONNX \n",
"\n",
"\n",
"[ONNX](https://onnx.ai/) is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx).\n",
"\n",
- "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation. Note how we create a `QuantTensor` instance with dummy data to tell Brevitas how our inputs look like, which will be used to set the input quantization annotation on the exported model."
+ "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format."
]
},
{
@@ -677,10 +681,13 @@
"metadata": {},
"outputs": [],
"source": [
- "import brevitas.onnx as bo\n",
- "from brevitas.quant_tensor import QuantTensor\n",
+ "from brevitas.export import export_qonnx\n",
+ "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n",
+ "from qonnx.core.modelwrapper import ModelWrapper\n",
+ "from qonnx.core.datatype import DataType\n",
+ "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n",
"\n",
- "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n",
+ "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
"input_shape = (1, 600)\n",
"\n",
"# create a QuantTensor instance to mark input as bipolar during export\n",
@@ -688,18 +695,25 @@
"input_a = 2 * input_a - 1\n",
"scale = 1.0\n",
"input_t = torch.from_numpy(input_a * scale)\n",
- "input_qt = QuantTensor(\n",
- " input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True\n",
- ")\n",
"\n",
"#Move to CPU before export\n",
"model_for_export.cpu()\n",
"\n",
"# Export to ONNX\n",
- "bo.export_finn_onnx(\n",
- " model_for_export, export_path=ready_model_filename, input_t=input_qt\n",
+ "export_qonnx(\n",
+ " model_for_export, export_path=ready_model_filename, input_t=input_t\n",
")\n",
"\n",
+ "# clean-up\n",
+ "qonnx_cleanup(ready_model_filename, out_file=ready_model_filename)\n",
+ "\n",
+ "# ModelWrapper\n",
+ "model = ModelWrapper(ready_model_filename)\n",
+ "# Setting the input datatype explicitly because it doesn't get derived from the export function\n",
+ "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n",
+ "model = model.transform(ConvertQONNXtoFINN())\n",
+ "model.save(ready_model_filename)\n",
+ "\n",
"print(\"Model saved to %s\" % ready_model_filename)"
]
},
@@ -755,7 +769,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
index e4848a1f40..33b64e11c0 100644
--- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
+++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb
@@ -62,9 +62,11 @@
"metadata": {},
"outputs": [],
"source": [
+ "import os\n",
"from qonnx.core.modelwrapper import ModelWrapper\n",
"\n",
- "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n",
+ "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
"model_for_sim = ModelWrapper(ready_model_filename)"
]
},
@@ -151,7 +153,7 @@
"model_for_sim = model_for_sim.transform(InferDataTypes())\n",
"model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())\n",
"\n",
- "verif_model_filename = \"cybsec-mlp-verification.onnx\"\n",
+ "verif_model_filename = model_dir + \"/cybsec-mlp-verification.onnx\"\n",
"model_for_sim.save(verif_model_filename)"
]
},
@@ -258,7 +260,8 @@
"\n",
"# replace this with your trained network checkpoint if you're not\n",
"# using the pretrained weights\n",
- "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n",
+ "trained_state_dict = torch.load(model_dir + \"/state_dict.pth\")[\"models_state_dict\"][0]\n",
+ "\n",
"# Uncomment the following line if you previously chose to train the network yourself\n",
"#trained_state_dict = torch.load(\"state_dict_self-trained.pth\")\n",
"\n",
@@ -365,10 +368,11 @@
"metadata": {},
"outputs": [],
"source": [
- "if ok == n_verification_inputs:\n",
+ "try:\n",
+ " assert ok == n_verification_inputs\n",
" print(\"Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\")\n",
- "else:\n",
- " print(\"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\")"
+ "except AssertionError:\n",
+ " assert False, \"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\""
]
},
{
@@ -395,7 +399,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
index a18cafd604..73cd25cf20 100644
--- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
+++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb
@@ -115,7 +115,8 @@
"import os\n",
"import shutil\n",
"\n",
- "model_file = \"cybsec-mlp-ready.onnx\"\n",
+ "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n",
+ "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
"\n",
"estimates_output_dir = \"output_estimates_only\"\n",
"\n",
@@ -148,6 +149,15 @@
"build.build_dataflow_cfg(model_file, cfg_estimates)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assert os.path.exists(estimates_output_dir + \"/report/estimate_network_performance.json\")"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -255,7 +265,7 @@
"\n",
"**Live FINN tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on **(your AWS URL):6080/vnc.html**\n",
"\n",
- "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MatrixVectorActivation_XXXXXX`\n",
+ "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MVAU_hls_XXXXXX`\n",
" \n",
"* Once the `step_create_stitched_ip [11/16]` below is completed, you can view the generated stitched IP in Vivado under `/home/ubuntu/finn/notebooks/end2end_example/cybersecurity/output_ipstitch_ooc_rtlsim/stitched_ip`\n",
" "
@@ -272,7 +282,7 @@
"import os\n",
"import shutil\n",
"\n",
- "model_file = \"cybsec-mlp-ready.onnx\"\n",
+ "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
"\n",
"rtlsim_output_dir = \"output_ipstitch_ooc_rtlsim\"\n",
"\n",
@@ -305,6 +315,17 @@
"build.build_dataflow_cfg(model_file, cfg_stitched_ip)"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "assert os.path.exists(rtlsim_output_dir + \"/report/ooc_synth_and_timing.json\")\n",
+ "assert os.path.exists(rtlsim_output_dir + \"/report/rtlsim_performance.json\")\n",
+ "assert os.path.exists(rtlsim_output_dir + \"/final_hw_config.json\")"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -412,7 +433,7 @@
"import os\n",
"import shutil\n",
"\n",
- "model_file = \"cybsec-mlp-ready.onnx\"\n",
+ "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n",
"\n",
"final_output_dir = \"output_final\"\n",
"\n",
@@ -638,7 +659,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.8.5"
+ "version": "3.10.12"
}
},
"nbformat": 4,
diff --git a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
index 738811fa72..38505fb6ef 100644
--- a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
+++ b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py
@@ -48,7 +48,6 @@ def __init__(
onehot=False,
train=True,
):
-
self.dataframe = (
pd.concat([pd.read_csv(file_path_train), pd.read_csv(file_path_test)])
.reset_index()
@@ -77,9 +76,7 @@ def __getitem__(self, index):
data_val = self.data[index][:-1]
return data_val, target
- def dec2bin(
- self, column: pd.Series, number_of_bits: int, left_msb: bool = True
- ) -> pd.Series:
+ def dec2bin(self, column: pd.Series, number_of_bits: int, left_msb: bool = True) -> pd.Series:
"""Convert a decimal pd.Series to binary pd.Series with numbers in their
# base-2 equivalents.
The output is a numpy nd array.
@@ -133,6 +130,7 @@ def integer_encoding(self, df):
def quantize_df(self, df):
"""Quantized the input dataframe. The scaling is done by multiplying
every column by the inverse of the minimum of that column"""
+
# gets the smallest positive number of a vector
def get_min_positive_number(vector):
return vector[vector > 0].min()
@@ -178,24 +176,18 @@ def char_split(s):
column_data = np.clip(
column_data, 0, 4294967295
) # clip due to overflow of uint32 of matlab code
- column_data = self.round_like_matlab_series(
- column_data
- ) # round like matlab
+ column_data = self.round_like_matlab_series(column_data) # round like matlab
column_data = column_data.astype(np.uint32) # cast like matlab
if column == "rate":
column_data.update(pd.Series(dict_correct_rate_values))
python_quantized_df[column] = (
- self.dec2bin(column_data, maxbits, left_msb=False)
- .reshape((-1, 1))
- .flatten()
+ self.dec2bin(column_data, maxbits, left_msb=False).reshape((-1, 1)).flatten()
)
for column in python_quantized_df.columns:
- python_quantized_df[column] = (
- python_quantized_df[column].apply(char_split).values
- )
+ python_quantized_df[column] = python_quantized_df[column].apply(char_split).values
python_quantized_df_separated = pd.DataFrame(
np.column_stack(python_quantized_df.values.T.tolist())
diff --git a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
index 0ffb525544..c4570616d2 100644
--- a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
+++ b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py
@@ -57,9 +57,7 @@ def make_unsw_nb15_test_batches(bsize, dataset_root):
help='name of bitfile (i.e. "resizer.bit")',
default="../bitfile/finn-accel.bit",
)
- parser.add_argument(
- "--dataset_root", help="dataset root dir for download/reuse", default="."
- )
+ parser.add_argument("--dataset_root", help="dataset root dir for download/reuse", default=".")
# parse arguments
args = parser.parse_args()
bsize = args.batchsize
diff --git a/requirements.txt b/requirements.txt
index 83aad07d72..c2973f9432 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,19 +1,21 @@
bitstring==3.1.7
-clize==4.1.1
+clize==5.0.1
dataclasses-json==0.5.7
-docrep==0.2.7
gspread==3.6.0
-numpy==1.22.0
+importlib-resources==6.1.0
+ipython==8.12.2
+numpy==1.24.1
onnx==1.13.0
onnxoptimizer
-onnxruntime==1.11.1
-pre-commit==2.9.2
+onnxruntime==1.16.1
+pre-commit==3.3.2
protobuf==3.20.3
psutil==5.9.4
-pyscaffold==3.2.1
-scipy==1.5.2
+pyscaffold==4.4
+scipy==1.10.1
setupext-janitor>=1.1.2
-sigtools==2.0.3
-toposort==1.5
+setuptools==68.2.2
+sigtools==4.0.1
+toposort==1.7.0
vcdvcd==1.0.5
wget==3.2
diff --git a/run-docker.sh b/run-docker.sh
index 381be35293..e732492728 100755
--- a/run-docker.sh
+++ b/run-docker.sh
@@ -47,7 +47,7 @@ if [ -z "$FINN_XILINX_PATH" ];then
fi
if [ -z "$FINN_XILINX_VERSION" ];then
- recho "Please set the FINN_XILINX_VERSION to the version of the Xilinx tools to use (e.g. 2020.1)"
+ recho "Please set the FINN_XILINX_VERSION to the version of the Xilinx tools to use (e.g. 2022.2)"
recho "FINN functionality depending on Vivado, Vitis or HLS will not be available."
fi
@@ -86,23 +86,29 @@ SCRIPTPATH=$(dirname "$SCRIPT")
: ${ALVEO_BOARD="U250"}
: ${ALVEO_TARGET_DIR="/tmp"}
: ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"}
-: ${XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"}
+: ${XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"}
: ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"}
: ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --always --tags --dirty).$XRT_DEB_VERSION"}
: ${FINN_DOCKER_PREBUILT="0"}
: ${FINN_DOCKER_RUN_AS_ROOT="0"}
: ${FINN_DOCKER_GPU="$(docker info | grep nvidia | wc -m)"}
: ${FINN_DOCKER_EXTRA=""}
+: ${FINN_DOCKER_BUILD_EXTRA=""}
: ${FINN_SKIP_DEP_REPOS="0"}
+: ${FINN_SKIP_BOARD_FILES="0"}
: ${OHMYXILINX="${SCRIPTPATH}/deps/oh-my-xilinx"}
: ${NVIDIA_VISIBLE_DEVICES=""}
: ${DOCKER_BUILDKIT="1"}
+: ${FINN_SINGULARITY=""}
DOCKER_INTERACTIVE=""
+# Catch FINN_DOCKER_EXTRA options being passed in without a trailing space
+FINN_DOCKER_EXTRA+=" "
+
if [ "$1" = "test" ]; then
gecho "Running test suite (all tests)"
- DOCKER_CMD="python setup.py test"
+ DOCKER_CMD="pytest"
elif [ "$1" = "quicktest" ]; then
gecho "Running test suite (non-Vivado, non-slow tests)"
DOCKER_CMD="quicktest.sh"
@@ -116,8 +122,10 @@ elif [ "$1" = "notebook" ]; then
DOCKER_CMD="jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port $JUPYTER_PORT $JUPYTER_PASSWD_ARG notebooks"
FINN_DOCKER_EXTRA+="-e JUPYTER_PORT=$JUPYTER_PORT "
FINN_DOCKER_EXTRA+="-e NETRON_PORT=$NETRON_PORT "
- FINN_DOCKER_EXTRA+="-p $JUPYTER_PORT:$JUPYTER_PORT "
- FINN_DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT "
+ if [ -z "$FINN_SINGULARITY" ]; then
+ FINN_DOCKER_EXTRA+="-p $JUPYTER_PORT:$JUPYTER_PORT "
+ FINN_DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT "
+ fi
elif [ "$1" = "build_dataflow" ]; then
BUILD_DATAFLOW_DIR=$(readlink -f "$2")
FINN_DOCKER_EXTRA+="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR "
@@ -143,7 +151,7 @@ else
fi
-if [ "$FINN_DOCKER_GPU" != 0 ];then
+if [ "$FINN_DOCKER_GPU" != 0 ] && [ -z "$FINN_SINGULARITY" ];then
gecho "nvidia-docker detected, enabling GPUs"
if [ ! -z "$NVIDIA_VISIBLE_DEVICES" ];then
FINN_DOCKER_EXTRA+="--runtime nvidia -e NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES "
@@ -174,19 +182,18 @@ if [ "$FINN_SKIP_DEP_REPOS" = "0" ]; then
fi
# Build the FINN Docker image
-if [ "$FINN_DOCKER_PREBUILT" = "0" ]; then
+if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then
# Need to ensure this is done within the finn/ root folder:
OLD_PWD=$(pwd)
cd $SCRIPTPATH
- docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --tag=$FINN_DOCKER_TAG .
+ docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA .
cd $OLD_PWD
fi
# Launch container with current directory mounted
# important to pass the --init flag here for correct Vivado operation, see:
# https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins
-DOCKER_EXEC="docker run -t --rm $DOCKER_INTERACTIVE --tty --init "
-DOCKER_EXEC+="--hostname $DOCKER_INST_NAME "
-DOCKER_EXEC+="-e SHELL=/bin/bash "
+DOCKER_BASE="docker run -t --rm $DOCKER_INTERACTIVE --tty --init --hostname $DOCKER_INST_NAME "
+DOCKER_EXEC="-e SHELL=/bin/bash "
DOCKER_EXEC+="-w $SCRIPTPATH "
DOCKER_EXEC+="-v $SCRIPTPATH:$SCRIPTPATH "
DOCKER_EXEC+="-v $FINN_HOST_BUILD_DIR:$FINN_HOST_BUILD_DIR "
@@ -201,7 +208,10 @@ DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD "
DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR "
DOCKER_EXEC+="-e OHMYXILINX=$OHMYXILINX "
DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS "
-if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ];then
+# Workaround for FlexLM issue, see:
+# https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647
+DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 "
+if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ] && [ -z "$FINN_SINGULARITY" ];then
DOCKER_EXEC+="-v /etc/group:/etc/group:ro "
DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro "
DOCKER_EXEC+="-v /etc/shadow:/etc/shadow:ro "
@@ -241,6 +251,17 @@ if [ ! -z "$FINN_XILINX_PATH" ];then
fi
fi
DOCKER_EXEC+="$FINN_DOCKER_EXTRA "
-DOCKER_EXEC+="$FINN_DOCKER_TAG $DOCKER_CMD"
-$DOCKER_EXEC
+if [ -z "$FINN_SINGULARITY" ];then
+ CMD_TO_RUN="$DOCKER_BASE $DOCKER_EXEC $FINN_DOCKER_TAG $DOCKER_CMD"
+else
+ SINGULARITY_BASE="singularity exec"
+ # Replace command options for Singularity
+ SINGULARITY_EXEC="${DOCKER_EXEC//"-e "/"--env "}"
+ SINGULARITY_EXEC="${SINGULARITY_EXEC//"-v "/"-B "}"
+ SINGULARITY_EXEC="${SINGULARITY_EXEC//"-w "/"--pwd "}"
+ CMD_TO_RUN="$SINGULARITY_BASE $SINGULARITY_EXEC $FINN_SINGULARITY /usr/local/bin/finn_entrypoint.sh $DOCKER_CMD"
+ gecho "FINN_SINGULARITY is set, launching Singularity container instead of Docker"
+fi
+
+$CMD_TO_RUN
diff --git a/setup.cfg b/setup.cfg
index 1893aa4231..4834011dea 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -34,12 +34,12 @@
name = finn
description = A Framework for Fast, Scalable Quantized Neural Network Inference
author = Yaman Umuroglu
-author-email = yamanu@xilinx.com
+author_email = yamanu@xilinx.com
license = new-bsd
-long-description = file: README.md
-long-description-content-type = text/markdown
+long_description = file: README.md
+long_description_content_type = text/markdown
url = https://xilinx.github.io/finn/
-project-urls =
+project_urls =
Documentation = https://finn.readthedocs.io/
# Change if running only on Windows, Mac or Linux (comma-separated)
platforms = any
@@ -56,8 +56,6 @@ packages = find_namespace:
include_package_data = True
package_dir =
=src
-# DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
-setup_requires = pyscaffold>=3.2a0,<3.3a0
# The usage of test_requires is discouraged, see `Dependency Management` docs
# tests_require = pytest; pytest-cov
# Require a specific Python version, e.g. Python 2.7 or >= 3.4
@@ -81,6 +79,8 @@ docs =
pytest
netron
vcdvcd
+ sphinx==5.0.2
+ sphinx_rtd_theme==0.5.0
torchvision
torch
qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx
@@ -127,6 +127,12 @@ markers =
transform: mark tests that test transformations (before hls layers)
fpgadataflow: mark tests related to hls layers
end2end: mark tests that run the end2end flow
+ notebooks: mark tests that execute all Jupyter notebooks
+ sanity_bnn: mark tests that execute the sanity BNN test
+ bnn_u250: mark tests that execute U250 BNN tests
+ bnn_kv260: mark tests that execute KV260 BNN tests
+ bnn_pynq: mark tests that execute Pynq-Z1 BNN tests
+ bnn_zcu104: mark tests that execute ZCU104 BNN tests
norecursedirs =
dist
build
diff --git a/setup.py b/setup.py
index 8fd781462c..9a06632af1 100644
--- a/setup.py
+++ b/setup.py
@@ -35,17 +35,7 @@
PyScaffold helps you to put up the scaffold of your new Python project.
Learn more under: https://pyscaffold.org/
"""
-from pkg_resources import VersionConflict, require
from setuptools import setup
-import sys
-
-try:
- require("setuptools>=38.3")
-except VersionConflict:
- print("Error: version of setuptools is too old (<38.3)!")
- sys.exit(1)
-
-
if __name__ == "__main__":
- setup(use_pyscaffold=True)
+ setup()
diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py
index 5726702666..a4bf40760e 100644
--- a/src/finn/analysis/fpgadataflow/dataflow_performance.py
+++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020, Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -28,7 +29,7 @@
from qonnx.custom_op.registry import getCustomOp
-from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.util.fpgadataflow import is_hls_node, is_rtl_node
def dataflow_performance(model):
@@ -38,7 +39,7 @@ def dataflow_performance(model):
for each node along the critical path.
Preconditions:
- - model consists of fpgadataflow nodes
+ - model consists of HLS/RTL nodes
- model has cycle estimates annotated (see AnnotateCycles transformation)
- nodes have unique names (see GiveUniqueNodeNames)
@@ -52,7 +53,7 @@ def dataflow_performance(model):
max_node_name = ""
for node in model.graph.node:
- if is_fpgadataflow_node(node) is True:
+ if is_hls_node(node) or is_rtl_node(node):
inst = getCustomOp(node)
node_cycles = int(inst.get_nodeattr("cycles_estimate"))
if node_cycles > max_cycles:
@@ -66,9 +67,7 @@ def dataflow_performance(model):
max_pred_latency = 0
else:
# find max of any of predecessors
- pred_latencies = map(
- lambda x: latency_at_node_output[x.name], predecessors
- )
+ pred_latencies = map(lambda x: latency_at_node_output[x.name], predecessors)
max_pred_latency = max(pred_latencies)
latency_at_node_output[node.name] = node_cycles + max_pred_latency
critical_path_cycles = max(latency_at_node_output.values())
diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
index e1517ec636..50585720fe 100644
--- a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
+++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020, Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -28,7 +29,7 @@
import qonnx.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.util.fpgadataflow import is_hls_node, is_rtl_node
def exp_cycles_per_layer(model):
@@ -41,7 +42,7 @@ def exp_cycles_per_layer(model):
cycle_dict = {}
for node in model.graph.node:
- if is_fpgadataflow_node(node) is True:
+ if is_hls_node(node) or is_rtl_node(node):
inst = registry.getCustomOp(node)
cycle_dict[node.name] = int(inst.get_exp_cycles())
diff --git a/src/finn/analysis/fpgadataflow/floorplan_params.py b/src/finn/analysis/fpgadataflow/floorplan_params.py
index d57b660bce..be03966fb9 100644
--- a/src/finn/analysis/fpgadataflow/floorplan_params.py
+++ b/src/finn/analysis/fpgadataflow/floorplan_params.py
@@ -1,4 +1,5 @@
# Copyright (c) 2020, Xilinx
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -45,7 +46,7 @@ def floorplan_params(model):
}
}
for node in model.graph.node:
- if is_fpgadataflow_node(node) is True:
+ if is_fpgadataflow_node(node):
node_inst = getCustomOp(node)
node_slr = node_inst.get_nodeattr("slr")
node_pid = node_inst.get_nodeattr("partition_id")
diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
index 4d921438f6..330494315a 100644
--- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py
@@ -30,11 +30,12 @@
import warnings
import xml.etree.ElementTree as ET
-from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.util.fpgadataflow import is_hls_node
def hls_synth_res_estimation(model):
- """Extracts the FPGA resource results from the Vivado HLS synthesis estimates.
+ """Extracts the FPGA resource results from the Vitis HLS synthesis estimates.
+ Note that this analysis pass only works on nodes that have an HLS backend.
Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames
transformation) prior to calling this analysis pass to ensure all nodes are
visible in the results.
@@ -43,7 +44,7 @@ def hls_synth_res_estimation(model):
res_dict = {}
for node in model.graph.node:
- if is_fpgadataflow_node(node) is True:
+ if is_hls_node(node):
# init values to zero
res_dict[node.name] = dict()
res_dict[node.name]["BRAM_18K"] = 0
diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py
index 8b9c5d2a04..7b65b60fa7 100644
--- a/src/finn/analysis/fpgadataflow/post_synth_res.py
+++ b/src/finn/analysis/fpgadataflow/post_synth_res.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020, Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,7 @@
from qonnx.core.modelwrapper import ModelWrapper
from qonnx.custom_op.registry import getCustomOp
-from finn.transformation.move_reshape import _is_fpgadataflow_node
+from finn.util.fpgadataflow import is_hls_node, is_rtl_node
def post_synth_res(model, override_synth_report_filename=None):
@@ -85,8 +86,8 @@ def get_instance_stats(inst_name):
row = root.findall(".//*[@contents='%s']/.." % inst_name)
if row != []:
node_dict = {}
- row = row[0].getchildren()
- for (restype, ind) in restype_to_ind.items():
+ row = list(row[0])
+ for restype, ind in restype_to_ind.items():
node_dict[restype] = int(row[ind].attrib["contents"])
return node_dict
else:
@@ -102,7 +103,7 @@ def get_instance_stats(inst_name):
sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model"))
sdp_res_dict = post_synth_res(sdp_model, synth_report_filename)
res_dict.update(sdp_res_dict)
- elif _is_fpgadataflow_node(node):
+ elif is_hls_node(node) or is_rtl_node(node):
node_dict = get_instance_stats(node.name)
if node_dict is not None:
res_dict[node.name] = node_dict
diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py
index 406496bc0e..a6be1f1f53 100644
--- a/src/finn/analysis/fpgadataflow/res_estimation.py
+++ b/src/finn/analysis/fpgadataflow/res_estimation.py
@@ -28,7 +28,7 @@
import qonnx.custom_op.registry as registry
-from finn.util.fpgadataflow import is_fpgadataflow_node
+from finn.util.fpgadataflow import is_hls_node, is_rtl_node
def res_estimation(model):
@@ -41,7 +41,7 @@ def res_estimation(model):
res_dict = {}
for node in model.graph.node:
- if is_fpgadataflow_node(node) is True:
+ if is_hls_node(node) or is_rtl_node(node):
inst = registry.getCustomOp(node)
res_dict[node.name] = inst.node_res_estimation()
@@ -59,13 +59,10 @@ def res_estimation_complete(model):
res_dict = {}
for node in model.graph.node:
- if is_fpgadataflow_node(node) is True:
- op_type = node.op_type
+ if is_hls_node(node) or is_rtl_node(node):
inst = registry.getCustomOp(node)
- if (
- op_type == "MatrixVectorActivation"
- or op_type == "VectorVectorActivation"
- ):
+ op_type = node.op_type
+ if op_type.startswith("MVAU") or op_type.startswith("VVAU"):
orig_restype = inst.get_nodeattr("resType")
res_dict[node.name] = []
inst.set_nodeattr("resType", "dsp")
@@ -73,7 +70,7 @@ def res_estimation_complete(model):
inst.set_nodeattr("resType", "lut")
res_dict[node.name].append(inst.node_res_estimation())
inst.set_nodeattr("resType", orig_restype)
- elif op_type == "ConvolutionInputGenerator":
+ elif op_type.startswith("ConvolutionInputGenerator"):
orig_ramstyle = inst.get_nodeattr("ram_style")
res_dict[node.name] = []
inst.set_nodeattr("ram_style", "block")
diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py
index d6864994a7..284cd2baa3 100644
--- a/src/finn/builder/build_dataflow.py
+++ b/src/finn/builder/build_dataflow.py
@@ -91,12 +91,8 @@ def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True):
return steps_as_fxns
-def resolve_step_filename(
- step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0
-):
- step_names = list(
- map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False))
- )
+def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0):
+ step_names = list(map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False)))
assert step_name in step_names, "start_step %s not found" + step_name
step_no = step_names.index(step_name) + step_delta
assert step_no >= 0, "Invalid step+delta combination"
@@ -150,19 +146,13 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig):
for transform_step in build_dataflow_steps:
try:
step_name = transform_step.__name__
- print(
- "Running step: %s [%d/%d]"
- % (step_name, step_num, len(build_dataflow_steps))
- )
+ print("Running step: %s [%d/%d]" % (step_name, step_num, len(build_dataflow_steps)))
# redirect output to logfile
if not cfg.verbose:
sys.stdout = stdout_logger
sys.stderr = stderr_logger
# also log current step name to logfile
- print(
- "Running step: %s [%d/%d]"
- % (step_name, step_num, len(build_dataflow_steps))
- )
+ print("Running step: %s [%d/%d]" % (step_name, step_num, len(build_dataflow_steps)))
# run the step
step_start = time.time()
model = transform_step(model, cfg)
diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py
index a38cb6e572..e35c1cd346 100644
--- a/src/finn/builder/build_dataflow_config.py
+++ b/src/finn/builder/build_dataflow_config.py
@@ -1,4 +1,5 @@
# Copyright (c) 2020 Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -64,15 +65,6 @@ class DataflowOutputType(str, Enum):
DEPLOYMENT_PACKAGE = "deployment_package"
-class ComputeEngineMemMode(str, Enum):
- """Memory mode for generated compute engines. See
- https://finn.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode
- for more information."""
-
- CONST = "const"
- DECOUPLED = "decoupled"
-
-
class VitisOptStrategyCfg(str, Enum):
"""Vitis optimization strategy with serializable string enum values."""
@@ -115,13 +107,15 @@ class VerificationStepType(str, Enum):
"step_qonnx_to_finn",
"step_tidy_up",
"step_streamline",
- "step_convert_to_hls",
+ "step_convert_to_hw",
"step_create_dataflow_partition",
+ "step_specialize_layers",
"step_target_fps_parallelization",
"step_apply_folding_config",
+ "step_minimize_bit_width",
"step_generate_estimate_reports",
- "step_hls_codegen",
- "step_hls_ipgen",
+ "step_hw_codegen",
+ "step_hw_ipgen",
"step_set_fifo_depths",
"step_create_stitched_ip",
"step_measure_rtlsim_performance",
@@ -136,16 +130,18 @@ class VerificationStepType(str, Enum):
"step_qonnx_to_finn",
"step_tidy_up",
"step_streamline",
- "step_convert_to_hls",
+ "step_convert_to_hw",
"step_create_dataflow_partition",
+ "step_specialize_layers",
"step_target_fps_parallelization",
"step_apply_folding_config",
+ "step_minimize_bit_width",
"step_generate_estimate_reports",
]
-#: List of steps to run for a dataflow build including HLS code generation, but
+#: List of steps to run for a dataflow build including HW code generation, but
#: without any synthesis.
-hls_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hls_codegen"]
+hw_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hw_codegen"]
@dataclass_json
@@ -168,6 +164,14 @@ class DataflowBuildConfig:
#: DataflowOutputType for available options.
generate_outputs: List[DataflowOutputType]
+ #: (Optional) Path to configuration JSON file in which user can specify
+ #: a preferred implementation style (HLS or RTL) for each node.
+ #: The SpecializeLayers transformation picks up these settings and if possible
+ #: fulfills the desired implementation style for each layer by converting the
+ #: node into its HLS or RTL variant.
+ #: Will be applied with :py:mod:`qonnx.transformation.general.ApplyConfig`
+ specialize_layers_config_file: Optional[str] = None
+
#: (Optional) Path to configuration JSON file. May include parallelization,
#: FIFO sizes, RAM and implementation style attributes and so on.
#: If the parallelization attributes (PE, SIMD) are part of the config,
@@ -228,11 +232,17 @@ class DataflowBuildConfig:
mvau_wwidth_max: Optional[int] = 36
#: (Optional) Whether thresholding layers (which implement quantized
- #: activations in FINN) will be implemented as stand-alone HLS layers,
+ #: activations in FINN) will be implemented as stand-alone HW layers,
#: instead of being part of MatrixVectorActivation layer. This gives larger
#: flexibility, and makes it possible to have runtime-writable thresholds.
standalone_thresholds: Optional[bool] = False
+ #: (Optional) Whether optimizations that minimize the bit width of the
+ #: weights and accumulator will be applied. Because this optimization relies
+ #: on the the values of the weights, it will only be applied if runtime-
+ #: writeable weights is not enabled.
+ minimize_bit_width: Optional[bool] = True
+
#: Target board, only needed for generating full bitfiles where the FINN
#: design is integrated into a shell.
#: e.g. "Pynq-Z1" or "U250"
@@ -259,9 +269,7 @@ class DataflowBuildConfig:
#: When `auto_fifo_depths = True`, select which method will be used for
#: setting the FIFO sizes.
- auto_fifo_strategy: Optional[
- AutoFIFOSizingMethod
- ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
+ auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM
#: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test
#: if set to True, always using Python instead
@@ -271,17 +279,14 @@ class DataflowBuildConfig:
#: Only relevant when `auto_fifo_depths = True`
large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO
- #: Target clock frequency (in nanoseconds) for Vivado HLS synthesis.
+ #: Target clock frequency (in nanoseconds) for Vitis HLS synthesis.
#: e.g. `hls_clk_period_ns=5.0` will target a 200 MHz clock.
#: If not specified it will default to synth_clk_period_ns
hls_clk_period_ns: Optional[float] = None
- #: Which memory mode will be used for compute layers
- default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED
-
- #: Force inference of RTL ConvolutionInputGenerator over HLS implementation
- #: If set to False, falls back to the default behavior of InferConvInpGen()
- force_rtl_conv_inp_gen: Optional[bool] = False
+ #: Call CapConvolutionFIFODepths in InsertAndSetFIFODepths transform
+ #: to make convolution FIFOs smaller where appropriate
+ default_swg_exception: Optional[bool] = False
#: Which Vitis platform will be used.
#: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO`
@@ -341,8 +346,8 @@ class DataflowBuildConfig:
#: Override the number of inputs for rtlsim performance measurement.
rtlsim_batch_size: Optional[int] = 1
- #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during
- #: rtlsim, otherwise they will be replaced by HLS implementations.
+ #: If set to True, FIFOs with impl_style=vivado will be kept during
+ #: rtlsim, otherwise they will be replaced by RTL implementations.
rtlsim_use_vivado_comps: Optional[bool] = True
def _resolve_hls_clk_period(self):
@@ -358,9 +363,7 @@ def _resolve_driver_platform(self):
elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO:
return "alveo"
else:
- raise Exception(
- "Couldn't resolve driver platform for " + str(self.shell_flow_type)
- )
+ raise Exception("Couldn't resolve driver platform for " + str(self.shell_flow_type))
def _resolve_fpga_part(self):
if self.fpga_part is None:
@@ -402,8 +405,7 @@ def _resolve_vitis_platform(self):
return alveo_default_platform[self.board]
else:
raise Exception(
- "Could not resolve Vitis platform:"
- " need either board or vitis_platform specified"
+ "Could not resolve Vitis platform:" " need either board or vitis_platform specified"
)
def _resolve_verification_steps(self):
@@ -421,8 +423,7 @@ def _resolve_verification_io_pair(self):
)
verify_input_npy = np.load(self.verify_input_npy)
assert os.path.isfile(self.verify_expected_output_npy), (
- "verify_expected_output_npy not found: "
- + self.verify_expected_output_npy
+ "verify_expected_output_npy not found: " + self.verify_expected_output_npy
)
verify_expected_output_npy = np.load(self.verify_expected_output_npy)
return (
diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py
index 2ee898bc7d..443d2df54c 100644
--- a/src/finn/builder/build_dataflow_steps.py
+++ b/src/finn/builder/build_dataflow_steps.py
@@ -1,4 +1,5 @@
# Copyright (c) 2020 Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -52,7 +53,7 @@
from qonnx.util.config import extract_model_config_to_json
from shutil import copy
-import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls
+import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw
import finn.transformation.streamline.absorb as absorb
from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance
from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer
@@ -89,6 +90,12 @@
from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO
from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver
from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild
+from finn.transformation.fpgadataflow.minimize_accumulator_width import (
+ MinimizeAccumulatorWidth,
+)
+from finn.transformation.fpgadataflow.minimize_weight_bit_width import (
+ MinimizeWeightBitWidth,
+)
from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim
from finn.transformation.fpgadataflow.prepare_ip import PrepareIP
from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim
@@ -102,6 +109,7 @@
SplitLargeFIFOs,
)
from finn.transformation.fpgadataflow.set_folding import SetFolding
+from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers
from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext
from finn.transformation.fpgadataflow.vitis_build import VitisBuild
from finn.transformation.move_reshape import RemoveCNVtoFCFlatten
@@ -139,9 +147,7 @@ def verify_step(
in_npy = np.expand_dims(in_npy_all[b], axis=0)
exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0)
if need_parent:
- assert (
- cfg.save_intermediate_models
- ), "Enable save_intermediate_models for verification"
+ assert cfg.save_intermediate_models, "Enable save_intermediate_models for verification"
parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx"
child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name
model.save(child_model_fn)
@@ -155,9 +161,7 @@ def verify_step(
)
print("Attempting to force model shape on verification input")
in_npy = in_npy.reshape(exp_ishape)
- out_dict = execute_parent(
- parent_model_fn, child_model_fn, in_npy, return_full_ctx=True
- )
+ out_dict = execute_parent(parent_model_fn, child_model_fn, in_npy, return_full_ctx=True)
out_npy = out_dict[out_tensor_name]
else:
inp_tensor_name = model.graph.input[0].name
@@ -214,25 +218,15 @@ def verify_step(
def prepare_for_stitched_ip_rtlsim(verify_model, cfg):
if not cfg.rtlsim_use_vivado_comps:
need_restitch = False
- # switch impl_style=vivado components to rtl/hls
+ # switch impl_style=vivado components to rtl
# StreamingFIFO must have impl_style=rtl
- for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"):
+ for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO_rtl"):
inst = getCustomOp(fifo_layer)
if inst.get_nodeattr("impl_style") != "rtl":
inst.set_nodeattr("impl_style", "rtl")
inst.set_nodeattr("code_gen_dir_ipgen", "")
inst.set_nodeattr("ipgen_path", "")
need_restitch = True
- # StreamingDataWidthConverter must have impl_style=hls
- for dwc_layer in verify_model.get_nodes_by_op_type(
- "StreamingDataWidthConverter_Batch"
- ):
- inst = getCustomOp(dwc_layer)
- if inst.get_nodeattr("impl_style") != "hls":
- inst.set_nodeattr("impl_style", "hls")
- inst.set_nodeattr("code_gen_dir_ipgen", "")
- inst.set_nodeattr("ipgen_path", "")
- need_restitch = True
# if we've made alterations to the model, need to do some re-prep
if need_restitch:
print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM")
@@ -336,48 +330,46 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig):
return model
-def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig):
- """Convert eligible nodes to `HLSCustomOp` subclasses that represent HLS
- layers. Which nodes and particular configurations can be converted to HLS
- is limited, see the source code of the `convert_to_hls` module for more."""
+def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig):
+ """Convert eligible nodes to `HWCustomOp` subclasses that represent HW
+ layers. Which nodes and particular configurations can be converted to HW
+ is limited, see the source code of the `convert_to_hw` module for more.
+ In the end am empty json file is created which can be used to set user specific
+ preferred implementation styles for each node."""
- mem_mode = cfg.default_mem_mode.value
if cfg.standalone_thresholds:
# doing this first causes all threshold layers to be standalone
- model = model.transform(to_hls.InferThresholdingLayer())
+ model = model.transform(to_hw.InferThresholdingLayer())
# needed for bipolar MatMul layers
- model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))
+ model = model.transform(to_hw.InferBinaryMatrixVectorActivation())
# needed for non-bipolar MatMul layers
- model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))
+ model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())
# TopK to LabelSelect
- model = model.transform(to_hls.InferLabelSelectLayer())
+ model = model.transform(to_hw.InferLabelSelectLayer())
# input quantization (if any) as standalone threshold
- model = model.transform(to_hls.InferThresholdingLayer())
+ model = model.transform(to_hw.InferThresholdingLayer())
# needed for convolutions -- TODO always exec?
need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0
if need_conv:
- if cfg.force_rtl_conv_inp_gen:
- model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True))
- else:
- model = model.transform(to_hls.InferConvInpGen())
- model = model.transform(to_hls.InferStreamingMaxPool())
+ model = model.transform(to_hw.InferConvInpGen())
+ model = model.transform(to_hw.InferStreamingMaxPool())
model = model.transform(RemoveCNVtoFCFlatten())
# get rid of Tranpose -> Tranpose identity seq
model = model.transform(absorb.AbsorbConsecutiveTransposes())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(InferDataLayouts())
+
return model
def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig):
- """Separate consecutive groups of HLSCustomOp nodes into StreamingDataflowPartition
+ """Separate consecutive groups of HWCustomOp nodes into StreamingDataflowPartition
nodes, which point to a separate ONNX file. Dataflow accelerator synthesis
- can only be performed on those HLSCustomOp sub-graphs."""
+ can only be performed on those HWCustomOp sub-graphs."""
parent_model = model.transform(
CreateDataflowPartition(
- partition_model_dir=cfg.output_dir
- + "/intermediate_models/supported_op_partitions"
+ partition_model_dir=cfg.output_dir + "/intermediate_models/supported_op_partitions"
)
)
sdp_nodes = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")
@@ -388,6 +380,31 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig
if cfg.save_intermediate_models:
parent_model.save(cfg.output_dir + "/intermediate_models/dataflow_parent.onnx")
model = ModelWrapper(dataflow_model_filename)
+
+ # create a configuration json file that can be used to set the specialize layer config
+ attrs = [
+ "preferred_impl_style",
+ ]
+ extract_model_config_to_json(
+ model, cfg.output_dir + "/template_specialize_layers_config.json", attrs
+ )
+
+ return model
+
+
+def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig):
+ """Convert HW nodes to either an HLS or RTL variant of the node. HW nodes
+ get converted either based on pre-determined rules (details can be found
+ in `specialize_layers` source code) or the user provides a configuration file
+ which contains the desired setting. If the user preference cannot be fulfilled,
+ a warning will be printed and the implementation style will be set to a default."""
+
+ if cfg.specialize_layers_config_file is not None:
+ model = model.transform(GiveUniqueNodeNames())
+ model = model.transform(ApplyConfig(cfg.specialize_layers_config_file))
+ model = model.transform(SpecializeLayers(cfg._resolve_fpga_part()))
+ model = model.transform(InferShapes())
+ model = model.transform(InferDataTypes())
return model
@@ -410,14 +427,15 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi
hw_attrs = [
"PE",
"SIMD",
+ "parallel_window",
"ram_style",
"resType",
"mem_mode",
"runtime_writeable_weights",
+ "depth_trigger_uram",
+ "depth_trigger_bram",
]
- extract_model_config_to_json(
- model, cfg.output_dir + "/auto_folding_config.json", hw_attrs
- )
+ extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs)
return model
@@ -452,9 +470,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
with open(report_dir + "/estimate_layer_cycles.json", "w") as f:
json.dump(estimate_layer_cycles, f, indent=2)
estimate_layer_resources = model.analysis(res_estimation)
- estimate_layer_resources["total"] = aggregate_dict_keys(
- estimate_layer_resources
- )
+ estimate_layer_resources["total"] = aggregate_dict_keys(estimate_layer_resources)
with open(report_dir + "/estimate_layer_resources.json", "w") as f:
json.dump(estimate_layer_resources, f, indent=2)
estimate_layer_resources_complete = model.analysis(res_estimation_complete)
@@ -468,8 +484,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
est_fps = n_clock_cycles_per_sec / estimate_network_performance["max_cycles"]
estimate_network_performance["estimated_throughput_fps"] = est_fps
est_latency_ns = (
- estimate_network_performance["critical_path_cycles"]
- * cfg.synth_clk_period_ns
+ estimate_network_performance["critical_path_cycles"] * cfg.synth_clk_period_ns
)
estimate_network_performance["estimated_latency_ns"] = est_latency_ns
with open(report_dir + "/estimate_network_performance.json", "w") as f:
@@ -477,18 +492,27 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig
return model
-def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
- "Generate Vivado HLS code to prepare HLSCustomOp nodes for IP generation."
+def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig):
+ """Tighten the weight and accumulator bit widths for each layer."""
+ if cfg.minimize_bit_width:
+ model = model.transform(MinimizeWeightBitWidth())
+ model = model.transform(MinimizeAccumulatorWidth())
+ # make sure the changed datatypes are propagated through the network
+ model = model.transform(InferDataTypes())
+ return model
- model = model.transform(
- PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
- )
+
+def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig):
+ """Generate Vitis HLS code to prepare HLSBackend nodes for IP generation.
+ And fills RTL templates for RTLBackend nodes."""
+
+ model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
return model
-def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
- """Run Vivado HLS synthesis on generated code for HLSCustomOp nodes,
- in order to generate IP blocks."""
+def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig):
+ """Run Vitis HLS synthesis on generated code for HLSBackend nodes,
+ in order to generate IP blocks. For RTL nodes this step does not do anything."""
model = model.transform(HLSSynthIP())
model = model.transform(ReplaceVerilogRelPaths())
@@ -516,6 +540,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
if cfg.auto_fifo_depths:
if cfg.auto_fifo_strategy == "characterize":
model = model.transform(InsertDWC())
+ model = model.transform(SpecializeLayers())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(
PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
@@ -533,6 +558,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
create_shallow_fifos=True,
)
)
+ model = model.transform(SpecializeLayers())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
elif cfg.auto_fifo_strategy == "largefifo_rtlsim":
@@ -548,6 +574,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
InsertAndSetFIFODepths(
cfg._resolve_fpga_part(),
cfg._resolve_hls_clk_period(),
+ swg_exception=cfg.default_swg_exception,
vivado_ram_style=cfg.large_fifo_mem_style,
force_python_sim=force_python_sim,
)
@@ -563,6 +590,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
# need to make sure all FIFOs are created so that their depth can be
# set by ApplyConfig, so create_shallow_fifos=True
model = model.transform(InsertFIFO(create_shallow_fifos=True))
+ model = model.transform(SpecializeLayers())
model = model.transform(GiveUniqueNodeNames())
model = model.transform(GiveReadableTensorNames())
if cfg.folding_config_file is not None:
@@ -572,6 +600,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
hw_attrs = [
"PE",
"SIMD",
+ "parallel_window",
"ram_style",
"depth",
"impl_style",
@@ -580,10 +609,10 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
"runtime_writeable_weights",
"inFIFODepths",
"outFIFODepths",
+ "depth_trigger_uram",
+ "depth_trigger_bram",
]
- extract_model_config_to_json(
- model, cfg.output_dir + "/final_hw_config.json", hw_attrs
- )
+ extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs)
# perform FIFO splitting and shallow FIFO removal only after the final config
# json file has been written. otherwise, since these transforms may add/remove
@@ -594,9 +623,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig):
# after FIFOs are ready to go, call PrepareIP and HLSSynthIP again
# this will only run for the new nodes (e.g. FIFOs and DWCs)
- model = model.transform(
- PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())
- )
+ model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()))
model = model.transform(HLSSynthIP())
return model
@@ -633,9 +660,7 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig):
if cfg.verify_save_rtlsim_waveforms:
report_dir = cfg.output_dir + "/report"
os.makedirs(report_dir, exist_ok=True)
- verify_model.set_metadata_prop(
- "rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir)
- )
+ verify_model.set_metadata_prop("rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir))
verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True)
os.environ["LIVENESS_THRESHOLD"] = str(prev_liveness)
return model
@@ -656,9 +681,7 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
rtlsim_model = deepcopy(model)
rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg)
# multi-in/out streams currently not supported in our C++ verilator driver
- model_multi_io = (
- len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1
- )
+ model_multi_io = len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1
force_python_rtlsim = cfg.force_python_rtlsim or model_multi_io
if model_multi_io:
warnings.warn(
@@ -668,7 +691,6 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
rtlsim_bs = int(cfg.rtlsim_batch_size)
orig_rtlsim_trace_depth = get_rtlsim_trace_depth()
if force_python_rtlsim:
- # run with single input to get latency
assert rtlsim_bs > 0, "rtlsim batch size must be >0"
if cfg.verify_save_rtlsim_waveforms:
# set depth to 3 for layer-by-layer visibility
@@ -677,12 +699,12 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
"rtlsim_trace",
"%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs),
)
- rtlsim_model.set_metadata_prop(
- "extra_verilator_args", str(["-CFLAGS", "-O3"])
- )
+ rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"]))
+ # run with single input to get latency
+ rtlsim_latency_dict = throughput_test_rtlsim(rtlsim_model, 1)
+ # run with batch to get stable-state throughput
rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs)
- rtlsim_latency = rtlsim_perf_dict["cycles"]
- rtlsim_perf_dict["latency_cycles"] = rtlsim_latency
+ rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_dict["cycles"]
else:
rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs)
# keep keys consistent between the Python and C++-styles
@@ -693,9 +715,22 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi
rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000
rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s
rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz
- for (key, val) in rtlsim_perf_dict.items():
+ for key, val in rtlsim_perf_dict.items():
if "max_count" in key:
del rtlsim_perf_dict[key]
+ # estimate stable-state throughput based on latency+throughput
+ if rtlsim_bs == 1:
+ rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_perf_dict[
+ "throughput[images/s]"
+ ]
+ else:
+ total_cycles = rtlsim_perf_dict["cycles"]
+ latency_cycles = rtlsim_perf_dict["latency_cycles"]
+ stablestate_cycles = total_cycles - latency_cycles
+ clk_ns = float(model.get_metadata_prop("clk_ns"))
+ fclk_mhz = 1 / (clk_ns * 0.001)
+ runtime_s = (stablestate_cycles * clk_ns) * (10**-9)
+ rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_bs / runtime_s
with open(report_dir + "/rtlsim_performance.json", "w") as f:
json.dump(rtlsim_perf_dict, f, indent=2)
@@ -722,13 +757,9 @@ def step_out_of_context_synthesis(model: ModelWrapper, cfg: DataflowBuildConfig)
"""Run out-of-context synthesis and generate reports.
Depends on the DataflowOutputType.STITCHED_IP output product."""
if DataflowOutputType.OOC_SYNTH in cfg.generate_outputs:
- assert (
- DataflowOutputType.STITCHED_IP in cfg.generate_outputs
- ), "OOC needs stitched IP"
+ assert DataflowOutputType.STITCHED_IP in cfg.generate_outputs, "OOC needs stitched IP"
model = model.transform(
- SynthOutOfContext(
- part=cfg._resolve_fpga_part(), clk_period_ns=cfg.synth_clk_period_ns
- )
+ SynthOutOfContext(part=cfg._resolve_fpga_part(), clk_period_ns=cfg.synth_clk_period_ns)
)
report_dir = cfg.output_dir + "/report"
os.makedirs(report_dir, exist_ok=True)
@@ -819,13 +850,15 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig):
"step_qonnx_to_finn": step_qonnx_to_finn,
"step_tidy_up": step_tidy_up,
"step_streamline": step_streamline,
- "step_convert_to_hls": step_convert_to_hls,
+ "step_convert_to_hw": step_convert_to_hw,
+ "step_specialize_layers": step_specialize_layers,
"step_create_dataflow_partition": step_create_dataflow_partition,
"step_target_fps_parallelization": step_target_fps_parallelization,
"step_apply_folding_config": step_apply_folding_config,
+ "step_minimize_bit_width": step_minimize_bit_width,
"step_generate_estimate_reports": step_generate_estimate_reports,
- "step_hls_codegen": step_hls_codegen,
- "step_hls_ipgen": step_hls_ipgen,
+ "step_hw_codegen": step_hw_codegen,
+ "step_hw_ipgen": step_hw_ipgen,
"step_set_fifo_depths": step_set_fifo_depths,
"step_create_stitched_ip": step_create_stitched_ip,
"step_measure_rtlsim_performance": step_measure_rtlsim_performance,
diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py
index 2695113661..588e97e9e4 100644
--- a/src/finn/core/onnx_exec.py
+++ b/src/finn/core/onnx_exec.py
@@ -31,13 +31,10 @@
import qonnx.analysis.topology as ta
from qonnx.core.onnx_exec import execute_onnx as execute_onnx_base
-from finn.core.remote_exec import remote_exec
from finn.core.rtlsim_exec import rtlsim_exec
-def execute_onnx(
- model, input_dict, return_full_exec_context=False, start_node=None, end_node=None
-):
+def execute_onnx(model, input_dict, return_full_exec_context=False, start_node=None, end_node=None):
"""Executes given ONNX ModelWrapper with given named inputs.
If return_full_exec_context is False, a dict of named outputs is returned
as indicated by the model.graph.output.
@@ -51,13 +48,10 @@ def execute_onnx(
# check if model has an execution mode set
# if None, execute model node using the QONNX-provided execute_onnx impl
- # if set to "remote_pynq" execute model on PYNQ board
# if set to "rtlsim" execute model using pyverilator
model_exec_mode = model.get_metadata_prop("exec_mode")
if (model_exec_mode is None) or (model_exec_mode == ""):
- return execute_onnx_base(
- model, input_dict, return_full_exec_context, start_node, end_node
- )
+ return execute_onnx_base(model, input_dict, return_full_exec_context, start_node, end_node)
if not model.check_all_tensor_shapes_specified():
raise Exception("Found unspecified tensor shapes, try infer_shapes")
@@ -91,22 +85,17 @@ def execute_onnx(
# check if model has an execution mode set
# if None, execute model node by node using execute_node()
- # if set to "remote_pynq" execute model on PYNQ board
# if set to "rtlsim" execute model using pyverilator
model_exec_mode = model.get_metadata_prop("exec_mode")
if (model_exec_mode is None) or (model_exec_mode == ""):
return execute_onnx_base()
- elif model_exec_mode == "remote_pynq":
- # use remote exec metadata built into model to execute on a remote PYNQ
- remote_exec(model, execution_context)
elif model_exec_mode == "rtlsim":
# use stitched IP for rtlsim
rtlsim_exec(model, execution_context)
else:
raise Exception(
- """Metadata property "exec_mode" is set to an unknown value.
- Can be left unset or has to be set to "remote_pynq" for remote execution
- on PYNQ board or "rtlsim" for execution using pyverilator!"""
+ """Metadata property "exec_mode" is set to an unknown value. Can be left
+ unset or has to be set to "rtlsim" for execution using pyverilator!"""
)
if return_full_exec_context:
diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py
deleted file mode 100644
index f487b48f86..0000000000
--- a/src/finn/core/remote_exec.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) 2020 Xilinx, Inc.
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# * Neither the name of Xilinx nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import numpy as np
-import os
-import subprocess
-import warnings
-
-
-def remote_exec(model, execution_context):
- """Executes the given model remotely on the pynq board. The metadata properties
- related to the pynq board have to be set. The execution context contains the
- input values."""
- # TODO fix for multi input-output
- pynq_ip = model.get_metadata_prop("pynq_ip")
- pynq_port = int(model.get_metadata_prop("pynq_port"))
- pynq_username = model.get_metadata_prop("pynq_username")
- pynq_password = model.get_metadata_prop("pynq_password")
- pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
- deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
- platform = model.get_metadata_prop("platform")
- assert platform in ["alveo", "zynq-iodma"]
- bitfile = model.get_metadata_prop("bitfile")
- bitfile = os.path.basename(bitfile)
- if pynq_password == "":
- if "zynq" in platform:
- raise Exception("PYNQ board remote exec needs password for sudo")
- else:
- local_prefix = "" # assume we are using an ssh key
- warnings.warn("Empty password, make sure you've set up an ssh key")
- else:
- local_prefix = "sshpass -p %s " % pynq_password
-
- if platform == "alveo":
- # Alveo can run without sudo
- remote_prefix = ""
- elif "zynq" in platform:
- # PYNQ Zynq boards need to execute with sudo
- remote_prefix = "echo %s | sudo -S " % pynq_password
-
- inp = execution_context[model.graph.input[0].name]
- # make copy of array before saving it
- inp = inp.copy()
- batchsize = inp.shape[0]
- np.save(os.path.join(deployment_dir, "input.npy"), inp)
- # extracting last folder of absolute path (deployment_dir)
- deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
- # copy input to PYNQ board
- cmd = local_prefix + "scp -P{} -r {}/input.npy {}@{}:{}/{}".format(
- pynq_port,
- deployment_dir,
- pynq_username,
- pynq_ip,
- pynq_target_dir,
- deployment_folder,
- )
- bash_command = ["/bin/bash", "-c", cmd]
- process_scp_in = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
- process_scp_in.communicate()
-
- # use platform attribute for correct remote execution
- if platform == "alveo":
- remote_cmd = "bash -ic 'bash alveo_run.sh execute %d' \"" % batchsize
- else:
- remote_cmd = (
- "python3.6 driver.py --exec_mode=execute --batchsize={} "
- "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
- '--platform={} "'
- ).format(batchsize, bitfile, platform)
- cmd = (
- local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
- ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
- bash_command = ["/bin/bash", "-c", cmd]
- process_exec_accel = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
- process_exec_accel.communicate()
- # remove stale output file from local dir, if any
- try:
- os.remove("{}/output.npy".format(deployment_dir))
- except FileNotFoundError:
- pass
- # copy generated output to local
- cmd = local_prefix + "scp -P{} {}@{}:{}/{}/output.npy {}".format(
- pynq_port,
- pynq_username,
- pynq_ip,
- pynq_target_dir,
- deployment_folder,
- deployment_dir,
- )
- bash_command = ["/bin/bash", "-c", cmd]
- process_scp_out = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
- process_scp_out.communicate()
- outp = np.load("{}/output.npy".format(deployment_dir))
- execution_context[model.graph.output[0].name] = outp
diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py
index 3533fd1339..08633be33b 100644
--- a/src/finn/core/throughput_test.py
+++ b/src/finn/core/throughput_test.py
@@ -28,90 +28,11 @@
import numpy as np
import os
-import subprocess
-import warnings
from qonnx.util.basic import gen_finn_dt_tensor
from finn.core.rtlsim_exec import rtlsim_exec
-def throughput_test_remote(model, batchsize=1000, timeout=None):
- """Runs the throughput test for the given model remotely on the pynq board.
- The metadata properties related to the pynq board have to be set.
- Additionally a timeout for the SSH communication can be set.
- Returns a dictionary with results of the throughput test. Returns None
- if the test fails."""
-
- pynq_ip = model.get_metadata_prop("pynq_ip")
- pynq_port = int(model.get_metadata_prop("pynq_port"))
- pynq_username = model.get_metadata_prop("pynq_username")
- pynq_password = model.get_metadata_prop("pynq_password")
- pynq_target_dir = model.get_metadata_prop("pynq_target_dir")
- deployment_dir = model.get_metadata_prop("pynq_deploy_dir")
- # extracting last folder of absolute path (deployment_dir)
- deployment_folder = os.path.basename(os.path.normpath(deployment_dir))
- platform = model.get_metadata_prop("platform")
- assert platform in ["alveo", "zynq-iodma"]
- bitfile = model.get_metadata_prop("bitfile")
- bitfile = os.path.basename(bitfile)
- if pynq_password == "":
- if "zynq" in platform:
- raise Exception("PYNQ board remote exec needs password for sudo")
- else:
- local_prefix = "" # assume we are using an ssh key
- warnings.warn("Empty password, make sure you've set up an ssh key")
- else:
- local_prefix = "sshpass -p %s " % pynq_password
-
- if platform == "alveo":
- # Alveo can run without sudo but needs correct environment
- remote_prefix = "conda activate finn-pynq-alveo; "
- elif "zynq" in platform:
- # PYNQ Zynq boards need to execute with sudo
- remote_prefix = "echo %s | sudo -S " % pynq_password
-
- # use platform attribute for correct remote execution
- if platform == "alveo":
- remote_cmd = "bash -ic 'bash alveo_run.sh throughput_test %d' \"" % batchsize
- else:
- remote_cmd = (
- "python3.6 driver.py --exec_mode=throughput_test --batchsize={} "
- "--bitfile={} --inputfile=input.npy --outputfile=output.npy "
- '--platform={} "'
- ).format(batchsize, bitfile, platform)
- cmd = (
- local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd
- ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder)
- bash_command = ["/bin/bash", "-c", cmd]
- process_throughput_test = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
- process_throughput_test.communicate(timeout=timeout)
-
- # remove any pre-existing metrics file
- try:
- os.remove("{}/nw_metrics.txt".format(deployment_dir))
- except FileNotFoundError:
- pass
-
- cmd = local_prefix + "scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format(
- pynq_port,
- pynq_username,
- pynq_ip,
- pynq_target_dir,
- deployment_folder,
- deployment_dir,
- )
- bash_command = ["/bin/bash", "-c", cmd]
- process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE)
- process_compile.communicate(timeout=timeout)
-
- try:
- with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file:
- res = eval(file.read())
- return res
- except FileNotFoundError:
- return None
-
-
def throughput_test_rtlsim(model, batchsize=100):
"""Runs a throughput test for the given IP-stitched model. When combined
with tracing, useful to determine bottlenecks and required FIFO sizes."""
diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py
index 56d4230a3a..aed2ab7fe1 100644
--- a/src/finn/custom_op/fpgadataflow/__init__.py
+++ b/src/finn/custom_op/fpgadataflow/__init__.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2023-2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -26,70 +27,57 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch
-from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch
-from finn.custom_op.fpgadataflow.checksum import CheckSum
+from finn.custom_op.fpgadataflow.addstreams import AddStreams
+from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp
from finn.custom_op.fpgadataflow.concat import StreamingConcat
from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
ConvolutionInputGenerator,
)
-from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import (
- ConvolutionInputGenerator1D,
-)
-from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import (
- ConvolutionInputGenerator_rtl,
-)
from finn.custom_op.fpgadataflow.downsampler import DownSampler
-from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch
-from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise
-from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch
-from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl
-from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch
-from finn.custom_op.fpgadataflow.iodma import IODMA
-from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch
+from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams
+from finn.custom_op.fpgadataflow.fmpadding import FMPadding
+from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel
+from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool
+from finn.custom_op.fpgadataflow.labelselect import LabelSelect
from finn.custom_op.fpgadataflow.lookup import Lookup
-from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation
-from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch
+from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
+from finn.custom_op.fpgadataflow.pool import Pool
from finn.custom_op.fpgadataflow.streamingdataflowpartition import (
StreamingDataflowPartition,
)
-from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import (
- StreamingDataWidthConverter_Batch,
+from finn.custom_op.fpgadataflow.streamingdatawidthconverter import (
+ StreamingDataWidthConverter,
)
+from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise
from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO
-from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch
-from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch
-from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker
-from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch
-from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation
+from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool
+from finn.custom_op.fpgadataflow.thresholding import Thresholding
+from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
+from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
custom_op = dict()
# make sure new HLSCustomOp subclasses are imported here so that they get
# registered and plug in correctly into the infrastructure
-custom_op["DownSampler"] = DownSampler
-custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch
-custom_op["MatrixVectorActivation"] = MatrixVectorActivation
-custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
-custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D
-custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl
-custom_op["TLastMarker"] = TLastMarker
-custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch
+custom_op["MVAU"] = MVAU
custom_op["StreamingFIFO"] = StreamingFIFO
-custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch
-custom_op["Pool_Batch"] = Pool_Batch
-custom_op["FMPadding_Batch"] = FMPadding_Batch
-custom_op["Thresholding_Batch"] = Thresholding_Batch
-custom_op["AddStreams_Batch"] = AddStreams_Batch
-custom_op["LabelSelect_Batch"] = LabelSelect_Batch
-custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch
-custom_op["VectorVectorActivation"] = VectorVectorActivation
-custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch
-custom_op["IODMA"] = IODMA
+custom_op["Thresholding"] = Thresholding
+custom_op["VVAU"] = VVAU
custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition
-custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch
+
+custom_op["AddStreams"] = AddStreams
+custom_op["ChannelwiseOp"] = ChannelwiseOp
+custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator
+custom_op["DownSampler"] = DownSampler
+custom_op["DuplicateStreams"] = DuplicateStreams
+custom_op["FMPadding"] = FMPadding
+custom_op["FMPadding_Pixel"] = FMPadding_Pixel
+custom_op["GlobalAccPool"] = GlobalAccPool
+custom_op["LabelSelect"] = LabelSelect
custom_op["Lookup"] = Lookup
+custom_op["Pool"] = Pool
custom_op["StreamingConcat"] = StreamingConcat
-custom_op["CheckSum"] = CheckSum
+custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter
custom_op["StreamingEltwise"] = StreamingEltwise
-custom_op["FMPadding_rtl"] = FMPadding_rtl
+custom_op["StreamingMaxPool"] = StreamingMaxPool
+custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour
diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py
new file mode 100644
index 0000000000..ac61786ac1
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/addstreams.py
@@ -0,0 +1,171 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class AddStreams(HWCustomOp):
+ """Abstraction layer for HW implementation of AddStreams."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = super().get_nodeattr_types()
+ my_attrs.update(
+ {
+ "NumChannels": ("i", True, ""),
+ "PE": ("i", True, ""),
+ # FINN DataTypes for inputs; output datatype inferred from input
+ "inputDataType": ("s", True, ""),
+ # number of input vectors, examples:
+ # [1] is a single vector (like a FC layer with batch=1)
+ # [4] is four vectors (like a FC layer with batch=4)
+ # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+ "numInputVectors": ("ints", False, [1]),
+ "inFIFODepths": ("ints", False, [2, 2]),
+ }
+ )
+ return my_attrs
+
+ def get_normal_input_shape(self, ind=0):
+ ich = self.get_nodeattr("NumChannels")
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ ishape = tuple(vecs + [ich])
+ return ishape
+
+ def get_folded_input_shape(self, ind=0):
+ ich = self.get_nodeattr("NumChannels")
+ pe = self.get_nodeattr("PE")
+ assert ich % pe == 0, "PE must divide NumChannels"
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ ishape = tuple(vecs + [ich // pe, pe])
+ return ishape
+
+ def get_normal_output_shape(self, ind=0):
+ return self.get_normal_input_shape()
+
+ def get_folded_output_shape(self, ind=0):
+ return self.get_folded_input_shape()
+
+ def make_shape_compatible_op(self, model):
+ exp_ishape = self.get_normal_input_shape()
+ oshape = self.get_normal_output_shape()
+ ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+ assert ishape == exp_ishape, "Unexpected input1 shape."
+ ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
+ assert ishape == exp_ishape, "Unexpected input2 shape."
+ return super().make_const_shape_op(oshape)
+
+ def infer_node_datatype(self, model):
+ node = self.onnx_node
+ idt = model.get_tensor_datatype(node.input[0])
+ if idt != self.get_input_datatype():
+ warn_str = "inputDataType changing for %s: %s -> %s " % (
+ node.name,
+ str(self.get_input_datatype()),
+ str(idt),
+ )
+ warnings.warn(warn_str)
+ self.set_nodeattr("inputDataType", idt.name)
+ # enforce output data type (calculated based on idt)
+ odt = self.get_output_datatype()
+ model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+ def verify_node(self):
+ pass
+
+ def get_input_datatype(self, ind=0):
+ """Returns FINN DataType of input."""
+ return DataType[self.get_nodeattr("inputDataType")]
+
+ def get_output_datatype(self, ind=0):
+ """Returns FINN DataType of output."""
+ # we need to set output datatype to the next larger int or uint
+ # enhancement: consider specifying w/ explicit outputDataType attribute
+ # to allow overflow and use the same idt if user wants
+ idt = DataType[self.get_nodeattr("inputDataType")]
+ if idt.signed():
+ return DataType.get_smallest_possible(2 * idt.min())
+ else:
+ return DataType.get_smallest_possible(2 * idt.max())
+
+ def get_instream_width(self, ind=0):
+ """Returns input stream width."""
+ ibits = self.get_input_datatype().bitwidth()
+ pe = self.get_nodeattr("PE")
+ in_width = pe * ibits
+ return in_width
+
+ def get_outstream_width(self, ind=0):
+ """Returns output stream width."""
+ obits = self.get_output_datatype().bitwidth()
+ pe = self.get_nodeattr("PE")
+ out_width = pe * obits
+ return out_width
+
+ def get_number_output_values(self):
+ return np.prod(self.get_folded_output_shape()[:-1])
+
+ def get_exp_cycles(self):
+ # Channels/PE * batch size * fmdim * fmdim
+ return np.prod(self.get_folded_output_shape()[:-1])
+
+ def execute_node(self, context, graph):
+ # simulate behavior using Python
+ node = self.onnx_node
+ inp0_values = context[node.input[0]]
+ inp1_values = context[node.input[1]]
+ oshape = context[node.output[0]].shape
+ ishape0 = inp0_values.shape
+ ishape1 = inp1_values.shape
+ assert ishape0 == ishape1, "Shapes of inputs should be the same for Addstreams"
+ result = inp0_values + inp1_values
+ context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
+
+ def get_verilog_top_module_intf_names(self):
+ intf_names = super().get_verilog_top_module_intf_names()
+ sname = self.hls_sname()
+ swidth = self.get_instream_width_padded()
+ intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
+ return intf_names
+
+ def derive_characteristic_fxns(self, period):
+ n_inps = np.prod(self.get_folded_input_shape()[:-1])
+ io_dict = {
+ "inputs": {
+ "in0": [0 for i in range(n_inps)],
+ "in1": [0 for i in range(n_inps)],
+ },
+ "outputs": {"out": []},
+ }
+ super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py
new file mode 100644
index 0000000000..9bf4ebdf62
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py
@@ -0,0 +1,234 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import onnxruntime as rt
+import warnings
+from onnx import TensorProto, helper
+from qonnx.core.datatype import DataType
+from qonnx.util.basic import qonnx_make_model
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+# ONNX i/o tensor shape assumptions for channelwise ops:
+# input 0 is the input tensor, shape (..., NumChannels)
+# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel)
+# output 0 is the output tensor, shape (..., NumChannels) - same as input
+# the ... here can be any shape (representing groups of vectors)
+
+
+def get_smallest_possible(vals):
+ """Returns smallest (fewest bits) possible DataType that can represent
+ value. Prefers unsigned integers where possible."""
+ vals = np.array(vals, dtype=np.float64)
+ for v in vals:
+ assert int(v) == v, "Error float value"
+
+ for k in DataType.get_accumulator_dt_cands():
+ dt = DataType[k]
+
+ if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]:
+ # not currently supported
+ continue
+
+ if (dt.min() <= vals).all() and (vals <= dt.max()).all():
+ return dt
+
+ warnings.warn(
+ """InferChannelwiseLinearLayer: Output values may not be
+ representable with supported data types.
+ Setting maximum width data type available.
+ This will lead to errors if there are no constrains on the input
+ """
+ )
+
+ if (0 <= vals).all():
+ return DataType["UINT64"]
+ else:
+ return DataType["INT64"]
+
+
+class ChannelwiseOp(HWCustomOp):
+ """Abstraction layer for HW implementation of ChannelwiseOp."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {
+ # channelwise "map" function to apply:
+ # one of cmp_le, cmp_ge, add, mul
+ "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}),
+ "PE": ("i", True, 0),
+ "NumChannels": ("i", True, 0),
+ # string defining memory resource type for parameters
+ "ram_style": ("s", False, "distributed", {"distributed", "block"}),
+ # FINN DataTypes for inputs, weights, outputs
+ "inputDataType": ("s", True, ""),
+ "paramDataType": ("s", True, ""),
+ "outputDataType": ("s", True, ""),
+ # number of input vectors, examples:
+ # [1] is a single vector (like a FC layer with batch=1)
+ # [4] is four vectors (like a FC layer with batch=4)
+ # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+ "numInputVectors": ("ints", False, [1]),
+ }
+ my_attrs.update(super().get_nodeattr_types())
+ return my_attrs
+
+ def calc_tmem(self):
+ """Calculates and returns TMEM, the depth of the memory used
+ to store the channelwise op parameters."""
+ chn = self.get_nodeattr("NumChannels")
+ pe = self.get_nodeattr("PE")
+ return chn // pe
+
+ def make_shape_compatible_op(self, model):
+ oshape = self.get_normal_output_shape()
+ # implement tensor with correct shape
+ return super().make_const_shape_op(oshape)
+
+ def infer_node_datatype(self, model):
+ node = self.onnx_node
+ # check input datatype against property
+ idt = model.get_tensor_datatype(node.input[0])
+
+ exp_idt_name = self.get_nodeattr("inputDataType")
+ if exp_idt_name != idt.name:
+ func = self.get_nodeattr("Func")
+ assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer"
+
+ self.set_nodeattr("inputDataType", idt.name)
+ # update the func in ['add','mul'] cases
+
+ # get parameter ranges
+ param = model.get_initializer(node.input[1])
+ param_min = min(param.flatten())
+ param_max = max(param.flatten())
+
+ # set function and determine output data type
+ if func == "add":
+ out_min = idt.min() + param_min
+ out_max = idt.max() + param_max
+ odt = get_smallest_possible([out_min, out_max])
+ elif func == "mul":
+ possible_limits = []
+ possible_limits += [idt.min() * param_min]
+ possible_limits += [idt.min() * param_max]
+ possible_limits += [idt.max() * param_min]
+ possible_limits += [idt.max() * param_max]
+ odt = get_smallest_possible(possible_limits)
+
+ self.set_nodeattr("outputDataType", odt.name)
+
+ # set output datatype from property
+ odt = self.get_output_datatype()
+ model.set_tensor_datatype(node.output[0], odt)
+
+ def verify_node(self):
+ pass
+
+ def get_input_datatype(self, ind=0):
+ """Returns FINN DataType of input."""
+ return DataType[self.get_nodeattr("inputDataType")]
+
+ def get_output_datatype(self, ind=0):
+ """Returns FINN DataType of output."""
+ return DataType[self.get_nodeattr("outputDataType")]
+
+ def get_instream_width(self, ind=0):
+ i_bits = self.get_input_datatype().bitwidth()
+ return i_bits * self.get_nodeattr("PE")
+
+ def get_outstream_width(self, ind=0):
+ o_bits = self.get_output_datatype().bitwidth()
+ return o_bits * self.get_nodeattr("PE")
+
+ def get_folded_input_shape(self, ind=0):
+ ich = self.get_nodeattr("NumChannels")
+ pe = self.get_nodeattr("PE")
+ fold = ich // pe
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ folded_input_shape = tuple(vecs + [fold, pe])
+ return folded_input_shape
+
+ def get_folded_output_shape(self, ind=0):
+ # same shape as input
+ return self.get_folded_input_shape()
+
+ def get_normal_input_shape(self, ind=0):
+ ich = self.get_nodeattr("NumChannels")
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ normal_input_shape = tuple(vecs + [ich])
+ return normal_input_shape
+
+ def get_normal_output_shape(self, ind=0):
+ # same shape as input
+ return self.get_normal_input_shape()
+
+ def get_number_output_values(self):
+ nf = np.prod(self.get_folded_output_shape()[:-1])
+ return nf
+
+ def get_exp_cycles(self):
+ # Channels/PE * batch size * fmdim * fmdim
+ return np.prod(self.get_folded_output_shape()[:-1])
+
+ def execute_node(self, context, graph):
+ # create a standard onnx node to help calculate the result
+ # depending on Func node attribute either a Mul or an Add node
+ node = self.onnx_node
+ func = self.get_nodeattr("Func")
+ inp_values = context[node.input[0]]
+ param_values = context[node.input[1]]
+ oshape = context[node.output[0]].shape
+ ishape = inp_values.shape
+ pshape = param_values.shape
+ inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape)
+ param = helper.make_tensor_value_info(node.input[1], TensorProto.FLOAT, pshape)
+ outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape)
+ node_func = helper.make_node(
+ func.capitalize(),
+ inputs=node.input,
+ outputs=[node.output[0]],
+ )
+ graph_func = helper.make_graph(
+ nodes=[node_func],
+ name="single-add-exec",
+ inputs=[inp, param],
+ outputs=[outp],
+ )
+
+ opset_version = self.onnx_opset_version
+ opset_imports = [helper.make_opsetid("", opset_version)]
+ onnx_kwargs = {"opset_imports": opset_imports}
+ model_func = qonnx_make_model(graph_func, **onnx_kwargs)
+ idict = {node.input[0]: inp_values, node.input[1]: param_values}
+ sess = rt.InferenceSession(model_func.SerializeToString())
+ result = sess.run(None, idict)
+ context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py
index 4437bcd198..210b6b7fdd 100644
--- a/src/finn/custom_op/fpgadataflow/concat.py
+++ b/src/finn/custom_op/fpgadataflow/concat.py
@@ -1,4 +1,5 @@
# Copyright (c) 2021, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -27,20 +28,18 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
-import os
from qonnx.core.datatype import DataType
from qonnx.util.basic import roundup_to_integer_multiple
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
-class StreamingConcat(HLSCustomOp):
- """Streaming concatenation node with dynamically generated HLS.
+class StreamingConcat(HWCustomOp):
+ """Abstraction layer for HW implementation of Concat.
Only supports concatenating along the last axis."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
@@ -127,238 +126,13 @@ def get_number_output_values(self):
def get_exp_cycles(self):
return np.prod(self.get_folded_output_shape()[:-1])
- def generate_params(self, model, path):
- elems_per_stream = self.get_nodeattr("ElemsPerStream")
- inp_streams = []
- commands = []
- idt = self.get_input_datatype()
- total_elems = self.get_total_elems()
- total_bw = idt.bitwidth() * total_elems
- for (i, elems) in enumerate(elems_per_stream):
- bw = idt.bitwidth() * elems
- inp_stream = "hls::stream > &in%d" % (bw, i)
- inp_streams.append(inp_stream)
- cmd = "in%d.read()" % i
- commands.append(cmd)
- out_stream = "hls::stream > &out" % (total_bw)
- inp_streams.append(out_stream)
-
- impl_hls_code = []
- impl_hls_code.append("void StreamingConcat(")
- impl_hls_code.append(",".join(inp_streams))
- impl_hls_code.append(", unsigned int numReps) {")
- impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {")
- impl_hls_code.append("#pragma HLS PIPELINE II=1")
- impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw)
- # FIXME: the order of streams for concatenation works out differently
- # for cppsim vs rtlsim, addressed via reversing the order of commands
- # for now
- impl_hls_code.append("#ifdef __SYNTHESIS__")
- impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");")
- impl_hls_code.append("#else")
- impl_hls_code.append("out_elem = (" + ",".join(commands) + ");")
- impl_hls_code.append("#endif")
- impl_hls_code.append("out.write(out_elem);")
- impl_hls_code.append("}")
- impl_hls_code.append("}")
- impl_hls_code = "\n".join(impl_hls_code)
-
- impl_filename = "{}/concat_impl.hpp".format(path)
- f_impl = open(impl_filename, "w")
- f_impl.write(impl_hls_code)
- f_impl.close()
-
def execute_node(self, context, graph):
- mode = self.get_nodeattr("exec_mode")
node = self.onnx_node
- n_inps = len(self.onnx_node.input)
- ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)]
- folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)]
- exp_oshape = self.get_normal_output_shape()
- folded_oshape = self.get_folded_output_shape()
- export_idt = self.get_input_datatype()
-
- if mode == "cppsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- elif mode == "rtlsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
-
- for i in range(n_inps):
- inp = context[node.input[i]]
- assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i]
- # reshape input into folded form
- inp = inp.reshape(folded_ishapes[i])
- # make copy before saving array
- reshaped_input = inp.copy()
- np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input)
-
- if mode == "cppsim":
- # execute the precompiled model
- super().exec_precompiled_singlenode_model()
- # load output npy file
- super().npy_to_dynamic_output(context)
- assert (
- context[node.output[0]].shape == folded_oshape
- ), "cppsim did not produce expected folded output shape"
- context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
- elif mode == "rtlsim":
- sim = self.get_rtlsim()
- io_dict = {"inputs": {}, "outputs": {"out": []}}
- for i in range(n_inps):
- nbits = self.get_instream_width(i)
- rtlsim_inp = npy_to_rtlsim_input(
- "%s/input_%d.npy" % (code_gen_dir, i),
- export_idt,
- nbits,
- reverse_inner=True,
- )
- io_dict["inputs"]["in%d" % i] = rtlsim_inp
- super().reset_rtlsim(sim)
- super().toggle_clk(sim)
-
- self.rtlsim_multi_io(sim, io_dict)
- rtlsim_output = io_dict["outputs"]["out"]
- odt = self.get_output_datatype()
- target_bits = odt.bitwidth()
- packed_bits = self.get_outstream_width()
- out_npy_path = "{}/output.npy".format(code_gen_dir)
- out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- rtlsim_output,
- out_npy_path,
- odt,
- out_shape,
- packed_bits,
- target_bits,
- reverse_inner=True,
- )
- # load and reshape output
- output = np.load(out_npy_path)
- output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
- context[node.output[0]] = output
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
-
- assert (
- context[node.output[0]].shape == exp_oshape
- ), """Output shape doesn't match expected shape."""
-
- def global_includes(self):
- self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"']
-
- def defines(self, var):
- num_reps = self.get_nodeattr("numInputVectors")
- num_reps = np.prod(num_reps)
- self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps]
-
- def read_npy_data(self):
- n_inputs = self.get_n_inputs()
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- npy_type = "float"
- self.code_gen_dict["$READNPYDATA$"] = []
- idt = self.get_input_datatype()
- idt_bw = idt.bitwidth()
- elem_hls_type = idt.get_hls_datatype_str()
- elem_bits = idt_bw
- for i in range(n_inputs):
- packed_bits = self.get_instream_width(i)
- packed_hls_type = "ap_uint<%d>" % packed_bits
- npy_in = "%s/input_%d.npy" % (code_gen_dir, i)
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in%d);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in, i)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- n_inputs = self.get_n_inputs()
- for i in range(n_inputs):
- packed_bits = self.get_instream_width(i)
- packed_hls_type = "ap_uint<%d>" % packed_bits
- stream_name = "in%d" % i
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream<%s> %s ("%s");'
- % (packed_hls_type, stream_name, stream_name)
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
- )
-
- def docompute(self):
- self.code_gen_dict["$DOCOMPUTE$"] = []
- n_inputs = self.get_n_inputs()
- in_stream_names = ["in%d" % x for x in range(n_inputs)]
- in_stream_names = ",".join(in_stream_names)
- comp_call = "StreamingConcat(%s, out, NumReps);" % (in_stream_names)
- self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
-
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
- def blackboxfunction(self):
- n_inputs = self.get_n_inputs()
- in_streams = []
- for i in range(n_inputs):
- iwidth = self.get_instream_width(i)
- in_streams.append("hls::stream> &in%d" % (iwidth, i))
- in_streams = ",".join(in_streams)
- total_width = self.get_input_datatype().bitwidth() * self.get_total_elems()
- out_stream = "hls::stream> &out" % (total_width)
- blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream)
- self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
-
- def pragmas(self):
- n_inputs = self.get_n_inputs()
- pragmas = []
- for i in range(n_inputs):
- pragmas.append(
- "#pragma HLS INTERFACE axis port=in%d name=in%d_%s"
- % (i, i, self.hls_sname())
- )
- self.code_gen_dict["$PRAGMAS$"] = pragmas
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
+ inp_values = []
+ for inp in node.input:
+ inp_values.append(context[inp])
+ result = np.concatenate(inp_values, axis=-1)
+ context[node.output[0]] = result
def get_instream_width_padded(self, ind=0):
in_width = self.get_instream_width(ind)
@@ -370,7 +144,5 @@ def get_verilog_top_module_intf_names(self):
sname = self.hls_sname()
intf_names["s_axis"] = []
for i in range(n_inputs):
- intf_names["s_axis"].append(
- ("in%d_%s" % (i, sname), self.get_instream_width_padded(i))
- )
+ intf_names["s_axis"].append(("in%d_%s" % (i, sname), self.get_instream_width_padded(i)))
return intf_names
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
index 1566445999..96f49069c7 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
+++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -26,36 +26,27 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-import math
import numpy as np
-import os
+from onnx import TensorProto, helper
from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
from qonnx.custom_op.general.im2col import compute_conv_output_dim
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.util.basic import qonnx_make_model
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
# ONNX i/o tensor shape assumptions for ConvolutionInputGenerator:
# input 0 is the input tensor, shape NHWC = (1, IFMDim, IFMDim, IFMChannels)
# output 0 is the output tensor, shape NHWC:
# = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels)
-# note: the actual data layout produced by the hlslib kernels is different
-# for depthwise and non-depthwise ops.
-# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD)
-# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD)
-# see test_fpgadataflow_slidingwindow.py for an example of how to transform
-# between the two layouts
+class ConvolutionInputGenerator(HWCustomOp):
+ """Abstraction layer for HW implementation of ConvolutionInputGenerator"""
-class ConvolutionInputGenerator(HLSCustomOp):
- """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator
- (sliding window) function variants. Depending on the combination of
- attributes (e.g. depthwise or not, whether k % stride is 0) a different
- variant will be picked for the actual HLS implementation."""
-
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
@@ -82,23 +73,16 @@ def get_nodeattr_types(self):
"distributed",
{"auto", "block", "distributed", "ultra"},
),
+ "parallel_window": ("i", False, 0, {0, 1}),
+ # 1D (True) or 2D (False) spatial data
+ "is1D": ("i", False, 0),
+ # Enable reprogrammable implementation to change FM dimensions,
+ # stride, or dilation during runtime (requires parallel_window = 0)
+ "dynamic_mode": ("i", False, 0, {0, 1}),
}
my_attrs.update(super().get_nodeattr_types())
return my_attrs
- def get_nodeattr(self, name):
- # overriding get_nodeattr to check for square kernel/img.. requirement
- # since this can't be done with the attribute restriction in nodeattr_types
- # TODO non-square can be enabled in theory but needs testing
- ret = super().get_nodeattr(name)
- props_to_check = ["ConvKernelDim", "IFMDim", "OFMDim", "Stride", "Dilation"]
- if name in props_to_check:
- is_square = ret[0] == ret[1]
- assert is_square, "Only square %s supported" % name
- if name == "Dilation":
- assert ret[0] == ret[1] == 1, "Only dilation=1 supported"
- return ret
-
def get_normal_input_shape(self, ind=0):
ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
ifm_ch = self.get_nodeattr("IFMChannels")
@@ -137,8 +121,12 @@ def get_folded_output_shape(self, ind=0):
ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
- wf = int((k_h * k_w * ifm_ch) // simd)
- folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
+ if self.use_parallel_window_output():
+ wf = int((ifm_ch) // simd)
+ folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
+ else:
+ wf = int((k_h * k_w * ifm_ch) // simd)
+ folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
return folded_oshape
def make_shape_compatible_op(self, model):
@@ -177,322 +165,93 @@ def get_instream_width(self, ind=0):
return in_width
def get_outstream_width(self, ind=0):
- """Returns stream width, input and output stream width are equal for
- the sliding window function, so the function to determine the input
- stream width can be reused."""
- return self.get_instream_width()
+ if self.use_parallel_window_output():
+ # feed all window pixels in parallel
+ k_h, k_w = self.get_nodeattr("ConvKernelDim")
+ return self.get_instream_width() * k_h * k_w
+ else:
+ # if parallel variant not in use: same width for output and input stream
+ return self.get_instream_width()
def get_number_output_values(self):
folded_oshape = self.get_folded_output_shape()
num_output_elems = np.prod(folded_oshape[:-1])
return num_output_elems
- def get_exp_cycles(self):
- simd = self.get_nodeattr("SIMD")
+ def get_1d_conv_attrs_normalized(self):
+ # support both (1, D) and (D, 1) cases transparently:
+ # For the kernel, presenting the input data of size D as
+ # [H, W] = [Y, X] = [1, D] or [D, 1]
+ # effectively gives the same result.
+ # For consistency and ease of programming, this function
+ # returns the attributes of the layer as follows:
+ # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
+ # The dummy ('1') dimension is the Y-dimension.
ifm_ch = self.get_nodeattr("IFMChannels")
- k_h, k_w = self.get_nodeattr("ConvKernelDim")
- ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
- ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim")
- stride_h, stride_w = self.get_nodeattr("Stride")
- dilation_h, dilation_w = self.get_nodeattr("Dilation")
-
- # since mmv != 1 is not supported yet, we set mmv for now to 1
- mmv = 1
- # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
- cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
- cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
- max_cycles = max(cycles_write_block, cycles_read_block)
- exp_cycles = (
- ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
- )
+ k = self.get_nodeattr("ConvKernelDim")
+ ifm_dim = self.get_nodeattr("IFMDim")
+ ofm_dim = self.get_nodeattr("OFMDim")
+ stride = self.get_nodeattr("Stride")
+ dilation = self.get_nodeattr("Dilation")
+
+ # see defines() for an explanation
+ if ifm_dim[1] == 1:
+ ifm_dim = ifm_dim[::-1]
+ ofm_dim = ofm_dim[::-1]
+ k = k[::-1]
+ stride = stride[::-1]
+ dilation = dilation[::-1]
+
+ return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
- return int(exp_cycles)
+ def get_exp_cycles(self):
+ return 0
def bram_estimation(self):
- # NOTE: only tested with a square convolution
- simd = self.get_nodeattr("SIMD")
- ifm_ch = self.get_nodeattr("IFMChannels")
- ifm_dim = self.get_nodeattr("IFMDim")[0]
- k = self.get_nodeattr("ConvKernelDim")[0]
- stride = self.get_nodeattr("Stride")[0]
- ram_style = self.get_nodeattr("ram_style")
- if ram_style == "block" or ram_style == "auto":
- ram_depth = ifm_dim * ifm_ch / simd
- if ram_depth <= 512:
- ram_width = 36
- elif ram_depth <= 1024:
- ram_width = 18
- elif ram_depth <= 2048:
- ram_width = 9
- elif ram_depth <= 4096:
- ram_width = 4
- elif ram_depth <= 8192:
- ram_width = 2
- else:
- ram_width = 1
- return int(
- (k + stride)
- * (
- math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
- * math.ceil(ifm_dim * ifm_ch / simd / ram_depth)
- )
- )
- else:
- return 0
+ return 0
def lut_estimation(self):
- # NOTE: only tested with a square convolution
- simd = self.get_nodeattr("SIMD")
- ifm_ch = self.get_nodeattr("IFMChannels")
- ifm_dim = self.get_nodeattr("IFMDim")[0]
- k = self.get_nodeattr("ConvKernelDim")[0]
- stride = self.get_nodeattr("Stride")[0]
- ram_style = self.get_nodeattr("ram_style")
- if ram_style == "distributed":
- ram_luts = int(
- (k + stride)
- * (
- simd
- * self.get_input_datatype().bitwidth()
- * math.ceil(ifm_dim * ifm_ch / simd / 64)
- )
- )
- else:
- ram_luts = 0
- return 300 + ram_luts
+ return 0
def uram_estimation(self):
- # NOTE: only tested with a square convolution
- simd = self.get_nodeattr("SIMD")
- ifm_ch = self.get_nodeattr("IFMChannels")
- ifm_dim = self.get_nodeattr("IFMDim")[0]
- k = self.get_nodeattr("ConvKernelDim")[0]
- stride = self.get_nodeattr("Stride")[0]
- ram_style = self.get_nodeattr("ram_style")
- if ram_style == "ultra":
- return int(
- (k + stride)
- * (
- math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
- * math.ceil(ifm_dim * ifm_ch / simd / 4096)
- )
- )
- else:
- return 0
+ return 0
def execute_node(self, context, graph):
- mode = self.get_nodeattr("exec_mode")
+ # using Im2Col node to calculate output
node = self.onnx_node
- exp_ishape = self.get_normal_input_shape()
- exp_oshape = self.get_normal_output_shape()
- folded_ishape = self.get_folded_input_shape()
-
- # TODO ensure codegen dir exists
- if mode == "cppsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- elif mode == "rtlsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
-
- inp = context[node.input[0]]
- assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert (
- inp.shape == exp_ishape
- ), """Input shape doesn't
- match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch)."""
- if self.get_input_datatype() == DataType["BIPOLAR"]:
- # store bipolar activations as binary
- inp = (inp + 1) / 2
- export_idt = DataType["BINARY"]
- else:
- export_idt = self.get_input_datatype()
- # reshape input into folded form
- inp = inp.reshape(folded_ishape)
- # make copy before saving array
- reshaped_input = inp.copy()
- np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
-
- if mode == "cppsim":
- # execute the precompiled model
- super().exec_precompiled_singlenode_model()
- # load output npy file
- super().npy_to_dynamic_output(context)
- assert (
- context[node.output[0]].shape == exp_oshape
- ), "cppsim \
- did not produce expected output shape"
- elif mode == "rtlsim":
- sim = self.get_rtlsim()
- nbits = self.get_instream_width()
- rtlsim_inp = npy_to_rtlsim_input(
- "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
- )
- super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
- odt = export_idt
- target_bits = odt.bitwidth()
- packed_bits = self.get_outstream_width()
- out_npy_path = "{}/output.npy".format(code_gen_dir)
- out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
- )
- # load and reshape output
- output = np.load(out_npy_path)
- output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
- context[node.output[0]] = output
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
- # binary -> bipolar if needed
- if self.get_output_datatype() == DataType["BIPOLAR"]:
- out = context[node.output[0]]
- out = 2 * out - 1
- context[node.output[0]] = out
- assert (
- context[node.output[0]].shape == exp_oshape
- ), """Output
- shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch)."""
-
- def global_includes(self):
- self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
-
- def defines(self, var):
- numReps = 1
- ifm_dim = self.get_nodeattr("IFMDim")[0]
+ ifm_dim = self.get_nodeattr("IFMDim")
+ k = self.get_nodeattr("ConvKernelDim")
+ s = self.get_nodeattr("Stride")
+ d = self.get_nodeattr("Dilation")
ifm_ch = self.get_nodeattr("IFMChannels")
- ofm_dim = self.get_nodeattr("OFMDim")[0]
- k = self.get_nodeattr("ConvKernelDim")[0]
- stride = self.get_nodeattr("Stride")[0]
- simd = self.get_nodeattr("SIMD")
- ifm_precision = self.get_input_datatype().bitwidth()
-
- self.code_gen_dict["$DEFINES$"] = [
- """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n
- #define Input_precision1 {}\n #define IFMDim1 {}\n
- #define OFMDim1 {}\n #define SIMD1 {}\n
- #define Stride1 {}\n #define numReps {}""".format(
- k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps
- )
- ]
-
- def read_npy_data(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_input_datatype()
- if dtype == DataType["BIPOLAR"]:
- # use binary for bipolar storage
- dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_in = "%s/input_0.npy" % code_gen_dir
- self.code_gen_dict["$READNPYDATA$"] = []
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+ inp_values = context[node.input[0]]
+ oshape = context[node.output[0]].shape
+ ishape = inp_values.shape
+ inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape)
+ outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape)
+ im2col_node = helper.make_node(
+ "Im2Col",
+ [node.input[0]],
+ [node.output[0]],
+ domain="qonnx.custom_op.general",
+ stride=[s[0], s[1]],
+ kernel_size=[k[0], k[1]],
+ dilations=[d[0], d[1]],
+ input_shape="(1,{},{},{})".format(ifm_dim[0], ifm_dim[1], ifm_ch),
)
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ graph_im2col = helper.make_graph(
+ nodes=[im2col_node],
+ name="single-im2col-exec",
+ inputs=[inp],
+ outputs=[outp],
)
- def docompute(self):
- node = self.onnx_node
- ram_style = self.get_nodeattr("ram_style")
- map_to_hls_ram_style = {
- "auto": "ap_resource_dflt()",
- "block": "ap_resource_bram()",
- "distributed": "ap_resource_lutram()",
- "ultra": "ap_resource_uram()",
- }
- hls_ram_style = map_to_hls_ram_style[ram_style]
- hls_call = node.op_type
-
- # check which ConvolutionInputGenerator is needed
- k = self.get_nodeattr("ConvKernelDim")[0]
- stride = self.get_nodeattr("Stride")[0]
-
- if k % stride != 0:
- hls_call += "_kernel_stride"
-
- if self.get_nodeattr("depthwise") == 1:
- self.code_gen_dict["$DOCOMPUTE$"] = [
- """{}_dws (in0, out, numReps, {});""".format(
- hls_call, hls_ram_style
- )
- ]
- else:
- self.code_gen_dict["$DOCOMPUTE$"] = [
- """{} (in0, out, numReps, {});""".format(
- hls_call, hls_ram_style
- )
- ]
-
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- if dtype == DataType["BIPOLAR"]:
- # use binary for bipolar storage
- dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
- def blackboxfunction(self):
- self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
- hls::stream> &out)""".format(
- self.onnx_node.name
- )
- ]
-
- def pragmas(self):
- self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
- ]
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
+ opset_version = self.onnx_opset_version
+ opset_imports = [helper.make_opsetid("", opset_version)]
+ onnx_kwargs = {"opset_imports": opset_imports}
+ model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs))
+ model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype())
+ # use execution function from Im2Col node
+ # this automatically updates the execution context
+ inst = getCustomOp(im2col_node)
+ inst.execute_node(context, model_im2col.graph)
diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py
index b7efaff440..4f919d1b50 100644
--- a/src/finn/custom_op/fpgadataflow/downsampler.py
+++ b/src/finn/custom_op/fpgadataflow/downsampler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -27,20 +27,22 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
import numpy as np
-import os
import warnings
+from onnx import TensorProto, helper
from qonnx.core.datatype import DataType
+from qonnx.core.modelwrapper import ModelWrapper
+from qonnx.custom_op.registry import getCustomOp
+from qonnx.util.basic import qonnx_make_model
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
-class DownSampler(HLSCustomOp):
- """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function.
+class DownSampler(HWCustomOp):
+ """Abstraction layer for HW implementation of DownSampling
Basically performs a down sampling of the image removing rows and columns."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
@@ -174,180 +176,54 @@ def get_number_output_values(self):
folded_oshape = self.get_folded_output_shape()
return np.prod(folded_oshape[:-1])
- def global_includes(self):
- self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
-
- def defines(self, var):
- self.code_gen_dict["$DEFINES$"] = []
-
- ifm_ch = self.get_nodeattr("NumChannels")
- self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
-
- ibits = self.get_input_datatype().bitwidth()
- self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
-
- idim = self.get_nodeattr("ImgDim")
- self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
-
- simd = self.get_nodeattr("SIMD")
- self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)]
-
- stride = self.get_nodeattr("Stride")
- self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)]
-
- batch_size = self.get_nodeattr("numInputVectors")
- self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
-
- def read_npy_data(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_input_datatype()
- if dtype == DataType["BIPOLAR"]:
- # use binary for bipolar storage
- dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_in = "%s/input_0.npy" % code_gen_dir
- self.code_gen_dict["$READNPYDATA$"] = []
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
- )
-
- def docompute(self):
- dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D"
- self.code_gen_dict["$DOCOMPUTE$"] = [
- f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0, out, numReps);"""
- ]
-
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- if dtype == DataType["BIPOLAR"]:
- # use binary for bipolar storage
- dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
- def blackboxfunction(self):
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
- % (self.onnx_node.name, packed_hls_type, packed_hls_type)
- ]
-
- def pragmas(self):
- self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
- ]
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
-
def execute_node(self, context, graph):
- mode = self.get_nodeattr("exec_mode")
+ # using Im2Col node to calculate output
node = self.onnx_node
- exp_ishape = self.get_normal_input_shape()
- exp_oshape = self.get_normal_output_shape()
- folded_ishape = self.get_folded_input_shape()
-
- if mode == "cppsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- elif mode == "rtlsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ ifm_dim = self.get_nodeattr("ImgDim")
+ stride = self.get_nodeattr("Stride")
+ ifm_ch = self.get_nodeattr("NumChannels")
+ # check if 1D or 2D case
+ if self.get_nodeattr("is1D"):
+ if self.get_nodeattr("is1D_unitx"):
+ ifm_dim_w = 1
+ sw = 1
+ ifm_dim_h = ifm_dim
+ sh = stride
+ else:
+ ifm_dim_h = 1
+ sh = 1
+ ifm_dim_w = ifm_dim
+ sw = stride
else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
-
- inp = context[node.input[0]]
- assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert (
- inp.shape == exp_ishape
- ), """Input shape doesn't
- match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
- export_idt = self.get_input_datatype()
-
- reshaped_input = inp.reshape(folded_ishape)
- np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+ ifm_dim_h = ifm_dim_w = ifm_dim
+ sh = sw = stride
+ inp_values = context[node.input[0]]
+ oshape = context[node.output[0]].shape
+ ishape = inp_values.shape
+ inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape)
+ outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape)
+ im2col_node = helper.make_node(
+ "Im2Col",
+ [node.input[0]],
+ [node.output[0]],
+ domain="qonnx.custom_op.general",
+ stride=[sh, sw],
+ kernel_size=[1, 1],
+ input_shape="(1,{},{},{})".format(ifm_dim_h, ifm_dim_w, ifm_ch),
+ )
+ graph_im2col = helper.make_graph(
+ nodes=[im2col_node],
+ name="single-im2col-exec",
+ inputs=[inp],
+ outputs=[outp],
+ )
- if mode == "cppsim":
- # execute the precompiled model
- super().exec_precompiled_singlenode_model()
- # load output npy file
- super().npy_to_dynamic_output(context)
- assert (
- context[node.output[0]].shape == exp_oshape
- ), "cppsim did not produce expected output shape"
- elif mode == "rtlsim":
- sim = self.get_rtlsim()
- nbits = self.get_instream_width()
- rtlsim_inp = npy_to_rtlsim_input(
- "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
- )
- super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
- odt = export_idt
- target_bits = odt.bitwidth()
- packed_bits = self.get_outstream_width()
- out_npy_path = "{}/output.npy".format(code_gen_dir)
- out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
- )
- # load and reshape output
- output = np.load(out_npy_path)
- output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
- context[node.output[0]] = output
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
- assert (
- context[node.output[0]].shape == exp_oshape
- ), """Output shape doesn't match expected shape
- (1, OutputDim, OutputDim, NumChannels)."""
+ opset_version = self.onnx_opset_version
+ opset_imports = [helper.make_opsetid("", opset_version)]
+ onnx_kwargs = {"opset_imports": opset_imports}
+ model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs))
+ model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype())
+ # use execution function from Im2Col node
+ # this automatically updates the execution context
+ inst = getCustomOp(im2col_node)
+ inst.execute_node(context, model_im2col.graph)
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py
new file mode 100644
index 0000000000..8943ffc9e3
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py
@@ -0,0 +1,177 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class DuplicateStreams(HWCustomOp):
+ """Abstraction layer for HW implementation of DuplicateStreams"""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {
+ "NumChannels": ("i", True, 0),
+ "PE": ("i", True, 0),
+ # how many duplicated output streams to create
+ "NumOutputStreams": ("i", True, 0),
+ # FINN DataTypes for input
+ "inputDataType": ("s", True, ""),
+ # number of input vectors, examples:
+ # [1] is a single vector (like a FC layer with batch=1)
+ # [4] is four vectors (like a FC layer with batch=4)
+ # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+ "numInputVectors": ("ints", False, [1]),
+ }
+ my_attrs.update(super().get_nodeattr_types())
+ return my_attrs
+
+ def get_num_output_streams(self):
+ return self.get_nodeattr("NumOutputStreams")
+
+ def get_normal_input_shape(self, ind=0):
+ ch = self.get_nodeattr("NumChannels")
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ ishape = tuple(vecs + [ch])
+ return ishape
+
+ def get_folded_input_shape(self, ind=0):
+ ch = self.get_nodeattr("NumChannels")
+ pe = self.get_nodeattr("PE")
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ assert ch % pe == 0, "PE must divide NumChannels"
+ folds = int(ch / pe)
+ folded_ishape = tuple(vecs + [folds, pe])
+ return folded_ishape
+
+ def get_normal_output_shape(self, ind=0):
+ # since the output shape of both out streams are the same
+ # return independently from index
+ return self.get_normal_input_shape()
+
+ def get_folded_output_shape(self, ind=0):
+ # since the output shape of both out streams are the same
+ # return independently from index
+ return self.get_folded_input_shape()
+
+ def make_shape_compatible_op(self, model):
+ exp_ishape = self.get_normal_input_shape()
+ ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+ assert ishape == exp_ishape, "Unexpected input shape."
+ num_out = self.get_num_output_streams()
+ assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs"
+
+ oshape = self.get_normal_output_shape()
+ ret = super().make_const_shape_op(oshape)
+ ret.output[:] = self.onnx_node.output
+ return ret
+
+ def infer_node_datatype(self, model):
+ node = self.onnx_node
+ idt = model.get_tensor_datatype(node.input[0])
+ if idt != self.get_input_datatype():
+ warn_str = "inputDataType changing for %s: %s -> %s " % (
+ node.name,
+ str(self.get_input_datatype()),
+ str(idt),
+ )
+ warnings.warn(warn_str)
+ self.set_nodeattr("inputDataType", idt.name)
+ odt = self.get_output_datatype()
+ for my_out in self.onnx_node.output:
+ model.set_tensor_datatype(my_out, odt)
+
+ def verify_node(self):
+ pass
+
+ def get_input_datatype(self, ind=0):
+ """Returns FINN DataType of input."""
+ return DataType[self.get_nodeattr("inputDataType")]
+
+ def get_output_datatype(self, ind=0):
+ """Returns FINN DataType of output."""
+ return DataType[self.get_nodeattr("inputDataType")]
+
+ def get_instream_width(self, ind=0):
+ """Returns input stream width."""
+ ibits = self.get_input_datatype().bitwidth()
+ pe = self.get_nodeattr("PE")
+ in_width = pe * ibits
+ return in_width
+
+ def get_outstream_width(self, ind=0):
+ """Returns output stream width."""
+ obits = self.get_output_datatype().bitwidth()
+ pe = self.get_nodeattr("PE")
+ out_width = pe * obits
+ return out_width
+
+ def get_number_output_values(self):
+ return self.get_num_output_streams() * np.prod(self.get_folded_output_shape()[1:-1])
+
+ def get_exp_cycles(self):
+ # Channels/PE * batch size * fmdim * fmdim
+ return np.prod(self.get_folded_output_shape()[:-1])
+
+ def execute_node(self, context, graph):
+ # passing input to both outputs to make
+ # abstraction layer executable
+ node = self.onnx_node
+ inp = context[node.input[0]]
+ exp_shape = self.get_normal_input_shape()
+
+ output = inp
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+ for outp in node.output:
+ context[outp] = output
+
+ def get_verilog_top_module_intf_names(self):
+ intf_names = super().get_verilog_top_module_intf_names()
+ n_outputs = self.get_num_output_streams()
+ sname = self.hls_sname()
+ intf_names["m_axis"] = []
+ for i in range(n_outputs):
+ intf_names["m_axis"].append(
+ ("out%d_%s" % (i, sname), self.get_outstream_width_padded())
+ )
+ return intf_names
+
+ def derive_characteristic_fxns(self, period):
+ n_inps = np.prod(self.get_folded_input_shape()[:-1])
+ io_dict = {
+ "inputs": {
+ "in0": [0 for i in range(n_inps)],
+ },
+ "outputs": {"out0": [], "out1": []},
+ }
+ super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py
new file mode 100644
index 0000000000..5767028ea7
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/fmpadding.py
@@ -0,0 +1,172 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class FMPadding(HWCustomOp):
+ """Abstraction layer for HW impplementation of FMPadding.
+ Pads input image by given amount."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {
+ # spatial size of input images
+ "ImgDim": ("ints", True, []), # [H, W] = [Y, X]
+ # total padding (per dimension) to apply
+ "Padding": (
+ "ints",
+ True,
+ [1, 1, 1, 1],
+ ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
+ # number of channels in input image
+ "NumChannels": ("i", True, 0),
+ # SIMD Input parallelism
+ "SIMD": ("i", False, 1),
+ # FINN input datatype
+ "inputDataType": ("s", True, ""),
+ # shape describing input vecs per execution
+ "numInputVectors": ("i", False, 1),
+ }
+ my_attrs.update(super().get_nodeattr_types())
+ return my_attrs
+
+ def get_padded_odim(self):
+ "Return the padded spatial size of the output."
+ idim_h, idim_w = self.get_nodeattr("ImgDim")
+ pad = self.get_nodeattr("Padding")
+ pad_h = pad[0] + pad[2]
+ pad_w = pad[1] + pad[3]
+ odim_h = idim_h + pad_h
+ odim_w = idim_w + pad_w
+ return [odim_h, odim_w]
+
+ def get_exp_cycles(self):
+ odim_h, odim_w = self.get_padded_odim()
+ channels = self.get_nodeattr("NumChannels")
+ simd = self.get_nodeattr("SIMD")
+ batch_size = self.get_nodeattr("numInputVectors")
+ exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
+ return int(exp_cycles)
+
+ def get_normal_input_shape(self, ind=0):
+ idim_h, idim_w = self.get_nodeattr("ImgDim")
+ num_ch = self.get_nodeattr("NumChannels")
+ ishape = (1, idim_h, idim_w, num_ch)
+ return ishape
+
+ def get_normal_output_shape(self, ind=0):
+ odim_h, odim_w = self.get_padded_odim()
+ num_ch = self.get_nodeattr("NumChannels")
+
+ oshape = (1, odim_h, odim_w, num_ch)
+ return oshape
+
+ def get_folded_input_shape(self, ind=0):
+ normal_ishape = list(self.get_normal_input_shape())
+ ifm_ch = self.get_nodeattr("NumChannels")
+ simd = self.get_nodeattr("SIMD")
+ assert ifm_ch % simd == 0, "SIMD must divide input channels"
+ fold = int(normal_ishape[-1] / simd)
+ folded_ishape = normal_ishape[:-1] + [fold, simd]
+ return tuple(folded_ishape)
+
+ def get_folded_output_shape(self, ind=0):
+ normal_oshape = list(self.get_normal_output_shape())
+ ifm_ch = self.get_nodeattr("NumChannels")
+ simd = self.get_nodeattr("SIMD")
+ assert ifm_ch % simd == 0, "SIMD must divide input channels"
+ fold = int(normal_oshape[-1] / simd)
+ folded_oshape = normal_oshape[:-1] + [fold, simd]
+ return tuple(folded_oshape)
+
+ def make_shape_compatible_op(self, model):
+ exp_ishape = self.get_normal_input_shape()
+ oshape = self.get_normal_output_shape()
+ ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+ assert ishape == exp_ishape, "Unexpect input shape for FMPadding."
+ return super().make_const_shape_op(oshape)
+
+ def infer_node_datatype(self, model):
+ node = self.onnx_node
+ idt = model.get_tensor_datatype(node.input[0])
+ if idt != self.get_input_datatype():
+ warn_str = "inputDataType changing for %s: %s -> %s " % (
+ node.name,
+ str(self.get_input_datatype()),
+ str(idt),
+ )
+ warnings.warn(warn_str)
+ self.set_nodeattr("inputDataType", idt.name)
+ model.set_tensor_datatype(node.output[0], idt)
+
+ def verify_node(self):
+ pass
+
+ def get_input_datatype(self, ind=0):
+ """Returns FINN DataType of input."""
+ ret = DataType[self.get_nodeattr("inputDataType")]
+ # the hlslib op always pads with zeros, so ensure that the DataType
+ # is able to represent zeros
+ assert ret.allowed(0), "FMPadding_Batch DataType must support zero"
+ return ret
+
+ def get_output_datatype(self, ind=0):
+ """Returns FINN DataType of output. (Same as input datatype)"""
+ return self.get_input_datatype()
+
+ def get_instream_width(self, ind=0):
+ ibits = self.get_input_datatype().bitwidth()
+ simd = self.get_nodeattr("SIMD")
+ return ibits * simd
+
+ def get_outstream_width(self, ind=0):
+ obits = self.get_output_datatype().bitwidth()
+ simd = self.get_nodeattr("SIMD")
+ return obits * simd
+
+ def get_number_output_values(self):
+ folded_oshape = self.get_folded_output_shape()
+ return np.prod(folded_oshape[:-1])
+
+ def execute_node(self, context, graph):
+ # simulate behavior with Python functionality
+ node = self.onnx_node
+ pad = self.get_nodeattr("Padding")
+ inp_values = context[node.input[0]]
+ oshape = context[node.output[0]].shape
+ result = np.pad(
+ inp_values, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant"
+ )
+ context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
deleted file mode 100644
index dfc55d283f..0000000000
--- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py
+++ /dev/null
@@ -1,391 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import numpy as np
-import os
-import warnings
-from qonnx.core.datatype import DataType
-
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-
-
-class FMPadding_Batch(HLSCustomOp):
- """Corresponds to finn-hlslib FMPadding_Batch function.
- Pads input image by given amount."""
-
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
-
- def get_nodeattr_types(self):
- my_attrs = {
- # spatial size of input images
- "ImgDim": ("ints", True, []), # [H, W] = [Y, X]
- # total padding (per dimension) to apply
- "Padding": (
- "ints",
- True,
- [1, 1, 1, 1],
- ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end]
- # number of channels in input image
- "NumChannels": ("i", True, 0),
- # SIMD Input parallelism
- "SIMD": ("i", False, 1),
- # FINN input datatype
- "inputDataType": ("s", True, ""),
- # shape describing input vecs per execution
- "numInputVectors": ("i", False, 1),
- }
- my_attrs.update(super().get_nodeattr_types())
- return my_attrs
-
- def get_padded_odim(self):
- "Return the padded spatial size of the output."
- idim_h, idim_w = self.get_nodeattr("ImgDim")
- pad = self.get_nodeattr("Padding")
- pad_h = pad[0] + pad[2]
- pad_w = pad[1] + pad[3]
- odim_h = idim_h + pad_h
- odim_w = idim_w + pad_w
- return [odim_h, odim_w]
-
- def get_exp_cycles(self):
- odim_h, odim_w = self.get_padded_odim()
- channels = self.get_nodeattr("NumChannels")
- simd = self.get_nodeattr("SIMD")
- batch_size = self.get_nodeattr("numInputVectors")
- exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
- return int(exp_cycles)
-
- def get_normal_input_shape(self, ind=0):
- idim_h, idim_w = self.get_nodeattr("ImgDim")
- num_ch = self.get_nodeattr("NumChannels")
- ishape = (1, idim_h, idim_w, num_ch)
- return ishape
-
- def get_normal_output_shape(self, ind=0):
- odim_h, odim_w = self.get_padded_odim()
- num_ch = self.get_nodeattr("NumChannels")
-
- oshape = (1, odim_h, odim_w, num_ch)
- return oshape
-
- def get_folded_input_shape(self, ind=0):
- normal_ishape = list(self.get_normal_input_shape())
- ifm_ch = self.get_nodeattr("NumChannels")
- simd = self.get_nodeattr("SIMD")
- assert ifm_ch % simd == 0, "SIMD must divide input channels"
- fold = int(normal_ishape[-1] / simd)
- folded_ishape = normal_ishape[:-1] + [fold, simd]
- return tuple(folded_ishape)
-
- def get_folded_output_shape(self, ind=0):
- normal_oshape = list(self.get_normal_output_shape())
- ifm_ch = self.get_nodeattr("NumChannels")
- simd = self.get_nodeattr("SIMD")
- assert ifm_ch % simd == 0, "SIMD must divide input channels"
- fold = int(normal_oshape[-1] / simd)
- folded_oshape = normal_oshape[:-1] + [fold, simd]
- return tuple(folded_oshape)
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- oshape = self.get_normal_output_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpect input shape for SameResize."
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- idt = model.get_tensor_datatype(node.input[0])
- if idt != self.get_input_datatype():
- warn_str = "inputDataType changing for %s: %s -> %s " % (
- node.name,
- str(self.get_input_datatype()),
- str(idt),
- )
- warnings.warn(warn_str)
- self.set_nodeattr("inputDataType", idt.name)
- model.set_tensor_datatype(node.output[0], idt)
-
- def verify_node(self):
- pass
-
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- ret = DataType[self.get_nodeattr("inputDataType")]
- # the hlslib op always pads with zeros, so ensure that the DataType
- # is able to represent zeros
- assert ret.allowed(0), "FMPadding_Batch DataType must support zero"
- return ret
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output. (Same as input datatype)"""
- return self.get_input_datatype()
-
- def get_instream_width(self, ind=0):
- ibits = self.get_input_datatype().bitwidth()
- simd = self.get_nodeattr("SIMD")
- return ibits * simd
-
- def get_outstream_width(self, ind=0):
- obits = self.get_output_datatype().bitwidth()
- simd = self.get_nodeattr("SIMD")
- return obits * simd
-
- def get_number_output_values(self):
- folded_oshape = self.get_folded_output_shape()
- return np.prod(folded_oshape[:-1])
-
- def global_includes(self):
- self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
-
- def defines(self, var):
- idim_h, idim_w = self.get_nodeattr("ImgDim")
- odim_h, odim_w = self.get_padded_odim()
- pad = self.get_nodeattr("Padding")
- pad_h = pad[0] + pad[2]
- pad_w = pad[1] + pad[3]
- is_square_img = idim_h == idim_w
- is_square_pad = pad_h == pad_w
-
- if is_square_img and is_square_pad:
- self.code_gen_dict["$DEFINES$"] = [
- """#define ImgDim1 {}\n#define OutputDim1 {}\n
- #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n
- #define NumChannels1 {}\n#define SIMD1 {}\n
- #define numReps {}\n""".format(
- idim_h,
- odim_h,
- pad[0],
- pad[2],
- self.get_nodeattr("NumChannels"),
- self.get_nodeattr("SIMD"),
- self.get_nodeattr("numInputVectors"),
- )
- ]
- else:
- self.code_gen_dict["$DEFINES$"] = [
- """
- #define OutputDim1_x {}\n
- #define OutputDim1_y {}\n
- #define PaddingLeft1 {}\n
- #define PaddingRight1 {}\n
- #define PaddingTop1 {}\n
- #define PaddingBottom1 {}\n
- #define NumChannels1 {}\n
- #define SIMD1 {}\n
- #define numReps {}\n
- """.format(
- odim_w,
- odim_h,
- pad[1],
- pad[3],
- pad[0],
- pad[2],
- self.get_nodeattr("NumChannels"),
- self.get_nodeattr("SIMD"),
- self.get_nodeattr("numInputVectors"),
- )
- ]
-
- def read_npy_data(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_input_datatype()
- if dtype == DataType["BIPOLAR"]:
- # use binary for bipolar storage
- dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_in = "%s/input_0.npy" % code_gen_dir
- self.code_gen_dict["$READNPYDATA$"] = []
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
- )
-
- def docompute(self):
- in_t = self.get_input_datatype().get_hls_datatype_str()
- node = self.onnx_node
-
- idim_h, idim_w = self.get_nodeattr("ImgDim")
- pad = self.get_nodeattr("Padding")
- pad_h = pad[0] + pad[2]
- pad_w = pad[1] + pad[3]
- is_square_img = idim_h == idim_w
- is_square_pad = pad_h == pad_w
-
- if is_square_img and is_square_pad:
- hls_call = node.op_type
- self.code_gen_dict["$DOCOMPUTE$"] = [
- """{} (in0, out, numReps);""".format(
- hls_call, in_t
- )
- ]
- else:
- hls_call = "FMPadding_nonsquare_Batch"
- self.code_gen_dict["$DOCOMPUTE$"] = [
- """{} (in0, out, numReps);""".format(
- hls_call, in_t
- )
- ]
-
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- if dtype == DataType["BIPOLAR"]:
- # use binary for bipolar storage
- dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
- def blackboxfunction(self):
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
- % (self.onnx_node.name, packed_hls_type, packed_hls_type)
- ]
-
- def pragmas(self):
- self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
- ]
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
-
- def execute_node(self, context, graph):
- mode = self.get_nodeattr("exec_mode")
- node = self.onnx_node
- exp_ishape = self.get_normal_input_shape()
- exp_oshape = self.get_normal_output_shape()
- folded_ishape = self.get_folded_input_shape()
-
- if mode == "cppsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- elif mode == "rtlsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
-
- inp = context[node.input[0]]
- assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert (
- inp.shape == exp_ishape
- ), """Input shape doesn't
- match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
- export_idt = self.get_input_datatype()
-
- reshaped_input = inp.reshape(folded_ishape)
- np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
-
- if mode == "cppsim":
- # execute the precompiled model
- super().exec_precompiled_singlenode_model()
- # load output npy file
- super().npy_to_dynamic_output(context)
- assert (
- context[node.output[0]].shape == exp_oshape
- ), "cppsim did not produce expected output shape"
- elif mode == "rtlsim":
- sim = self.get_rtlsim()
- nbits = self.get_instream_width()
- rtlsim_inp = npy_to_rtlsim_input(
- "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
- )
- super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
- odt = export_idt
- target_bits = odt.bitwidth()
- packed_bits = self.get_outstream_width()
- out_npy_path = "{}/output.npy".format(code_gen_dir)
- out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
- )
- # load and reshape output
- output = np.load(out_npy_path)
- output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
- context[node.output[0]] = output
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
- assert (
- context[node.output[0]].shape == exp_oshape
- ), """Output shape doesn't match expected shape
- (1, OutputDim_H, OutputDim_W, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py
new file mode 100644
index 0000000000..b1f9900070
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py
@@ -0,0 +1,175 @@
+# Copyright (c) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class FMPadding_Pixel(HWCustomOp):
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {
+ # spatial size of input images
+ "ImgDim": ("ints", True, []),
+ # stride to apply, can be non-square
+ "Stride": ("ints", True, []),
+ # number of channels in input image
+ "NumChannels": ("i", True, 0),
+ # SIMD Input parallelism
+ "SIMD": ("i", False, 1),
+ # FINN input datatype
+ "inputDataType": ("s", True, ""),
+ # shape describing input vecs per execution
+ "numInputVectors": ("i", False, 1),
+ }
+ my_attrs.update(super().get_nodeattr_types())
+ return my_attrs
+
+ def get_padded_odim(self):
+ "Return the padded spatial size of the output."
+ idim_h, idim_w = self.get_nodeattr("ImgDim")
+ stride_h, stride_w = self.get_nodeattr("Stride")
+ odim_h = idim_h + (idim_h - 1) * (stride_h - 1)
+ odim_w = idim_w + (idim_w - 1) * (stride_w - 1)
+ return [odim_h, odim_w]
+
+ def get_exp_cycles(self):
+ odim_h, odim_w = self.get_padded_odim()
+ channels = self.get_nodeattr("NumChannels")
+ simd = self.get_nodeattr("SIMD")
+ batch_size = self.get_nodeattr("numInputVectors")
+ exp_cycles = (channels / simd) * batch_size * odim_h * odim_w
+ return int(exp_cycles)
+
+ def get_normal_input_shape(self, ind=0):
+ idim_h, idim_w = self.get_nodeattr("ImgDim")
+ num_ch = self.get_nodeattr("NumChannels")
+ ishape = (1, idim_h, idim_w, num_ch)
+ return ishape
+
+ def get_normal_output_shape(self, ind=0):
+ odim_h, odim_w = self.get_padded_odim()
+ num_ch = self.get_nodeattr("NumChannels")
+ oshape = (1, odim_h, odim_w, num_ch)
+ return oshape
+
+ def get_folded_input_shape(self, ind=0):
+ normal_ishape = list(self.get_normal_input_shape())
+ ifm_ch = self.get_nodeattr("NumChannels")
+ simd = self.get_nodeattr("SIMD")
+ assert ifm_ch % simd == 0, "SIMD must divide input channels"
+ fold = int(normal_ishape[-1] / simd)
+ folded_ishape = normal_ishape[:-1] + [fold, simd]
+ return tuple(folded_ishape)
+
+ def get_folded_output_shape(self, ind=0):
+ normal_oshape = list(self.get_normal_output_shape())
+ ifm_ch = self.get_nodeattr("NumChannels")
+ simd = self.get_nodeattr("SIMD")
+ assert ifm_ch % simd == 0, "SIMD must divide input channels"
+ fold = int(normal_oshape[-1] / simd)
+ folded_oshape = normal_oshape[:-1] + [fold, simd]
+ return tuple(folded_oshape)
+
+ def make_shape_compatible_op(self, model):
+ exp_ishape = self.get_normal_input_shape()
+ oshape = self.get_normal_output_shape()
+ ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+ assert ishape == exp_ishape, "Unexpect input shape for FMPadding_Pixel."
+ return super().make_const_shape_op(oshape)
+
+ def infer_node_datatype(self, model):
+ node = self.onnx_node
+ idt = model.get_tensor_datatype(node.input[0])
+ if idt != self.get_input_datatype():
+ warn_str = "inputDataType changing for %s: %s -> %s " % (
+ node.name,
+ str(self.get_input_datatype()),
+ str(idt),
+ )
+ warnings.warn(warn_str)
+ self.set_nodeattr("inputDataType", idt.name)
+ model.set_tensor_datatype(node.output[0], idt)
+
+ def verify_node(self):
+ pass
+
+ def get_input_datatype(self, ind=0):
+ """Returns FINN DataType of input."""
+ ret = DataType[self.get_nodeattr("inputDataType")]
+ # the hlslib op always pads with zeros, so ensure that the DataType
+ # is able to represent zeros
+ assert ret.allowed(0), "FMPadding_Pixel DataType must support zero"
+ return ret
+
+ def get_output_datatype(self, ind=0):
+ """Returns FINN DataType of output. (Same as input datatype)"""
+ return self.get_input_datatype()
+
+ def get_instream_width(self, ind=0):
+ ibits = self.get_input_datatype().bitwidth()
+ simd = self.get_nodeattr("SIMD")
+ return ibits * simd
+
+ def get_outstream_width(self, ind=0):
+ obits = self.get_output_datatype().bitwidth()
+ simd = self.get_nodeattr("SIMD")
+ return obits * simd
+
+ def get_number_output_values(self):
+ folded_oshape = self.get_folded_output_shape()
+ return np.prod(folded_oshape[:-1])
+
+ def execute_node(self, context, graph):
+ # simulate behavior with Python functionality
+ node = self.onnx_node
+ s_h, s_w = self.get_nodeattr("Stride")
+ inp_values = context[node.input[0]]
+ ishape = inp_values.shape
+ result = np.zeros(
+ (
+ ishape[0],
+ ishape[1] + (ishape[1] - 1) * (s_h - 1),
+ ishape[2] + (ishape[2] - 1) * (s_w - 1),
+ ishape[3],
+ )
+ )
+ for b in range(ishape[0]):
+ for h in range(ishape[1]):
+ for w in range(ishape[2]):
+ oh = h * s_h
+ ow = w * s_w
+ result[b, oh, ow, :] = inp_values[b, h, w, :]
+ oshape = context[node.output[0]].shape
+ context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool.py b/src/finn/custom_op/fpgadataflow/globalaccpool.py
new file mode 100644
index 0000000000..4008cdc7c9
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/globalaccpool.py
@@ -0,0 +1,160 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import warnings
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
+
+
+class GlobalAccPool(HWCustomOp):
+ """Abstraction layer for HW implementation of GlobalAccPool"""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {
+ "NumChannels": ("i", True, 0),
+ "PE": ("i", True, 0),
+ # FINN DataTypes for input
+ "inputDataType": ("s", True, ""),
+ # number of input vectors, examples:
+ # [1] is a single vector (like a FC layer with batch=1)
+ # [4] is four vectors (like a FC layer with batch=4)
+ # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
+ "numInputVectors": ("ints", False, [1]),
+ }
+ my_attrs.update(super().get_nodeattr_types())
+ return my_attrs
+
+ def get_normal_input_shape(self, ind=0):
+ ch = self.get_nodeattr("NumChannels")
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ ishape = tuple(vecs + [ch])
+ return ishape
+
+ def get_folded_input_shape(self, ind=0):
+ ch = self.get_nodeattr("NumChannels")
+ pe = self.get_nodeattr("PE")
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ assert ch % pe == 0, "PE must divide NumChannels"
+ folds = int(ch / pe)
+ folded_ishape = tuple(vecs + [folds, pe])
+ return folded_ishape
+
+ def get_normal_output_shape(self, ind=0):
+ ch = self.get_nodeattr("NumChannels")
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ if len(vecs) == 1:
+ oshape = tuple(vecs + [ch])
+ elif len(vecs) == 3:
+ oshape = tuple([vecs[0]] + [1, 1, ch])
+ return oshape
+
+ def get_folded_output_shape(self, ind=0):
+ ch = self.get_nodeattr("NumChannels")
+ pe = self.get_nodeattr("PE")
+ unfolded_shape = list(self.get_normal_output_shape())
+ assert ch % pe == 0, "PE must divide NumChannels"
+ folds = int(ch / pe)
+ oshape = tuple(unfolded_shape[:-1] + [folds, pe])
+ return oshape
+
+ def make_shape_compatible_op(self, model):
+ exp_ishape = self.get_normal_input_shape()
+ oshape = self.get_normal_output_shape()
+ ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
+ assert ishape == exp_ishape, "Unexpected input shape."
+ return super().make_const_shape_op(oshape)
+
+ def infer_node_datatype(self, model):
+ node = self.onnx_node
+ idt = model.get_tensor_datatype(node.input[0])
+ if idt != self.get_input_datatype():
+ warn_str = "inputDataType changing for %s: %s -> %s " % (
+ node.name,
+ str(self.get_input_datatype()),
+ str(idt),
+ )
+ warnings.warn(warn_str)
+ self.set_nodeattr("inputDataType", idt.name)
+ odt = self.get_output_datatype()
+ model.set_tensor_datatype(self.onnx_node.output[0], odt)
+
+ def verify_node(self):
+ pass
+
+ def get_input_datatype(self, ind=0):
+ """Returns FINN DataType of input."""
+ return DataType[self.get_nodeattr("inputDataType")]
+
+ def get_output_datatype(self, ind=0):
+ """Returns FINN DataType of output."""
+ # determine data type from image size and input type
+ idt = DataType[self.get_nodeattr("inputDataType")]
+ vecs = list(self.get_nodeattr("numInputVectors"))
+ npixels = vecs[-1] * vecs[-2]
+ if idt.signed():
+ extreme_value = npixels * idt.min()
+ else:
+ extreme_value = npixels * idt.max()
+ return DataType.get_smallest_possible(extreme_value)
+
+ def get_instream_width(self, ind=0):
+ """Returns input stream width."""
+ ibits = self.get_input_datatype().bitwidth()
+ pe = self.get_nodeattr("PE")
+ in_width = pe * ibits
+ return in_width
+
+ def get_outstream_width(self, ind=0):
+ """Returns output stream width."""
+ obits = self.get_output_datatype().bitwidth()
+ pe = self.get_nodeattr("PE")
+ out_width = pe * obits
+ return out_width
+
+ def get_number_output_values(self):
+ return np.prod(self.get_folded_output_shape()[1:-1])
+
+ def get_exp_cycles(self):
+ # Channels/PE * batch size * idim * idim + Channels/PE
+ ch = self.get_nodeattr("NumChannels")
+ pe = self.get_nodeattr("PE")
+ folds = int(ch / pe)
+ return int(np.prod(self.get_folded_input_shape()[:-1]) + folds)
+
+ def execute_node(self, context, graph):
+ # simulate behavior with Python functionality
+ node = self.onnx_node
+ inp_values = context[node.input[0]]
+ oshape = context[node.output[0]].shape
+ result = np.apply_over_axes(np.sum, inp_values, [1, 2])
+ context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape)
diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
deleted file mode 100644
index e7fa5bc004..0000000000
--- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py
+++ /dev/null
@@ -1,340 +0,0 @@
-# Copyright (c) 2020, Xilinx
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-# list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# * Neither the name of FINN nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import numpy as np
-import os
-import warnings
-from qonnx.core.datatype import DataType
-
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
-from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-
-
-class GlobalAccPool_Batch(HLSCustomOp):
- """Class that corresponds to finn-hlslib AccPool_Batch function."""
-
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
-
- def get_nodeattr_types(self):
- my_attrs = {
- "NumChannels": ("i", True, 0),
- "PE": ("i", True, 0),
- # FINN DataTypes for input
- "inputDataType": ("s", True, ""),
- # number of input vectors, examples:
- # [1] is a single vector (like a FC layer with batch=1)
- # [4] is four vectors (like a FC layer with batch=4)
- # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
- "numInputVectors": ("ints", False, [1]),
- }
- my_attrs.update(super().get_nodeattr_types())
- return my_attrs
-
- def get_normal_input_shape(self, ind=0):
- ch = self.get_nodeattr("NumChannels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- ishape = tuple(vecs + [ch])
- return ishape
-
- def get_folded_input_shape(self, ind=0):
- ch = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- vecs = list(self.get_nodeattr("numInputVectors"))
- assert ch % pe == 0, "PE must divide NumChannels"
- folds = int(ch / pe)
- folded_ishape = tuple(vecs + [folds, pe])
- return folded_ishape
-
- def get_normal_output_shape(self, ind=0):
- ch = self.get_nodeattr("NumChannels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- if len(vecs) == 1:
- oshape = tuple(vecs + [ch])
- elif len(vecs) == 3:
- oshape = tuple([vecs[0]] + [1, 1, ch])
- return oshape
-
- def get_folded_output_shape(self, ind=0):
- ch = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- unfolded_shape = list(self.get_normal_output_shape())
- assert ch % pe == 0, "PE must divide NumChannels"
- folds = int(ch / pe)
- oshape = tuple(unfolded_shape[:-1] + [folds, pe])
- return oshape
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- oshape = self.get_normal_output_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpected input shape."
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- idt = model.get_tensor_datatype(node.input[0])
- if idt != self.get_input_datatype():
- warn_str = "inputDataType changing for %s: %s -> %s " % (
- node.name,
- str(self.get_input_datatype()),
- str(idt),
- )
- warnings.warn(warn_str)
- self.set_nodeattr("inputDataType", idt.name)
- odt = self.get_output_datatype()
- model.set_tensor_datatype(self.onnx_node.output[0], odt)
-
- def verify_node(self):
- info_messages = []
- # verify that "backend" is set to "fpgadataflow"
- backend_value = self.get_nodeattr("backend")
- if backend_value == "fpgadataflow":
- info_messages.append("Attribute backend is set correctly")
- else:
- info_messages.append('Attribute backend should be set to "fpgadataflow"')
-
- # verify that all necessary attributes exist
- try:
- self.get_nodeattr("code_gen_dir_cppsim")
- self.get_nodeattr("executable_path")
- self.get_nodeattr("NumChannels")
- self.get_nodeattr("PE")
- self.get_nodeattr("inputDataType")
- info_messages.append("All necessary attributes exist")
- except Exception:
- info_messages.append(
- """The required GlobalAccPool_Batch attributes do not exist."""
- )
-
- # verify that input data is 2D
- if len(self.get_nodeattr("numInputVectors")) != 3:
- info_messages.append("""GlobalAccPool_Batch requires 2D data input.""")
- raise Exception
-
- return info_messages
-
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("inputDataType")]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- # determine data type from image size and input type
- idt = DataType[self.get_nodeattr("inputDataType")]
- vecs = list(self.get_nodeattr("numInputVectors"))
- npixels = vecs[-1] * vecs[-2]
- if idt.signed():
- extreme_value = npixels * idt.min()
- else:
- extreme_value = npixels * idt.max()
- return DataType.get_smallest_possible(extreme_value)
-
- def get_instream_width(self, ind=0):
- """Returns input stream width."""
- ibits = self.get_input_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- in_width = pe * ibits
- return in_width
-
- def get_outstream_width(self, ind=0):
- """Returns output stream width."""
- obits = self.get_output_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- out_width = pe * obits
- return out_width
-
- def get_number_output_values(self):
- return np.prod(self.get_folded_output_shape()[1:-1])
-
- def get_exp_cycles(self):
- # Channels/PE * batch size * idim * idim + Channels/PE
- ch = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- folds = int(ch / pe)
- return int(np.prod(self.get_folded_input_shape()[:-1]) + folds)
-
- def execute_node(self, context, graph):
- mode = self.get_nodeattr("exec_mode")
- node = self.onnx_node
- exp_ishape = self.get_normal_input_shape()
- exp_oshape = self.get_normal_output_shape()
- folded_ishape = self.get_folded_input_shape()
-
- if mode == "cppsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- elif mode == "rtlsim":
- code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
-
- inp = context[node.input[0]]
- assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert inp.shape == exp_ishape, """Input shape doesn't match expected shape ."""
- export_idt = self.get_input_datatype()
- # reshape input into folded form
- inp = inp.reshape(folded_ishape)
- # make copy before saving array
- reshaped_input = inp.copy()
- np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
-
- if mode == "cppsim":
- # execute the precompiled model
- super().exec_precompiled_singlenode_model()
- # load output npy file
- super().npy_to_dynamic_output(context)
- assert (
- context[node.output[0]].shape == exp_oshape
- ), "cppsim \
- did not produce expected output shape"
- elif mode == "rtlsim":
- sim = self.get_rtlsim()
- nbits = self.get_instream_width()
- rtlsim_inp = npy_to_rtlsim_input(
- "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
- )
- super().reset_rtlsim(sim)
- super().toggle_clk(sim)
- rtlsim_output = self.rtlsim(sim, rtlsim_inp)
- odt = self.get_output_datatype()
- target_bits = odt.bitwidth()
- packed_bits = self.get_outstream_width()
- out_npy_path = "{}/output.npy".format(code_gen_dir)
- out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
- )
- # load and reshape output
- output = np.load(out_npy_path)
- output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
- context[node.output[0]] = output
- else:
- raise Exception(
- """Invalid value for attribute exec_mode! Is currently set to: {}
- has to be set to one of the following value ("cppsim", "rtlsim")""".format(
- mode
- )
- )
-
- assert (
- context[node.output[0]].shape == exp_oshape
- ), """Output shape doesn't match expected shape."""
-
- def global_includes(self):
- self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
-
- def defines(self, var):
- self.code_gen_dict["$DEFINES$"] = []
-
- def read_npy_data(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_input_datatype()
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_in = "%s/input_0.npy" % code_gen_dir
- self.code_gen_dict["$READNPYDATA$"] = []
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
- )
-
- def docompute(self):
- self.code_gen_dict["$DOCOMPUTE$"] = [
- """AccPool_Batch<{}, {}, {}, {}, {}> (in0, out, 1);""".format(
- self.get_normal_input_shape()[1],
- self.get_nodeattr("NumChannels"),
- self.get_input_datatype().get_hls_datatype_str(),
- self.get_nodeattr("PE"),
- self.get_output_datatype().get_hls_datatype_str(),
- )
- ]
-
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
- def blackboxfunction(self):
- self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
- hls::stream> &out)""".format(
- self.onnx_node.name,
- self.get_instream_width(),
- self.get_outstream_width(),
- )
- ]
-
- def pragmas(self):
- self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
- ]
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py
new file mode 100644
index 0000000000..405c47a08d
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py
@@ -0,0 +1,81 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls
+from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls
+from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls
+from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls
+from finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls import (
+ ConvolutionInputGenerator_hls,
+)
+from finn.custom_op.fpgadataflow.hls.downsampler_hls import DownSampler_hls
+from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls
+from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls
+from finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls import FMPadding_Pixel_hls
+from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls
+from finn.custom_op.fpgadataflow.hls.iodma_hls import IODMA_hls
+from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls
+from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls
+from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls
+from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls
+from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import (
+ StreamingDataWidthConverter_hls,
+)
+from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls
+from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls
+from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls
+from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls
+from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls
+from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls
+
+custom_op = dict()
+
+# make sure new HLSCustomOp subclasses are imported here so that they get
+# registered and plug in correctly into the infrastructure
+custom_op["AddStreams_hls"] = AddStreams_hls
+custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls
+custom_op["CheckSum_hls"] = CheckSum_hls
+custom_op["ConvolutionInputGenerator_hls"] = ConvolutionInputGenerator_hls
+custom_op["DownSampler_hls"] = DownSampler_hls
+custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls
+custom_op["FMPadding_hls"] = FMPadding_hls
+custom_op["FMPadding_Pixel_hls"] = FMPadding_Pixel_hls
+custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls
+custom_op["IODMA_hls"] = IODMA_hls
+custom_op["LabelSelect_hls"] = LabelSelect_hls
+custom_op["Lookup_hls"] = Lookup_hls
+custom_op["Pool_hls"] = Pool_hls
+custom_op["StreamingConcat_hls"] = StreamingConcat_hls
+custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls
+custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls
+custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls
+custom_op["Thresholding_hls"] = Thresholding_hls
+custom_op["TLastMarker_hls"] = TLastMarker_hls
+custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls
+custom_op["MVAU_hls"] = MVAU_hls
+custom_op["VVAU_hls"] = VVAU_hls
diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py
similarity index 53%
rename from src/finn/custom_op/fpgadataflow/addstreams_batch.py
rename to src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py
index cd0af6b3ab..a3f0e043f8 100644
--- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -28,81 +28,24 @@
import numpy as np
import os
-import warnings
-from qonnx.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.addstreams import AddStreams
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-class AddStreams_Batch(HLSCustomOp):
+class AddStreams_hls(AddStreams, HLSBackend):
"""Class that corresponds to finn-hlslib AddStreams_Batch function."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
- my_attrs = super().get_nodeattr_types()
- my_attrs.update(
- {
- "NumChannels": ("i", True, ""),
- "PE": ("i", True, ""),
- # FINN DataTypes for inputs; output datatype inferred from input
- "inputDataType": ("s", True, ""),
- # number of input vectors, examples:
- # [1] is a single vector (like a FC layer with batch=1)
- # [4] is four vectors (like a FC layer with batch=4)
- # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
- "numInputVectors": ("ints", False, [1]),
- "inFIFODepths": ("ints", False, [2, 2]),
- }
- )
+ my_attrs = {}
+ my_attrs.update(AddStreams.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def get_normal_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- ishape = tuple(vecs + [ich])
- return ishape
-
- def get_folded_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- assert ich % pe == 0, "PE must divide NumChannels"
- vecs = list(self.get_nodeattr("numInputVectors"))
- ishape = tuple(vecs + [ich // pe, pe])
- return ishape
-
- def get_normal_output_shape(self, ind=0):
- return self.get_normal_input_shape()
-
- def get_folded_output_shape(self, ind=0):
- return self.get_folded_input_shape()
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- oshape = self.get_normal_output_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpected input1 shape."
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
- assert ishape == exp_ishape, "Unexpected input2 shape."
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- idt = model.get_tensor_datatype(node.input[0])
- if idt != self.get_input_datatype():
- warn_str = "inputDataType changing for %s: %s -> %s " % (
- node.name,
- str(self.get_input_datatype()),
- str(idt),
- )
- warnings.warn(warn_str)
- self.set_nodeattr("inputDataType", idt.name)
- # enforce output data type (calculated based on idt)
- odt = self.get_output_datatype()
- model.set_tensor_datatype(self.onnx_node.output[0], odt)
-
def verify_node(self):
info_messages = []
# verify that "backend" is set to "fpgadataflow"
@@ -121,48 +64,10 @@ def verify_node(self):
self.get_nodeattr("inputDataType")
info_messages.append("All necessary attributes exist")
except Exception:
- info_messages.append(
- """The required LabelSelect_Batch attributes do not exist."""
- )
+ info_messages.append("""The required LabelSelect_Batch attributes do not exist.""")
return info_messages
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("inputDataType")]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- # we need to set output datatype to the next larger int or uint
- # enhancement: consider specifying w/ explicit outputDataType attribute
- # to allow overflow and use the same idt if user wants
- idt = DataType[self.get_nodeattr("inputDataType")]
- if idt.signed():
- return DataType.get_smallest_possible(2 * idt.min())
- else:
- return DataType.get_smallest_possible(2 * idt.max())
-
- def get_instream_width(self, ind=0):
- """Returns input stream width."""
- ibits = self.get_input_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- in_width = pe * ibits
- return in_width
-
- def get_outstream_width(self, ind=0):
- """Returns output stream width."""
- obits = self.get_output_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- out_width = pe * obits
- return out_width
-
- def get_number_output_values(self):
- return np.prod(self.get_folded_output_shape()[:-1])
-
- def get_exp_cycles(self):
- # Channels/PE * batch size * fmdim * fmdim
- return np.prod(self.get_folded_output_shape()[:-1])
-
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
node = self.onnx_node
@@ -184,9 +89,7 @@ def execute_node(self, context, graph):
inp = context[node.input[0]]
assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert (
- inp.shape == exp_ishape
- ), """Input0 shape doesn't match expected shape ."""
+ assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape ."""
export_idt = self.get_input_datatype()
# reshape input into folded form
inp = inp.reshape(folded_ishape)
@@ -197,9 +100,7 @@ def execute_node(self, context, graph):
# exact same thing for input1
inp = context[node.input[1]]
assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert (
- inp.shape == exp_ishape
- ), """Input1 shape doesn't match expected shape ."""
+ assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape ."""
export_idt = self.get_input_datatype()
# reshape input into folded form
inp = inp.reshape(folded_ishape)
@@ -268,106 +169,85 @@ def read_npy_data(self):
self.code_gen_dict["$READNPYDATA$"] = []
npy_in = "%s/input_0.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
npy_in = "%s/input_1.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in1);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
def strm_decl(self):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in1 ("in1");'.format(self.get_instream_width())
+ 'hls::stream> in1_{} ("in1_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
)
def docompute(self):
- node = self.onnx_node
+ hls_call = "AddStreams_Batch"
self.code_gen_dict["$DOCOMPUTE$"] = [
- """{}<{}, {}, {}, {}, {}> (in0, in1, out, 1);""".format(
- node.op_type,
+ """{}<{}, {}, {}, {}, {}> (in0_{}, in1_{}, out_{}, 1);""".format(
+ hls_call,
self.get_nodeattr("PE"),
self.get_input_datatype().get_hls_datatype_str(),
self.get_input_datatype().get_hls_datatype_str(),
self.get_output_datatype().get_hls_datatype_str(),
self.get_number_output_values(),
+ self.hls_sname(),
+ self.hls_sname(),
+ self.hls_sname(),
)
]
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0, hls::stream> &in1,
- hls::stream> &out)""".format(
+ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{},
+ hls::stream> &out_{})""".format(
self.onnx_node.name,
self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(),
+ self.hls_sname(),
self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(),
+ self.hls_sname(),
self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(),
+ self.hls_sname(),
)
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
]
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname()
)
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
)
-
- def get_verilog_top_module_intf_names(self):
- intf_names = super().get_verilog_top_module_intf_names()
- sname = self.hls_sname()
- swidth = self.get_instream_width_padded()
- intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
- return intf_names
-
- def derive_characteristic_fxns(self, period):
- n_inps = np.prod(self.get_folded_input_shape()[:-1])
- io_dict = {
- "inputs": {
- "in0": [0 for i in range(n_inps)],
- "in1": [0 for i in range(n_inps)],
- },
- "outputs": {"out": []},
- }
- super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py
similarity index 66%
rename from src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
rename to src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py
index 46adca680d..14efa113dd 100644
--- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py
+++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -28,19 +28,17 @@
import numpy as np
import os
-import warnings
from math import ceil
from qonnx.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
from finn.util.data_packing import (
npy_to_rtlsim_input,
numpy_to_hls_code,
rtlsim_output_to_npy,
)
-from . import templates
-
# ONNX i/o tensor shape assumptions for channelwise ops:
# input 0 is the input tensor, shape (..., NumChannels)
# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel)
@@ -48,118 +46,21 @@
# the ... here can be any shape (representing groups of vectors)
-def get_smallest_possible(vals):
- """Returns smallest (fewest bits) possible DataType that can represent
- value. Prefers unsigned integers where possible."""
- vals = np.array(vals, dtype=np.float64)
- for v in vals:
- assert int(v) == v, "Error float value"
-
- for k in DataType.get_accumulator_dt_cands():
- dt = DataType[k]
-
- if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]:
- # not currently supported
- continue
-
- if (dt.min() <= vals).all() and (vals <= dt.max()).all():
- return dt
-
- warnings.warn(
- """InferChannelwiseLinearLayer: Output values may not be
- representable with supported data types.
- Setting maximum width data type available.
- This will lead to errors if there are no constrains on the input
- """
- )
-
- if (0 <= vals).all():
- return DataType["UINT64"]
- else:
- return DataType["INT64"]
-
-
-class ChannelwiseOp_Batch(HLSCustomOp):
+class ChannelwiseOp_hls(ChannelwiseOp, HLSBackend):
"""Class that corresponds to finn-hls Thresholding_Batch function.
It can implement a variety of channel-wise parametrized operations,
including Add, Mul and multi-thresholding.
"""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
- self.decoupled_wrapper = templates.decoupled_wrapper
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
- my_attrs = {
- # channelwise "map" function to apply:
- # one of cmp_le, cmp_ge, add, mul
- "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}),
- "PE": ("i", True, 0),
- "NumChannels": ("i", True, 0),
- # string defining memory resource type for parameters
- "ram_style": ("s", False, "distributed", {"distributed", "block"}),
- # FINN DataTypes for inputs, weights, outputs
- "inputDataType": ("s", True, ""),
- "paramDataType": ("s", True, ""),
- "outputDataType": ("s", True, ""),
- # number of input vectors, examples:
- # [1] is a single vector (like a FC layer with batch=1)
- # [4] is four vectors (like a FC layer with batch=4)
- # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
- "numInputVectors": ("ints", False, [1]),
- }
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs = {}
+ my_attrs.update(ChannelwiseOp.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def calc_tmem(self):
- """Calculates and returns TMEM, the depth of the memory used
- to store the channelwise op parameters."""
- chn = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- return chn // pe
-
- def make_shape_compatible_op(self, model):
- oshape = self.get_normal_output_shape()
- # implement tensor with correct shape
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- # check input datatype against property
- idt = model.get_tensor_datatype(node.input[0])
-
- exp_idt_name = self.get_nodeattr("inputDataType")
- if exp_idt_name != idt.name:
- func = self.get_nodeattr("Func")
- assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer"
-
- self.set_nodeattr("inputDataType", idt.name)
- # update the func in ['add','mul'] cases
-
- # get parameter ranges
- param = model.get_initializer(node.input[1])
- param_min = min(param.flatten())
- param_max = max(param.flatten())
-
- # set function and determine output data type
- if func == "add":
- out_min = idt.min() + param_min
- out_max = idt.max() + param_max
- odt = get_smallest_possible([out_min, out_max])
- elif func == "mul":
- possible_limits = []
- possible_limits += [idt.min() * param_min]
- possible_limits += [idt.min() * param_max]
- possible_limits += [idt.max() * param_min]
- possible_limits += [idt.max() * param_max]
- odt = get_smallest_possible(possible_limits)
-
- self.set_nodeattr("outputDataType", odt.name)
-
- # set output datatype from property
- odt = self.get_output_datatype()
- model.set_tensor_datatype(node.output[0], odt)
-
def verify_node(self):
info_messages = []
# verify that "backend" is set to "fpgadataflow"
@@ -181,9 +82,7 @@ def verify_node(self):
self.get_nodeattr("outputDataType")
info_messages.append("All necessary attributes exist")
except Exception:
- info_messages.append(
- """The required Threshold_Batch attributes do not exist."""
- )
+ info_messages.append("""The required Threshold_Batch attributes do not exist.""")
return info_messages
@@ -218,52 +117,6 @@ def lut_estimation(self):
# total cost
return comparator_cost + lutram_cost
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("inputDataType")]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- return DataType[self.get_nodeattr("outputDataType")]
-
- def get_instream_width(self, ind=0):
- i_bits = self.get_input_datatype().bitwidth()
- return i_bits * self.get_nodeattr("PE")
-
- def get_outstream_width(self, ind=0):
- o_bits = self.get_output_datatype().bitwidth()
- return o_bits * self.get_nodeattr("PE")
-
- def get_folded_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- fold = ich // pe
- vecs = list(self.get_nodeattr("numInputVectors"))
- folded_input_shape = tuple(vecs + [fold, pe])
- return folded_input_shape
-
- def get_folded_output_shape(self, ind=0):
- # same shape as input
- return self.get_folded_input_shape()
-
- def get_normal_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- normal_input_shape = tuple(vecs + [ich])
- return normal_input_shape
-
- def get_normal_output_shape(self, ind=0):
- # same shape as input
- return self.get_normal_input_shape()
-
- def get_number_output_values(self):
- nf = np.prod(self.get_folded_output_shape()[:-1])
- return nf
-
- def get_exp_cycles(self):
- # Channels/PE * batch size * fmdim * fmdim
- return np.prod(self.get_folded_output_shape()[:-1])
-
def get_template_param_values(self):
"""Returns the template parameter values according to input, output and weight
data types."""
@@ -303,9 +156,7 @@ def get_hls_compatible_parameter_tensor(self, orig_param_vector):
assert (orig_param_vector.astype(np.int32) == orig_param_vector).all()
ret = orig_param_vector
- assert (
- ret.shape[0] == chn
- ), "Cardinality of parameter vector is not as expected (chn)"
+ assert ret.shape[0] == chn, "Cardinality of parameter vector is not as expected (chn)"
# distribute rows between PEs
ret = ret.reshape(tmem, pe).transpose()
@@ -327,9 +178,7 @@ def generate_params(self, model, path):
parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters)
pdt = DataType[self.get_nodeattr("paramDataType")]
- parameters_hls_code = numpy_to_hls_code(
- parameter_tensor, pdt, "parameters", False, True
- )
+ parameters_hls_code = numpy_to_hls_code(parameter_tensor, pdt, "parameters", False, True)
# get input data type
export_idt = self.get_input_datatype()
if self.get_input_datatype() == DataType["BIPOLAR"]:
@@ -433,9 +282,7 @@ def execute_node(self, context, graph):
elif mode == "rtlsim":
sim = self.get_rtlsim()
nbits = self.get_instream_width()
- inp = npy_to_rtlsim_input(
- "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
- )
+ inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
super().toggle_clk(sim)
output = self.rtlsim(sim, inp)
@@ -444,9 +291,7 @@ def execute_node(self, context, graph):
packed_bits = self.get_outstream_width()
out_npy_path = "{}/output.npy".format(code_gen_dir)
out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- output, out_npy_path, odt, out_shape, packed_bits, target_bits
- )
+ rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
# load and reshape output
output = np.load(out_npy_path)
@@ -465,7 +310,6 @@ def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"']
- # TODO check and add whatever missing
def defines(self, var):
numInputVectors = list(self.get_nodeattr("numInputVectors"))
numReps = numInputVectors[0]
@@ -489,17 +333,15 @@ def read_npy_data(self):
self.code_gen_dict["$READNPYDATA$"] = []
# note: the innermost dim is reversed for the input
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
def docompute(self):
@@ -515,10 +357,12 @@ def docompute(self):
raise Exception("""Unexpeted input shape""")
self.code_gen_dict["$DOCOMPUTE$"] = [
"""Thresholding_Batch<{}, NumChannels1, PE1, {}, {}>
- (in0, out, threshs, numReps);""".format(
+ (in0_{}, out_{}, threshs, numReps);""".format(
spatial_dim,
tmpl_args["TSrcI"],
tmpl_args["TDstI"],
+ self.hls_sname(),
+ self.hls_sname(),
)
]
@@ -539,58 +383,46 @@ def dataoutstrm(self):
# note: the innermost dim is not reversed for the output
self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
% (
packed_hls_type,
elem_hls_type,
elem_bits,
npy_type,
+ self.hls_sname(),
shape_cpp_str,
npy_out,
)
]
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
- hls::stream> &out
+ """void {}(hls::stream> &in0_{},
+ hls::stream> &out_{}
)""".format(
self.onnx_node.name,
self.get_instream_width(),
+ self.hls_sname(),
self.get_outstream_width(),
+ self.hls_sname(),
)
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
]
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
)
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
# the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL]
# partition for parallel access along PE and N_PARAMS_PER_CHANNEL
# dimensions (dims 1 and 3)
self.code_gen_dict["$PRAGMAS$"].append(
- (
- "#pragma HLS ARRAY_PARTITION variable=threshs.parameters "
- "complete dim=1"
- )
+ ("#pragma HLS ARRAY_PARTITION variable=threshs.parameters " "complete dim=1")
)
- # self.code_gen_dict["$PRAGMAS$"].append(
- # (
- # "#pragma HLS ARRAY_PARTITION variable=threshs.parameters "
- # "complete dim=3"
- # )
- # )
-
# set resource type
ram_style = self.get_nodeattr("ram_style")
pe = self.get_nodeattr("PE")
@@ -600,17 +432,11 @@ def pragmas(self):
if pe < ich:
if ram_style == "distributed":
self.code_gen_dict["$PRAGMAS$"].append(
- (
- "#pragma HLS RESOURCE variable=threshs.parameters "
- "core=ROM_2P_LUTRAM"
- )
+ ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_LUTRAM")
)
elif ram_style == "block":
self.code_gen_dict["$PRAGMAS$"].append(
- (
- "#pragma HLS RESOURCE variable=threshs.parameters "
- "core=ROM_2P_BRAM"
- )
+ ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_BRAM")
)
else:
raise Exception(
diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
similarity index 86%
rename from src/finn/custom_op/fpgadataflow/checksum.py
rename to src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
index c927c07df2..8a72ca3c6c 100644
--- a/src/finn/custom_op/fpgadataflow/checksum.py
+++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py
@@ -1,4 +1,5 @@
# Copyright (c) 2022, Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -31,15 +32,16 @@
import warnings
from qonnx.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-class CheckSum(HLSCustomOp):
+class CheckSum_hls(HWCustomOp, HLSBackend):
"""Class that corresponds to custom_hls checksum function."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
@@ -52,7 +54,8 @@ def get_nodeattr_types(self):
# folded shape of input/output
"folded_shape": ("ints", True, []),
}
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs.update(HWCustomOp.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
def make_shape_compatible_op(self, model):
@@ -183,9 +186,7 @@ def execute_node(self, context, graph):
np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
sim = self.get_rtlsim()
nbits = self.get_instream_width()
- inp = npy_to_rtlsim_input(
- "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
- )
+ inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
super().toggle_clk(sim)
io_dict = {
@@ -199,9 +200,7 @@ def execute_node(self, context, graph):
packed_bits = self.get_outstream_width()
out_npy_path = "{}/output.npy".format(code_gen_dir)
out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- output, out_npy_path, odt, out_shape, packed_bits, target_bits
- )
+ rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
# load and reshape output
output = np.load(out_npy_path)
@@ -241,17 +240,28 @@ def read_npy_data(self):
self.code_gen_dict["$READNPYDATA$"] = []
# note: the innermost dim is reversed for the input
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
def strm_decl(self):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append("ap_uint<32> chk;")
# set drain = false for cppsim
@@ -259,7 +269,8 @@ def strm_decl(self):
def docompute(self):
self.code_gen_dict["$DOCOMPUTE$"] = [
- """checksum(in0, out, chk, drain);"""
+ """checksum(in0_%s, out_%s, chk, drain);"""
+ % (self.hls_sname(), self.hls_sname())
]
def dataoutstrm(self):
@@ -279,38 +290,35 @@ def dataoutstrm(self):
# note: the innermost dim is not reversed for the output
self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
% (
packed_hls_type,
elem_hls_type,
elem_bits,
npy_type,
+ self.hls_sname(),
shape_cpp_str,
npy_out,
),
"std::vector checksum(1);",
"checksum[0] = chk;",
- 'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");'
- % code_gen_dir,
+ 'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");' % code_gen_dir,
]
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """using T = ap_uint;\n void {}(hls::stream &in0,
- hls::stream &out, ap_uint<32> &chk, ap_uint<1> &drain)""".format(
- self.onnx_node.name
+ """using T = ap_uint;\n void {}(hls::stream &in0_{},
+ hls::stream &out_{}, ap_uint<32> &chk, ap_uint<1> &drain)""".format(
+ self.onnx_node.name, self.hls_sname(), self.hls_sname()
)
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS interface axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS interface axis port=in0_" + self.hls_sname()
]
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS interface axis port=out name=out_" + self.hls_sname()
+ "#pragma HLS interface axis port=out_" + self.hls_sname()
)
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS interface s_axilite port=chk bundle=checksum"
@@ -318,13 +326,9 @@ def pragmas(self):
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS interface s_axilite port=drain bundle=checksum"
)
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS interface ap_ctrl_none port=return"
- )
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS interface ap_ctrl_none port=return")
self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow")
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS dataflow disable_start_propagation"
- )
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow disable_start_propagation")
def get_verilog_top_module_intf_names(self):
intf_names = super().get_verilog_top_module_intf_names()
diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
new file mode 100644
index 0000000000..008fa9cee8
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py
@@ -0,0 +1,267 @@
+# Copyright (c) 2021, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow.concat import StreamingConcat
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingConcat_hls(StreamingConcat, HLSBackend):
+ """Streaming concatenation node with dynamically generated HLS.
+ Only supports concatenating along the last axis."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(StreamingConcat.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def generate_params(self, model, path):
+ elems_per_stream = self.get_nodeattr("ElemsPerStream")
+ inp_streams = []
+ commands = []
+ idt = self.get_input_datatype()
+ total_elems = self.get_total_elems()
+ total_bw = idt.bitwidth() * total_elems
+ for i, elems in enumerate(elems_per_stream):
+ bw = idt.bitwidth() * elems
+ inp_stream = "hls::stream > &in%d" % (bw, i)
+ inp_streams.append(inp_stream)
+ cmd = "in%d.read()" % i
+ commands.append(cmd)
+ out_stream = "hls::stream > &out" % (total_bw)
+ inp_streams.append(out_stream)
+
+ impl_hls_code = []
+ impl_hls_code.append("void StreamingConcat(")
+ impl_hls_code.append(",".join(inp_streams))
+ impl_hls_code.append(", unsigned int numReps) {")
+ impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {")
+ impl_hls_code.append("#pragma HLS PIPELINE II=1")
+ impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw)
+ # FIXME: the order of streams for concatenation works out differently
+ # for cppsim vs rtlsim, addressed via reversing the order of commands
+ # for now
+ impl_hls_code.append("#ifdef __SYNTHESIS__")
+ impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");")
+ impl_hls_code.append("#else")
+ impl_hls_code.append("out_elem = (" + ",".join(commands) + ");")
+ impl_hls_code.append("#endif")
+ impl_hls_code.append("out.write(out_elem);")
+ impl_hls_code.append("}")
+ impl_hls_code.append("}")
+ impl_hls_code = "\n".join(impl_hls_code)
+
+ impl_filename = "{}/concat_impl.hpp".format(path)
+ f_impl = open(impl_filename, "w")
+ f_impl.write(impl_hls_code)
+ f_impl.close()
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ n_inps = len(self.onnx_node.input)
+ ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)]
+ folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)]
+ exp_oshape = self.get_normal_output_shape()
+ folded_oshape = self.get_folded_output_shape()
+ export_idt = self.get_input_datatype()
+
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ for i in range(n_inps):
+ inp = context[node.input[i]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i]
+ # reshape input into folded form
+ inp = inp.reshape(folded_ishapes[i])
+ # make copy before saving array
+ reshaped_input = inp.copy()
+ np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == folded_oshape
+ ), "cppsim did not produce expected folded output shape"
+ context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ io_dict = {"inputs": {}, "outputs": {"out": []}}
+ for i in range(n_inps):
+ nbits = self.get_instream_width(i)
+ rtlsim_inp = npy_to_rtlsim_input(
+ "%s/input_%d.npy" % (code_gen_dir, i),
+ export_idt,
+ nbits,
+ reverse_inner=True,
+ )
+ io_dict["inputs"]["in%d" % i] = rtlsim_inp
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+
+ self.rtlsim_multi_io(sim, io_dict)
+ rtlsim_output = io_dict["outputs"]["out"]
+ odt = self.get_output_datatype()
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output,
+ out_npy_path,
+ odt,
+ out_shape,
+ packed_bits,
+ target_bits,
+ reverse_inner=True,
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output shape doesn't match expected shape."""
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"']
+
+ def defines(self, var):
+ num_reps = self.get_nodeattr("numInputVectors")
+ num_reps = np.prod(num_reps)
+ self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps]
+
+ def read_npy_data(self):
+ n_inputs = self.get_n_inputs()
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ npy_type = "float"
+ self.code_gen_dict["$READNPYDATA$"] = []
+ idt = self.get_input_datatype()
+ idt_bw = idt.bitwidth()
+ elem_hls_type = idt.get_hls_datatype_str()
+ elem_bits = idt_bw
+ for i in range(n_inputs):
+ packed_bits = self.get_instream_width(i)
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ npy_in = "%s/input_%d.npy" % (code_gen_dir, i)
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ i,
+ self.hls_sname(),
+ )
+ )
+
+ def strm_decl(self):
+ self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+ n_inputs = self.get_n_inputs()
+ for i in range(n_inputs):
+ packed_bits = self.get_instream_width(i)
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ stream_name = "in%d_%s" % (i, self.hls_sname())
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name)
+ )
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+
+ def docompute(self):
+ self.code_gen_dict["$DOCOMPUTE$"] = []
+ n_inputs = self.get_n_inputs()
+ in_streams = []
+ for i in range(n_inputs):
+ in_streams.append("in%d_%s" % (i, self.hls_sname()))
+ in_stream_names = ",".join(in_streams)
+ comp_call = "StreamingConcat(%s, out_%s, NumReps);" % (
+ in_stream_names,
+ self.hls_sname(),
+ )
+ self.code_gen_dict["$DOCOMPUTE$"] = [comp_call]
+
+ def blackboxfunction(self):
+ n_inputs = self.get_n_inputs()
+ in_streams = []
+ for i in range(n_inputs):
+ iwidth = self.get_instream_width(i)
+ in_streams.append("hls::stream> &in%d_%s" % (iwidth, i, self.hls_sname()))
+ in_streams = ",".join(in_streams)
+ total_width = self.get_input_datatype().bitwidth() * self.get_total_elems()
+ out_stream = "hls::stream> &out_%s" % (
+ total_width,
+ self.hls_sname(),
+ )
+ blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream)
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls]
+
+ def pragmas(self):
+ n_inputs = self.get_n_inputs()
+ pragmas = []
+ for i in range(n_inputs):
+ pragmas.append("#pragma HLS INTERFACE axis port=in%d_%s" % (i, self.hls_sname()))
+ self.code_gen_dict["$PRAGMAS$"] = pragmas
+ self.code_gen_dict["$PRAGMAS$"].append(
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+ )
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py
similarity index 52%
rename from src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
rename to src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py
index f1c84662cc..4a5c02ee06 100644
--- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py
+++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py
@@ -1,4 +1,5 @@
# Copyright (c) 2020, Xilinx
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -31,15 +32,13 @@
import os
import warnings
from qonnx.core.datatype import DataType
-from qonnx.custom_op.general.im2col import compute_conv_output_dim
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.convolutioninputgenerator import (
+ ConvolutionInputGenerator,
+)
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-# This operation should only be used for 1D convolutions. Either the
-# IFMDim_H or IFMDim_W should be '1', which represents the so-called
-# dummy-dimension
-
# ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D:
# input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels)
# output 0 is the output tensor, shape NHWC:
@@ -53,185 +52,59 @@
# between the two layouts
-class ConvolutionInputGenerator1D(HLSCustomOp):
+class ConvolutionInputGenerator_hls(ConvolutionInputGenerator, HLSBackend):
"""Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator
(sliding window) function variants. Depending on the combination of
attributes (e.g. depthwise or not, whether dilation is 0) a different
variant will be picked for the actual HLS implementation."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
- my_attrs = {
- "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X]
- "IFMChannels": ("i", True, 0),
- "IFMDim": ("ints", True, []), # [H, W] = [Y, X]
- "OFMDim": ("ints", True, []), # [H, W] = [Y, X]
- "SIMD": ("i", True, 0),
- "Stride": ("ints", True, []), # [H, W] = [Y, X]
- "Dilation": ("ints", True, []), # [H, W] = [Y, X]
- # FINN DataTypes for inputs, weights, outputs
- "inputDataType": ("s", True, ""),
- "outputDataType": ("s", True, ""),
- "depthwise": ("i", False, 0, {0, 1}),
- # FPGA resource type for ConvolutionInputGenerator input buffer
- # auto -- let Vivado HLS decide
- # block -- use BRAM
- # distributed -- use LUTRAM
- # ultra -- use URAM
- "ram_style": (
- "s",
- False,
- "distributed",
- {"auto", "block", "distributed", "ultra"},
- ),
- "parallel_window": ("i", False, 0, {0, 1}),
- }
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs = {}
+ my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def get_normal_input_shape(self, ind=0):
- ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
- ifm_ch = self.get_nodeattr("IFMChannels")
- ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch)
- return ishape
-
- def get_folded_input_shape(self, ind=0):
- ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
- ifm_ch = self.get_nodeattr("IFMChannels")
- simd = self.get_nodeattr("SIMD")
- assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
- wf = int(ifm_ch / simd)
- folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd)
- return folded_ishape
-
- def get_normal_output_shape(self, ind=0):
- k_h, k_w = self.get_nodeattr("ConvKernelDim")
- ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
- ifm_ch = self.get_nodeattr("IFMChannels")
- stride_h, stride_w = self.get_nodeattr("Stride")
- dilation_h, dilation_w = self.get_nodeattr("Dilation")
- pad = 0
- ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
- ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
- oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch)
- return oshape
-
- def get_folded_output_shape(self, ind=0):
- k_h, k_w = self.get_nodeattr("ConvKernelDim")
- ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
- ifm_ch = self.get_nodeattr("IFMChannels")
- stride_h, stride_w = self.get_nodeattr("Stride")
- dilation_h, dilation_w = self.get_nodeattr("Dilation")
- simd = self.get_nodeattr("SIMD")
- pad = 0
- ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h)
- ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w)
- assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
- if self.use_parallel_window_output():
- wf = int((ifm_ch) // simd)
- folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd)
- else:
- wf = int((k_h * k_w * ifm_ch) // simd)
- folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd)
- return folded_oshape
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- oshape = self.get_normal_output_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen."
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- # data type stays the same
- dtype = model.get_tensor_datatype(node.input[0])
- model.set_tensor_datatype(node.output[0], dtype)
-
- def verify_node(self):
- pass
-
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("inputDataType")]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- return DataType[self.get_nodeattr("outputDataType")]
-
- def get_instream_width(self, ind=0):
- ibits = self.get_input_datatype().bitwidth()
- simd = self.get_nodeattr("SIMD")
- ifm_ch = self.get_nodeattr("IFMChannels")
- assert ifm_ch % simd == 0, "SIMD must divide IFMChannels"
- in_width = simd * ibits
- return in_width
-
- def get_outstream_width(self, ind=0):
- if self.use_parallel_window_output():
- # feed all window pixels in parallel
- k_h, k_w = self.get_nodeattr("ConvKernelDim")
- return self.get_instream_width() * k_h * k_w
- else:
- # if parallel variant not in use: same width for output and input stream
- return self.get_instream_width()
-
- def get_number_output_values(self):
- folded_oshape = self.get_folded_output_shape()
- num_output_elems = np.prod(folded_oshape[:-1])
- return num_output_elems
-
def get_swu_variant(self):
- # checks which variant of the 1D ConvolutionInputGenerator (SWU) can be used
- # We have 5 variants: ConvolutionInputGenerator_1D_parallel,
+ # checks which variant of the ConvolutionInputGenerator (SWU) can be used
+ # For the 2D case, we have 4 variants:
+ # ConvolutioninputGenerator, ConvolutioninputGenerator_dws,
+ # ConvolutioninputGenerator_kernel_stride, ConvolutioninputGenerator_kernel_stride_dws
+ # For the 1D case, we have 5 variants: ConvolutionInputGenerator_1D_parallel,
# ConvolutionInputGenerator_1D_dws_naive, ConvolutionInputGenerator_1D,
# ConvolutioninputGenerator_1D_dws, ConvolutionInputGenerator_1D_dws_stride
is_dws = self.get_nodeattr("depthwise")
- is_strided = np.prod(self.get_nodeattr("Stride")) > 1
- is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2
- is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1
- if self.use_parallel_window_output():
- return "ConvolutionInputGenerator_1D_parallel"
- if not is_dws:
- return "ConvolutionInputGenerator_1D"
- if is_dws:
- if (is_strided and not is_stride_2) or (is_dilated):
- return "ConvolutionInputGenerator_1D_dws_naive"
- elif is_stride_2:
- return "ConvolutionInputGenerator_1D_dws_stride"
- else:
- return "ConvolutionInputGenerator_1D_dws"
-
- def get_1d_conv_attrs_normalized(self):
- # support both (1, D) and (D, 1) cases transparently:
- # For the kernel, presenting the input data of size D as
- # [H, W] = [Y, X] = [1, D] or [D, 1]
- # effectively gives the same result.
- # For consistency and ease of programming, this function
- # returns the attributes of the layer as follows:
- # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D].
- # The dummy ('1') dimension is the Y-dimension.
- ifm_ch = self.get_nodeattr("IFMChannels")
- k = self.get_nodeattr("ConvKernelDim")
- ifm_dim = self.get_nodeattr("IFMDim")
- ofm_dim = self.get_nodeattr("OFMDim")
- stride = self.get_nodeattr("Stride")
- dilation = self.get_nodeattr("Dilation")
-
- # see defines() for an explanation
- if ifm_dim[1] == 1:
- ifm_dim = ifm_dim[::-1]
- ofm_dim = ofm_dim[::-1]
- k = k[::-1]
- stride = stride[::-1]
- dilation = dilation[::-1]
-
- return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation)
+ if self.get_nodeattr("is1D"):
+ is_strided = np.prod(self.get_nodeattr("Stride")) > 1
+ is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2
+ is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1
+ if self.use_parallel_window_output():
+ return "ConvolutionInputGenerator_1D_parallel"
+ if not is_dws:
+ return "ConvolutionInputGenerator_1D"
+ if is_dws:
+ if (is_strided and not is_stride_2) or (is_dilated):
+ return "ConvolutionInputGenerator_1D_dws_naive"
+ elif is_stride_2:
+ return "ConvolutionInputGenerator_1D_dws_stride"
+ else:
+ return "ConvolutionInputGenerator_1D_dws"
+ else:
+ k = self.get_nodeattr("ConvKernelDim")[0]
+ stride = self.get_nodeattr("Stride")[0]
+ hls_call = "ConvolutionInputGenerator"
+ if k % stride != 0:
+ hls_call += "_kernel_stride"
+ if is_dws:
+ hls_call += "_dws"
+ return hls_call
def use_parallel_window_output(self):
- # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
+ if not self.get_nodeattr("is1D"):
+ return False
+ # If 1D, check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to
# feed window in parallel to the following layer, enabling full SIMD unfolding.
stride = self.get_nodeattr("Stride")
dilation = self.get_nodeattr("Dilation")
@@ -245,13 +118,7 @@ def use_parallel_window_output(self):
no_dilation = dilation_h == 1 and dilation_w == 1
supported_ram_style = ram_style in ["auto", "distributed"]
if self.get_nodeattr("parallel_window") == 1:
- if (
- fully_unfolded
- and non_dws
- and no_stride
- and no_dilation
- and supported_ram_style
- ):
+ if fully_unfolded and non_dws and no_stride and no_dilation and supported_ram_style:
return True
else:
warnings.warn(
@@ -267,64 +134,88 @@ def use_parallel_window_output(self):
def get_exp_cycles(self):
simd = self.get_nodeattr("SIMD")
- (
- ifm_ch,
- [ifm_dim_h, ifm_dim_w],
- [ofm_dim_h, ofm_dim_w],
- [k_h, k_w],
- [stride_h, stride_w],
- [dilation_h, dilation_w],
- ) = self.get_1d_conv_attrs_normalized()
-
- # since mmv != 1 is not supported yet, we set mmv for now to 1
- # mmv = 1
- # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
- swu_variant = self.get_swu_variant()
- if swu_variant == "ConvolutionInputGenerator_1D_parallel":
- exp_cycles = k_w + ofm_dim_w
- elif swu_variant == "ConvolutionInputGenerator_1D":
- exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd
- elif swu_variant in [
- "ConvolutionInputGenerator_1D_dws",
- "ConvolutionInputGenerator_1D_dws_stride",
- ]:
- exp_cycles = (
- 1
- + ofm_dim_w * k_w * ifm_ch / simd
- + (ifm_ch / simd) * (k_w - 1)
- - (k_w - 1)
- )
- elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
- cycles_read_block = ifm_dim_w * ifm_ch / simd
- cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
- exp_cycles = cycles_read_block + cycles_write_block
+ # 2D case
+ if not self.get_nodeattr("is1D"):
+ ifm_ch = self.get_nodeattr("IFMChannels")
+ k_h, k_w = self.get_nodeattr("ConvKernelDim")
+ ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim")
+ ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim")
+ stride_h, stride_w = self.get_nodeattr("Stride")
+ dilation_h, dilation_w = self.get_nodeattr("Dilation")
+
+ # since mmv != 1 is not supported yet, we set mmv for now to 1
+ mmv = 1
+ # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h
+ cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv
+ cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd)
+ max_cycles = max(cycles_write_block, cycles_read_block)
+ exp_cycles = ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles
+ # 1D case
+ else:
+ (
+ ifm_ch,
+ [ifm_dim_h, ifm_dim_w],
+ [ofm_dim_h, ofm_dim_w],
+ [k_h, k_w],
+ [stride_h, stride_w],
+ [dilation_h, dilation_w],
+ ) = self.get_1d_conv_attrs_normalized()
+
+ swu_variant = self.get_swu_variant()
+ if swu_variant == "ConvolutionInputGenerator_1D_parallel":
+ exp_cycles = k_w + ofm_dim_w
+ elif swu_variant == "ConvolutionInputGenerator_1D":
+ exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd
+ elif swu_variant in [
+ "ConvolutionInputGenerator_1D_dws",
+ "ConvolutionInputGenerator_1D_dws_stride",
+ ]:
+ exp_cycles = (
+ 1 + ofm_dim_w * k_w * ifm_ch / simd + (ifm_ch / simd) * (k_w - 1) - (k_w - 1)
+ )
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+ cycles_read_block = ifm_dim_w * ifm_ch / simd
+ cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd
+ exp_cycles = cycles_read_block + cycles_write_block
return int(exp_cycles)
def bram_estimation(self):
simd = self.get_nodeattr("SIMD")
- (
- ifm_ch,
- [ifm_dim_h, ifm_dim_w],
- [ofm_dim_h, ofm_dim_w],
- [k_h, k_w],
- [stride_h, stride_w],
- [dilation_h, dilation_w],
- ) = self.get_1d_conv_attrs_normalized()
+ is1D = self.get_nodeattr("is1D")
+ if not is1D:
+ ifm_ch = self.get_nodeattr("IFMChannels")
+ ifm_dim = self.get_nodeattr("IFMDim")[0]
+ k = self.get_nodeattr("ConvKernelDim")[0]
+ stride = self.get_nodeattr("Stride")[0]
+ else:
+ (
+ ifm_ch,
+ [ifm_dim_h, ifm_dim_w],
+ [ofm_dim_h, ofm_dim_w],
+ [k_h, k_w],
+ [stride_h, stride_w],
+ [dilation_h, dilation_w],
+ ) = self.get_1d_conv_attrs_normalized()
ram_style = self.get_nodeattr("ram_style")
swu_variant = self.get_swu_variant()
if swu_variant == "ConvolutionInputGenerator_1D_parallel":
return 0
if ram_style == "block" or ram_style == "auto":
- if swu_variant == "ConvolutionInputGenerator_1D":
- ram_depth = (k_w - 1) * ifm_ch / simd
- elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
- ram_depth = ifm_dim_w * ifm_ch / simd
- elif swu_variant in [
- "ConvolutionInputGenerator_1D_dws",
- "ConvolutionInputGenerator_1D_dws_stride",
- ]:
- ram_depth = k_w * ifm_ch / simd
+ if not is1D:
+ ram_depth = ifm_dim * ifm_ch / simd
+ else:
+ if swu_variant == "ConvolutionInputGenerator_1D":
+ ram_depth = (k_w - 1) * ifm_ch / simd
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+ ram_depth = ifm_dim_w * ifm_ch / simd
+ elif swu_variant in [
+ "ConvolutionInputGenerator_1D_dws",
+ "ConvolutionInputGenerator_1D_dws_stride",
+ ]:
+ ram_depth = k_w * ifm_ch / simd
+ # after calculate the ram_depth depending on the variant
+ # determine ram_width
if ram_depth <= 512:
ram_width = 36
elif ram_depth <= 1024:
@@ -337,80 +228,108 @@ def bram_estimation(self):
ram_width = 2
else:
ram_width = 1
- width_mul = math.ceil(
- simd * self.get_input_datatype().bitwidth() / ram_width
- )
- depth_mul = math.ceil(ram_depth / 18432)
- return width_mul * depth_mul
+
+ width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width)
+ if not is1D:
+ depth_mul = math.ceil(ifm_dim * ifm_ch / simd / ram_depth)
+ return int((k + stride) * width_mul * depth_mul)
+ else:
+ depth_mul = math.ceil(ram_depth / 18432)
+ return int(width_mul * depth_mul)
else:
return 0
def lut_estimation(self):
simd = self.get_nodeattr("SIMD")
- (
- ifm_ch,
- [ifm_dim_h, ifm_dim_w],
- [ofm_dim_h, ofm_dim_w],
- [k_h, k_w],
- [stride_h, stride_w],
- [dilation_h, dilation_w],
- ) = self.get_1d_conv_attrs_normalized()
+ is1D = self.get_nodeattr("is1D")
+ if not is1D:
+ ifm_ch = self.get_nodeattr("IFMChannels")
+ ifm_dim = self.get_nodeattr("IFMDim")[0]
+ k = self.get_nodeattr("ConvKernelDim")[0]
+ stride = self.get_nodeattr("Stride")[0]
+ else:
+ (
+ ifm_ch,
+ [ifm_dim_h, ifm_dim_w],
+ [ofm_dim_h, ofm_dim_w],
+ [k_h, k_w],
+ [stride_h, stride_w],
+ [dilation_h, dilation_w],
+ ) = self.get_1d_conv_attrs_normalized()
ram_style = self.get_nodeattr("ram_style")
swu_variant = self.get_swu_variant()
if swu_variant == "ConvolutionInputGenerator_1D_parallel":
- ram_luts = math.ceil(
- simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64
- )
- elif ram_style == "distributed":
- if swu_variant == "ConvolutionInputGenerator_1D":
- ram_luts = math.ceil(
- self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64
+ ram_luts = math.ceil(simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64)
+ if ram_style == "distributed":
+ if not is1D:
+ ram_luts = int(
+ (k + stride)
+ * (
+ simd
+ * self.get_input_datatype().bitwidth()
+ * math.ceil(ifm_dim * ifm_ch / simd / 64)
+ )
)
+ if swu_variant == "ConvolutionInputGenerator_1D":
+ ram_luts = math.ceil(self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64)
elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
- ram_luts = math.ceil(
- self.get_input_datatype().bitwidth() * ifm_dim_w * ifm_ch / 64
- )
+ ram_luts = math.ceil(self.get_input_datatype().bitwidth() * ifm_dim_w * ifm_ch / 64)
elif swu_variant in [
"ConvolutionInputGenerator_1D_dws",
"ConvolutionInputGenerator_1D_dws_stride",
]:
- ram_luts = math.ceil(
- self.get_input_datatype().bitwidth() * k_w * ifm_ch / 64
- )
+ ram_luts = math.ceil(self.get_input_datatype().bitwidth() * k_w * ifm_ch / 64)
else:
ram_luts = 0
return 300 + ram_luts
def uram_estimation(self):
simd = self.get_nodeattr("SIMD")
- (
- ifm_ch,
- [ifm_dim_h, ifm_dim_w],
- [ofm_dim_h, ofm_dim_w],
- [k_h, k_w],
- [stride_h, stride_w],
- [dilation_h, dilation_w],
- ) = self.get_1d_conv_attrs_normalized()
+ is1D = self.get_nodeattr("is1D")
+ if not is1D:
+ ifm_ch = self.get_nodeattr("IFMChannels")
+ ifm_dim = self.get_nodeattr("IFMDim")[0]
+ k = self.get_nodeattr("ConvKernelDim")[0]
+ stride = self.get_nodeattr("Stride")[0]
+ else:
+ (
+ ifm_ch,
+ [ifm_dim_h, ifm_dim_w],
+ [ofm_dim_h, ofm_dim_w],
+ [k_h, k_w],
+ [stride_h, stride_w],
+ [dilation_h, dilation_w],
+ ) = self.get_1d_conv_attrs_normalized()
+
ram_style = self.get_nodeattr("ram_style")
swu_variant = self.get_swu_variant()
if swu_variant == "ConvolutionInputGenerator_1D_parallel":
return 0
- elif ram_style == "ultra":
- if swu_variant == "ConvolutionInputGenerator_1D":
- width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
- depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096)
- return width_mul * depth_mul
- elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
- width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
- depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096)
- return width_mul * depth_mul
- elif swu_variant in [
- "ConvolutionInputGenerator_1D_dws",
- "ConvolutionInputGenerator_1D_dws_stride",
- ]:
- width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
- depth_mul = math.ceil(k_w * ifm_ch / simd / 4096)
- return width_mul * depth_mul
+ if ram_style == "ultra":
+ if not is1D:
+ return int(
+ (k + stride)
+ * (
+ math.ceil(simd * self.get_input_datatype().bitwidth() / 64)
+ * math.ceil(ifm_dim * ifm_ch / simd / 4096)
+ )
+ )
+ else:
+ if swu_variant == "ConvolutionInputGenerator_1D":
+ width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
+ depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096)
+ return width_mul * depth_mul
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+ width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
+ depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096)
+ return width_mul * depth_mul
+ elif swu_variant in [
+ "ConvolutionInputGenerator_1D_dws",
+ "ConvolutionInputGenerator_1D_dws_stride",
+ ]:
+ width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72)
+ depth_mul = math.ceil(k_w * ifm_ch / simd / 4096)
+ return width_mul * depth_mul
else:
return 0
@@ -504,18 +423,28 @@ def global_includes(self):
def defines(self, var):
numReps = 1
- (
- ifm_ch,
- [ifm_dim_h, ifm_dim_w],
- [ofm_dim_h, ofm_dim_w],
- [k_h, k_w],
- [stride_h, stride_w],
- [dilation_h, dilation_w],
- ) = self.get_1d_conv_attrs_normalized()
+ is1D = self.get_nodeattr("is1D")
simd = self.get_nodeattr("SIMD")
ifm_precision = self.get_input_datatype().bitwidth()
+ if not is1D:
+ ifm_dim = self.get_nodeattr("IFMDim")[0]
+ ifm_ch = self.get_nodeattr("IFMChannels")
+ ofm_dim = self.get_nodeattr("OFMDim")[0]
+ k = self.get_nodeattr("ConvKernelDim")[0]
+ stride = self.get_nodeattr("Stride")[0]
+ else:
+ (
+ ifm_ch,
+ [ifm_dim_h, ifm_dim_w],
+ [ofm_dim_h, ofm_dim_w],
+ [k_h, k_w],
+ [stride_h, stride_w],
+ [dilation_h, dilation_w],
+ ) = self.get_1d_conv_attrs_normalized()
+
swu_variant = self.get_swu_variant()
+ # check all different 1D scenarios
if swu_variant in [
"ConvolutionInputGenerator_1D_parallel",
"ConvolutionInputGenerator_1D",
@@ -542,7 +471,7 @@ def defines(self, var):
numReps,
)
]
- if swu_variant == "ConvolutionInputGenerator_1D_dws":
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws":
self.code_gen_dict["$DEFINES$"] = [
"""
#define ConvKernelDim1_x {}\n
@@ -562,7 +491,7 @@ def defines(self, var):
numReps,
)
]
- if swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
self.code_gen_dict["$DEFINES$"] = [
"""
#define ConvKernelDim1_x {}\n
@@ -586,33 +515,16 @@ def defines(self, var):
numReps,
)
]
-
- def read_npy_data(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_input_datatype()
- if dtype == DataType["BIPOLAR"]:
- # use binary for bipolar storage
- dtype = DataType["BINARY"]
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_in = "%s/input_0.npy" % code_gen_dir
- self.code_gen_dict["$READNPYDATA$"] = []
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
- )
+ # default to 2D cases
+ else:
+ self.code_gen_dict["$DEFINES$"] = [
+ """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n
+ #define Input_precision1 {}\n #define IFMDim1 {}\n
+ #define OFMDim1 {}\n #define SIMD1 {}\n
+ #define Stride1 {}\n #define numReps {}""".format(
+ k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps
+ )
+ ]
def docompute(self):
ram_style = self.get_nodeattr("ram_style")
@@ -625,45 +537,52 @@ def docompute(self):
hls_ram_style = map_to_hls_ram_style[ram_style]
swu_variant = self.get_swu_variant()
- # check which ConvolutionInputGenerator is needed
+ # check which 1D ConvolutionInputGenerator is needed
if swu_variant == "ConvolutionInputGenerator_1D_parallel":
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}
- (in0, out, numReps, {});""".format(
- swu_variant, hls_ram_style
+ (in0_{}, out_{}, numReps, {});""".format(
+ swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
)
]
- if swu_variant == "ConvolutionInputGenerator_1D":
+ elif swu_variant == "ConvolutionInputGenerator_1D":
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}
- (in0, out, numReps, {});""".format(
- swu_variant, hls_ram_style
+ (in0_{}, out_{}, numReps, {});""".format(
+ swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
)
]
- if swu_variant == "ConvolutionInputGenerator_1D_dws":
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws":
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}
- (in0, out, numReps, {});""".format(
- swu_variant, hls_ram_style
+ (in0_{}, out_{}, numReps, {});""".format(
+ swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
)
]
- if swu_variant == "ConvolutionInputGenerator_1D_dws_stride":
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws_stride":
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}
- (in0, out, numReps, {});""".format(
- swu_variant, hls_ram_style
+ (in0_{}, out_{}, numReps, {});""".format(
+ swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
)
]
- if swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
+ elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive":
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}
- (in0, out, numReps, {});""".format(
- swu_variant, hls_ram_style
+ (in0_{}, out_{}, numReps, {});""".format(
+ swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
+ )
+ ]
+ else:
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """{} (in0_{}, out_{}, numReps, {});""".format(
+ swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style
)
]
@@ -690,45 +609,32 @@ def dataoutstrm(self):
multi_pixel_out = 1
self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);'
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", true, 1, %d);'
% (
packed_hls_type,
elem_hls_type,
elem_bits,
npy_type,
+ self.hls_sname(),
oshape_cpp_str,
npy_out,
multi_pixel_out,
)
]
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
if self.use_parallel_window_output():
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
+ """void {}(hls::stream> &in0_{},
hls::stream>
- &out)""".format(
- self.onnx_node.name
+ &out_{})""".format(
+ self.onnx_node.name, self.hls_sname(), self.hls_sname()
)
]
else:
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
- hls::stream> &out)""".format(
- self.onnx_node.name
+ """void {}(hls::stream> &in0_{},
+ hls::stream> &out_{})""".format(
+ self.onnx_node.name, self.hls_sname(), self.hls_sname()
)
]
-
- def pragmas(self):
- self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
- ]
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py
new file mode 100644
index 0000000000..56f472b9c0
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py
@@ -0,0 +1,165 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow.downsampler import DownSampler
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class DownSampler_hls(DownSampler, HLSBackend):
+ """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function.
+ Basically performs a down sampling of the image removing rows and columns."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(DownSampler.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"']
+
+ def defines(self, var):
+ self.code_gen_dict["$DEFINES$"] = []
+
+ ifm_ch = self.get_nodeattr("NumChannels")
+ self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
+
+ ibits = self.get_input_datatype().bitwidth()
+ self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
+
+ idim = self.get_nodeattr("ImgDim")
+ self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
+
+ simd = self.get_nodeattr("SIMD")
+ self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)]
+
+ stride = self.get_nodeattr("Stride")
+ self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)]
+
+ batch_size = self.get_nodeattr("numInputVectors")
+ self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
+
+ def docompute(self):
+ dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D"
+ sname = self.hls_sname()
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0_{sname}, out_{sname}, numReps);"""
+ ]
+
+ def blackboxfunction(self):
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ packed_hls_type,
+ self.hls_sname(),
+ packed_hls_type,
+ self.hls_sname(),
+ )
+ ]
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_ishape = self.get_normal_input_shape()
+ exp_oshape = self.get_normal_output_shape()
+ folded_ishape = self.get_folded_input_shape()
+
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert (
+ inp.shape == exp_ishape
+ ), """Input shape doesn't
+ match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
+ export_idt = self.get_input_datatype()
+
+ reshaped_input = inp.reshape(folded_ishape)
+ np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), "cppsim did not produce expected output shape"
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ odt = export_idt
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output shape doesn't match expected shape
+ (1, OutputDim, OutputDim, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py
similarity index 60%
rename from src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
rename to src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py
index 93cde15ca7..e19149435e 100644
--- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py
+++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -28,91 +28,24 @@
import numpy as np
import os
-import warnings
-from qonnx.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-class DuplicateStreams_Batch(HLSCustomOp):
+class DuplicateStreams_hls(DuplicateStreams, HLSBackend):
"""Class that corresponds to finn-hlslib function of the same name."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
- my_attrs = {
- "NumChannels": ("i", True, 0),
- "PE": ("i", True, 0),
- # how many duplicated output streams to create
- "NumOutputStreams": ("i", True, 0),
- # FINN DataTypes for input
- "inputDataType": ("s", True, ""),
- # number of input vectors, examples:
- # [1] is a single vector (like a FC layer with batch=1)
- # [4] is four vectors (like a FC layer with batch=4)
- # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
- "numInputVectors": ("ints", False, [1]),
- }
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs = {}
+ my_attrs.update(DuplicateStreams.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def get_num_output_streams(self):
- return self.get_nodeattr("NumOutputStreams")
-
- def get_normal_input_shape(self, ind=0):
- ch = self.get_nodeattr("NumChannels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- ishape = tuple(vecs + [ch])
- return ishape
-
- def get_folded_input_shape(self, ind=0):
- ch = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- vecs = list(self.get_nodeattr("numInputVectors"))
- assert ch % pe == 0, "PE must divide NumChannels"
- folds = int(ch / pe)
- folded_ishape = tuple(vecs + [folds, pe])
- return folded_ishape
-
- def get_normal_output_shape(self, ind=0):
- # since the output shape of both out streams are the same
- # return independently from index
- return self.get_normal_input_shape()
-
- def get_folded_output_shape(self, ind=0):
- # since the output shape of both out streams are the same
- # return independently from index
- return self.get_folded_input_shape()
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpected input shape."
- num_out = self.get_num_output_streams()
- assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs"
-
- oshape = self.get_normal_output_shape()
- ret = super().make_const_shape_op(oshape)
- ret.output[:] = self.onnx_node.output
- return ret
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- idt = model.get_tensor_datatype(node.input[0])
- if idt != self.get_input_datatype():
- warn_str = "inputDataType changing for %s: %s -> %s " % (
- node.name,
- str(self.get_input_datatype()),
- str(idt),
- )
- warnings.warn(warn_str)
- self.set_nodeattr("inputDataType", idt.name)
- odt = self.get_output_datatype()
- for my_out in self.onnx_node.output:
- model.set_tensor_datatype(my_out, odt)
-
def verify_node(self):
info_messages = []
# verify that "backend" is set to "fpgadataflow"
@@ -132,43 +65,10 @@ def verify_node(self):
self.get_nodeattr("inputDataType")
info_messages.append("All necessary attributes exist")
except Exception:
- info_messages.append(
- """The required GlobalAccPool_Batch attributes do not exist."""
- )
+ info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""")
return info_messages
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("inputDataType")]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- return DataType[self.get_nodeattr("inputDataType")]
-
- def get_instream_width(self, ind=0):
- """Returns input stream width."""
- ibits = self.get_input_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- in_width = pe * ibits
- return in_width
-
- def get_outstream_width(self, ind=0):
- """Returns output stream width."""
- obits = self.get_output_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- out_width = pe * obits
- return out_width
-
- def get_number_output_values(self):
- return self.get_num_output_streams() * np.prod(
- self.get_folded_output_shape()[1:-1]
- )
-
- def get_exp_cycles(self):
- # Channels/PE * batch size * fmdim * fmdim
- return np.prod(self.get_folded_output_shape()[:-1])
-
def generate_params(self, model, path):
n_outputs = self.get_num_output_streams()
inp_streams = []
@@ -235,9 +135,7 @@ def execute_node(self, context, graph):
# execute the precompiled model
super().exec_precompiled_singlenode_model()
# load output npy file
- super().npy_to_dynamic_outputs(
- context, ["output%d.npy" % i for i in range(n_outputs)]
- )
+ super().npy_to_dynamic_outputs(context, ["output%d.npy" % i for i in range(n_outputs)])
for i in range(n_outputs):
assert (
context[node.output[i]].shape == exp_oshape
@@ -298,29 +196,16 @@ def global_includes(self):
def defines(self, var):
self.code_gen_dict["$DEFINES$"] = []
- def read_npy_data(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_input_datatype()
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_instream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_in = "%s/input_0.npy" % code_gen_dir
- self.code_gen_dict["$READNPYDATA$"] = []
- self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
def strm_decl(self):
n_outputs = self.get_num_output_streams()
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
)
for i in range(n_outputs):
- out_name = "out%d" % i
+ out_name = "out%d_%s" % (i, self.hls_sname())
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
'hls::stream> %s ("%s");'
% (self.get_outstream_width(), out_name, out_name)
@@ -328,8 +213,13 @@ def strm_decl(self):
def docompute(self):
n_outputs = self.get_num_output_streams()
- ostreams = ["out%d" % x for x in range(n_outputs)]
- dc = "DuplicateStreamsCustom(in0, %s);" % (",".join(ostreams))
+ ostreams = []
+ for i in range(n_outputs):
+ ostreams.append("out%d_%s" % (i, self.hls_sname()))
+ dc = "DuplicateStreamsCustom(in0_%s, %s);" % (
+ self.hls_sname(),
+ ",".join(ostreams),
+ )
self.code_gen_dict["$DOCOMPUTE$"] = [dc]
def dataoutstrm(self):
@@ -346,7 +236,7 @@ def dataoutstrm(self):
outstrm_code = []
for i in range(n_outputs):
- out_name = "out%d" % i
+ out_name = "out%d_%s" % (i, self.hls_sname())
npy_out = "%s/output%d.npy" % (code_gen_dir, i)
outstrm_code.append(
'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");'
@@ -363,18 +253,19 @@ def dataoutstrm(self):
self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
n_outputs = self.get_num_output_streams()
inp_streams = []
o_stream_w = self.get_outstream_width()
i_stream_w = self.get_instream_width()
- in_stream = "hls::stream > &in0" % (i_stream_w)
+ in_stream = "hls::stream > &in0_%s" % (i_stream_w, self.hls_sname())
inp_streams.append(in_stream)
for i in range(n_outputs):
- out_stream = "hls::stream > &out%d" % (o_stream_w, i)
+ out_stream = "hls::stream > &out%d_%s" % (
+ o_stream_w,
+ i,
+ self.hls_sname(),
+ )
inp_streams.append(out_stream)
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
@@ -387,34 +278,10 @@ def blackboxfunction(self):
def pragmas(self):
n_outputs = self.get_num_output_streams()
self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
]
for i in range(n_outputs):
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out%d name=out%d_%s"
- % (i, i, self.hls_sname())
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
-
- def get_verilog_top_module_intf_names(self):
- intf_names = super().get_verilog_top_module_intf_names()
- n_outputs = self.get_num_output_streams()
- sname = self.hls_sname()
- intf_names["m_axis"] = []
- for i in range(n_outputs):
- intf_names["m_axis"].append(
- ("out%d_%s" % (i, sname), self.get_outstream_width_padded())
+ "#pragma HLS INTERFACE axis port=out%d_%s" % (i, self.hls_sname())
)
- return intf_names
-
- def derive_characteristic_fxns(self, period):
- n_inps = np.prod(self.get_folded_input_shape()[:-1])
- io_dict = {
- "inputs": {
- "in0": [0 for i in range(n_inps)],
- },
- "outputs": {"out0": [], "out1": []},
- }
- super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py
new file mode 100644
index 0000000000..d57699af05
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py
@@ -0,0 +1,212 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow.fmpadding import FMPadding
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class FMPadding_hls(FMPadding, HLSBackend):
+ """Corresponds to finn-hlslib FMPadding_Batch function.
+ Pads input image by given amount."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(FMPadding.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+
+ def defines(self, var):
+ idim_h, idim_w = self.get_nodeattr("ImgDim")
+ odim_h, odim_w = self.get_padded_odim()
+ pad = self.get_nodeattr("Padding")
+ pad_h = pad[0] + pad[2]
+ pad_w = pad[1] + pad[3]
+ is_square_img = idim_h == idim_w
+ is_square_pad = pad_h == pad_w
+
+ if is_square_img and is_square_pad:
+ self.code_gen_dict["$DEFINES$"] = [
+ """#define ImgDim1 {}\n#define OutputDim1 {}\n
+ #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n
+ #define NumChannels1 {}\n#define SIMD1 {}\n
+ #define numReps {}\n""".format(
+ idim_h,
+ odim_h,
+ pad[0],
+ pad[2],
+ self.get_nodeattr("NumChannels"),
+ self.get_nodeattr("SIMD"),
+ self.get_nodeattr("numInputVectors"),
+ )
+ ]
+ else:
+ self.code_gen_dict["$DEFINES$"] = [
+ """
+ #define OutputDim1_x {}\n
+ #define OutputDim1_y {}\n
+ #define PaddingLeft1 {}\n
+ #define PaddingRight1 {}\n
+ #define PaddingTop1 {}\n
+ #define PaddingBottom1 {}\n
+ #define NumChannels1 {}\n
+ #define SIMD1 {}\n
+ #define numReps {}\n
+ """.format(
+ odim_w,
+ odim_h,
+ pad[1],
+ pad[3],
+ pad[0],
+ pad[2],
+ self.get_nodeattr("NumChannels"),
+ self.get_nodeattr("SIMD"),
+ self.get_nodeattr("numInputVectors"),
+ )
+ ]
+
+ def docompute(self):
+ in_t = self.get_input_datatype().get_hls_datatype_str()
+ idim_h, idim_w = self.get_nodeattr("ImgDim")
+ pad = self.get_nodeattr("Padding")
+ pad_h = pad[0] + pad[2]
+ pad_w = pad[1] + pad[3]
+ is_square_img = idim_h == idim_w
+ is_square_pad = pad_h == pad_w
+
+ if is_square_img and is_square_pad:
+ hls_call = "FMPadding_Batch"
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """{} (in0_{}, out_{}, numReps);""".format(
+ hls_call, in_t, self.hls_sname(), self.hls_sname()
+ )
+ ]
+ else:
+ hls_call = "FMPadding_nonsquare_Batch"
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """{} (in0_{}, out_{}, numReps);""".format(
+ hls_call, in_t, self.hls_sname(), self.hls_sname()
+ )
+ ]
+
+ def blackboxfunction(self):
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ packed_hls_type,
+ self.hls_sname(),
+ packed_hls_type,
+ self.hls_sname(),
+ )
+ ]
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_ishape = self.get_normal_input_shape()
+ exp_oshape = self.get_normal_output_shape()
+ folded_ishape = self.get_folded_input_shape()
+
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert (
+ inp.shape == exp_ishape
+ ), """Input shape doesn't
+ match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
+ export_idt = self.get_input_datatype()
+
+ reshaped_input = inp.reshape(folded_ishape)
+ np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), "cppsim did not produce expected output shape"
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ odt = export_idt
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output shape doesn't match expected shape
+ (1, OutputDim_H, OutputDim_W, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py
new file mode 100644
index 0000000000..b7ba301fbc
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py
@@ -0,0 +1,167 @@
+# Copyright (c) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of Xilinx nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class FMPadding_Pixel_hls(FMPadding_Pixel, HLSBackend):
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(FMPadding_Pixel.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+
+ def defines(self, var):
+ odim_h, odim_w = self.get_padded_odim()
+ stride_h, stride_w = self.get_nodeattr("Stride")
+ self.code_gen_dict["$DEFINES$"] = [
+ """
+ #define OutputDim_x {}\n
+ #define OutputDim_y {}\n
+ #define Stride_x {}\n
+ #define Stride_y {}\n
+ #define NumChannels {}\n
+ #define SIMD {}\n
+ """.format(
+ odim_w,
+ odim_h,
+ stride_w,
+ stride_h,
+ self.get_nodeattr("NumChannels"),
+ self.get_nodeattr("SIMD"),
+ )
+ ]
+
+ def docompute(self):
+ in_t = self.get_input_datatype().get_hls_datatype_str()
+ odim_h, odim_w = self.get_padded_odim()
+ stride_h, stride_w = self.get_nodeattr("Stride")
+ hls_call = "FMPadding_Pixel_Nonsquare"
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """{} (in0_{}, out_{});""".format(
+ hls_call, in_t, self.hls_sname(), self.hls_sname()
+ )
+ ]
+
+ def blackboxfunction(self):
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ packed_hls_type,
+ self.hls_sname(),
+ packed_hls_type,
+ self.hls_sname(),
+ )
+ ]
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_ishape = self.get_normal_input_shape()
+ exp_oshape = self.get_normal_output_shape()
+ folded_ishape = self.get_folded_input_shape()
+
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert (
+ inp.shape == exp_ishape
+ ), """Input shape doesn't
+ match expected shape (1, ImgDim_h, ImgDim_w, NumChannels)."""
+ export_idt = self.get_input_datatype()
+
+ reshaped_input = inp.reshape(folded_ishape)
+ np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), "cppsim did not produce expected output shape"
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ odt = export_idt
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output shape doesn't match expected shape
+ (1, OutputDim_H, OutputDim_W, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py
new file mode 100644
index 0000000000..9b2a7b25b0
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py
@@ -0,0 +1,176 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+
+from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class GlobalAccPool_hls(GlobalAccPool, HLSBackend):
+ """Class that corresponds to finn-hlslib AccPool_Batch function."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(GlobalAccPool.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def verify_node(self):
+ info_messages = []
+ # verify that "backend" is set to "fpgadataflow"
+ backend_value = self.get_nodeattr("backend")
+ if backend_value == "fpgadataflow":
+ info_messages.append("Attribute backend is set correctly")
+ else:
+ info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+ # verify that all necessary attributes exist
+ try:
+ self.get_nodeattr("code_gen_dir_cppsim")
+ self.get_nodeattr("executable_path")
+ self.get_nodeattr("NumChannels")
+ self.get_nodeattr("PE")
+ self.get_nodeattr("inputDataType")
+ info_messages.append("All necessary attributes exist")
+ except Exception:
+ info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""")
+
+ # verify that input data is 2D
+ if len(self.get_nodeattr("numInputVectors")) != 3:
+ info_messages.append("""GlobalAccPool_Batch requires 2D data input.""")
+ raise Exception
+
+ return info_messages
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_ishape = self.get_normal_input_shape()
+ exp_oshape = self.get_normal_output_shape()
+ folded_ishape = self.get_folded_input_shape()
+
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert inp.shape == exp_ishape, """Input shape doesn't match expected shape ."""
+ export_idt = self.get_input_datatype()
+ # reshape input into folded form
+ inp = inp.reshape(folded_ishape)
+ # make copy before saving array
+ reshaped_input = inp.copy()
+ np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), "cppsim \
+ did not produce expected output shape"
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ odt = self.get_output_datatype()
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output shape doesn't match expected shape."""
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+
+ def defines(self, var):
+ self.code_gen_dict["$DEFINES$"] = []
+
+ def docompute(self):
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format(
+ self.get_normal_input_shape()[1],
+ self.get_nodeattr("NumChannels"),
+ self.get_input_datatype().get_hls_datatype_str(),
+ self.get_nodeattr("PE"),
+ self.get_output_datatype().get_hls_datatype_str(),
+ self.hls_sname(),
+ self.hls_sname(),
+ )
+ ]
+
+ def blackboxfunction(self):
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ """void {}(hls::stream> &in0_{},
+ hls::stream> &out_{})""".format(
+ self.onnx_node.name,
+ self.get_instream_width(),
+ self.hls_sname(),
+ self.get_outstream_width(),
+ self.hls_sname(),
+ )
+ ]
diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
similarity index 81%
rename from src/finn/custom_op/fpgadataflow/iodma.py
rename to src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
index 65683079fc..8d9903f0f5 100644
--- a/src/finn/custom_op/fpgadataflow/iodma.py
+++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -31,7 +32,8 @@
import warnings
from qonnx.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
# the IODMA inerfaces a memory-mapped AXI interface and an AXI stream
# direction "in": pulls data from AXI-MM to AXI stream
@@ -47,7 +49,7 @@
# Interfaces
# - AXI-MM name specified by intfName unless this is set to "" (empty, the default)
-# in which case output AXI-MM are named "out" and input AXI-MM are named "in0"
+# in which case output AXI-MM are named "out_V" and input AXI-MM are named "in0_V"
# - AXI-MM interface width (in bits) is specified by intfWidth
# - AXI-Stream interface width (in bits) is specified by streamWidth
# - If inftWidth and streamWidth are not equal, the DMA core performs
@@ -72,11 +74,11 @@
# -the folded shape is not defined
-class IODMA(HLSCustomOp):
+class IODMA_hls(HWCustomOp, HLSBackend):
"""Class that corresponds to finn-hlslib DMA function(s)."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
@@ -97,7 +99,8 @@ def get_nodeattr_types(self):
# name of axi-mm interface
"intfName": ("s", False, ""),
}
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs.update(HWCustomOp.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
def get_normal_input_shape(self, ind=0):
@@ -116,9 +119,7 @@ def get_folded_input_shape(self, ind=0):
shape = list(self.get_normal_input_shape())
itype_bits = self.get_input_datatype().bitwidth()
intfw = self.get_nodeattr("streamWidth")
- assert (
- intfw % itype_bits == 0
- ), "Input stream width must be a multiple of datatype bits"
+ assert intfw % itype_bits == 0, "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
@@ -133,9 +134,7 @@ def get_folded_output_shape(self, ind=0):
shape = list(self.get_normal_output_shape())
itype_bits = self.get_output_datatype().bitwidth()
intfw = self.get_nodeattr("streamWidth")
- assert (
- intfw % itype_bits == 0
- ), "Input stream width must be a multiple of datatype bits"
+ assert intfw % itype_bits == 0, "Input stream width must be a multiple of datatype bits"
elems_per_word = intfw // itype_bits
assert shape[-1] % elems_per_word == 0, "Fold depth must be integer"
fold_depth = shape[-1] // elems_per_word
@@ -196,9 +195,7 @@ def get_number_output_values(self):
stream_width = self.get_nodeattr("streamWidth")
nelems = np.prod(oshape)
nbits = nelems * itype_bits
- assert (
- nbits % stream_width == 0
- ), "DMA: total transfer size must be word multiple"
+ assert nbits % stream_width == 0, "DMA: total transfer size must be word multiple"
ovalues = nbits // stream_width
return ovalues
@@ -254,15 +251,23 @@ def docompute(self):
# DWCs depend on AXI MM and out interface width
if strmw == intfw:
# case 0: AXI MM width = out width, no DWCs needed
- self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")]
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
+ ]
elif (strmw % intfw == 0) or (intfw % strmw == 0):
# case 1: AXI MM width divisible by out width or vice versa
# single DWC + single extra stream needed
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream > dma2dwc;" % intfw,
- dma_inst_template % ("in0", "dma2dwc"),
+ dma_inst_template % ("in0_" + self.hls_sname(), "dma2dwc"),
dwc_inst_template
- % (intfw, strmw, total_bits // intfw, "dma2dwc", "out"),
+ % (
+ intfw,
+ strmw,
+ total_bits // intfw,
+ "dma2dwc",
+ "out_" + self.hls_sname(),
+ ),
]
else:
# case 2: AXI MM width not divisible by out width or vice versa
@@ -271,26 +276,40 @@ def docompute(self):
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream > dma2lcm;" % intfw,
"hls::stream > lcm2out;" % width_lcm,
- dma_inst_template % ("in0", "dma2lcm"),
+ dma_inst_template % ("in0_" + self.hls_sname(), "dma2lcm"),
dwc_inst_template
% (intfw, width_lcm, total_bits // intfw, "dma2lcm", "lcm2out"),
dwc_inst_template
- % (width_lcm, strmw, total_bits // width_lcm, "lcm2out", "out"),
+ % (
+ width_lcm,
+ strmw,
+ total_bits // width_lcm,
+ "lcm2out",
+ "out_" + self.hls_sname(),
+ ),
]
elif direction == "out":
# in0 -> (DWCs) -> IODMA -> AXI MM
# DWCs depend on AXI MM and out interface width
if strmw == intfw:
# case 0: in width = AXI MM width, no DWCs needed
- self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")]
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname())
+ ]
elif (strmw % intfw == 0) or (intfw % strmw == 0):
# case 1: AXI MM width divisible by in width or vice versa
# single DWC + single extra stream needed
self.code_gen_dict["$DOCOMPUTE$"] = [
"hls::stream > dwc2dma;" % intfw,
dwc_inst_template
- % (strmw, intfw, total_bits // strmw, "in0", "dwc2dma"),
- dma_inst_template % ("dwc2dma", "out"),
+ % (
+ strmw,
+ intfw,
+ total_bits // strmw,
+ "in0_" + self.hls_sname(),
+ "dwc2dma",
+ ),
+ dma_inst_template % ("dwc2dma", "out_" + self.hls_sname()),
]
else:
# case 2: AXI MM width not divisible by out width or vice versa
@@ -300,10 +319,16 @@ def docompute(self):
"hls::stream > in2lcm;" % width_lcm,
"hls::stream > lcm2dma;" % intfw,
dwc_inst_template
- % (strmw, width_lcm, total_bits // strmw, "in0", "in2lcm"),
+ % (
+ strmw,
+ width_lcm,
+ total_bits // strmw,
+ "in0_" + self.hls_sname(),
+ "in2lcm",
+ ),
dwc_inst_template
% (width_lcm, intfw, total_bits // width_lcm, "in2lcm", "lcm2dma"),
- dma_inst_template % ("lcm2dma", "out"),
+ dma_inst_template % ("lcm2dma", "out_" + self.hls_sname()),
]
else:
raise Exception("Unknown IODMA direction: %s" % direction)
@@ -316,13 +341,25 @@ def blackboxfunction(self):
direction = self.get_nodeattr("direction")
if direction == "in":
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)"
- % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+ "void %s(%s *in0_%s, hls::stream<%s > &out_%s, unsigned int numReps)"
+ % (
+ self.onnx_node.name,
+ packed_hls_type_in,
+ self.hls_sname(),
+ packed_hls_type_out,
+ self.hls_sname(),
+ )
]
elif direction == "out":
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)"
- % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out)
+ "void %s(hls::stream<%s > &in0_%s, %s *out_%s, unsigned int numReps)"
+ % (
+ self.onnx_node.name,
+ packed_hls_type_in,
+ self.hls_sname(),
+ packed_hls_type_out,
+ self.hls_sname(),
+ )
]
else:
raise ValueError("Invalid IODMA direction, please set to in or out")
@@ -339,32 +376,32 @@ def pragmas(self):
if direction == "in":
if intfname == "":
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE m_axi offset=slave port=in0"
+ "#pragma HLS INTERFACE m_axi offset=slave port=in0_" + self.hls_sname()
)
else:
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
)
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE s_axilite port=in0 bundle=control"
+ "#pragma HLS INTERFACE s_axilite port=in0_%s bundle=control" % (self.hls_sname())
)
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
)
elif direction == "out":
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
)
if intfname == "":
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE m_axi offset=slave port=out"
+ "#pragma HLS INTERFACE m_axi offset=slave port=out_" + self.hls_sname()
)
else:
self.code_gen_dict["$PRAGMAS$"].append(
"#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname)
)
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE s_axilite port=out bundle=control"
+ "#pragma HLS INTERFACE s_axilite port=out_%s bundle=control" % (self.hls_sname())
)
else:
raise ValueError("Invalid IODMA direction, please set to in or out")
@@ -373,18 +410,6 @@ def pragmas(self):
def execute_node(self, context, graph):
pass
- def dataoutstrm(self):
- pass
-
- def read_npy_data(self):
- pass
-
- def save_as_npy(self):
- pass
-
- def strm_decl(self):
- pass
-
def get_verilog_top_module_intf_names(self):
intf_names = super().get_verilog_top_module_intf_names()
if self.get_nodeattr("direction") == "out":
diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py
similarity index 54%
rename from src/finn/custom_op/fpgadataflow/labelselect_batch.py
rename to src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py
index 03f89bd7ec..1e2c0d034a 100644
--- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py
+++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -28,99 +28,24 @@
import numpy as np
import os
-from onnx import TensorProto, helper
-from qonnx.core.datatype import DataType
-from qonnx.util.basic import roundup_to_integer_multiple
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.labelselect import LabelSelect
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-class LabelSelect_Batch(HLSCustomOp):
+class LabelSelect_hls(LabelSelect, HLSBackend):
"""Class that corresponds to finn-hlslib LabelSelect_Batch function."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
- odt_name = self.get_nodeattr("outputDataType")
- if odt_name == "":
- # If not provided compute min size
- labels = self.get_nodeattr("Labels")
- odt = DataType.get_smallest_possible(labels - 1)
- # ensure a datatype divisible by 8-bits in case this is the last node
- bw = roundup_to_integer_multiple(odt.bitwidth(), 8)
- new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw))
- odt = DataType[new_odt_name]
- odt_name = odt.name
- self.set_nodeattr("outputDataType", odt_name)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
- my_attrs = {
- "Labels": ("i", True, 0),
- "PE": ("i", True, 0),
- "K": ("i", True, 0),
- # FINN DataTypes for input
- "inputDataType": ("s", True, ""),
- "outputDataType": ("s", False, ""),
- # number of input vectors, examples:
- # [1] is a single vector (like a FC layer with batch=1)
- # [4] is four vectors (like a FC layer with batch=4)
- # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
- "numInputVectors": ("ints", False, [1]),
- }
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs = {}
+ my_attrs.update(LabelSelect.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def get_normal_input_shape(self, ind=0):
- nlabels = self.get_nodeattr("Labels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- ishape = tuple(vecs + [nlabels])
- return ishape
-
- def get_folded_input_shape(self, ind=0):
- nlabels = self.get_nodeattr("Labels")
- pe = self.get_nodeattr("PE")
- vecs = list(self.get_nodeattr("numInputVectors"))
- assert nlabels % pe == 0, "PE must divide Labels"
- folds = int(nlabels / pe)
- folded_ishape = tuple(vecs + [folds, pe])
- return folded_ishape
-
- def get_normal_output_shape(self, ind=0):
- k = self.get_nodeattr("K")
- vecs = list(self.get_nodeattr("numInputVectors"))
- oshape = tuple(vecs + [k])
- return oshape
-
- def get_folded_output_shape(self, ind=0):
- k = self.get_nodeattr("K")
- vecs = list(self.get_nodeattr("numInputVectors"))
- oshape = tuple(vecs + [k, 1])
- return oshape
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- oshape = self.get_normal_output_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpected input shape."
- return helper.make_node(
- "RandomNormal",
- inputs=[],
- outputs=[self.onnx_node.output[0]],
- mean=0.0,
- scale=1.0,
- dtype=TensorProto.INT64,
- shape=list(oshape),
- )
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- # check input datatype against property
- idt = model.get_tensor_datatype(node.input[0])
- self.set_nodeattr("inputDataType", idt.name)
-
- odt = self.get_output_datatype()
- model.set_tensor_datatype(self.onnx_node.output[0], odt)
-
def verify_node(self):
info_messages = []
# verify that "backend" is set to "fpgadataflow"
@@ -141,9 +66,7 @@ def verify_node(self):
self.get_nodeattr("outputDataType")
info_messages.append("All necessary attributes exist")
except Exception:
- info_messages.append(
- """The required LabelSelect_Batch attributes do not exist."""
- )
+ info_messages.append("""The required LabelSelect_Batch attributes do not exist.""")
# verify that input data is 1D
if len(self.get_nodeattr("numInputVectors")) > 1:
@@ -152,30 +75,6 @@ def verify_node(self):
return info_messages
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- ret = DataType[self.get_nodeattr("inputDataType")]
- return ret
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- ret = DataType[self.get_nodeattr("outputDataType")]
- return ret
-
- def get_instream_width(self, ind=0):
- """Returns input stream width."""
- ibits = self.get_input_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- in_width = pe * ibits
- return in_width
-
- def get_outstream_width(self, ind=0):
- """Returns output stream width."""
- return self.get_output_datatype().bitwidth()
-
- def get_number_output_values(self):
- return self.get_nodeattr("K")
-
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
node = self.onnx_node
@@ -275,83 +174,39 @@ def read_npy_data(self):
# Also notice that StreamingDataWidthConverter_Batch performs LE packing
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
def docompute(self):
- node = self.onnx_node
self.code_gen_dict["$DOCOMPUTE$"] = [
- """{}<{}, {}, {}, {}, {} > (in0, out, 1);""".format(
- node.op_type,
+ """LabelSelect_Batch<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format(
self.get_nodeattr("Labels"),
self.get_nodeattr("PE"),
self.get_nodeattr("K"),
self.get_input_datatype().get_hls_datatype_str(),
self.get_output_datatype().get_hls_datatype_str(),
+ self.hls_sname(),
+ self.hls_sname(),
)
]
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
- hls::stream > &out)""".format(
+ """void {}(hls::stream> &in0_{},
+ hls::stream > &out_{})""".format(
self.onnx_node.name,
self.get_nodeattr("PE"),
self.get_input_datatype().bitwidth(),
+ self.hls_sname(),
self.get_output_datatype().bitwidth(),
+ self.hls_sname(),
)
]
-
- def pragmas(self):
- self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
- ]
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
-
- def get_exp_cycles(self):
- nlabels = self.get_nodeattr("Labels")
- pe = self.get_nodeattr("PE")
- exp_cycles = nlabels / pe
- return int(exp_cycles)
diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
new file mode 100644
index 0000000000..ba44deb898
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py
@@ -0,0 +1,337 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+from math import ceil, log2
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.lookup import Lookup
+from finn.util.data_packing import (
+ npy_to_rtlsim_input,
+ numpy_to_hls_code,
+ pack_innermost_dim_as_hex_string,
+ rtlsim_output_to_npy,
+)
+
+
+class Lookup_hls(Lookup, HLSBackend):
+ "Streaming elementwise HLS lookup, mapping indices to values."
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(Lookup.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def global_includes(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ global_incls = []
+ global_incls.append('#include "lookup.hpp"')
+ if mem_mode == "internal_embedded":
+ global_incls.append('#include "embeddings.hpp"')
+ self.code_gen_dict["$GLOBALS$"] = global_incls
+
+ def defines(self, var):
+ n_inputs = np.prod(self.get_folded_input_shape()[:-1])
+ dtype = self.get_input_datatype()
+ elem_hls_type = dtype.get_hls_datatype_str()
+ emb_type = DataType[self.get_nodeattr("EmbeddingType")]
+ emb_hls_type = emb_type.get_hls_datatype_str()
+ emb_dim = self.get_nodeattr("EmbeddingDim")
+ mem_mode = self.get_nodeattr("mem_mode")
+ my_defines = []
+ my_defines.append("#define NumInputs %d" % n_inputs)
+ if mem_mode == "external":
+ ext_mem_width = self.get_nodeattr("ext_mem_width")
+ ext_mem_emb_size = self.get_folded_output_shape()[-2]
+ ext_mem_emb_align = ceil(log2(ext_mem_emb_size))
+ my_defines.append("#define MemBits %d" % ext_mem_width)
+ my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size)
+ my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align)
+ my_defines.append("#define T_SRC %s" % elem_hls_type)
+ my_defines.append("#define T_DST ap_uint")
+ elif mem_mode == "internal_embedded":
+ my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings"))
+ my_defines.append("#define EmbeddingDim %d" % emb_dim)
+ my_defines.append("#define InputType %s" % elem_hls_type)
+ my_defines.append("#define EmbeddingType %s" % emb_hls_type)
+ self.code_gen_dict["$DEFINES$"] = my_defines
+
+ def read_npy_data(self):
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ dtype = self.get_input_datatype()
+ if dtype == DataType["BIPOLAR"]:
+ # use binary for bipolar storage
+ dtype = DataType["BINARY"]
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = dtype.get_hls_datatype_str()
+ npy_type = "int64_t"
+ npy_in = "%s/input_0.npy" % code_gen_dir
+ self.code_gen_dict["$READNPYDATA$"] = []
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
+ )
+
+ def dataoutstrm(self):
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ dtype = self.get_output_datatype()
+ if dtype == DataType["BIPOLAR"]:
+ # use binary for bipolar storage
+ dtype = DataType["BINARY"]
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_outstream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = dtype.get_hls_datatype_str()
+ npy_type = "float"
+ npy_out = "%s/output.npy" % code_gen_dir
+ oshape = self.get_folded_output_shape()
+ oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
+
+ self.code_gen_dict["$DATAOUTSTREAM$"] = [
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", %s);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ self.hls_sname(),
+ oshape_cpp_str,
+ npy_out,
+ "false",
+ )
+ ]
+
+ def docompute(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """StreamingLookup(in0_%s, out_%s, embeddings);"""
+ % (self.hls_sname(), self.hls_sname())
+ ]
+ elif mem_mode == "external":
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """StreamingLookup_ext(in0_%s, out_%s, mem, size, oob_count,
+ oob_irq);"""
+ % (self.hls_sname(), self.hls_sname())
+ ]
+
+ def blackboxfunction(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ ibits = self.get_instream_width()
+ packed_input_hls_type = "ap_uint<%d>" % ibits
+ obits = self.get_outstream_width()
+ packed_output_hls_type = "ap_uint<%d>" % obits
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ packed_input_hls_type,
+ self.hls_sname(),
+ packed_output_hls_type,
+ self.hls_sname(),
+ )
+ ]
+ elif mem_mode == "external":
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void "
+ + self.onnx_node.name
+ + "(hls::stream &in0_%s, hls::stream &out_%s, "
+ % (self.hls_sname(), self.hls_sname())
+ + "T_DST const *const mem, unsigned const size, "
+ + "unsigned &oob_count, bool &oob_irq)"
+ ]
+
+ def pragmas(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()]
+ my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname())
+ my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+ if mem_mode == "internal_embedded":
+ my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM")
+ elif mem_mode == "external":
+ my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem")
+ my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control")
+ my_pragmas.append("#pragma HLS INTERFACE s_axilite port=size bundle=control")
+ my_pragmas.append("#pragma HLS INTERFACE s_axilite port=oob_count bundle=control")
+ my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq")
+ else:
+ raise Exception("Unrecognized mem_mode: " + mem_mode)
+ self.code_gen_dict["$PRAGMAS$"] = my_pragmas
+
+ def generate_params(self, model, path):
+ mem_mode = self.get_nodeattr("mem_mode")
+ embeddings = model.get_initializer(self.onnx_node.input[1])
+ if mem_mode == "internal_embedded":
+ code_gen_dir = path
+ weight_filename = "{}/embeddings.hpp".format(code_gen_dir)
+ edt = DataType[self.get_nodeattr("EmbeddingType")]
+ # obits = self.get_outstream_width()
+ # packed_output_hls_type = "ap_uint<%d>" % obits
+ assert np.vectorize(edt.allowed)(
+ embeddings
+ ).all(), "Embeddings can't be expressed with type %s" % str(edt)
+ # reverse innertmost dim in embeddings to remain compatible with
+ # how we normally encode the data in FINN
+ embeddings_rev = np.flip(embeddings, -1)
+ embeddings_hls_code = numpy_to_hls_code(embeddings_rev, edt, "embeddings", True, False)
+ f_thresh = open(weight_filename, "w")
+ f_thresh.write(embeddings_hls_code)
+ f_thresh.close()
+ elif mem_mode == "external":
+ edt = DataType[self.get_nodeattr("EmbeddingType")]
+ ext_mem_width = self.get_nodeattr("ext_mem_width")
+ assert edt.bitwidth() == 8, (
+ "Lookup with mem_mode=external "
+ + "only works with 8-bit embeddings but found "
+ + str(edt)
+ )
+ emb_dim = self.get_nodeattr("EmbeddingDim")
+ # need to zero-pad embeddings in external mode for burst alignment
+ # compute how much padding we need
+ emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1]
+ ext_mem_emb_size = self.get_folded_output_shape()[-2]
+ ext_mem_emb_align = ceil(log2(ext_mem_emb_size))
+ align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align)
+ pad_amount = align_factor - emb_dim
+ embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)])
+ # reshape for packing the innermost dim
+ embeddings_padded = embeddings_padded.reshape(-1, emb_elems_per_ext_mem_width)
+ weight_filename = "%s/%s.dat" % (path, self.onnx_node.name)
+ ret = pack_innermost_dim_as_hex_string(
+ embeddings_padded, edt, ext_mem_width, True, prefix=""
+ )
+ with open(weight_filename, "w") as f:
+ for current_line in ret:
+ f.write(current_line + "\n")
+ else:
+ raise Exception("Unrecognized mem_mode: " + mem_mode)
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_ishape = tuple(self.get_normal_input_shape())
+ exp_oshape = tuple(self.get_normal_output_shape())
+ folded_ishape = tuple(self.get_folded_input_shape())
+ folded_oshape = tuple(self.get_folded_output_shape())
+ mem_mode = self.get_nodeattr("mem_mode")
+ assert (
+ mem_mode == "internal_embedded"
+ ), "Only mem_mode=internal_embedded is supported for simulation of Lookup layer"
+
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray"
+ assert inp.shape == exp_ishape, """Input shape doesn't match expected shape."""
+ export_idt = self.get_input_datatype()
+ odt = self.get_output_datatype()
+
+ reshaped_input = inp.reshape(folded_ishape)
+ np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == folded_oshape
+ ), "cppsim did not produce expected folded output shape"
+ context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output,
+ out_npy_path,
+ odt,
+ out_shape,
+ packed_bits,
+ target_bits,
+ reverse_inner=True,
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output shape doesn't match expected shape."""
+
+ def get_ap_int_max_w(self):
+ parent_max = super().get_ap_int_max_w()
+ mem_mode = self.get_nodeattr("mem_mode")
+ ext_mem_width = self.get_nodeattr("ext_mem_width")
+ if mem_mode == "external":
+ return max(ext_mem_width, parent_max)
+ else:
+ return parent_max
diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
new file mode 100644
index 0000000000..94f8cc0845
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py
@@ -0,0 +1,590 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+# ONNX i/o tensor shape assumptions for MatrixVectorActivation_hls:
+# input 0 is the input tensor, shape (.., i_size) = (..., MW)
+# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH)
+# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres)
+# output 0 is the output tensor, shape (.., o_size) = (..., MH)
+# the ... here can be any shape (representing groups of vectors)
+
+
+class MVAU_hls(MVAU, HLSBackend):
+ """Corresponds to finn-hlslib MatrixVectorActivation_Batch function."""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(MVAU.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def lut_estimation(self):
+ """Calculates resource estimations for LUTs based on:
+ - FINN-R: An End-to-End Deep-Learning Framework for Fast
+ Exploration of Quantized Neural Networks
+ - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+ Y. Umuroglu, M. Leeser and K. Vissers
+ - 12. Sep 2018
+ """
+ # TODO add in/out FIFO contributions
+ P = self.get_nodeattr("PE")
+ Q = self.get_nodeattr("SIMD")
+ MW = self.get_nodeattr("MW")
+ wdt = self.get_weight_datatype()
+ W = wdt.bitwidth()
+ # determine tdt with input and weight data types
+ idt = self.get_input_datatype()
+ A = idt.bitwidth()
+ # parameters from experiments in paper mentioned above
+ c0 = 300
+ c1 = 1.1
+ c2 = 0
+ mmode = self.get_nodeattr("mem_mode")
+ mstyle = self.get_nodeattr("ram_style")
+ if (mmode == "internal_decoupled" and mstyle == "distributed") or (
+ mmode == "internal_embedded" and self.calc_wmem() <= 128
+ ):
+ c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+ # multiplication
+ res_type = self.get_nodeattr("resType")
+ if res_type == "dsp":
+ mult_luts = 0
+ else:
+ mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+ # adder tree
+ addertree_luts = (W + A) * (2 * Q - 1)
+ # accumulator
+ acc_datatype = self.get_accumulator_datatype()
+ # if accDataType is not set, then it will default to INT32, which would
+ # be a large overestimate in most (if not all) cases. In this scenario,
+ # we would use the minimum accumulator as determined by the data types
+ # bound, derived in https://arxiv.org/abs/2301.13376
+ alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed())
+ acc_bits = min(
+ acc_datatype.bitwidth(),
+ np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+ )
+ acc_luts = acc_bits
+ # thresholds and threshold comparators
+ thr_luts = 0
+ comp_luts = 0
+ noact = self.get_nodeattr("noActivation")
+ tmem_style = self.get_nodeattr("ram_style_thresholds")
+ if (noact == 0) and (tmem_style == "distributed"):
+ odt = self.get_output_datatype()
+ B = odt.bitwidth()
+ thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64)
+ comp_luts = (2**B - 1) * acc_bits
+
+ return int(
+ c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+ )
+
+ def dsp_estimation(self):
+ # multiplication
+ P = self.get_nodeattr("PE")
+ res_type = self.get_nodeattr("resType")
+ Q = self.get_nodeattr("SIMD")
+ wdt = self.get_weight_datatype()
+ W = wdt.bitwidth()
+ idt = self.get_input_datatype()
+ A = idt.bitwidth()
+ if res_type == "dsp":
+ mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling
+ else:
+ mult_dsp = 0
+ return int(mult_dsp)
+
+ def get_template_param_values(self):
+ """Returns the template parameter values according to input, output and weight
+ data types."""
+ ret = dict()
+ inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+ out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+ inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+ # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+ wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+ bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+ if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+ raise Exception("True binary (non-bipolar) inputs not yet supported")
+ inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+ # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+ wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+ # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+ inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+ wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+ # fill in TSrcI and TWeightI
+ # TODO check these with Giulio
+ # TODO handle non-bipolar binary inputs
+ if inp_is_bipolar and wt_is_bipolar:
+ ret["TSrcI"] = "Recast"
+ ret["TWeightI"] = "Identity"
+ elif (not inp_is_bipolar) and wt_is_bipolar:
+ ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+ ret["TWeightI"] = "Recast"
+ elif inp_is_bipolar and (not wt_is_bipolar):
+ ret["TSrcI"] = "Recast"
+ ret["TWeightI"] = "Identity"
+ elif (not inp_is_bipolar) and (not wt_is_bipolar):
+ ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+ ret["TWeightI"] = "Identity"
+
+ # fill in TDstI
+ ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+ return ret
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+ self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+
+ mem_mode = self.get_nodeattr("mem_mode")
+ if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external",
+ currently no other parameter value is supported!"""
+ )
+ self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"']
+ if self.calc_tmem() != 0:
+ # TODO find a better way of checking for no pregenerated thresholds
+ self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+ def defines(self, var):
+ # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements.
+ if var == "ipgen":
+ SIMD = self.get_nodeattr("SIMD")
+ MW = self.get_nodeattr("MW")
+ condition = SIMD >= (MW / 1024)
+ msg = (
+ f"HLS synthesis of MatrixVectorActivation requires: "
+ f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} "
+ f"and MW={MW} for node: {self.onnx_node.name}."
+ )
+ assert condition, msg
+ mem_mode = self.get_nodeattr("mem_mode")
+ numInputVectors = list(self.get_nodeattr("numInputVectors"))
+ numReps = np.prod(numInputVectors)
+ self.code_gen_dict["$DEFINES$"] = [
+ """#define MW1 {}\n #define MH1 {}\n
+ #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n
+ #define TMEM1 {}\n #define numReps {}""".format(
+ self.get_nodeattr("MW"),
+ self.get_nodeattr("MH"),
+ self.get_nodeattr("SIMD"),
+ self.get_nodeattr("PE"),
+ self.calc_wmem(),
+ self.calc_tmem(),
+ numReps,
+ )
+ ]
+ if mem_mode == "internal_decoupled" or mem_mode == "external":
+ wdt = self.get_weight_datatype()
+ self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
+
+ def read_npy_data(self):
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ dtype = self.get_input_datatype()
+ if dtype == DataType["BIPOLAR"]:
+ # use binary for bipolar storage
+ dtype = DataType["BINARY"]
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = dtype.get_hls_datatype_str()
+ npy_type = "float"
+ npy_in = "%s/input_0.npy" % code_gen_dir
+ self.code_gen_dict["$READNPYDATA$"] = []
+ # note: the innermost dim is reversed for the input
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
+ )
+
+ mem_mode = self.get_nodeattr("mem_mode")
+ if mem_mode == "internal_decoupled" or mem_mode == "external":
+ wdt = self.get_weight_datatype()
+ elem_bits = wdt.bitwidth()
+ packed_bits = self.get_weightstream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = wdt.get_hls_datatype_str()
+ npy_type = "float"
+ npy_in = "%s/weights.npy" % code_gen_dir
+
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
+ )
+
+ def strm_decl(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+
+ if mem_mode == "internal_decoupled" or mem_mode == "external":
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> weights_{} ("weights_{}");'.format(
+ self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+
+ def docompute(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ map_to_hls_mult_style = {
+ "auto": "ap_resource_dflt()",
+ "lut": "ap_resource_lut()",
+ "dsp": "ap_resource_dsp()",
+ }
+ tmpl_args = self.get_template_param_values()
+ if self.calc_tmem() == 0:
+ odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+ threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+ else:
+ threshs = "threshs"
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """Matrix_Vector_Activate_Batch
+ (in0_{}, out_{}, weights, {}, numReps, {});""".format(
+ tmpl_args["TSrcI"],
+ tmpl_args["TDstI"],
+ tmpl_args["TWeightI"],
+ self.hls_sname(),
+ self.hls_sname(),
+ threshs,
+ map_to_hls_mult_style[self.get_nodeattr("resType")],
+ )
+ ]
+ elif mem_mode == "internal_decoupled" or mem_mode == "external":
+ wdt = self.get_weight_datatype()
+ if wdt == DataType["BIPOLAR"]:
+ export_wdt = DataType["BINARY"]
+ else:
+ export_wdt = wdt
+ wdtype_hls_str = export_wdt.get_hls_datatype_str()
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """Matrix_Vector_Activate_Stream_Batch
+ (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
+ tmpl_args["TSrcI"],
+ tmpl_args["TDstI"],
+ tmpl_args["TWeightI"],
+ wdtype_hls_str,
+ self.hls_sname(),
+ self.hls_sname(),
+ self.hls_sname(),
+ threshs,
+ map_to_hls_mult_style[self.get_nodeattr("resType")],
+ )
+ ]
+
+ else:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external",
+ currently no other parameter value is supported!"""
+ )
+
+ def dataoutstrm(self):
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ dtype = self.get_output_datatype()
+ if dtype == DataType["BIPOLAR"]:
+ # use binary for bipolar storage
+ dtype = DataType["BINARY"]
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_outstream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = dtype.get_hls_datatype_str()
+ npy_type = "float"
+ npy_out = "%s/output.npy" % code_gen_dir
+ shape = self.get_folded_output_shape()
+ shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+ # note: the innermost dim is not reversed for the output
+ self.code_gen_dict["$DATAOUTSTREAM$"] = [
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ self.hls_sname(),
+ shape_cpp_str,
+ npy_out,
+ )
+ ]
+
+ def save_as_npy(self):
+ self.code_gen_dict["$SAVEASCNPY$"] = []
+
+ def blackboxfunction(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ """void {}(hls::stream> &in0_{},
+ hls::stream> &out_{}
+ )""".format(
+ self.onnx_node.name,
+ self.get_instream_width(),
+ self.hls_sname(),
+ self.get_outstream_width(),
+ self.hls_sname(),
+ )
+ ]
+ elif mem_mode == "internal_decoupled" or mem_mode == "external":
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ """void {}(
+ hls::stream> &in0_{},
+ hls::stream> &weights_{},
+ hls::stream> &out_{}
+ )""".format(
+ self.onnx_node.name,
+ self.get_instream_width(),
+ self.hls_sname(),
+ self.get_weightstream_width(),
+ self.hls_sname(),
+ self.get_outstream_width(),
+ self.hls_sname(),
+ )
+ ]
+
+ else:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded" or "internal_decoupled",
+ currently no other parameter value is supported!"""
+ )
+
+ def pragmas(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ ram_style_thresholds = self.get_nodeattr("ram_style_thresholds")
+ self.code_gen_dict["$PRAGMAS$"] = [
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+ ]
+ self.code_gen_dict["$PRAGMAS$"].append(
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+ )
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+ # the weight tensor is ap_uint [PE][WMEM]
+ # partition for parallel access along the PE dimension (dim 1)
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+ )
+ elif mem_mode == "internal_decoupled" or mem_mode == "external":
+ self.code_gen_dict["$PRAGMAS$"].append(
+ "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
+ )
+
+ else:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded", "internal_decoupled", or external,
+ currently no other parameter value is supported!"""
+ )
+
+ # the threshold tensor is acc_type [PE][TMEM][N_THRES]
+ # partition for parallel access along PE and N_THRES
+ # dimensions (dims 1 and 3)
+ if self.calc_tmem() != 0:
+ # TODO find a better way of checking for no pregenerated thresholds
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
+ )
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
+ )
+ # add resource pragma for thresholds if set
+ if ram_style_thresholds == "distributed":
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
+ )
+ elif ram_style_thresholds == "block":
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
+ )
+ elif ram_style_thresholds == "auto":
+ # no pragma needed
+ pass
+ else:
+ raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds)
+
+ def get_ap_int_max_w(self):
+ # base class impl (max of inp/out stream widths)
+ max_of_io = super().get_ap_int_max_w()
+ # internal_decoupled mode weight stream
+ weightstream = self.get_weightstream_width()
+ # single PE weight entry
+ weight_bits = self.get_weight_datatype().bitwidth()
+ simd = self.get_nodeattr("SIMD")
+ single_pe_w = simd * weight_bits
+ return max([weightstream, max_of_io, single_pe_w])
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ mem_mode = self.get_nodeattr("mem_mode")
+ node = self.onnx_node
+
+ # TODO ensure codegen dir exists
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ # create a npy file fore each input of the node (in_ind is input index)
+ in_ind = 0
+ for inputs in node.input:
+ # it is assumed that the first input of the node is the data input
+ # the second input are the weights
+ # the third input are the thresholds
+ if in_ind == 0:
+ assert (
+ str(context[inputs].dtype) == "float32"
+ ), """Input datatype is
+ not float32 as expected."""
+ expected_inp_shape = self.get_folded_input_shape()
+ reshaped_input = context[inputs].reshape(expected_inp_shape)
+ if self.get_input_datatype() == DataType["BIPOLAR"]:
+ # store bipolar activations as binary
+ reshaped_input = (reshaped_input + 1) / 2
+ export_idt = DataType["BINARY"]
+ else:
+ export_idt = self.get_input_datatype()
+ # make copy before saving the array
+ reshaped_input = reshaped_input.copy()
+ np.save(
+ os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+ reshaped_input,
+ )
+ elif in_ind > 2:
+ raise Exception("Unexpected input found for MatrixVectorActivation")
+ in_ind += 1
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ # reinterpret binary output as bipolar where needed
+ if self.get_output_datatype() == DataType["BIPOLAR"]:
+ out = context[node.output[0]]
+ out = 2 * out - 1
+ context[node.output[0]] = out
+ assert (
+ context[node.output[0]].shape == self.get_normal_output_shape()
+ ), "cppsim did not produce expected output shape"
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
+ self.reset_rtlsim(sim)
+ self.toggle_clk(sim)
+ if mem_mode == "external" or mem_mode == "internal_decoupled":
+ wnbits = self.get_weightstream_width()
+ export_wdt = self.get_weight_datatype()
+ # we have converted bipolar weights to binary for export,
+ # so use it as such for weight generation
+ if self.get_weight_datatype() == DataType["BIPOLAR"]:
+ export_wdt = DataType["BINARY"]
+ wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
+ num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
+ io_dict = {
+ "inputs": {"in0": inp, "weights": wei * num_w_reps},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ output = io_dict["outputs"]["out"]
+ else:
+ output = self.rtlsim(sim, inp)
+ odt = self.get_output_datatype()
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+
+ # load and reshape output
+ output = np.load(out_npy_path)
+ oshape = self.get_normal_output_shape()
+ output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ def instantiate_ip(self, cmd):
+ # instantiate the HLS IP
+ vlnv = self.get_nodeattr("ip_vlnv")
+ node_name = self.onnx_node.name
+ if self.get_nodeattr("mem_mode") == "internal_decoupled":
+ cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name))
+ else:
+ cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name))
diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py
similarity index 57%
rename from src/finn/custom_op/fpgadataflow/pool_batch.py
rename to src/finn/custom_op/fpgadataflow/hls/pool_hls.py
index 813f13e504..64c6ec33f8 100644
--- a/src/finn/custom_op/fpgadataflow/pool_batch.py
+++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -30,11 +30,12 @@
import os
from qonnx.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.pool import Pool
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-class Pool_Batch(HLSCustomOp):
+class Pool_hls(Pool, HLSBackend):
"""Class that corresponds to finn-hlslib Pool_batch function.
Requires ConvolutionInputGenerator(depthwise == 1) to format its input
@@ -54,152 +55,11 @@ class Pool_Batch(HLSCustomOp):
"""
def get_nodeattr_types(self):
- my_attrs = {
- "Channels": ("i", True, 0),
- "PE": ("i", True, 1),
- "KernelSize": ("ints", True, []),
- # Function:
- # - MaxPool
- # - QuantAvgPool
- # TODO add support for AvgPool and AccPool
- "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}),
- "OutImgDims": ("ints", True, []),
- # FINN DataTypes for inputs/outputs
- "InputDataType": ("s", True, ""),
- "OutputDataType": ("s", True, ""),
- "AccumBits": ("i", False, 0),
- "Size": ("i", False, 1),
- "BatchSize": ("i", False, 1),
- }
-
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs = {}
+ my_attrs.update(Pool.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("InputDataType")]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- fxn = self.get_nodeattr("Function")
- odt = DataType[self.get_nodeattr("OutputDataType")]
-
- if fxn == "MaxPool":
- # Same as input
- idt = DataType[self.get_nodeattr("InputDataType")]
- assert odt == idt, "In datatype must be equal to out datatype for Maxpool"
- elif fxn == "QuantAvgPool":
- idt = DataType[self.get_nodeattr("InputDataType")]
- assert (
- idt.signed() == odt.signed()
- ), """QuantAvgPool: Can't mix signed
- and unsigned datatypes"""
- else:
- raise Exception("Pool_Batch doesn't currently support " + fxn)
-
- return odt
-
- def get_normal_input_shape(self, ind=0):
- ifm_ch = self.get_nodeattr("Channels")
- odims = self.get_nodeattr("OutImgDims")
- batch_size = self.get_nodeattr("BatchSize")
- k = self.get_nodeattr("KernelSize")
- k_prod = int(np.prod(k))
- ishape = (batch_size, *odims, k_prod * ifm_ch)
- return ishape
-
- def get_folded_input_shape(self, ind=0):
- normal_ishape = list(self.get_normal_input_shape())
- ifm_ch = self.get_nodeattr("Channels")
- pe = self.get_nodeattr("PE")
- assert ifm_ch % pe == 0, "PE must divide input channels"
- fold = int(normal_ishape[-1] / pe)
- folded_ishape = normal_ishape[:-1] + [fold, pe]
- return tuple(folded_ishape)
-
- def get_normal_output_shape(self, ind=0):
- ofm_ch = self.get_nodeattr("Channels")
- odims = self.get_nodeattr("OutImgDims")
- batch_size = self.get_nodeattr("BatchSize")
- oshape = (batch_size, *odims, ofm_ch)
- return oshape
-
- def get_folded_output_shape(self, ind=0):
- normal_oshape = list(self.get_normal_output_shape())
- ifm_ch = self.get_nodeattr("Channels")
- pe = self.get_nodeattr("PE")
- assert ifm_ch % pe == 0, "PE must divide input channels"
- fold = int(ifm_ch / pe)
- folded_oshape = normal_oshape[:-1] + [fold, pe]
- return tuple(folded_oshape)
-
- def get_number_output_values(self):
- folded_oshape = self.get_folded_output_shape()
- return np.prod(folded_oshape[1:-1])
-
- def get_exp_cycles(self):
- # (Channels * kernel * kernel) / PE * odim * odim * batch_size
- ifm_ch = self.get_nodeattr("Channels")
- pe = self.get_nodeattr("PE")
- k = self.get_nodeattr("KernelSize")
- k_prod = int(np.prod(k))
- odims = self.get_nodeattr("OutImgDims")
- batch_size = self.get_nodeattr("BatchSize")
- exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size
- return int(exp_cycles)
-
- def get_instream_width(self, ind=0):
- dt_bits = self.get_input_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- in_width = int(dt_bits * pe)
- return in_width
-
- def get_outstream_width(self, ind=0):
- dt_bits = self.get_output_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- out_width = int(dt_bits * pe)
- return out_width
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- oshape = self.get_normal_output_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch."
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- # data type stays the same
- dtype = self.get_output_datatype()
- model.set_tensor_datatype(node.output[0], dtype)
-
- def verify_node(self):
- info_messages = []
- # verify that "backend" is set to "fpgadataflow"
- backend_value = self.get_nodeattr("backend")
- if backend_value == "fpgadataflow":
- info_messages.append("Attribute backend is set correctly")
- else:
- info_messages.append('Attribute backend should be set to "fpgadataflow"')
-
- # verify the number of inputs
- if len(self.onnx_node.input) == 1:
- info_messages.append("The number of inputs is correct")
- else:
- info_messages.append("""Pool_Batch needs 1 data input""")
-
- # check supported function
- fnx = self.get_nodeattr("Function")
- if fnx in ["MaxPool", "QuantAvgPool"]:
- info_messages.append(
- "Attribute Function contains a supported pool function"
- )
- else:
- info_messages.append(
- "Attribute Function contains an unsupported pool function"
- )
- return info_messages
-
def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
self.code_gen_dict["$GLOBALS$"] += ['#include "maxpool.h"']
@@ -239,17 +99,15 @@ def read_npy_data(self):
npy_in = "%s/input_0.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"] = []
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
- )
-
- def strm_decl(self):
- self.code_gen_dict["$STREAMDECLARATIONS$"] = []
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
- )
- self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
def docompute(self):
@@ -272,17 +130,15 @@ def docompute(self):
else:
act_hls_dt = "ap_uint<{}>".format(accum_bits)
self.code_gen_dict["$DOCOMPUTE$"] += [
- "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format(
- act_hls_dt, o_hls_dt, size
- )
+ "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format(act_hls_dt, o_hls_dt, size)
]
else:
raise Exception("Pool_Batch doesn't currently support " + fxn)
self.code_gen_dict["$DOCOMPUTE$"] += [
"""Pool_batch, Slice< {} > >
- (in0,out, pool_fxn, OFMDimTotal*numReps);""".format(
- i_hls_dt, o_hls_dt
+ (in0_{}, out_{}, pool_fxn, OFMDimTotal*numReps);""".format(
+ i_hls_dt, o_hls_dt, self.hls_sname(), self.hls_sname()
)
]
@@ -302,20 +158,18 @@ def dataoutstrm(self):
oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);'
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
% (
packed_hls_type,
elem_hls_type,
elem_bits,
npy_type,
+ self.hls_sname(),
oshape_cpp_str,
npy_out,
)
]
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
packed_ibits = self.get_instream_width()
packed_in_hls_type = "ap_uint<%d>" % packed_ibits
@@ -323,20 +177,15 @@ def blackboxfunction(self):
packed_obits = self.get_outstream_width()
packed_out_hls_type = "ap_uint<%d>" % packed_obits
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)"
- % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type)
- ]
-
- def pragmas(self):
- self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ packed_in_hls_type,
+ self.hls_sname(),
+ packed_out_hls_type,
+ self.hls_sname(),
+ )
]
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
new file mode 100644
index 0000000000..d1f58d3e87
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py
@@ -0,0 +1,215 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.streamingdatawidthconverter import (
+ StreamingDataWidthConverter,
+)
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+# does not do anything at the ONNX node-by-node level, and input-output
+# tensor shapes are the same. performs data width conversion at the rtlsim level
+
+
+class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend):
+ """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch
+ function."""
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"']
+
+ def defines(self, var):
+ numReps = 1
+ numInWords = int(np.prod(self.get_folded_input_shape()[:-1]))
+ inWidth = self.get_nodeattr("inWidth")
+ outWidth = self.get_nodeattr("outWidth")
+ self.code_gen_dict["$DEFINES$"] = [
+ "#define InWidth %d " % inWidth,
+ "#define OutWidth %d " % outWidth,
+ "#define NumInWords %d " % numInWords,
+ "#define numReps %d" % numReps,
+ ]
+ if self.needs_lcm():
+ lcmWidth = self.get_iowidth_lcm()
+ assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation"
+ numLCMToOut = numInWords // (lcmWidth / inWidth)
+ self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth)
+ self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut))
+
+ def strm_decl(self):
+ self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+ if self.needs_lcm():
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> intermediate ("intermediate");'.format(
+ self.get_iowidth_lcm()
+ )
+ )
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+
+ def docompute(self):
+ # TODO continue with fxns below, they are copy-pasted
+ op = "StreamingDataWidthConverter_Batch"
+ if self.needs_lcm():
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ 'hls::stream> intermediate ("intermediate");'.format(
+ self.get_iowidth_lcm()
+ ),
+ "%s(in0_%s, intermediate, numReps);"
+ % (op, self.hls_sname()),
+ "%s(intermediate, out_%s, numReps);"
+ % (op, self.hls_sname()),
+ ]
+ else:
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ "%s(in0_%s, out_%s, numReps);"
+ % (op, self.hls_sname(), self.hls_sname())
+ ]
+
+ def blackboxfunction(self):
+ in_packed_bits = self.get_instream_width()
+ in_packed_hls_type = "ap_uint<%d>" % in_packed_bits
+ out_packed_bits = self.get_outstream_width()
+ out_packed_hls_type = "ap_uint<%d>" % out_packed_bits
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ in_packed_hls_type,
+ self.hls_sname(),
+ out_packed_hls_type,
+ self.hls_sname(),
+ )
+ ]
+
+ def pragmas(self):
+ self.code_gen_dict["$PRAGMAS$"] = [
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+ ]
+ self.code_gen_dict["$PRAGMAS$"].append(
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+ )
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+ if self.needs_lcm():
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation")
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_shape = self.get_normal_input_shape()
+ folded_ishape = self.get_folded_input_shape()
+
+ # TODO ensure codegen dir exists
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape."
+
+ if self.get_input_datatype() == DataType["BIPOLAR"]:
+ # store bipolar activations as binary
+ inp = (inp + 1) / 2
+ export_idt = DataType["BINARY"]
+ else:
+ export_idt = self.get_input_datatype()
+ # reshape input into folded shape
+ reshaped_input = inp.reshape(folded_ishape)
+ # make copy before saving array
+ reshaped_input = reshaped_input.copy()
+ np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+ if mode == "cppsim":
+ output = inp
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_shape)
+ context[node.output[0]] = output
+
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ odt = export_idt
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(exp_shape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to "rtlsim" """.format(
+ mode
+ )
+ )
+ # binary -> bipolar if needed
+ if self.get_output_datatype() == DataType["BIPOLAR"]:
+ out = context[node.output[0]]
+ out = 2 * out - 1
+ context[node.output[0]] = out
+ assert context[node.output[0]].shape == tuple(
+ exp_shape
+ ), """Output
+ shape doesn't match expected shape, should be same as input shape"""
diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py
similarity index 54%
rename from src/finn/custom_op/fpgadataflow/eltwise.py
rename to src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py
index 68ed6546c7..0d618d832a 100644
--- a/src/finn/custom_op/fpgadataflow/eltwise.py
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, Xilinx
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -28,112 +28,24 @@
import numpy as np
import os
-import warnings
-from qonnx.core.datatype import DataType
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise
from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
-class StreamingEltwise(HLSCustomOp):
+class StreamingEltwise_hls(StreamingEltwise, HLSBackend):
"""Class that corresponds to finn-hlslib StreamingEltwise function."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
-
- my_attrs = super().get_nodeattr_types()
- my_attrs.update(
- {
- "NumChannels": ("i", True, ""),
- "PE": ("i", True, ""),
- # FINN DataTypes for inputs; output datatype inferred from input
- "inputDataType0": ("s", True, ""),
- "inputDataType1": ("s", True, ""),
- # type of EltwiseFunction for the operation
- "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]),
- # number of input vectors, examples:
- # [1] is a single vector (like a FC layer with batch=1)
- # [4] is four vectors (like a FC layer with batch=4)
- # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
- "numInputVectors": ("ints", False, [1]),
- "inFIFODepths": ("ints", False, [2, 2]),
- }
- )
+ my_attrs = {}
+ my_attrs.update(StreamingEltwise.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def get_eltwise_op_lambda(self):
- eltwise_op = self.get_nodeattr("eltwiseOp")
- idt0 = self.get_input_datatype(0)
- idt1 = self.get_input_datatype(1)
- odt = self.get_output_datatype()
- tin0 = idt0.get_hls_datatype_str()
- tin1 = idt1.get_hls_datatype_str()
- tout = odt.get_hls_datatype_str()
- eltwise_ops = {
- # "Add": "[](auto a, auto b) { return a + b; }",
- # "Sub": "[](auto a, auto b) { return a - b; }",
- # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }",
- "Add": f"add<{tin0}, {tin1}, {tout}>()",
- "Sub": f"sub<{tin0}, {tin1}, {tout}>()",
- "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()",
- }
- return eltwise_ops[eltwise_op]
-
- def get_normal_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- ishape = tuple(vecs + [ich])
- return ishape
-
- def get_folded_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- assert ich % pe == 0, "PE must divide NumChannels"
- vecs = list(self.get_nodeattr("numInputVectors"))
- ishape = tuple(vecs + [ich // pe, pe])
- return ishape
-
- def get_normal_output_shape(self, ind=0):
- return self.get_normal_input_shape()
-
- def get_folded_output_shape(self, ind=0):
- return self.get_folded_input_shape()
-
- def make_shape_compatible_op(self, model):
- exp_ishape = self.get_normal_input_shape()
- oshape = self.get_normal_output_shape()
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0]))
- assert ishape == exp_ishape, "Unexpected input1 shape."
- ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1]))
- assert ishape == exp_ishape, "Unexpected input2 shape."
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- idt0 = model.get_tensor_datatype(node.input[0])
- if idt0 != self.get_input_datatype(0):
- warn_str = "inputDataType0 changing for %s: %s -> %s " % (
- node.name,
- str(self.get_input_datatype(0)),
- str(idt0),
- )
- warnings.warn(warn_str)
- self.set_nodeattr("inputDataType0", idt0.name)
- idt1 = model.get_tensor_datatype(node.input[1])
- if idt1 != self.get_input_datatype(1):
- warn_str = "inputDataType1 changing for %s: %s -> %s " % (
- node.name,
- str(self.get_input_datatype(1)),
- str(idt1),
- )
- warnings.warn(warn_str)
- self.set_nodeattr("inputDataType1", idt1.name)
- # enforce output data type (calculated based on idt)
- odt = self.get_output_datatype()
- model.set_tensor_datatype(self.onnx_node.output[0], odt)
-
def verify_node(self):
info_messages = []
# verify that "backend" is set to "fpgadataflow"
@@ -154,66 +66,10 @@ def verify_node(self):
self.get_nodeattr("eltwiseOp")
info_messages.append("All necessary attributes exist")
except Exception:
- info_messages.append(
- """The required StreamingEltwise attributes do not exist."""
- )
+ info_messages.append("""The required StreamingEltwise attributes do not exist.""")
return info_messages
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("inputDataType" + str(ind))]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- op = self.get_nodeattr("eltwiseOp")
- idt0 = self.get_input_datatype(0)
- idt1 = self.get_input_datatype(1)
- assert idt0.signed() == idt1.signed(), (
- "%s: Inputs must have same signedness" % self.onnx_node.name
- )
- idt0_min, idt0_max = idt0.min(), idt0.max()
- idt1_min, idt1_max = idt1.min(), idt1.max()
- cands = [
- idt0_min - idt1_min,
- idt0_min - idt1_max,
- idt0_max - idt1_min,
- idt0_max - idt1_max,
- ]
- largest_magnitude = max(map(abs, cands))
- if op == "Add":
- if idt0.signed():
- return DataType.get_smallest_possible(idt0.min() + idt1.min())
- else:
- return DataType.get_smallest_possible(idt0.max() + idt1.max())
- elif op == "Sub":
- return DataType.get_smallest_possible(-largest_magnitude)
- elif op == "AbsDiff":
- return DataType.get_smallest_possible(largest_magnitude)
- else:
- raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op))
-
- def get_instream_width(self, ind=0):
- """Returns input stream width."""
- ibits = self.get_input_datatype(ind).bitwidth()
- pe = self.get_nodeattr("PE")
- in_width = pe * ibits
- return in_width
-
- def get_outstream_width(self, ind=0):
- """Returns output stream width."""
- obits = self.get_output_datatype().bitwidth()
- pe = self.get_nodeattr("PE")
- out_width = pe * obits
- return out_width
-
- def get_number_output_values(self):
- return np.prod(self.get_folded_output_shape()[:-1])
-
- def get_exp_cycles(self):
- # Channels/PE * batch size * fmdim * fmdim
- return np.prod(self.get_folded_output_shape()[:-1])
-
def execute_node(self, context, graph):
mode = self.get_nodeattr("exec_mode")
node = self.onnx_node
@@ -235,9 +91,7 @@ def execute_node(self, context, graph):
inp = context[node.input[0]]
assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert (
- inp.shape == exp_ishape
- ), """Input0 shape doesn't match expected shape ."""
+ assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape ."""
export_idt0 = self.get_input_datatype(0)
# reshape input into folded form
inp = inp.reshape(folded_ishape)
@@ -248,9 +102,7 @@ def execute_node(self, context, graph):
# exact same thing for input1
inp = context[node.input[1]]
assert str(inp.dtype) == "float32", "Input datatype is not float32"
- assert (
- inp.shape == exp_ishape
- ), """Input1 shape doesn't match expected shape ."""
+ assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape ."""
export_idt1 = self.get_input_datatype(1)
# reshape input into folded form
inp = inp.reshape(folded_ishape)
@@ -354,25 +206,45 @@ def read_npy_data(self):
self.code_gen_dict["$READNPYDATA$"] = []
npy_in = "%s/input_0.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0);'
- % (packed_hls_type_0, elem_hls_type_0, elem_bits_0, npy_type, npy_in)
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+ % (
+ packed_hls_type_0,
+ elem_hls_type_0,
+ elem_bits_0,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
npy_in = "%s/input_1.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in1);'
- % (packed_hls_type_1, elem_hls_type_1, elem_bits_1, npy_type, npy_in)
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);'
+ % (
+ packed_hls_type_1,
+ elem_hls_type_1,
+ elem_bits_1,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
def strm_decl(self):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width(0))
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(0), self.hls_sname(), self.hls_sname()
+ )
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in1 ("in1");'.format(self.get_instream_width(1))
+ 'hls::stream> in1_{} ("in1_{}");'.format(
+ self.get_instream_width(1), self.hls_sname(), self.hls_sname()
+ )
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
)
def docompute(self):
@@ -394,7 +266,7 @@ def docompute(self):
out_hls_type,
)
self.code_gen_dict["$DOCOMPUTE$"] = [
- """{}<{}, {}, {}, {}, {}, {}>(in0, in1, out, {});""".format(
+ """{}<{}, {}, {}, {}, {}, {}>(in0_{}, in1_{}, out_{}, {});""".format(
"StreamingEltwise",
self.get_nodeattr("NumChannels"),
self.get_nodeattr("PE"),
@@ -402,65 +274,35 @@ def docompute(self):
slice_in0,
slice_in1,
slice_out,
+ self.hls_sname(),
+ self.hls_sname(),
+ self.hls_sname(),
eltwise_op_str,
)
]
- def dataoutstrm(self):
- code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
- dtype = self.get_output_datatype()
- elem_bits = dtype.bitwidth()
- packed_bits = self.get_outstream_width()
- packed_hls_type = "ap_uint<%d>" % packed_bits
- elem_hls_type = dtype.get_hls_datatype_str()
- npy_type = "float"
- npy_out = "%s/output.npy" % code_gen_dir
- oshape = self.get_folded_output_shape()
- oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}")
-
- self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");'
- % (
- packed_hls_type,
- elem_hls_type,
- elem_bits,
- npy_type,
- oshape_cpp_str,
- npy_out,
- )
- ]
-
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0, hls::stream> &in1,
- hls::stream> &out)""".format(
+ """void {}(hls::stream> &in0_{}, hls::stream> &in1_{},
+ hls::stream> &out_{})""".format(
self.onnx_node.name,
self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(),
+ self.hls_sname(),
self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(),
+ self.hls_sname(),
self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(),
+ self.hls_sname(),
)
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
]
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname()
)
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
)
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
-
- def get_verilog_top_module_intf_names(self):
- intf_names = super().get_verilog_top_module_intf_names()
- sname = self.hls_sname()
- swidth = self.get_instream_width_padded()
- intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]]
- return intf_names
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py
new file mode 100755
index 0000000000..69db7b4606
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py
@@ -0,0 +1,222 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class StreamingMaxPool_hls(StreamingMaxPool, HLSBackend):
+ """Class that corresponds to finn-hlslib StreamingMaxPool_batch function."""
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(StreamingMaxPool.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def verify_node(self):
+ info_messages = []
+ # verify that "backend" is set to "fpgadataflow"
+ backend_value = self.get_nodeattr("backend")
+ if backend_value == "fpgadataflow":
+ info_messages.append("Attribute backend is set correctly")
+ else:
+ info_messages.append('Attribute backend should be set to "fpgadataflow"')
+
+ # verify the number of inputs
+ if len(self.onnx_node.input) == 1:
+ info_messages.append("The number of inputs is correct")
+ else:
+ info_messages.append("""StreamingMaxPool_Batch needs 1 data input""")
+
+ return info_messages
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"']
+
+ def defines(self, var):
+ numReps = 1
+ ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized()
+ ceil_mode = self.get_nodeattr("CeilMode")
+ output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode)
+
+ if self.is_1d():
+ self.code_gen_dict["$DEFINES$"] = [
+ """#define ImgDim {}\n #define PoolDim {}\n
+ #define NumChannels {}\n #define PE {}\n #define OutputSize {}
+ \n #define numReps {}""".format(
+ ifm_dim[1],
+ k[1],
+ self.get_nodeattr("NumChannels"),
+ self.get_nodeattr("PE"),
+ output_size,
+ numReps,
+ )
+ ]
+ else:
+ self.code_gen_dict["$DEFINES$"] = [
+ """#define ImgDim {}\n #define PoolDim {}\n
+ #define NumChannels {}\n #define numReps {}""".format(
+ ifm_dim[1],
+ k[1],
+ self.get_nodeattr("NumChannels"),
+ numReps,
+ )
+ ]
+
+ def docompute(self):
+ dtype = self.get_input_datatype()
+ if dtype.bitwidth() == 1:
+ if self.is_1d():
+ raise Exception("Binary 1d MaxPool not implemented on HLS backend")
+ else:
+ op = "StreamingMaxPool"
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ "%s(in0_%s, out_%s);"
+ % (op, self.hls_sname(), self.hls_sname())
+ ]
+ else:
+ dtype = self.get_input_datatype()
+ dtype_hls = dtype.get_hls_datatype_str()
+ minval_str = str(int(dtype.min()))
+ if self.is_1d():
+ op = "StreamingMaxPool_Precision_1d"
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """%s(in0_%s, out_%s);"""
+ % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname())
+ ]
+ else:
+ op = "StreamingMaxPool_Precision"
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ "%s(in0_%s, out_%s);"
+ % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname())
+ ]
+
+ def blackboxfunction(self):
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ packed_hls_type,
+ self.hls_sname(),
+ packed_hls_type,
+ self.hls_sname(),
+ )
+ ]
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_ishape = self.get_normal_input_shape()
+ exp_oshape = self.get_normal_output_shape()
+ folded_ishape = self.get_folded_input_shape()
+
+ # TODO ensure codegen dir exists
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert (
+ inp.shape == exp_ishape
+ ), """Input shape doesn't
+ match expected shape (1, ifm_dim, ifm_dim, ifm_ch)."""
+ if self.get_input_datatype() == DataType["BIPOLAR"]:
+ # store bipolar activations as binary
+ inp = (inp + 1) / 2
+ export_idt = DataType["BINARY"]
+ else:
+ export_idt = self.get_input_datatype()
+
+ reshaped_input = inp.reshape(folded_ishape)
+ np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), "cppsim \
+ did not produce expected output shape"
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ odt = export_idt
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+ # binary -> bipolar if needed
+ if self.get_output_datatype() == DataType["BIPOLAR"]:
+ out = context[node.output[0]]
+ out = 2 * out - 1
+ context[node.output[0]] = out
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output
+ shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch)."""
diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
similarity index 64%
rename from src/finn/custom_op/fpgadataflow/thresholding_batch.py
rename to src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
index d9745acf63..b753bc7a03 100644
--- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py
+++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -29,15 +29,12 @@
import numpy as np
import os
import textwrap
-import warnings
from math import ceil, log2
from qonnx.core.datatype import DataType
-from qonnx.util.basic import (
- interleave_matrix_outer_dim_from_partitions,
- roundup_to_integer_multiple,
-)
+from qonnx.util.basic import roundup_to_integer_multiple
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.thresholding import Thresholding
from finn.util.data_packing import (
npy_to_rtlsim_input,
numpy_to_hls_code,
@@ -45,8 +42,6 @@
rtlsim_output_to_npy,
)
-from . import templates
-
# ONNX i/o tensor shape assumptions for Thresholding:
# input 0 is the input tensor, shape (..., NumChannels)
# input 1 is the threshold tensor, shape (NumChannels, n_thres)
@@ -54,39 +49,26 @@
# the ... here can be any shape (representing groups of vectors)
-class Thresholding_Batch(HLSCustomOp):
+class Thresholding_hls(Thresholding, HLSBackend):
"""Class that corresponds to finn-hls Thresholding_Batch function."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
- self.decoupled_wrapper = templates.decoupled_wrapper
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
- # parallelization; channels thresholded per cycle
- "PE": ("i", True, 0),
- # number of channels (each may have different thresholds)
- "NumChannels": ("i", True, 0),
- # number of steps in thresholding function
- "numSteps": ("i", True, 1),
+ # memory mode for the thresholds
+ # internal_embedded -- embedded thresholds
+ # internal_decoupled -- default, streaming thresholds with streamer packaged inside IP
+ "mem_mode": (
+ "s",
+ False,
+ "internal_decoupled",
+ {"internal_embedded", "internal_decoupled"},
+ ),
# string defining memory type
"ram_style": ("s", False, "distributed", {"distributed", "block"}),
- # FINN DataTypes for inputs, outputs
- "inputDataType": ("s", True, ""),
- "weightDataType": ("s", True, ""),
- "outputDataType": ("s", True, ""),
- # number of input vectors, examples:
- # [1] is a single vector (like a FC layer with batch=1)
- # [4] is four vectors (like a FC layer with batch=4)
- # [1, 4, 4] is four * four vectors (like a conv layer with batch=1)
- "numInputVectors": ("ints", False, [1]),
- # initialization value for the thresholding accumulator
- "ActVal": ("i", False, 0),
- # memory mode for the thresholds
- # const -- embedded thresholds, default
- # decoupled -- streaming thresholds with streamer packaged inside IP
- "mem_mode": ("s", False, "const", {"const", "decoupled"}),
- # (mem_mode = decoupled only) whether weights (thresholds) will be
+ # (mem_mode = internal_decoupled only) whether weights (thresholds) will be
# writable through an AXI-lite interface during runtime
# 1 for enabled, 0 for disabled.
# see finn-rtllib/memstream/doc/README for more about the memory
@@ -97,60 +79,10 @@ def get_nodeattr_types(self):
# weight data from the weight FIFOs.
"runtime_writeable_weights": ("i", False, 0, {0, 1}),
}
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs.update(Thresholding.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
- def calc_tmem(self):
- """Calculates and returns TMEM."""
- mh = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- return mh // pe
-
- def make_shape_compatible_op(self, model):
- oshape = self.get_normal_output_shape()
- return super().make_const_shape_op(oshape)
-
- def infer_node_datatype(self, model):
- node = self.onnx_node
- idt = model.get_tensor_datatype(node.input[0])
- if idt != self.get_input_datatype():
- warn_str = "inputDataType changing for %s: %s -> %s " % (
- node.name,
- str(self.get_input_datatype().name),
- str(idt.name),
- )
- warnings.warn(warn_str)
- self.set_nodeattr("inputDataType", idt.name)
- # set output datatype from property
- odt = self.get_output_datatype()
- model.set_tensor_datatype(node.output[0], odt)
-
- def verify_node(self):
- info_messages = []
- # verify that "backend" is set to "fpgadataflow"
- backend_value = self.get_nodeattr("backend")
- if backend_value == "fpgadataflow":
- info_messages.append("Attribute backend is set correctly")
- else:
- info_messages.append('Attribute backend should be set to "fpgadataflow"')
-
- # verify that all necessary attributes exist
- # TODO collect automatically from get_nodeattr_types
- try:
- self.get_nodeattr("code_gen_dir_cppsim")
- self.get_nodeattr("executable_path")
- self.get_nodeattr("NumChannels")
- self.get_nodeattr("PE")
- self.get_nodeattr("inputDataType")
- self.get_nodeattr("outputDataType")
- info_messages.append("All necessary attributes exist")
- except Exception:
- info_messages.append(
- """The required Threshold_Batch attributes do not exist."""
- )
-
- return info_messages
-
def bram_estimation(self):
"""Calculates BRAM cost if resource set to BRAM"""
style = self.get_nodeattr("ram_style")
@@ -182,53 +114,9 @@ def lut_estimation(self):
# total cost
return comparator_cost + lutram_cost
- def get_input_datatype(self, ind=0):
- """Returns FINN DataType of input."""
- return DataType[self.get_nodeattr("inputDataType")]
-
- def get_output_datatype(self, ind=0):
- """Returns FINN DataType of output."""
- return DataType[self.get_nodeattr("outputDataType")]
-
- def get_weight_datatype(self):
- """Returns FINN DataType of thresholds, here called weights."""
- return DataType[self.get_nodeattr("weightDataType")]
-
- def minimize_accumulator_width(self, model):
- "Minimize threshold width ('accumulator width' here due to convention)"
- thresholds = model.get_initializer(self.onnx_node.input[1])
- threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds)
- min_threshold = thresholds.min()
- max_threshold = thresholds.max()
- min_input = self.get_input_datatype().min()
- max_input = self.get_input_datatype().max()
- # get range required by threshold values
- tdt_min = min(min_input, min_threshold)
- tdt_max = max(max_input, max_threshold)
- if tdt_min < 0:
- if abs(tdt_min) > tdt_max:
- tdt = DataType.get_smallest_possible(tdt_min)
- else:
- tdt = DataType.get_smallest_possible(-tdt_max - 1)
- else:
- tdt = DataType.get_smallest_possible(tdt_max)
- assert np.vectorize(tdt.allowed)(
- threshold_tensor
- ).all(), "Thresholds can't be expressed with type %s" % str(tdt)
- self.set_nodeattr("weightDataType", tdt.name)
- return DataType[self.get_nodeattr("weightDataType")]
-
- def get_instream_width(self, ind=0):
- i_bits = self.get_input_datatype().bitwidth()
- return i_bits * self.get_nodeattr("PE")
-
- def get_outstream_width(self, ind=0):
- o_bits = self.get_output_datatype().bitwidth()
- return o_bits * self.get_nodeattr("PE")
-
def get_weightstream_width(self):
- """Returns weight stream width. Used only in decoupled mode."""
- if self.get_nodeattr("mem_mode") == "decoupled":
+ """Returns weight stream width. Used only in internal_decoupled mode."""
+ if self.get_nodeattr("mem_mode") == "internal_decoupled":
pe = self.get_nodeattr("PE")
wp = self.get_weight_datatype().bitwidth()
n_thres_steps = self.get_nodeattr("numSteps")
@@ -239,44 +127,16 @@ def get_weightstream_width(self):
def get_weightstream_width_padded(self):
"""Returns weight stream width padded to a multiple of 8. This is required
- by the AXI Stream spec. Used in decoupled mode."""
+ by the AXI Stream spec. Used in internal_decoupled mode."""
weight_width = self.get_weightstream_width()
return roundup_to_integer_multiple(weight_width, 8)
def get_ap_int_max_w(self):
- temp_value = super().get_ap_int_max_w()
- weightstream = self.get_weightstream_width()
- return max([weightstream, temp_value])
-
- def get_folded_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- fold = ich // pe
- vecs = list(self.get_nodeattr("numInputVectors"))
- folded_input_shape = tuple(vecs + [fold, pe])
- return folded_input_shape
-
- def get_folded_output_shape(self, ind=0):
- # same shape as input
- return self.get_folded_input_shape()
-
- def get_normal_input_shape(self, ind=0):
- ich = self.get_nodeattr("NumChannels")
- vecs = list(self.get_nodeattr("numInputVectors"))
- normal_input_shape = tuple(vecs + [ich])
- return normal_input_shape
-
- def get_normal_output_shape(self, ind=0):
- # same shape as input
- return self.get_normal_input_shape()
-
- def get_number_output_values(self):
- nf = np.prod(self.get_folded_output_shape()[:-1])
- return nf
-
- def get_exp_cycles(self):
- # Channels/PE * batch size * fmdim * fmdim
- return np.prod(self.get_folded_output_shape()[:-1])
+ ap_int_max_w = HLSBackend.get_ap_int_max_w(self)
+ if self.get_nodeattr("mem_mode") == "internal_decoupled":
+ weightstream = self.get_weightstream_width()
+ ap_int_max_w = max([weightstream, ap_int_max_w])
+ return ap_int_max_w
def get_template_param_values(self):
"""Returns the template parameter values according to input, output and weight
@@ -291,63 +151,6 @@ def get_template_param_values(self):
return ret
- def get_hls_compatible_threshold_tensor(self, orig_thres_matrix):
- """Convert the original numpy weight matrix orig_weight_matrix into
- a form suitable for passing to the hlslib call:
- * ensure MH % PE == 0
- * for unsigned inputs, ensure thresholds are positive
- * interleave rows between PEs
- * reshape into (PE, TMEM, n_thres_steps) and return
- """
- mh = self.get_nodeattr("NumChannels")
- pe = self.get_nodeattr("PE")
- tmem = mh // pe
- assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated."
- assert (
- orig_thres_matrix.ndim == 2
- ), """Threshold matrix dimension is
- not as expected (2)."""
- n_thres_steps = orig_thres_matrix.shape[1]
- assert n_thres_steps == self.get_nodeattr(
- "numSteps"
- ), "Mismatch in threshold steps"
- if not self.get_input_datatype().signed():
- # ensure all thresholds are nonnegative
- assert (orig_thres_matrix >= 0).all()
- # ensure all thresholds are integer
- assert np.equal(
- np.mod(orig_thres_matrix, 1), 0
- ).all(), "Need int threshold tensor"
- ret = orig_thres_matrix
- # workaround for vivado_hls threshold bug
- if ret[0][0] == 0 and n_thres_steps == 1:
- ret = np.copy(ret)
- ret[0][0] = 1
- warnings.warn(
- "Setting 0-valued first threshold to 1 to avoid vivado_hls bug"
- )
- # ensure channels = mh , duplicating if necessary
- if ret.shape[0] == 1:
- ret = np.tile(ret, (mh, 1))
- assert (
- ret.shape[0] == mh
- ), "Channels of threshold matrix are not as expected (mh)"
- # distribute rows between PEs
- ret = interleave_matrix_outer_dim_from_partitions(ret, pe)
- assert (
- ret.shape[0] == pe
- ), """First dimension after distribution of the
- rows between PEs is not as expected (pe)"""
- assert (
- ret.shape[1] == tmem
- ), """Second dimension after distribution of the
- rows between PEs is not as expected (tmem)"""
- assert (
- ret.shape[2] == n_thres_steps
- ), """Third dimension after distribution of the
- rows between PEs is not as expected (n_thres_steps)"""
- return ret.reshape(1, pe, tmem, n_thres_steps)
-
def make_weight_file(self, weights, weight_file_mode, weight_file_name):
"""Produce a file containing given weights (thresholds) in appropriate
format for this layer. This file can be used for either synthesis or
@@ -361,7 +164,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name):
* weight_file_name : filename for the weight file to be generated
"""
- threshold_tensor = self.get_hls_compatible_threshold_tensor(weights)
+ threshold_tensor = self.get_hw_compatible_threshold_tensor(weights)
tdt = self.get_weight_datatype()
assert np.vectorize(tdt.allowed)(
threshold_tensor
@@ -455,36 +258,18 @@ def generate_params(self, model, path):
code_gen_dir = path
thresholds = model.get_initializer(self.onnx_node.input[1])
mem_mode = self.get_nodeattr("mem_mode")
- if mem_mode == "const":
+ if mem_mode == "internal_embedded":
# save thresholds in thresh.h
weight_filename = "{}/thresh.h".format(code_gen_dir)
self.make_weight_file(thresholds, "hls_header", weight_filename)
- elif mem_mode == "decoupled":
- # save decoupled weights for cppsim
+ elif mem_mode == "internal_decoupled":
+ # save internal_decoupled weights for cppsim
weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir)
self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim)
# also save weights as Verilog .dat file
- # note that we provide two different .dat files, one for synth
- # and one for synthesis. this is because URAM-based weights always
- # need zero weights for synthesis, otherwise they get inferred
- # as BRAM
- weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir)
- weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir)
- # sim weights are always the true weights
- self.make_weight_file(
- thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim
- )
- ram_style = self.get_nodeattr("ram_style")
- if ram_style == "ultra":
- # UltraRAM must have no memory initializer, or only zeroes
- # otherwise BRAM will be inferred instead of URAM
- # as a workaround we provide a zero-weight init here
- synth_thresholds = np.zeros_like(thresholds, dtype=np.float32)
- else:
- synth_thresholds = thresholds
- self.make_weight_file(
- synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth
- )
+ # This file will be ignored when synthesizing UltraScale memory.
+ weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir)
+ self.make_weight_file(thresholds, "decoupled_verilog_dat", weight_filename_rtl)
else:
raise Exception("Unrecognized mem_mode")
@@ -545,18 +330,14 @@ def execute_node(self, context, graph):
out = 2 * out - 1
context[node.output[0]] = out
oshape = self.get_normal_output_shape()
- assert (
- context[node.output[0]].shape == oshape
- ), """Output shape is not as expected"""
+ assert context[node.output[0]].shape == oshape, """Output shape is not as expected"""
elif mode == "rtlsim":
sim = self.get_rtlsim()
nbits = self.get_instream_width()
- inp = npy_to_rtlsim_input(
- "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
- )
+ inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
super().reset_rtlsim(sim)
super().toggle_clk(sim)
- if self.get_nodeattr("mem_mode") == "decoupled":
+ if self.get_nodeattr("mem_mode") == "internal_decoupled":
wnbits = self.get_weightstream_width()
export_wdt = self.get_weight_datatype()
wei = npy_to_rtlsim_input(
@@ -569,7 +350,7 @@ def execute_node(self, context, graph):
}
self.rtlsim_multi_io(sim, io_dict)
output = io_dict["outputs"]["out"]
- elif self.get_nodeattr("mem_mode") == "const":
+ elif self.get_nodeattr("mem_mode") == "internal_embedded":
output = self.rtlsim(sim, inp)
else:
raise Exception("Unrecognized mem_mode")
@@ -578,9 +359,7 @@ def execute_node(self, context, graph):
packed_bits = self.get_outstream_width()
out_npy_path = "{}/output.npy".format(code_gen_dir)
out_shape = self.get_folded_output_shape()
- rtlsim_output_to_npy(
- output, out_npy_path, odt, out_shape, packed_bits, target_bits
- )
+ rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
# load and reshape output
output = np.load(out_npy_path)
@@ -597,7 +376,7 @@ def execute_node(self, context, graph):
def global_includes(self):
self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"']
- if self.get_nodeattr("mem_mode") == "const":
+ if self.get_nodeattr("mem_mode") == "internal_embedded":
self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
# TODO check and add whatever missing
@@ -615,13 +394,12 @@ def defines(self, var):
total_spatial_size,
)
]
- if self.get_nodeattr("mem_mode") == "decoupled":
+ if self.get_nodeattr("mem_mode") == "internal_decoupled":
self.code_gen_dict["$DEFINES$"].append(
"#define ActVal1 %d" % self.get_nodeattr("ActVal")
)
self.code_gen_dict["$DEFINES$"].append(
- "#define ThresType1 %s"
- % self.get_weight_datatype().get_hls_datatype_str()
+ "#define ThresType1 %s" % self.get_weight_datatype().get_hls_datatype_str()
)
self.code_gen_dict["$DEFINES$"].append(
"#define NumSteps1 %d" % self.get_nodeattr("numSteps")
@@ -639,11 +417,18 @@ def read_npy_data(self):
self.code_gen_dict["$READNPYDATA$"] = []
# note: the innermost dim is reversed for the input
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
mem_mode = self.get_nodeattr("mem_mode")
- if mem_mode == "decoupled":
+ if mem_mode == "internal_decoupled":
tdt = self.get_weight_datatype()
elem_bits = tdt.bitwidth()
packed_bits = self.get_weightstream_width()
@@ -653,49 +438,63 @@ def read_npy_data(self):
npy_in = "%s/thresholds.npy" % code_gen_dir
self.code_gen_dict["$READNPYDATA$"].append(
- 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, ImgDim1);'
- % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in)
+ 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, ImgDim1);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
)
def strm_decl(self):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> in0 ("in0");'.format(self.get_instream_width())
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> out ("out");'.format(self.get_outstream_width())
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
)
mem_mode = self.get_nodeattr("mem_mode")
- if mem_mode == "decoupled":
+ if mem_mode == "internal_decoupled":
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream> weights ("weights");'.format(
- self.get_weightstream_width()
+ 'hls::stream> weights_{} ("weights_{}");'.format(
+ self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
)
)
def docompute(self):
tmpl_args = self.get_template_param_values()
- node = self.onnx_node
mem_mode = self.get_nodeattr("mem_mode")
- if mem_mode == "const":
+ if mem_mode == "internal_embedded":
self.code_gen_dict["$DOCOMPUTE$"] = [
- """{}
- (in0, out, threshs, numReps);""".format(
- node.op_type,
+ """Thresholding_Batch
+ (in0_{}, out_{}, threshs, numReps);""".format(
tmpl_args["TSrcI"],
tmpl_args["TDstI"],
+ self.hls_sname(),
+ self.hls_sname(),
)
]
- elif mem_mode == "decoupled":
+ elif mem_mode == "internal_decoupled":
# note that numReps is set to 1 in the invocation below, since
# - for cppsim the repetition comes from the threshold stream reader+input
# - for synth the unit runs continuously anyway (ap_ctrl_none)
self.code_gen_dict["$DOCOMPUTE$"] = [
"""{}
- (in0, out, weights, numReps);""".format(
+ (in0_{}, out_{}, weights_{}, numReps);""".format(
"Thresholding_Stream_Batch",
tmpl_args["TSrcI"],
tmpl_args["TDstI"],
+ self.hls_sname(),
+ self.hls_sname(),
+ self.hls_sname(),
)
]
else:
@@ -718,41 +517,44 @@ def dataoutstrm(self):
# note: the innermost dim is not reversed for the output
self.code_gen_dict["$DATAOUTSTREAM$"] = [
- 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);'
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
% (
packed_hls_type,
elem_hls_type,
elem_bits,
npy_type,
+ self.hls_sname(),
shape_cpp_str,
npy_out,
)
]
- def save_as_npy(self):
- self.code_gen_dict["$SAVEASCNPY$"] = []
-
def blackboxfunction(self):
- if self.get_nodeattr("mem_mode") == "const":
+ if self.get_nodeattr("mem_mode") == "internal_embedded":
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
- hls::stream> &out
+ """void {}(hls::stream> &in0_{},
+ hls::stream> &out_{}
)""".format(
self.onnx_node.name,
self.get_instream_width(),
+ self.hls_sname(),
self.get_outstream_width(),
+ self.hls_sname(),
)
]
- elif self.get_nodeattr("mem_mode") == "decoupled":
+ elif self.get_nodeattr("mem_mode") == "internal_decoupled":
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void {}(hls::stream> &in0,
- hls::stream> &weights,
- hls::stream> &out
+ """void {}(hls::stream> &in0_{},
+ hls::stream> &weights_{},
+ hls::stream> &out_{}
)""".format(
self.onnx_node.name,
self.get_instream_width(),
+ self.hls_sname(),
self.get_weightstream_width(),
+ self.hls_sname(),
self.get_outstream_width(),
+ self.hls_sname(),
)
]
else:
@@ -760,30 +562,22 @@ def blackboxfunction(self):
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
]
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
- )
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
)
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
- if self.get_nodeattr("mem_mode") == "const":
+ if self.get_nodeattr("mem_mode") == "internal_embedded":
# the threshold tensor is acc_type [PE][TMEM][N_THRES]
# partition for parallel access along PE and N_THRES
# dimensions (dims 1 and 3)
self.code_gen_dict["$PRAGMAS$"].append(
- (
- "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
- "complete dim=1"
- )
+ ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
)
self.code_gen_dict["$PRAGMAS$"].append(
- (
- "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds "
- "complete dim=3"
- )
+ ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
)
# set resource type
ram_style = self.get_nodeattr("ram_style")
@@ -794,17 +588,11 @@ def pragmas(self):
if pe < ich:
if ram_style == "distributed":
self.code_gen_dict["$PRAGMAS$"].append(
- (
- "#pragma HLS RESOURCE variable=threshs.m_thresholds "
- "core=ROM_2P_LUTRAM"
- )
+ ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM")
)
elif ram_style == "block":
self.code_gen_dict["$PRAGMAS$"].append(
- (
- "#pragma HLS RESOURCE variable=threshs.m_thresholds "
- "core=ROM_2P_BRAM"
- )
+ ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM")
)
else:
raise Exception(
@@ -813,17 +601,16 @@ def pragmas(self):
ram_style
)
)
- elif self.get_nodeattr("mem_mode") == "decoupled":
+ elif self.get_nodeattr("mem_mode") == "internal_decoupled":
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=weights name=weights_"
- + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
)
def code_generation_ipi(self):
cmd = []
# add streamer if needed
mem_mode = self.get_nodeattr("mem_mode")
- if mem_mode == "decoupled":
+ if mem_mode == "internal_decoupled":
node_name = self.onnx_node.name
runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
sname = self.hls_sname()
@@ -837,8 +624,7 @@ def code_generation_ipi(self):
cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name))
cmd.append(
"create_bd_intf_pin -mode Master "
- "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s"
- % (node_name, dout_name)
+ "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name)
)
cmd.append(
"create_bd_intf_pin -mode Slave "
@@ -850,30 +636,23 @@ def code_generation_ipi(self):
% (self.get_nodeattr("ip_vlnv"), node_name, node_name)
)
# instantiate a streamer and connect it to the HLS IP
- strm_vlnv = "xilinx.com:user:memstream:1.0"
+ strm_vlnv = "amd.com:finn:memstream:1.0"
strm_inst = node_name + "_wstrm"
cmd.append(
- "create_bd_cell -type ip -vlnv %s /%s/%s"
- % (strm_vlnv, node_name, strm_inst)
+ "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst)
)
cmd.append(
"set_property -dict [list "
- "CONFIG.NSTREAMS {1} "
- "CONFIG.MEM_DEPTH {%d} "
- "CONFIG.MEM_WIDTH {%d} "
- "CONFIG.MEM_INIT {%s} "
+ "CONFIG.DEPTH {%d} "
+ "CONFIG.WIDTH {%d} "
+ "CONFIG.INIT_FILE {%s} "
"CONFIG.RAM_STYLE {%s} "
- "CONFIG.STRM0_DEPTH {%d} "
- "CONFIG.STRM0_WIDTH {%d} "
- "CONFIG.STRM0_OFFSET {0} "
"] [get_bd_cells /%s/%s]"
% (
self.calc_tmem(),
self.get_weightstream_width_padded(),
- self.get_nodeattr("code_gen_dir_ipgen") + "/",
+ self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat",
self.get_nodeattr("ram_style"),
- self.calc_tmem(),
- self.get_weightstream_width_padded(),
node_name,
strm_inst,
)
@@ -884,11 +663,11 @@ def code_generation_ipi(self):
% (node_name, strm_inst, node_name, node_name, sname)
)
cmd.append(
- "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]"
+ "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]"
% (node_name, rst_name, node_name, strm_inst)
)
cmd.append(
- "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]"
+ "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]"
% (node_name, clk_name, node_name, strm_inst)
)
cmd.append(
@@ -914,8 +693,7 @@ def code_generation_ipi(self):
axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0]
cmd.append(
"create_bd_intf_pin -mode Slave "
- "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s"
- % (node_name, axilite_name)
+ "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name)
)
cmd.append(
"connect_bd_intf_net [get_bd_intf_pins %s/%s] "
@@ -925,8 +703,8 @@ def code_generation_ipi(self):
# TODO calculate and pass in segment size here
cmd.append("assign_bd_address")
cmd.append("save_bd_design")
- elif mem_mode == "const":
- # base class impl sufficient for const mode
+ elif mem_mode == "internal_embedded":
+ # base class impl sufficient for internal_embedded mode
return super().code_generation_ipi()
else:
raise Exception("Unrecognized mem_mode for Thresholding_Batch")
@@ -935,7 +713,7 @@ def code_generation_ipi(self):
def get_verilog_top_module_intf_names(self):
intf_names = super().get_verilog_top_module_intf_names()
mem_mode = self.get_nodeattr("mem_mode")
- if mem_mode == "decoupled":
+ if mem_mode == "internal_decoupled":
# only expose axilite interface if attribute is set
runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1
if runtime_writable:
@@ -967,10 +745,8 @@ def derive_characteristic_fxns(self, period):
"outputs": {"out": []},
}
mem_mode = self.get_nodeattr("mem_mode")
- if mem_mode in ["decoupled", "external"]:
+ if mem_mode in ["internal_decoupled", "external"]:
n_weight_inps = self.calc_tmem()
num_w_reps = np.prod(self.get_nodeattr("numInputVectors"))
- io_dict["inputs"]["weights"] = [
- 0 for i in range(num_w_reps * n_weight_inps)
- ]
+ io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)]
super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict)
diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py
similarity index 81%
rename from src/finn/custom_op/fpgadataflow/tlastmarker.py
rename to src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py
index 1bd32442a1..2e908016e7 100644
--- a/src/finn/custom_op/fpgadataflow/tlastmarker.py
+++ b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2020, Xilinx
+# Copyright (c) 2020-2022, Xilinx, Inc.
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
@@ -26,10 +27,11 @@
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp
-class TLastMarker(HLSCustomOp):
+class TLastMarker_hls(HWCustomOp, HLSBackend):
"""Node that adds/removes AXI stream TLAST signals where needed. Its behavior
is transparent in node-by-node execution, only visible in IP-stitched rtlsim or
actual hardware.
@@ -37,8 +39,8 @@ class TLastMarker(HLSCustomOp):
(needed by the FINN PYNQ shell) or at the beginning to remove the end-of-burst
from DMA read."""
- def __init__(self, onnx_node):
- super().__init__(onnx_node)
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
def get_nodeattr_types(self):
my_attrs = {
@@ -56,7 +58,8 @@ def get_nodeattr_types(self):
# Vitis docs recommend using qdma_axis for external, ap_axiu for internal
"Protocol": ("s", False, "external", {"external", "internal"}),
}
- my_attrs.update(super().get_nodeattr_types())
+ my_attrs.update(HWCustomOp.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
return my_attrs
def execute_node(self, context, graph):
@@ -130,9 +133,9 @@ def docompute(self):
self.code_gen_dict["$DOCOMPUTE$"] = [
"for(unsigned int i=0; i &in0,
- hls::stream &out, unsigned int numIters)"""
- % self.onnx_node.name
+ """void %s(hls::stream &in0_%s,
+ hls::stream &out_%s, unsigned int numIters)"""
+ % (self.onnx_node.name, self.hls_sname(), self.hls_sname())
]
else:
self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
- """void %s(hls::stream &in0, hls::stream &out)"""
- % self.onnx_node.name
+ """void %s(hls::stream &in0_%s,
+ hls::stream &out_%s)"""
+ % (self.onnx_node.name, self.hls_sname(), self.hls_sname())
]
def pragmas(self):
self.code_gen_dict["$PRAGMAS$"] = [
- "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
]
self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname()
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
)
dyn_iters = self.get_nodeattr("DynIters")
@@ -211,9 +218,7 @@ def pragmas(self):
"#pragma HLS INTERFACE s_axilite port=numIters bundle=control"
)
- self.code_gen_dict["$PRAGMAS$"].append(
- "#pragma HLS INTERFACE ap_ctrl_none port=return"
- )
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
def get_number_output_values(self):
return self.get_nodeattr("NumIters")
@@ -239,10 +244,10 @@ def get_outstream_width(self, ind=0):
def strm_decl(self):
self.code_gen_dict["$STREAMDECLARATIONS$"] = []
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream in0 ("in0");'
+ 'hls::stream in0_%s ("in0_%s");' % (self.hls_sname(), self.hls_sname())
)
self.code_gen_dict["$STREAMDECLARATIONS$"].append(
- 'hls::stream out ("out");'
+ 'hls::stream out_%s ("out_%s");' % (self.hls_sname(), self.hls_sname())
)
def get_verilog_top_module_intf_names(self):
diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
new file mode 100644
index 0000000000..05d26eddb2
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py
@@ -0,0 +1,175 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class UpsampleNearestNeighbour_hls(UpsampleNearestNeighbour, HLSBackend):
+ """
+ Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function.
+ Upsampling is done with the Nearest Neighbour algorithm.
+ The layer expects square feature maps for the in and output.
+ """
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(UpsampleNearestNeighbour.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def verify_node(self):
+ pass
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"']
+
+ def defines(self, var):
+ self.code_gen_dict["$DEFINES$"] = []
+
+ ifm_ch = self.get_nodeattr("NumChannels")
+ self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)]
+
+ ibits = self.get_input_datatype().bitwidth()
+ self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)]
+
+ idim = self.get_nodeattr("IFMDim")
+ self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)]
+
+ odim = self.get_nodeattr("OFMDim")
+ self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)]
+
+ batch_size = self.get_nodeattr("numInputVectors")
+ self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)]
+
+ def docompute(self):
+ is_2d = self.get_nodeattr("DimMode") == 0
+ batch = self.get_nodeattr("numInputVectors")
+ if is_2d:
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);"""
+ % (self.hls_sname(), self.hls_sname())
+ ]
+ else:
+ assert batch == 1, "1D upsampler currently needs numReps=1"
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);"""
+ % (self.hls_sname(), self.hls_sname())
+ ]
+
+ def blackboxfunction(self):
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)"
+ % (
+ self.onnx_node.name,
+ packed_hls_type,
+ self.hls_sname(),
+ packed_hls_type,
+ self.hls_sname(),
+ )
+ ]
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ node = self.onnx_node
+ exp_ishape = self.get_normal_input_shape()
+ exp_oshape = self.get_normal_output_shape()
+ folded_oshape = self.get_folded_output_shape()
+
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ inp = context[node.input[0]]
+ assert str(inp.dtype) == "float32", "Input datatype is not float32"
+ assert (
+ inp.shape == exp_ishape
+ ), """Input shape doesn't
+ match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels)."""
+ export_idt = self.get_input_datatype()
+ self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir)
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ assert (
+ context[node.output[0]].shape == folded_oshape
+ ), "cppsim did not produce expected folded output shape"
+ context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape)
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ rtlsim_inp = npy_to_rtlsim_input(
+ "{}/input_0.npy".format(code_gen_dir), export_idt, nbits
+ )
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+ rtlsim_output = self.rtlsim(sim, rtlsim_inp)
+ odt = export_idt
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(
+ rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits
+ )
+ # load and reshape output
+ output = np.load(out_npy_path)
+ output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+ assert (
+ context[node.output[0]].shape == exp_oshape
+ ), """Output shape doesn't match expected shape
+ (1, OutputDim, OutputDim, NumChannels)."""
diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
new file mode 100644
index 0000000000..3e10b640c5
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py
@@ -0,0 +1,541 @@
+# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import math
+import numpy as np
+import os
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend
+from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU
+from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy
+
+
+class VVAU_hls(VVAU, HLSBackend):
+ """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function"""
+
+ def __init__(self, onnx_node, **kwargs):
+ super().__init__(onnx_node, **kwargs)
+
+ def get_nodeattr_types(self):
+ my_attrs = {}
+ my_attrs.update(VVAU.get_nodeattr_types(self))
+ my_attrs.update(HLSBackend.get_nodeattr_types(self))
+ return my_attrs
+
+ def lut_estimation(self):
+ """Calculates resource estimations for LUTs based on:
+ - FINN-R: An End-to-End Deep-Learning Framework for Fast
+ Exploration of Quantized Neural Networks
+ - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien,
+ Y. Umuroglu, M. Leeser and K. Vissers
+ - 12. Sep 2018
+ """
+ # TODO add in/out FIFO contributions
+ P = self.get_nodeattr("PE")
+ Q = self.get_nodeattr("SIMD")
+ wdt = self.get_weight_datatype()
+ W = wdt.bitwidth()
+ # determine tdt with input and weight data types
+ idt = self.get_input_datatype()
+ A = idt.bitwidth()
+ # parameters from experiments in paper mentioned above
+ c0 = 300
+ c1 = 1.1
+ c2 = 0
+ mmode = self.get_nodeattr("mem_mode")
+ mstyle = self.get_nodeattr("ram_style")
+ if (mmode == "internal_decoupled" and mstyle == "distributed") or (
+ mmode == "internal_embedded" and self.calc_wmem() <= 128
+ ):
+ c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64)
+
+ # multiplication
+ res_type = self.get_nodeattr("resType")
+ if res_type == "dsp":
+ mult_luts = 0
+ else:
+ mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A)
+ # adder tree
+ addertree_luts = (W + A) * (2 * Q - 1)
+ # accumulator
+ acc_datatype = self.get_accumulator_datatype()
+ acc_bits = acc_datatype.bitwidth()
+ k_h, k_w = self.get_nodeattr("Kernel")
+ # if accDataType is not set, then it will default to INT32, which would
+ # be a large overestimate in most (if not all) cases. In this scenario,
+ # we would use the minimum accumulator as determined by the data types
+ # bound, derived in https://arxiv.org/abs/2301.13376
+ alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed())
+ acc_bits = min(
+ acc_datatype.bitwidth(),
+ np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1),
+ )
+ acc_luts = acc_bits
+ # thresholds and threshold comparators
+ thr_luts = 0
+ comp_luts = 0
+ noact = self.get_nodeattr("noActivation")
+ # TODO - add 'ram_style_threshold' node attribute
+ if noact == 0:
+ odt = self.get_output_datatype()
+ B = odt.bitwidth()
+ thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64
+ comp_luts = (2**B - 1) * acc_bits
+
+ return int(
+ c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2
+ )
+
+ def dsp_estimation(self):
+ # multiplication
+ P = self.get_nodeattr("PE")
+ res_type = self.get_nodeattr("resType")
+ wdt = self.get_weight_datatype()
+ W = wdt.bitwidth()
+ idt = self.get_input_datatype()
+ A = idt.bitwidth()
+ if res_type == "dsp":
+ mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling
+ else:
+ mult_dsp = 0
+ return int(mult_dsp)
+
+ def execute_node(self, context, graph):
+ mode = self.get_nodeattr("exec_mode")
+ mem_mode = self.get_nodeattr("mem_mode")
+ node = self.onnx_node
+
+ # TODO ensure codegen dir exists
+ if mode == "cppsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ elif mode == "rtlsim":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ # create a npy file fore each input of the node (in_ind is input index)
+ in_ind = 0
+ for inputs in node.input:
+ # it is assumed that the first input of the node is the data input
+ # the second input are the weights
+ # the third input are the thresholds
+ if in_ind == 0:
+ assert (
+ str(context[inputs].dtype) == "float32"
+ ), """Input datatype is
+ not float32 as expected."""
+ expected_inp_shape = self.get_folded_input_shape()
+ reshaped_input = context[inputs].reshape(expected_inp_shape)
+ if self.get_input_datatype() == DataType["BIPOLAR"]:
+ # store bipolar activations as binary
+ reshaped_input = (reshaped_input + 1) / 2
+ export_idt = DataType["BINARY"]
+ else:
+ export_idt = self.get_input_datatype()
+ # make copy before saving the array
+ reshaped_input = reshaped_input.copy()
+ np.save(
+ os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)),
+ reshaped_input,
+ )
+ elif in_ind > 2:
+ raise Exception("Unexpected input found for VectorVectorActivation")
+ in_ind += 1
+
+ if mode == "cppsim":
+ # execute the precompiled model
+ super().exec_precompiled_singlenode_model()
+ # load output npy file
+ super().npy_to_dynamic_output(context)
+ # reinterpret binary output as bipolar where needed
+ if self.get_output_datatype() == DataType["BIPOLAR"]:
+ out = context[node.output[0]]
+ out = 2 * out - 1
+ context[node.output[0]] = out
+ assert (
+ context[node.output[0]].shape == self.get_normal_output_shape()
+ ), "cppsim did not produce expected output shape"
+ elif mode == "rtlsim":
+ sim = self.get_rtlsim()
+ nbits = self.get_instream_width()
+ inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits)
+ super().reset_rtlsim(sim)
+ super().toggle_clk(sim)
+
+ if mem_mode == "external" or mem_mode == "internal_decoupled":
+ wnbits = self.get_weightstream_width()
+ export_wdt = self.get_weight_datatype()
+ # we have converted bipolar weights to binary for export,
+ # so use it as such for weight generation
+ if self.get_weight_datatype() == DataType["BIPOLAR"]:
+ export_wdt = DataType["BINARY"]
+ wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits)
+ dim_h, dim_w = self.get_nodeattr("Dim")
+ num_w_reps = dim_h * dim_w
+
+ io_dict = {
+ "inputs": {"in0": inp, "weights": wei * num_w_reps},
+ "outputs": {"out": []},
+ }
+ self.rtlsim_multi_io(sim, io_dict)
+ output = io_dict["outputs"]["out"]
+ else:
+ output = self.rtlsim(sim, inp)
+ odt = self.get_output_datatype()
+ target_bits = odt.bitwidth()
+ packed_bits = self.get_outstream_width()
+ out_npy_path = "{}/output.npy".format(code_gen_dir)
+ out_shape = self.get_folded_output_shape()
+ rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits)
+
+ # load and reshape output
+ output = np.load(out_npy_path)
+ oshape = self.get_normal_output_shape()
+ output = np.asarray([output], dtype=np.float32).reshape(*oshape)
+ context[node.output[0]] = output
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ def get_template_param_values(self):
+ """Returns the template parameter values according to input, output and weight
+ data types."""
+ ret = dict()
+ inp_hls_str = self.get_input_datatype().get_hls_datatype_str()
+ out_hls_str = self.get_output_datatype().get_hls_datatype_str()
+ inp_is_binary = self.get_input_datatype() == DataType["BINARY"]
+ # out_is_binary = self.get_output_datatype() == DataType["BINARY"]
+ wt_is_binary = self.get_weight_datatype() == DataType["BINARY"]
+ bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1
+ if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode):
+ raise Exception("True binary (non-bipolar) inputs not yet supported")
+ inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"]
+ # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"]
+ wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"]
+ # reinterpret inp/wt as bipolar if bin_xnor_mode is iset
+ inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode)
+ wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode)
+ # fill in TSrcI and TWeightI
+ # TODO check these with Giulio
+ # TODO handle non-bipolar binary inputs
+ if inp_is_bipolar and wt_is_bipolar:
+ ret["TSrcI"] = "Recast"
+ ret["TWeightI"] = "Identity"
+ elif (not inp_is_bipolar) and wt_is_bipolar:
+ ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+ ret["TWeightI"] = "Recast"
+ elif inp_is_bipolar and (not wt_is_bipolar):
+ ret["TSrcI"] = "Recast"
+ ret["TWeightI"] = "Identity"
+ elif (not inp_is_bipolar) and (not wt_is_bipolar):
+ ret["TSrcI"] = "Slice<%s>" % inp_hls_str
+ ret["TWeightI"] = "Identity"
+
+ # fill in TDstI
+ ret["TDstI"] = "Slice<%s>" % out_hls_str
+
+ return ret
+
+ def global_includes(self):
+ self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"']
+ self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"']
+ mem_mode = self.get_nodeattr("mem_mode")
+ if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external",
+ currently no other parameter value is supported!"""
+ )
+ if self.calc_tmem() != 0:
+ self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"']
+
+ def defines(self, var):
+ dim_h, dim_w = self.get_nodeattr("Dim")
+ numReps = 1 * dim_h * dim_w
+ k_h, k_w = self.get_nodeattr("Kernel")
+ innerProdDim = k_h * k_w
+ mem_mode = self.get_nodeattr("mem_mode")
+
+ self.code_gen_dict["$DEFINES$"] = [
+ """#define Channels1 {}\n #define InnerProdDim {}\n
+ #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format(
+ self.get_nodeattr("Channels"),
+ innerProdDim,
+ self.get_nodeattr("SIMD"),
+ self.get_nodeattr("PE"),
+ numReps,
+ )
+ ]
+ if mem_mode == "internal_decoupled" or mem_mode == "external":
+ wdt = self.get_weight_datatype()
+ self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth()))
+
+ def read_npy_data(self):
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ dtype = self.get_input_datatype()
+ if dtype == DataType["BIPOLAR"]:
+ # use binary for bipolar storage
+ dtype = DataType["BINARY"]
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = dtype.get_hls_datatype_str()
+ npy_type = "float"
+ npy_in = "%s/input_0.npy" % code_gen_dir
+ self.code_gen_dict["$READNPYDATA$"] = []
+ # note: the innermost dim is reversed for the input
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
+ )
+
+ mem_mode = self.get_nodeattr("mem_mode")
+ if mem_mode == "internal_decoupled" or mem_mode == "external":
+ wdt = self.get_weight_datatype()
+ elem_bits = wdt.bitwidth()
+ packed_bits = self.get_weightstream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = wdt.get_hls_datatype_str()
+ npy_type = "float"
+ npy_in = "%s/weights.npy" % code_gen_dir
+
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
+ )
+
+ def strm_decl(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> in0_{} ("in0_{}");'.format(
+ self.get_instream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> out_{} ("out_{}");'.format(
+ self.get_outstream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+ if mem_mode == "internal_decoupled" or mem_mode == "external":
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream> weights_{} ("weights_{}");'.format(
+ self.get_weightstream_width(), self.hls_sname(), self.hls_sname()
+ )
+ )
+
+ def docompute(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ map_to_hls_mult_style = {
+ "auto": "ap_resource_dflt()",
+ "lut": "ap_resource_lut()",
+ "dsp": "ap_resource_dsp()",
+ }
+ tmpl_args = self.get_template_param_values()
+ if self.calc_tmem() == 0:
+ odtype_hls_str = self.get_output_datatype().get_hls_datatype_str()
+ threshs = "PassThroughActivation<%s>()" % odtype_hls_str
+ else:
+ threshs = "threshs"
+
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """Vector_Vector_Activate_Batch
+ (in0_{}, out_{}, weights, {}, numReps, {});""".format(
+ tmpl_args["TSrcI"],
+ tmpl_args["TDstI"],
+ tmpl_args["TWeightI"],
+ self.hls_sname(),
+ self.hls_sname(),
+ threshs,
+ map_to_hls_mult_style[self.get_nodeattr("resType")],
+ )
+ ]
+ elif mem_mode == "internal_decoupled" or mem_mode == "external":
+ wdt = self.get_weight_datatype()
+ if wdt == DataType["BIPOLAR"]:
+ export_wdt = DataType["BINARY"]
+ else:
+ export_wdt = wdt
+ wdtype_hls_str = export_wdt.get_hls_datatype_str()
+ self.code_gen_dict["$DOCOMPUTE$"] = [
+ """{}
+ (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format(
+ "Vector_Vector_Activate_Stream_Batch",
+ tmpl_args["TSrcI"],
+ tmpl_args["TDstI"],
+ tmpl_args["TWeightI"],
+ wdtype_hls_str,
+ self.hls_sname(),
+ self.hls_sname(),
+ self.hls_sname(),
+ threshs,
+ map_to_hls_mult_style[self.get_nodeattr("resType")],
+ )
+ ]
+ else:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external",
+ currently no other parameter value is supported!"""
+ )
+
+ def dataoutstrm(self):
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ dtype = self.get_output_datatype()
+ if dtype == DataType["BIPOLAR"]:
+ # use binary for bipolar storage
+ dtype = DataType["BINARY"]
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_outstream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = dtype.get_hls_datatype_str()
+ npy_type = "float"
+ npy_out = "%s/output.npy" % code_gen_dir
+ shape = self.get_folded_output_shape()
+ shape_cpp_str = str(shape).replace("(", "{").replace(")", "}")
+
+ # note: the innermost dim is not reversed for the output
+ self.code_gen_dict["$DATAOUTSTREAM$"] = [
+ 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ self.hls_sname(),
+ shape_cpp_str,
+ npy_out,
+ )
+ ]
+
+ def save_as_npy(self):
+ self.code_gen_dict["$SAVEASCNPY$"] = []
+
+ def blackboxfunction(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ """void {}(hls::stream> &in0_{},
+ hls::stream> &out_{}
+ )""".format(
+ self.onnx_node.name,
+ self.get_instream_width(),
+ self.hls_sname(),
+ self.get_outstream_width(),
+ self.hls_sname(),
+ )
+ ]
+ elif mem_mode == "internal_decoupled" or mem_mode == "external":
+ self.code_gen_dict["$BLACKBOXFUNCTION$"] = [
+ """void {}(
+ hls::stream> &in0_{},
+ hls::stream> &weights_{},
+ hls::stream> &out_{}
+ )""".format(
+ self.onnx_node.name,
+ self.get_instream_width(),
+ self.hls_sname(),
+ self.get_weightstream_width(),
+ self.hls_sname(),
+ self.get_outstream_width(),
+ self.hls_sname(),
+ )
+ ]
+ else:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded" or "internal_decoupled",
+ currently no other parameter value is supported!"""
+ )
+
+ def pragmas(self):
+ mem_mode = self.get_nodeattr("mem_mode")
+ self.code_gen_dict["$PRAGMAS$"] = [
+ "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()
+ ]
+ self.code_gen_dict["$PRAGMAS$"].append(
+ "#pragma HLS INTERFACE axis port=out_" + self.hls_sname()
+ )
+ self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return")
+
+ if mem_mode == "internal_embedded":
+ self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"')
+ # the weight tensor is ap_uint [PE][WMEM]
+ # partition for parallel access along the PE dimension (dim 1)
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1")
+ )
+ elif mem_mode == "internal_decoupled" or mem_mode == "external":
+ self.code_gen_dict["$PRAGMAS$"].append(
+ "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname()
+ )
+ else:
+ raise Exception(
+ """Please set mem_mode to "internal_embedded", "internal_decoupled", or external,
+ currently no other parameter value is supported!"""
+ )
+
+ if self.calc_tmem() != 0:
+ # TODO find a better way of checking for no pregenerated thresholds
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1")
+ )
+ self.code_gen_dict["$PRAGMAS$"].append(
+ ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3")
+ )
+
+ def instantiate_ip(self, cmd):
+ # instantiate the HLS IP
+ vlnv = self.get_nodeattr("ip_vlnv")
+ node_name = self.onnx_node.name
+ if self.get_nodeattr("mem_mode") == "internal_decoupled":
+ cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name))
+ else:
+ cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name))
diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py
new file mode 100644
index 0000000000..d8210fd684
--- /dev/null
+++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py
@@ -0,0 +1,476 @@
+# Copyright (C) 2023, Advanced Micro Devices, Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of FINN nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import numpy as np
+import os
+import subprocess
+from abc import ABC, abstractmethod
+from qonnx.core.datatype import DataType
+
+from finn.custom_op.fpgadataflow import templates
+from finn.util.basic import CppBuilder, get_rtlsim_trace_depth, make_build_dir
+from finn.util.hls import CallHLS
+from finn.util.pyverilator import make_single_source_file
+
+try:
+ from pyverilator import PyVerilator
+except ModuleNotFoundError:
+ PyVerilator = None
+
+
+class HLSBackend(ABC):
+ """HLSBackend class all custom ops that correspond to a finn-hlslib
+ function are using functionality of. Contains different functions every HLS
+ custom node should have. Some as abstract methods, these have to be filled
+ when writing a new HLS custom op node."""
+
+ def get_nodeattr_types(self):
+ return {
+ "code_gen_dir_cppsim": ("s", False, ""),
+ "executable_path": ("s", False, ""),
+ "res_hls": ("s", False, ""),
+ }
+
+ def get_all_verilog_paths(self):
+ "Return list of all folders containing Verilog code for this node."
+
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ assert (
+ code_gen_dir != ""
+ ), """Node attribute "code_gen_dir_ipgen" is
+ not set. Please run HLSSynthIP first."""
+ verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name)
+ # default impl only returns the HLS verilog codegen dir
+ return [verilog_path]
+
+ def get_all_verilog_filenames(self, abspath=False):
+ "Return list of all Verilog files used for this node."
+
+ verilog_files = []
+ verilog_paths = self.get_all_verilog_paths()
+ for verilog_path in verilog_paths:
+ for f in os.listdir(verilog_path):
+ if f.endswith(".v"):
+ if abspath:
+ verilog_files += [verilog_path + "/" + f]
+ else:
+ verilog_files += [f]
+ return verilog_files
+
+ def prepare_rtlsim(self):
+ """Creates a Verilator emulation library for the RTL code generated
+ for this node, sets the rtlsim_so attribute to its path and returns
+ a PyVerilator wrapper around it."""
+
+ if PyVerilator is None:
+ raise ImportError("Installation of PyVerilator is required.")
+
+ verilog_files = self.get_all_verilog_filenames(abspath=True)
+ single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_")
+ tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_")
+ target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v"
+ make_single_source_file(verilog_files, target_file)
+
+ # build the Verilator emu library
+ sim = PyVerilator.build(
+ self.get_verilog_top_module_name() + ".v",
+ build_dir=tmp_build_dir,
+ verilog_path=[single_src_dir],
+ trace_depth=get_rtlsim_trace_depth(),
+ top_module_name=self.get_verilog_top_module_name(),
+ )
+ # save generated lib filename in attribute
+ self.set_nodeattr("rtlsim_so", sim.lib._name)
+ return sim
+
+ def code_generation_ipgen(self, model, fpgapart, clk):
+ """Generates c++ code and tcl script for ip generation."""
+ node = self.onnx_node
+
+ # generate top cpp file for ip generation
+ path = self.get_nodeattr("code_gen_dir_ipgen")
+ self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+ self.generate_params(model, path)
+ self.global_includes()
+ self.defines("ipgen")
+ self.blackboxfunction()
+ self.pragmas()
+ self.docompute()
+
+ template = templates.ipgen_template
+
+ for key in self.code_gen_dict:
+ # transform list into long string separated by '\n'
+ code_gen_line = "\n".join(self.code_gen_dict[key])
+ template = template.replace(key, code_gen_line)
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w")
+ f.write(template)
+ f.close()
+ self.code_gen_dict.clear()
+
+ # generate tcl script for ip generation
+ self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)]
+ self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir]
+ self.code_gen_dict["$FPGAPART$"] = [fpgapart]
+ self.code_gen_dict["$TOPFXN$"] = [node.name]
+ self.code_gen_dict["$CLKPERIOD$"] = [str(clk)]
+ self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives()
+ self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives()
+
+ template = templates.ipgentcl_template
+
+ for key in self.code_gen_dict:
+ # transform list into long string separated by '\n'
+ code_gen_line = "\n".join(self.code_gen_dict[key])
+ template = template.replace(key, code_gen_line)
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w")
+ f.write(template)
+ f.close()
+ self.code_gen_dict.clear()
+
+ def ipgen_default_directives(self):
+ """Return list of default HLS synthesis directives"""
+
+ default_directives = [
+ "set_param hls.enable_hidden_option_error false",
+ "config_compile -disable_unroll_code_size_check -pipeline_style flp",
+ "config_interface -m_axi_addr64",
+ "config_rtl -module_auto_prefix",
+ "config_rtl -deadlock_detection none",
+ ]
+ return default_directives
+
+ def ipgen_extra_directives(self):
+ "Return a list of extra tcl directives for HLS synthesis."
+ return []
+
+ def ipgen_singlenode_code(self):
+ """Builds the bash script for IP generation using the CallHLS utility."""
+ node = self.onnx_node
+ code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen")
+ builder = CallHLS()
+ builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name))
+ builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name))
+ builder.build(code_gen_dir)
+ ipgen_path = builder.ipgen_path
+ assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path)
+ self.set_nodeattr("ipgen_path", ipgen_path)
+ ip_path = ipgen_path + "/sol1/impl/ip"
+ assert os.path.isdir(ip_path), "IPGen failed: %s not found. Check log under %s" % (
+ ip_path,
+ code_gen_dir,
+ )
+ self.set_nodeattr("ip_path", ip_path)
+ vlnv = "xilinx.com:hls:%s:1.0" % node.name
+ self.set_nodeattr("ip_vlnv", vlnv)
+
+ def code_generation_cppsim(self, model):
+ """Generates c++ code for simulation (cppsim)."""
+ node = self.onnx_node
+ path = self.get_nodeattr("code_gen_dir_cppsim")
+ self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())]
+ self.generate_params(model, path)
+ self.global_includes()
+ self.defines("cppsim")
+ self.read_npy_data()
+ self.strm_decl()
+ self.pragmas()
+ self.docompute()
+ self.dataoutstrm()
+ self.save_as_npy()
+
+ template = templates.docompute_template
+
+ for key in self.code_gen_dict:
+ # transform list into long string separated by '\n'
+ code_gen_line = "\n".join(self.code_gen_dict[key])
+ template = template.replace(key, code_gen_line)
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w")
+ f.write(template)
+ f.close()
+ self.code_gen_dict.clear()
+
+ def code_generation_ipi(self):
+ """Constructs and returns the TCL for node instantiation in Vivado IPI."""
+ vlnv = self.get_nodeattr("ip_vlnv")
+ cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)]
+ return cmd
+
+ def compile_singlenode_code(self):
+ """Builds the bash script for compilation using the CppBuilder from
+ finn.util.basic and executes the script to produce the executable."""
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ builder = CppBuilder()
+ # to enable additional debug features please uncommand the next line
+ # builder.append_includes("-DDEBUG")
+ builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp")
+ builder.append_includes("-I$FINN_ROOT/deps/cnpy/")
+ builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib")
+ builder.append_includes("-I$FINN_ROOT/custom_hls")
+ builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"]))
+ builder.append_includes("--std=c++14")
+ builder.append_includes("-O3")
+ builder.append_sources(code_gen_dir + "/*.cpp")
+ builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp")
+ builder.append_includes("-lz")
+ builder.set_executable_path(code_gen_dir + "/node_model")
+ builder.build(code_gen_dir)
+ self.set_nodeattr("executable_path", builder.executable_path)
+
+ def dynamic_input_to_npy(self, context, count, target_dir=""):
+ """Saves input (given context) into .npy files.
+
+ Count indicates the number of inputs that have to be saved."""
+ node = self.onnx_node
+ if target_dir == "":
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ if code_gen_dir == "":
+ raise Exception(
+ """
+ Found no codegen dir for this node, did you run the prepare_cppsim transformation?
+ """
+ )
+ target_dir = code_gen_dir
+ # create a npy file for each input of the node (in_ind is input index)
+ # assuming dynamic inputs start from 0
+ for in_ind in range(count):
+ current_input_name = node.input[in_ind]
+ input_array = context[current_input_name]
+ if in_ind == 0:
+ expected_inp_shape = self.get_folded_input_shape()
+ idt = self.get_input_datatype()
+ else:
+ expected_inp_shape = self.get_folded_input_shape(in_ind)
+ idt = self.get_input_datatype(in_ind)
+ reshaped_input = input_array.reshape(expected_inp_shape)
+ if idt == DataType["BIPOLAR"]:
+ # store bipolar activations as binary
+ reshaped_input = (reshaped_input + 1) / 2
+ # make copy before saving the array
+ reshaped_input = reshaped_input.copy()
+ np.save(
+ os.path.join(target_dir, "input_{}.npy".format(in_ind)),
+ reshaped_input,
+ )
+
+ def npy_to_dynamic_output(self, context):
+ """Reads the output from an output.npy file generated from cppsim and
+ places its content into the context dictionary."""
+ node = self.onnx_node
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ output = np.load("{}/output.npy".format(code_gen_dir))
+ exp_shape = self.get_normal_output_shape()
+ context[node.output[0]] = output.reshape(exp_shape)
+
+ def npy_to_dynamic_outputs(self, context, npy_list):
+ """Reads the output from .npy files generated from cppsim and places
+ their content into the context dictionary.
+ npy_list is a list specifying which files to read, and its order must
+ match the order of node outputs."""
+ node = self.onnx_node
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ for i in range(len(npy_list)):
+ output = np.load("{}/{}".format(code_gen_dir, npy_list[i]))
+ if i == 0:
+ exp_shape = self.get_normal_output_shape()
+ else:
+ exp_shape = self.get_normal_output_shape(i)
+ context[node.output[i]] = output.reshape(exp_shape)
+
+ def exec_precompiled_singlenode_model(self):
+ """Executes precompiled executable."""
+ executable_path = self.get_nodeattr("executable_path")
+ if executable_path == "":
+ raise Exception(
+ """
+Found no executable for this node, did you run the codegen and
+compilation transformations?
+ """
+ )
+ process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE)
+ process_execute.communicate()
+
+ def hls_sname(self):
+ """Get the naming convention used by Vitis HLS for stream signals
+ Example: the TDATA for a stream called "out" would be out_V_TDATA.
+ """
+ return "V"
+
+ def execute_node(self, context, graph):
+ """Executes single node using cppsim or rtlsim."""
+ mode = self.get_nodeattr("exec_mode")
+ if mode == "cppsim":
+ # save input(s)
+ self.dynamic_input_to_npy(context, 1)
+ # execute the precompiled model
+ self.exec_precompiled_singlenode_model()
+ # load output npy file
+ self.npy_to_dynamic_output(context)
+ elif mode == "rtlsim":
+ pass
+
+ else:
+ raise Exception(
+ """Invalid value for attribute exec_mode! Is currently set to: {}
+ has to be set to one of the following value ("cppsim", "rtlsim")""".format(
+ mode
+ )
+ )
+
+ @abstractmethod
+ def global_includes(self):
+ """Function to set the global includes for c++ code that has to be generated
+ for cppsim or rtlsim, is member function of HLSBackend class but has to
+ be filled by every node."""
+ pass
+
+ @abstractmethod
+ def defines(self, var):
+ """Function to set the define commands for c++ code that has to be generated
+ for cppsim or rtlsim, is member function of HLSBackend class but has to
+ be filled by every node.
+
+ var: makes it possible to reuse the function for different c++ code generation.
+ I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are
+ added."""
+ pass
+
+ def read_npy_data(self):
+ """Function to generate the commands for reading data from .npy file in c++,
+ might need to be overwritten depending on custom op."""
+ code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim")
+ dtype = self.get_input_datatype()
+ if dtype == DataType["BIPOLAR"]:
+ # use binary for bipolar storage
+ dtype = DataType["BINARY"]
+ elem_bits = dtype.bitwidth()
+ packed_bits = self.get_instream_width()
+ packed_hls_type = "ap_uint<%d>" % packed_bits
+ elem_hls_type = dtype.get_hls_datatype_str()
+ npy_type = "float"
+ npy_in = "%s/input_0.npy" % code_gen_dir
+ self.code_gen_dict["$READNPYDATA$"] = []
+ self.code_gen_dict["$READNPYDATA$"].append(
+ 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);'
+ % (
+ packed_hls_type,
+ elem_hls_type,
+ elem_bits,
+ npy_type,
+ npy_in,
+ self.hls_sname(),
+ )
+ )
+
+ def strm_decl(self):
+ """Function to generate the commands for the stream declaration in c++,
+ is member function of HLSBackend class but might need to be filled
+ by node."""
+ self.code_gen_dict["$STREAMDECLARATIONS$"] = []
+ self.code_gen_dict["$STREAMDECLARATIONS$"].append(
+ 'hls::stream