diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml index 57e3d54952..91433f3093 100644 --- a/.github/ISSUE_TEMPLATE/config.yml +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -4,5 +4,5 @@ contact_links: url: https://finn.readthedocs.io/en/latest/getting_started.html about: Documentation about how to get up and running with FINN. - name: Ask for help and get in touch with the community - url: https://gitter.im/xilinx-finn/community - about: Check out our gitter channel, if you have a question about FINN or a general problem that is likely not a bug. + url: https://github.com/Xilinx/finn/discussions + about: Check out our GitHub Discussions, if you have a question about FINN or a general problem that is likely not a bug. diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml index 00c25a4a31..f9a251a8c7 100644 --- a/.github/workflows/docker-image.yml +++ b/.github/workflows/docker-image.yml @@ -1,8 +1,6 @@ name: DockerImage on: - pull_request: - branches: [ dev ] push: branches: [ dev ] diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 5f03379bbc..011ccebadc 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -18,7 +18,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v4 with: - python-version: '3.8' + python-version: '3.10' - name: Run Lint uses: pre-commit/action@v3.0.0 diff --git a/.isort.cfg b/.isort.cfg index 6cfe1c8919..5378b88fad 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -2,7 +2,7 @@ line_length=88 indent=' ' skip=.tox,.venv,build,dist -known_standard_library=setuptools,pkg_resources +known_standard_library=setuptools known_test=pytest known_first_party=finn sections=FUTURE,STDLIB,TEST,THIRDPARTY,FIRSTPARTY,LOCALFOLDER diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 126a4ac4b2..72a9688505 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,11 +29,11 @@ exclude: '^docs/conf.py' default_language_version: - python: python3.8 + python: python3.10 repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.2.0 + rev: v4.4.0 hooks: - id: trailing-whitespace exclude: '\.dat$' @@ -56,15 +56,16 @@ repos: - id: isort - repo: https://github.com/psf/black - rev: 22.3.0 + rev: 23.3.0 hooks: - id: black language_version: python3 + args: [--line-length=100] - repo: https://github.com/PyCQA/flake8 - rev: 3.9.2 + rev: 6.0.0 hooks: - id: flake8 # black-compatible flake-8 config - args: ['--max-line-length=88', # black default + args: ['--max-line-length=100', # black default '--extend-ignore=E203'] # E203 is not PEP8 compliant diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 478957be11..575a60c69d 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,4 +1,5 @@ -# Copyright (c) 2021, Xilinx +# Copyright (c) 2021-2022, Xilinx +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,13 +32,15 @@ version: 2 +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + sphinx: configuration: docs/finn/conf.py python: - version: 3.8 install: - - method: pip - path: . - extra_requirements: - - docs + - requirements: docs/requirements.txt diff --git a/AUTHORS.rst b/AUTHORS.rst index 861b81924b..5a11497fc8 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -28,3 +28,9 @@ Contributors * Matthias Gehre (@mgehre-amd) * Hugo Le Blevec (@hleblevec) * Patrick Geel (@patrickgeel) +* John Monks (@jmonks-amd) +* Tim Paine (@timkpaine) +* Linus Jungemann (@LinusJungemann) +* Shashwat Khandelwal (@shashwat1198) +* Ian Colbert (@i-colbert) +* Rachit Garg (@rstar900) diff --git a/CHANGELOG.rst b/CHANGELOG.rst deleted file mode 100644 index 226e6f5931..0000000000 --- a/CHANGELOG.rst +++ /dev/null @@ -1,10 +0,0 @@ -========= -Changelog -========= - -Version 0.1 -=========== - -- Feature A added -- FIX: nasty bug #1729 fixed -- add your changes here! diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index d376a1b42b..5e34624790 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,6 +29,60 @@ Please follow the steps below and be sure that your contribution complies with o 1. The main branch should always be treated as stable and clean. Only hot fixes are allowed to be pull-requested. The hot fix is supposed to be very important such that without this fix, a lot of things will break. 2. For new features, smaller bug fixes, doc updates, and many other fixes, users should pull request against the development branch. -3. We will review your contribution and, if any additional fixes or modifications are +3. Sign Your Work + +Please use the *Signed-off-by* line at the end of your patch which indicates that you accept the Developer Certificate of Origin (DCO) defined by https://developercertificate.org/ reproduced below:: + +``` + Developer Certificate of Origin + Version 1.1 + + Copyright (C) 2004, 2006 The Linux Foundation and its contributors. + 1 Letterman Drive + Suite D4700 + San Francisco, CA, 94129 + + Everyone is permitted to copy and distribute verbatim copies of this + license document, but changing it is not allowed. + + + Developer's Certificate of Origin 1.1 + + By making a contribution to this project, I certify that: + + (a) The contribution was created in whole or in part by me and I + have the right to submit it under the open source license + indicated in the file; or + + (b) The contribution is based upon previous work that, to the best + of my knowledge, is covered under an appropriate open source + license and I have the right under that license to submit that + work with modifications, whether created in whole or in part + by me, under the same open source license (unless I am + permitted to submit under a different license), as indicated + in the file; or + + (c) The contribution was provided directly to me by some other + person who certified (a), (b) or (c) and I have not modified + it. + + (d) I understand and agree that this project and the contribution + are public and that a record of the contribution (including all + personal information I submit with it, including my sign-off) is + maintained indefinitely and may be redistributed consistent with + this project or the open source license(s) involved. +``` + +You can enable Signed-off-by automatically by adding the `-s` flag to the `git commit` command. + +Here is an example Signed-off-by line which indicates that the contributor accepts DCO: + +``` + This is my commit message + + Signed-off-by: Jane Doe +``` + +4. We will review your contribution and, if any additional fixes or modifications are necessary, may provide feedback to guide you. When accepted, your pull request will be merged to the repository. If you have more questions please contact us. diff --git a/LICENSE.txt b/LICENSE.txt index 278564a5a4..cec78d6043 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,4 +1,5 @@ -Copyright (c) 2020, Xilinx +Copyright (C) 2020-2022, Xilinx, Inc. +Copyright (C) 2022-2024, Advanced Micro Devices, Inc. All rights reserved. Redistribution and use in source and binary forms, with or without diff --git a/README.md b/README.md index 2e1faf8f0c..0856701908 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,12 @@ -drawing +drawing [![GitHub Discussions](https://img.shields.io/badge/discussions-join-green)](https://github.com/Xilinx/finn/discussions) [![ReadTheDocs](https://readthedocs.org/projects/finn/badge/?version=latest&style=plastic)](http://finn.readthedocs.io/) -FINN is an experimental framework from Xilinx Research Labs to explore deep neural network -inference on FPGAs. +FINN is an experimental framework from Integrated Communications and AI Lab of AMD Research & Advanced Development to explore deep neural network inference on FPGAs. It specifically targets quantized neural networks, with emphasis on generating dataflow-style architectures customized for each network. @@ -28,7 +27,7 @@ Please see the [Getting Started](https://finn.readthedocs.io/en/latest/getting_s ## Documentation -You can view the documentation on [readthedocs](https://finn.readthedocs.io) or build them locally using `python setup.py doc` from inside the Docker container. Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience. +You can view the documentation on [readthedocs](https://finn.readthedocs.io). Additionally, there is a series of [Jupyter notebook tutorials](https://github.com/Xilinx/finn/tree/main/notebooks), which we recommend running from inside Docker for a better experience. ## Community diff --git a/docker/Dockerfile.finn b/docker/Dockerfile.finn index dbafba2476..2ceb1f4195 100644 --- a/docker/Dockerfile.finn +++ b/docker/Dockerfile.finn @@ -1,4 +1,5 @@ -# Copyright (c) 2021, Xilinx +# Copyright (C) 2021-2022, Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,10 +27,10 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -FROM pytorch/pytorch:1.7.1-cuda11.0-cudnn8-runtime -LABEL maintainer="Yaman Umuroglu " +FROM ubuntu:jammy-20230126 +LABEL maintainer="Jakoba Petri-Koenig , Yaman Umuroglu " -ARG XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt" +ARG XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt" WORKDIR /workspace @@ -57,12 +58,17 @@ RUN apt-get update && \ unzip \ zip \ locales \ - lsb-core + lsb-core \ + python3 \ + python-is-python3 \ + python3-pip \ + python3-setuptools-scm \ + python3-venv RUN echo "StrictHostKeyChecking no" >> /etc/ssh/ssh_config RUN locale-gen "en_US.UTF-8" # install Verilator from source to get the right version -RUN apt-get install -y git perl python3 make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlibc zlib1g zlib1g-dev +RUN apt-get install -y git perl make autoconf g++ flex bison ccache libgoogle-perftools-dev numactl perl-doc libfl2 libfl-dev zlib1g zlib1g-dev RUN git clone https://github.com/verilator/verilator RUN cd verilator && \ git checkout v4.224 && \ @@ -81,22 +87,31 @@ RUN rm /tmp/$XRT_DEB_VERSION.deb COPY requirements.txt . RUN pip install -r requirements.txt RUN rm requirements.txt + +# install PyTorch +RUN pip install torch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 --extra-index-url https://download.pytorch.org/whl/cu116 + # extra Python package dependencies (for testing and interaction) -RUN pip install pygments==2.4.1 -RUN pip install ipykernel==5.5.5 +RUN pip install pygments==2.14.0 +RUN pip install ipykernel==6.21.2 RUN pip install jupyter==1.0.0 --ignore-installed RUN pip install markupsafe==2.0.1 -RUN pip install matplotlib==3.3.1 --ignore-installed +RUN pip install matplotlib==3.7.0 --ignore-installed RUN pip install pytest-dependency==0.5.1 -RUN pip install sphinx==5.0.2 -RUN pip install sphinx_rtd_theme==0.5.0 -RUN pip install pytest-xdist[setproctitle]==2.4.0 -RUN pip install pytest-parallel==0.1.0 +RUN pip install pytest-xdist[setproctitle]==3.2.0 +RUN pip install pytest-parallel==0.1.1 RUN pip install "netron>=5.0.0" -RUN pip install pandas==1.1.5 -RUN pip install scikit-learn==0.24.1 -RUN pip install tqdm==4.31.1 +RUN pip install pandas==1.5.3 +RUN pip install scikit-learn==1.2.1 +RUN pip install tqdm==4.64.1 RUN pip install -e git+https://github.com/fbcotter/dataset_loading.git@0.0.4#egg=dataset_loading +# these versions of pytest and associated plugins allow for stable collection of +# test reports and code coverage reports in HTML +RUN pip install pytest==6.2.5 +RUN pip install pytest-metadata==1.7.0 +RUN pip install pytest-html==3.0.0 +RUN pip install pytest-html-merger==0.0.8 +RUN pip install pytest-cov==4.1.0 # extra dependencies from other FINN deps # installed in Docker image to make entrypoint script go faster diff --git a/docker/finn_entrypoint.sh b/docker/finn_entrypoint.sh index b5c702111a..61c8f78665 100644 --- a/docker/finn_entrypoint.sh +++ b/docker/finn_entrypoint.sh @@ -54,8 +54,11 @@ recho () { echo -e "${RED}ERROR: $1${NC}" } -# qonnx +# qonnx (using workaround for https://github.com/pypa/pip/issues/7953) +# to be fixed in future Ubuntu versions (https://bugs.launchpad.net/ubuntu/+source/setuptools/+bug/1994016) +mv ${FINN_ROOT}/deps/qonnx/pyproject.toml ${FINN_ROOT}/deps/qonnx/pyproject.tmp pip install --user -e ${FINN_ROOT}/deps/qonnx +mv ${FINN_ROOT}/deps/qonnx/pyproject.tmp ${FINN_ROOT}/deps/qonnx/pyproject.toml # finn-experimental pip install --user -e ${FINN_ROOT}/deps/finn-experimental # brevitas @@ -109,10 +112,31 @@ if [ -f "$HLS_PATH/settings64.sh" ];then else yecho "Unable to find $HLS_PATH/settings64.sh" yecho "Functionality dependent on Vitis HLS will not be available." - yecho "Please note that FINN needs at least version 2020.2 for Vitis HLS support." + yecho "Please note that FINN needs at least version 2020.2 for Vitis HLS support. Our recommendation is to use version 2022.2" yecho "If you need Vitis HLS, ensure HLS_PATH is set correctly and mounted into the Docker container." fi +if [ -d "$FINN_ROOT/.Xilinx" ]; then + mkdir "$HOME/.Xilinx" + if [ -f "$FINN_ROOT/.Xilinx/HLS_init.tcl" ]; then + cp "$FINN_ROOT/.Xilinx/HLS_init.tcl" "$HOME/.Xilinx/" + gecho "Found HLS_init.tcl and copied to $HOME/.Xilinx/HLS_init.tcl" + else + yecho "Unable to find $FINN_ROOT/.Xilinx/HLS_init.tcl" + fi + + if [ -f "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" ]; then + mkdir "$HOME/.Xilinx/Vivado/" + cp "$FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" "$HOME/.Xilinx/Vivado/" + gecho "Found Vivado_init.tcl and copied to $HOME/.Xilinx/Vivado/Vivado_init.tcl" + else + yecho "Unable to find $FINN_ROOT/.Xilinx/Vivado/Vivado_init.tcl" + fi +else + echo "If you need to enable a beta device, ensure .Xilinx/HLS_init.tcl and/or .Xilinx/Vivado/Vivado_init.tcl are set correctly and mounted" + echo "See https://docs.xilinx.com/r/en-US/ug835-vivado-tcl-commands/Tcl-Initialization-Scripts" +fi + export PATH=$PATH:$HOME/.local/bin # execute the provided command(s) as root exec "$@" diff --git a/docker/jenkins/Jenkinsfile b/docker/jenkins/Jenkinsfile index e3e5b5f7f9..6d51fffd64 100644 --- a/docker/jenkins/Jenkinsfile +++ b/docker/jenkins/Jenkinsfile @@ -1,46 +1,355 @@ -node { - def app - stage('Clone repository') { - /* Let's make sure we have the repository cloned to our workspace */ - checkout scm - } - withEnv([ - "FINN_XILINX_PATH=/proj/xbuilds/SWIP/2022.1_0420_0327/installs/lin64", - "FINN_XILINX_VERSION=2022.1", - "FINN_DOCKER_TAG=xilinx/finn:jenkins", - "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci", - "PLATFORM_REPO_PATHS=/opt/xilinx/platforms" - ]){ - parallel firstBranch: { - stage('Brevitas export') { - dir("${env.WORKSPACE}") { - sh("bash run-docker.sh python setup.py test --addopts -mbrevitas_export") - } - } - }, secondBranch: { - stage('Streamlining transformations') { - dir("${env.WORKSPACE}") { - sh("bash run-docker.sh python setup.py test --addopts -mstreamline") - } - } - }, thirdBranch: { - stage('Util functions') { - dir("${env.WORKSPACE}") { - sh("bash run-docker.sh python setup.py test --addopts -mutil") - } - } - }, fourthBranch: { - stage('General transformations') { - dir("${env.WORKSPACE}") { - sh("bash run-docker.sh python setup.py test --addopts -mtransform") - } - } - }, fifthBranch: { - stage('Fpgadataflow transformations and simulations') { - dir("${env.WORKSPACE}") { - sh("bash run-docker.sh python setup.py test --addopts -mfpgadataflow") - } +pipeline { + agent none + parameters { + booleanParam(name: 'fpgadataflow', defaultValue: false, description: 'Run fpgadataflow tests') + booleanParam(name: 'sanity', defaultValue: true, description: 'Run sanity hardware and unit tests') + booleanParam(name: 'end2end', defaultValue: false, description: 'Run end2end tests') + } + stages { + stage('Run Tests') { + parallel { + stage('Sanity - Build Hardware') { + when { + expression { return params['sanity'] } + } + agent { + label 'finn-build' + } + environment { + TEST_NAME = "bnn_build_sanity" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + // Creates dir in finn clone to store build files for stashing + sh "mkdir -p ${env.TEST_NAME}" + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Pass in the marker to run with pytest and the XML test results filename + runDockerPytestWithMarker("sanity_bnn", "${env.TEST_NAME}", '') + + // Find the board's build files (bitstreams/xclbins) and zip for use on the boards themselves + findCopyZip("Pynq-Z1", env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + findCopyZip("ZCU104", env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + findCopyZip("KV260_SOM", env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + findCopyZip("U250", env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + + // Stash the test results file(s) + stash name: "${env.TEST_NAME}", includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html" + + // Use an env variable to help collect test results later in pipeline + env.BNN_BUILD_SANITY = "SUCCESS" + } + } + } + } + stage('Sanity - Unit Tests') { + when { + expression { params['sanity'] } + } + agent { + label 'finn-build' + } + environment { + TEST_NAME = "sanity_ut" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Multiple markers with pytest needs its own script + createMultiMarkerScript("util or brevitas_export or streamline or transform or notebooks", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_sanity_ut") + sh './run-docker.sh ./run-tests.sh' + + // Stash the test results file(s) + stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html" + + // Use an env variable to help collect test results later in pipeline + env.SANITY_UT = "SUCCESS" + + // Archive coverage report if successful + archiveSuccessfulStage(env.SANITY_UT, "coverage_sanity_ut") + } + } + } + } + stage('fpgadataflow Tests') { + when { + expression { params['fpgadataflow'] } + } + agent { + label 'finn-build' + } + environment { + TEST_NAME = "fpgadataflow" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Pass in the marker to run with pytest and the XML test results filename + runDockerPytestWithMarker("fpgadataflow", "${env.TEST_NAME}", "--cov --cov-report=html:coverage_fpgadataflow") + + // Stash the test results file(s) + stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html" + + // Use an env variable to help collect test results later in pipeline + env.FPGADATAFLOW_RESULT = "SUCCESS" + + // Archive coverage report if successful + archiveSuccessfulStage(env.FPGADATAFLOW_RESULT, "coverage_fpgadataflow") + } + } + } + } + stage('End2end') { + when { + expression { params['end2end'] } + } + agent { + label 'finn-build' + } + environment { + TEST_NAME = "end2end" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + // Delete any build files from a previous build + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Pass in the marker to run with pytest and the XML test results filename + runDockerPytestWithMarker(env.TEST_NAME, "${env.TEST_NAME}", '') + + // Stash the test results file(s) + stash name: env.TEST_NAME, includes: "${env.TEST_NAME}.xml,${env.TEST_NAME}.html" + + // Use an env variable to help collect test results later in pipeline + env.END2END_RESULT = "SUCCESS" + } + } + } + } + stage('BNN end2end - U250') { + when { + expression { return params['end2end'] } + } + agent { + label 'finn-build' + } + environment { + BOARD = "U250" + TEST_NAME = "bnn_build_full" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + // Creates dir in finn clone to store build files for stashing + sh "mkdir -p ${env.TEST_NAME}" + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Pass in the marker to run with pytest and the XML test results filename + runDockerPytestWithMarker("bnn_u250", "${env.TEST_NAME}_${env.BOARD}", '') + findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + + // Stash the test results file(s) + stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html" + + // Use an env variable to help collect test results later in pipeline + env.BNN_BUILD_U250 = "SUCCESS" + } + } + } + } + stage('BNN end2end - Pynq-Z1') { + when { + expression { return params['end2end'] } + } + agent { + label 'finn-build' + } + environment { + BOARD = "Pynq-Z1" + TEST_NAME = "bnn_build_full" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + // Creates dir in finn clone to store build files for stashing + sh "mkdir -p ${env.TEST_NAME}" + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Pass in the marker to run with pytest and the XML test results filename + runDockerPytestWithMarker("bnn_pynq", "${env.TEST_NAME}_${env.BOARD}", '') + findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + + // Stash the test results file(s) + stash name: "${env.TEST_NAME}_PynqZ1", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html" + + // Use an env variable to help collect test results later in pipeline + env.BNN_BUILD_PYNQZ1 = "SUCCESS" + } + } + } + } + stage('BNN end2end - ZCU104') { + when { + expression { return params['end2end'] } + } + agent { + label 'finn-build' + } + environment { + BOARD = "ZCU104" + TEST_NAME = "bnn_build_full" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + // Creates dir in finn clone to store build files for stashing + sh "mkdir -p ${env.TEST_NAME}" + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Pass in the marker to run with pytest and the XML test results filename + runDockerPytestWithMarker("bnn_zcu104", "${env.TEST_NAME}_${env.BOARD}", '') + findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + + // Stash the test results file(s) + stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html" + + // Use an env variable to help collect test results later in pipeline + env.BNN_BUILD_ZCU104 = "SUCCESS" + } + } + } + } + stage('BNN end2end - KV260_SOM') { + when { + expression { return params['end2end'] } + } + agent { + label 'finn-build' + } + environment { + BOARD = "KV260_SOM" + TEST_NAME = "bnn_build_full" + FINN_HOST_BUILD_DIR = "${env.FINN_HOST_BUILD_DIR}/${env.TEST_NAME}_${env.BOARD}" + } + steps { + catchError(stageResult: 'FAILURE') { + script { + // Creates dir in finn clone to store build files for stashing + sh "mkdir -p ${env.TEST_NAME}" + cleanPreviousBuildFiles(env.FINN_HOST_BUILD_DIR) + + // Pass in the marker to run with pytest and the XML test results filename + runDockerPytestWithMarker("bnn_kv260", "${env.TEST_NAME}_${env.BOARD}", '') + findCopyZip(env.BOARD, env.FINN_HOST_BUILD_DIR, env.TEST_NAME) + + // Stash the test results file(s) + stash name: "${env.TEST_NAME}_${env.BOARD}", includes: "${env.TEST_NAME}_${env.BOARD}.xml,${env.TEST_NAME}_${env.BOARD}.html" + + // Use an env variable to help collect test results later in pipeline + env.BNN_BUILD_KV260_SOM = "SUCCESS" + } } + } } + } } + stage('Check Stage Results') { + agent { + label 'finn-build' + } + steps { + script { + sh 'mkdir -p reports' + cleanPreviousBuildFiles('reports') + dir('reports') { + // Only unstash for stages that ran + unstashSuccessfulStage(env.SANITY_UT, "sanity_ut") + unstashSuccessfulStage(env.FPGADATAFLOW_RESULT, "fpgadataflow") + unstashSuccessfulStage(env.BNN_BUILD_SANITY, "bnn_build_sanity") + unstashSuccessfulStage(env.END2END_RESULT, "end2end") + unstashSuccessfulStage(env.BNN_BUILD_U250, "bnn_build_full_U250") + unstashSuccessfulStage(env.BNN_BUILD_PYNQZ1, "bnn_build_full_PynqZ1") + unstashSuccessfulStage(env.BNN_BUILD_ZCU104, "bnn_build_full_ZCU104") + unstashSuccessfulStage(env.BNN_BUILD_KV260_SOM, "bnn_build_full_KV260_SOM") + } + + // Combine individual HTML files to one single report + sh './run-docker.sh pytest_html_merger -i reports/ -o reports/test_report_final.html' + + // Archive the XML & HTML test results + archiveArtifacts artifacts: "reports/*.xml" + archiveArtifacts artifacts: "reports/*.html" + + // Plot what XML files were created during the test run + junit 'reports/*.xml' + } + } + } + } +} + +void cleanPreviousBuildFiles(String buildDir) { + // Delete any build files from a previous build + // Previous build folders affect findCopyZip() and can cause the stage to fail + if (!buildDir.empty) { + sh "rm -rf ${buildDir}" + } +} + +void createMultiMarkerScript(String markers, String testResultsFilename, String additionalOptions) { + // Passing multiple markers when running ./run-docker.sh does not work with bash. + // Therefore, create a script to maintain the single quotes that surround the markers + sh """echo "#!/bin/bash +python -m pytest -m \'${markers}\' --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}" >> run-tests.sh + """ + + // Give permissions to script + sh 'chmod 777 run-tests.sh' +} + +void runDockerPytestWithMarker(String marker, String testResultsFilename, String additionalOptions) { + sh """./run-docker.sh python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html ${additionalOptions}""" +} + +def findBoardBuildFiles(String searchDir, String dirToFind) { + def result = sh(script: "find $searchDir -type d -name \"$dirToFind*\"", returnStdout: true).trim() + if (result.empty) { + error "Directory containing '$dirToFind' not found." + } + return result +} + +void findCopyZip(String board, String findDir, String copyDir) { + def buildDir = findBoardBuildFiles(findDir, "hw_deployment_${board}") + sh "cp -r ${buildDir}/${board} ${copyDir}/" + dir(copyDir) { + sh "zip -r ${board}.zip ${board}/" + sh "mkdir -p ${env.ARTIFACT_DIR}/${copyDir}/" + sh "cp ${board}.zip ${env.ARTIFACT_DIR}/${copyDir}/" + } +} + +void unstashSuccessfulStage(String stageEnvVariableSet, String stashName) { + if (stageEnvVariableSet) { + unstash stashName + } +} + +void archiveSuccessfulStage(String stageEnvVariableSet, String folder) { + if (stageEnvVariableSet) { + archiveArtifacts artifacts: "${folder}/**/*" + } } diff --git a/docker/jenkins/Jenkinsfile_CI b/docker/jenkins/Jenkinsfile_CI new file mode 100644 index 0000000000..5e7d5f1475 --- /dev/null +++ b/docker/jenkins/Jenkinsfile_CI @@ -0,0 +1,46 @@ +node('finn-build || built-in') { + def app + stage('Clone repository') { + /* Let's make sure we have the repository cloned to our workspace */ + checkout scm + } + withEnv([ + "FINN_XILINX_PATH=/proj/xbuilds/SWIP/2022.2_1014_8888/installs/lin64", + "FINN_XILINX_VERSION=2022.2", + "FINN_DOCKER_TAG=xilinx/finn:jenkins", + "FINN_HOST_BUILD_DIR=/scratch/users/finn_ci", + "PLATFORM_REPO_PATHS=/opt/xilinx/platforms" + ]){ + parallel firstBranch: { + stage('Brevitas export') { + dir("${env.WORKSPACE}") { + sh("bash run-docker.sh pytest -mbrevitas_export") + } + } + }, secondBranch: { + stage('Streamlining transformations') { + dir("${env.WORKSPACE}") { + sh("bash run-docker.sh pytest -mstreamline") + } + } + }, thirdBranch: { + stage('Util functions') { + dir("${env.WORKSPACE}") { + sh("bash run-docker.sh pytest -mutil") + } + } + }, fourthBranch: { + stage('General transformations') { + dir("${env.WORKSPACE}") { + sh("bash run-docker.sh pytest -mtransform") + } + } + }, fifthBranch: { + stage('Fpgadataflow transformations and simulations') { + dir("${env.WORKSPACE}") { + sh("bash run-docker.sh pytest -mfpgadataflow") + } + } + } + } +} diff --git a/docker/jenkins/Jenkinsfile_HW b/docker/jenkins/Jenkinsfile_HW new file mode 100644 index 0000000000..bd438d888e --- /dev/null +++ b/docker/jenkins/Jenkinsfile_HW @@ -0,0 +1,481 @@ +pipeline { + agent none + stages { + stage('Get node status') { + options { skipDefaultCheckout() } + agent { + label 'finn-build' + } + steps { + script { + // Check which boards are online before running HW tests + env.ALVEO_HOST_ONLINE = isNodeOnline('finn-u250') + env.PYNQ_ONLINE = isNodeOnline('finn-pynq') + env.ZCU104_ONLINE = isNodeOnline('finn-zcu104') + env.KV260_ONLINE = isNodeOnline('finn-kv260') + } + } + } + stage('Reboot Zynq platforms') { + parallel { + stage('Pynq-Z1') { + options { skipDefaultCheckout() } + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.PYNQ_ONLINE == 'true') } + } + agent { + label 'finn-pynq' + } + environment { + BOARD = 'Pynq-Z1' + USER_CREDENTIALS = credentials('pynq-z1-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + restartZynqPlatform() + } + } + } + stage('ZCU104') { + options { skipDefaultCheckout() } + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.ZCU104_ONLINE == 'true') } + } + agent { + label 'finn-zcu104' + } + environment { + BOARD = 'ZCU104' + USER_CREDENTIALS = credentials('pynq-z1-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + restartZynqPlatform() + } + } + } + stage('Kria KV260_SOM') { + options { skipDefaultCheckout() } + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.KV260_ONLINE == 'true') } + } + agent { + label 'finn-kv260' + } + environment { + BOARD = 'KV260_SOM' + USER_CREDENTIALS = credentials('user-ubuntu-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + restartZynqPlatform() + } + } + } + } + } + stage('Wait for Nodes to reboot') { + options { skipDefaultCheckout() } + agent { + label 'finn-build' + } + steps { + sleep(time: "${env.REBOOT_SLEEP}", unit: 'MINUTES') + } + } + stage('Collect build information for HW testing') { + options { skipDefaultCheckout() } + agent { + label 'finn-build' + } + steps { + script { + // Check which boards are online before running HW tests + env.ALVEO_HOST_ONLINE = isNodeOnline('finn-u250') + env.PYNQ_ONLINE = isNodeOnline('finn-pynq') + env.ZCU104_ONLINE = isNodeOnline('finn-zcu104') + env.KV260_ONLINE = isNodeOnline('finn-kv260') + + // Stash the HW test scripts to be used on worker nodes + dir('docker/jenkins') { + stash name: 'bnn_test_files', includes: 'test_bnn_hw_pytest.py' + } + + // Collect build artifacts from network and stash for use on worker nodes + dir("${env.ARTIFACT_DIR}"){ + stashBuildArtifacts('bnn_build_sanity') + stashBuildArtifacts('bnn_build_full') + } + } + } + } + stage('Sanity - Run Hardware Tests') { + parallel { + stage('BNN Sanity - U250') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.ALVEO_HOST_ONLINE == 'true') } + } + agent { + label 'finn-u250' + } + environment { + BOARD = 'U250' + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_sanity", "${env.BOARD}", "${env.BOARD}") + } + } + post { + always { + stashResults("bnn_build_sanity", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + stage('BNN Sanity - Pynq-Z1') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.PYNQ_ONLINE == 'true') } + } + agent { + label 'finn-pynq' + } + environment { + BOARD = 'Pynq-Z1' + USER_CREDENTIALS = credentials('pynq-z1-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_sanity", "${env.BOARD}", "Pynq") + } + } + post { + always { + stashResults("bnn_build_sanity", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + stage('BNN Sanity - ZCU104') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.ZCU104_ONLINE == 'true') } + } + agent { + label 'finn-zcu104' + } + environment { + BOARD = 'ZCU104' + USER_CREDENTIALS = credentials('pynq-z1-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_sanity", "${env.BOARD}", "${env.BOARD}") + } + } + post { + always { + stashResults("bnn_build_sanity", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + stage('BNN Sanity - KV260_SOM') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.KV260_ONLINE == 'true') } + } + agent { + label 'finn-kv260' + } + environment { + BOARD = 'KV260_SOM' + USER_CREDENTIALS = credentials('user-ubuntu-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_sanity", "${env.BOARD}", "${env.BOARD}") + } + } + post { + always { + stashResults("bnn_build_sanity", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + } + } + stage('End2end - Run Hardware Tests') { + parallel { + stage('BNN end2end - U250') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.ALVEO_HOST_ONLINE == 'true') } + } + agent { + label 'finn-u250' + } + environment { + BOARD = 'U250' + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_full", "${env.BOARD}", "${env.BOARD}") + } + } + post { + always { + stashResults("bnn_build_full", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + stage('BNN end2end - Pynq-Z1') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.PYNQ_ONLINE == 'true') } + } + agent { + label 'finn-pynq' + } + environment { + BOARD = 'Pynq-Z1' + USER_CREDENTIALS = credentials('pynq-z1-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_full", "${env.BOARD}", "Pynq") + } + } + post { + always { + stashResults("bnn_build_full", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + stage('BNN end2end - ZCU104') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.ZCU104_ONLINE == 'true') } + } + agent { + label 'finn-zcu104' + } + environment { + BOARD = 'ZCU104' + USER_CREDENTIALS = credentials('pynq-z1-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_full", "${env.BOARD}", "${env.BOARD}") + } + } + post { + always { + stashResults("bnn_build_full", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + stage('BNN end2end - KV260_SOM') { + when { + // beforeAgent set to 'true' to prevent an offline agent hanging the stage + beforeAgent true + expression { return (env.KV260_ONLINE == 'true') } + } + agent { + label 'finn-kv260' + } + environment { + BOARD = 'KV260_SOM' + USER_CREDENTIALS = credentials('user-ubuntu-credentials') + } + steps { + catchError(stageResult: 'FAILURE') { + runTest("bnn_build_full", "${env.BOARD}", "${env.BOARD}") + } + } + post { + always { + stashResults("bnn_build_full", "${env.BOARD}") + cleanUpWorkspaceOwnership() + } + } + } + } + } + stage('Check Stage Results') { + agent { + label 'finn-build' + } + steps { + script { + sh 'mkdir -p reports' + cleanPreviousBuildFiles('reports') + dir('reports') { + // Only unstash for stages that ran + unstashSuccessfulStage(env.ALVEO_HOST_ONLINE, "xml_bnn_build_sanity_U250") + unstashSuccessfulStage(env.PYNQ_ONLINE, "xml_bnn_build_sanity_Pynq-Z1") + unstashSuccessfulStage(env.ZCU104_ONLINE, "xml_bnn_build_sanity_ZCU104") + unstashSuccessfulStage(env.KV260_ONLINE, "xml_bnn_build_sanity_KV260_SOM") + unstashSuccessfulStage(env.ALVEO_HOST_ONLINE, "xml_bnn_build_full_U250") + unstashSuccessfulStage(env.PYNQ_ONLINE, "xml_bnn_build_full_Pynq-Z1") + unstashSuccessfulStage(env.ZCU104_ONLINE, "xml_bnn_build_full_ZCU104") + unstashSuccessfulStage(env.KV260_ONLINE, "xml_bnn_build_full_KV260_SOM") + } + + // Combine individual HTML files to one single report + sh './run-docker.sh pytest_html_merger -i reports/ -o reports/test_report_hw_final.html' + + // Archive the XML & HTML test results + archiveArtifacts artifacts: "reports/*.xml" + archiveArtifacts artifacts: "reports/*.html" + + // Plot what XML files were created during the test run + junit 'reports/*.xml' + } + } + } + } +} + +void cleanPreviousBuildFiles(String buildDir) { + // Delete any build files from a previous build + // Previous build folders affect findCopyZip() and can cause the stage to fail + if (!buildDir.empty) { + if (env.USER_CREDENTIALS) { + sh "echo $USER_CREDENTIALS_PSW | sudo -S rm -rf ${buildDir}*" + } else { + sh "rm -rf ${buildDir}" + } + } +} + +void createTestScript(String board, String marker, String testResultsFilename) { + if(board == "U250") + sh """echo "#!/bin/bash +. /opt/xilinx/xrt/setup.sh +. ${VENV_ACTIVATE} +python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html" >> run-tests.sh + """ + else + sh """echo "#!/bin/bash +. /etc/profile.d/pynq_venv.sh +. /etc/profile.d/xrt_setup.sh +python -m pytest -m ${marker} --junitxml=${testResultsFilename}.xml --html=${testResultsFilename}.html --self-contained-html" >> run-tests.sh + """ + + // Give permissions to script + sh 'chmod 777 run-tests.sh' +} + +def isNodeOnline(String labelName) { + Label label = Jenkins.instance.getLabel(labelName) + def agentOnline = false + + if (label) { + List nodes = Jenkins.instance.getNodes() + + nodes.each { node -> + if (node.getAssignedLabels().contains(label)) { + def computer = node.toComputer() + if (computer && computer.isOnline()) { + agentOnline = true + } else { + echo """Agent ${node.displayName} is offline""" + } + } + } + } else { + echo """Node with label ${labelName} not found""" + } + + return agentOnline +} + +void unstashSuccessfulStage(String stageEnvVariableSet, String stashName) { + if (stageEnvVariableSet) { + unstash stashName + } +} + +void stashBuildArtifacts(String testDir) { + dir("$testDir") { + def files = findFiles() + files.each { f -> + def file = f.toString() + def extIndex = file.lastIndexOf(".") + def boardName = file.substring(0, extIndex) + stash name: "${testDir}_${boardName}_zip", includes: "${f}" + } + } +} + +void runTest(String testType, String board, String marker) { + sh "mkdir -p ${testType}" + dir("$testType") { + // Clean any files from a previous run + cleanPreviousBuildFiles("${board}*") + + // Get the test files + unstash name: "${testType}_${board}_zip" + sh "unzip -o ${board}.zip" + + dir("$board") { + // Get the scripts necessary for running hw tests + unstash name: 'bnn_test_files' + + // Create test script + createTestScript(board, marker, "${testType}_hw_${board}") + + if (env.USER_CREDENTIALS) { + // Execute the script as the root user - needed for zynq platforms + sh 'echo ${USER_CREDENTIALS_PSW} | sudo -S ./run-tests.sh' + } else { + // Execute the script + sh './run-tests.sh' + } + } + } +} + +void stashResults (String testType, String board) { + // Get test result file and delete test files on the board + dir("${testType}/${board}") { + // Collect the results file on the worker node by stashing + try { + stash name: "xml_${testType}_${board}", includes: "${testType}_hw_${board}.xml,${testType}_hw_${board}.html" + } catch (err) { + echo "No results to stash" + } + } +} + +void cleanUpWorkspaceOwnership () { + if (env.USER_CREDENTIALS) { + sh 'echo ${USER_CREDENTIALS_PSW} | sudo -S chown -R $(id -u):$(id -g) ${WORKSPACE}' + } +} + +void restartZynqPlatform () { + if (env.USER_CREDENTIALS) { + sh 'echo ${USER_CREDENTIALS_PSW} | sudo -S shutdown -r +1' + } +} diff --git a/docker/jenkins/test_bnn_hw_pytest.py b/docker/jenkins/test_bnn_hw_pytest.py new file mode 100755 index 0000000000..dc350d8504 --- /dev/null +++ b/docker/jenkins/test_bnn_hw_pytest.py @@ -0,0 +1,213 @@ +import pytest + +import itertools +import logging +import numpy as np +import os +import subprocess +from scipy.stats import linregress + +# no __init__ constructors allowed in Pytest - so use global variables instead +base_dir_global = os.getcwd() +default_test_run_timeout = 30 # seconds +output_execute_results_file = "output.npy" +execute_results_reference_file = "output_reference.npy" +output_throughput_results_file = "nw_metrics.txt" +throughput_results_formatted_file = "throughput_metrics_formatted.txt" +logger = logging.getLogger(__name__) + + +def remove_cache_dirs(dir_list): + tmp_list = list(dir_list) + for i in range(len(tmp_list) - 1, -1, -1): + if ".pytest_cache" in tmp_list[i]: + del tmp_list[i] + elif "__pycache__" in tmp_list[i]: + del tmp_list[i] + return tmp_list + + +def delete_file(file_path): + # Check if the file exists before deleting it + if os.path.exists(file_path): + try: + os.remove(file_path) + logger.info(f"File '{file_path}' deleted successfully.") + except Exception as e: + logger.error(f"An error occurred while deleting the file: {e}") + else: + logger.info(f"File '{file_path}' does not exist. Continuing with the script.") + + +def get_platform(board_str): + return "alveo" if "U250" in board_str else "zynq-iodma" + + +def get_full_parameterized_test_list(marker, test_dir_list, batch_size_list, platform_list): + test_cases = [ + ( + f"{marker}_{param1}_batchSize-{param2}_platform-{param3}", + { + "test_dir": param1, + "batch_size": param2, + "platform": param3, + }, + ) + for param1, param2, param3 in itertools.product( + test_dir_list, + batch_size_list, + platform_list, + ) + ] + return test_cases + + +def pytest_generate_tests(metafunc): + idlist = [] + argvalues = [] + scenarios = [] + + # Separate the full list of markers used on command line. + # This allows a user to select multiple markers + all_markers_used = metafunc.config.getoption("-m").split(" ") + current_dir = os.getcwd() + test_dirs = [ + name for name in os.listdir(current_dir) if os.path.isdir(os.path.join(current_dir, name)) + ] + test_dirs = remove_cache_dirs(test_dirs) + + for marker in all_markers_used: + if "Pynq" in marker or "U250" in marker or "ZCU104" in marker or "KV260_SOM" in marker: + platform = get_platform(marker) + scenarios.extend( + get_full_parameterized_test_list( + marker, test_dir_list=test_dirs, batch_size_list=[1], platform_list=[platform] + ) + ) + + if len(scenarios) > 0: + for scenario in scenarios: + # There is a known Pynq/XRT issue with larger sets of weights on Alveo. + # Accesses to address spaces over 16KB do NOT work as intended. + # Disabling Alveo lfc HW test until resolved. + if scenario[0] == "U250_bnn_w1_a1_lfc_batchSize-1_platform-alveo": + continue + idlist.append(scenario[0]) + items = scenario[1].items() + argnames = [x[0] for x in items] + argvalues.append([x[1] for x in items]) + metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class") + + +@pytest.mark.Pynq +@pytest.mark.U250 +@pytest.mark.ZCU104 +@pytest.mark.KV260_SOM +class TestBnn: + def test_type_execute(self, test_dir, batch_size, platform): + # Enter into test directory and clean any files from a potential previous run + os.chdir(os.path.join(base_dir_global, test_dir)) + delete_file(output_execute_results_file) + + # Run test option: execute + bitfile = "a.xclbin" if platform == "alveo" else "resizer.bit" + result = subprocess.run( + [ + "python", + "driver.py", + "--exec_mode=execute", + f"--batchsize={batch_size}", + f"--bitfile={bitfile}", + "--inputfile=input.npy", + "--outputfile=output.npy", + f"--platform={platform}", + ], + capture_output=True, + text=True, + timeout=default_test_run_timeout, + ) + assert result.returncode == 0 + + # Load the output and reference arrays + output_array = np.load(output_execute_results_file) + reference_array = np.load(execute_results_reference_file) + + # Compare the arrays + try: + assert np.isclose(output_array, reference_array).all() + except AssertionError as e: + logger.error("AssertionError occurred: %s", e, exc_info=True) + raise + + def test_type_throughput(self, test_dir, batch_size, platform): + os.chdir(os.path.join(base_dir_global, test_dir)) + delete_file(output_throughput_results_file) + + # Run test option: throughput + bitfile = "a.xclbin" if platform == "alveo" else "resizer.bit" + result = subprocess.run( + [ + "python", + "driver.py", + "--exec_mode=throughput_test", + f"--batchsize={batch_size}", + f"--bitfile={bitfile}", + "--inputfile=input.npy", + "--outputfile=output.npy", + f"--platform={platform}", + ], + capture_output=True, + text=True, + timeout=default_test_run_timeout, + ) + assert result.returncode == 0 + + # Check if nw_metrics.txt now exists after test run + assert os.path.exists(output_throughput_results_file) + + with open(output_throughput_results_file, "r") as file: + res = eval(file.read()) + + # try a range of batch sizes, some may fail due to insufficient DMA + # buffers + bsize_range_in = [8**i for i in range(5)] + bsize_range = [] + ret = dict() + for bsize in bsize_range_in: + if res is not None: + ret[bsize] = res + bsize_range.append(bsize) + else: + # assume we reached largest possible N + break + + y = [ret[key]["runtime[ms]"] for key in bsize_range] + lrret = linregress(bsize_range, y) + ret_str = "" + ret_str += "\n" + "%s Throughput Test Results" % test_dir + ret_str += "\n" + "-----------------------------" + ret_str += "\n" + "From linear regression:" + ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept + ret_str += "\n" + "Time per sample: %f ms" % lrret.slope + ret_str += "\n" + "Raw data:" + + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]" + ) + for k in bsize_range: + v = ret[k] + ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( + k, + np.round(v["runtime[ms]"], 4), + v["fclk[mhz]"], + np.round(v["throughput[images/s]"], 2), + np.round(v["DRAM_in_bandwidth[MB/s]"], 2), + np.round(v["DRAM_out_bandwidth[MB/s]"], 2), + ) + ret_str += "\n" + "-----------------------------" + # largest_bsize = bsize_range[-1] + + # Dump the metrics to a text file + with open(throughput_results_formatted_file, "w") as f: + f.write(ret_str) + assert os.path.exists(throughput_results_formatted_file) diff --git a/docker/quicktest.sh b/docker/quicktest.sh index b4ad37232f..3684e3a0d4 100755 --- a/docker/quicktest.sh +++ b/docker/quicktest.sh @@ -6,16 +6,16 @@ cd $FINN_ROOT # check if command line argument is empty or not present if [ -z $1 ]; then echo "Running quicktest: not (vivado or slow or board) with pytest-xdist" - python setup.py test --addopts "-m 'not (vivado or slow or vitis or board)' --dist=loadfile -n $PYTEST_PARALLEL" + pytest -m 'not (vivado or slow or vitis or board or notebooks or bnn_pynq)' --dist=loadfile -n $PYTEST_PARALLEL elif [ $1 = "main" ]; then echo "Running main test suite: not (rtlsim or end2end) with pytest-xdist" - python setup.py test --addopts "-k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL" + pytest -k 'not (rtlsim or end2end)' --dist=loadfile -n $PYTEST_PARALLEL elif [ $1 = "rtlsim" ]; then echo "Running rtlsim test suite with pytest-parallel" - python setup.py test --addopts "-k rtlsim --workers $PYTEST_PARALLEL" + pytest -k rtlsim --workers $PYTEST_PARALLEL elif [ $1 = "end2end" ]; then echo "Running end2end test suite with no parallelism" - python setup.py test --addopts "-k end2end" + pytest -k end2end elif [ $1 = "full" ]; then echo "Running full test suite, each step with appropriate parallelism" $0 main; diff --git a/docs/finn/brevitas_export.rst b/docs/finn/brevitas_export.rst index 950b601f98..0a1c788324 100644 --- a/docs/finn/brevitas_export.rst +++ b/docs/finn/brevitas_export.rst @@ -8,11 +8,11 @@ Brevitas Export :scale: 70% :align: center -FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_. Brevitas provides an export of a quantized network in ONNX representation in several flavors. -Two of the Brevitas-exported ONNX variants can be ingested by FINN: - - * FINN-ONNX: Quantized weights exported as tensors with additional attributes to mark low-precision datatypes. Quantized activations exported as MultiThreshold nodes. - * QONNX: All quantization is represented using Quant, BinaryQuant or Trunc nodes. QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn` +FINN expects an ONNX model as input. This can be a model trained with `Brevitas `_. Brevitas is a PyTorch library for quantization-aware training and the FINN Docker image comes with several `example Brevitas networks `_. +Brevitas provides an export of a quantized network in QONNX representation, which is the format that can be ingested by FINN. +In a QONNX graph, all quantization is represented using Quant, BinaryQuant or Trunc nodes. +QONNX must be converted into FINN-ONNX by :py:mod:`finn.transformation.qonnx.convert_qonnx_to_finn`. FINN-ONNX is the intermediate representation (IR) FINN uses internally. +In this IR, quantized weights are indicated through tensors with additional attributes to mark low-precision datatypes and quantized activations are expressed as MultiThreshold nodes. To work with either type of ONNX model, it is loaded into a :ref:`modelwrapper` provided by FINN. diff --git a/docs/finn/command_line.rst b/docs/finn/command_line.rst index 8c37479a28..110a522847 100644 --- a/docs/finn/command_line.rst +++ b/docs/finn/command_line.rst @@ -20,7 +20,7 @@ two command line entry points for productivity and ease-of-use: Jupyter notebook as a starting point, visualizing the model at intermediate steps and adding calls to new transformations as needed. Once you have a working flow, you can implement a command line entry for this - by using the "advanced mode" described here. + by using the "advanced mode". Simple dataflow build mode @@ -28,7 +28,7 @@ Simple dataflow build mode This mode is intended for simpler networks whose topologies resemble the FINN end-to-end examples. -It runs a fixed build flow spanning tidy-up, streamlining, HLS conversion +It runs a fixed build flow spanning tidy-up, streamlining, HW conversion and hardware synthesis. It can be configured to produce different outputs, including stitched IP for integration in Vivado IPI as well as bitfiles. @@ -43,7 +43,9 @@ To use it, first create a folder with the necessary configuration and model file 3. Create a JSON file with the build configuration. It must be named ``dataflow_build_dir/dataflow_build_config.json``. Read more about the build configuration options on :py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig`. You can find an example .json file under ``src/finn/qnn-data/build_dataflow/dataflow_build_config.json`` -4. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``. +4. (Optional) create a JSON file with the specialize layers configuration. It must be named ``dataflow_build_dir/specialize_layers_config.json`` + You can find an example .json file under ``src/finn/qnn-data/build_dataflow/specialize_layers_config.json``. +5. (Optional) create a JSON file with the folding configuration. It must be named ``dataflow_build_dir/folding_config.json``. You can find an example .json file under ``src/finn/qnn-data/build_dataflow/folding_config.json``. Instead of specifying the folding configuration, you can use the `target_fps` option in the build configuration to control the degree of parallelization for your network. @@ -59,25 +61,28 @@ as it goes through numerous steps: .. code-block:: none - Building dataflow accelerator from /home/maltanar/sandbox/build_dataflow/model.onnx + Building dataflow accelerator from build_dataflow/model.onnx Outputs will be generated at output_tfc_w1a1_Pynq-Z1 Build log is at output_tfc_w1a1_Pynq-Z1/build_dataflow.log - Running step: step_tidy_up [1/16] - Running step: step_streamline [2/16] - Running step: step_convert_to_hls [3/16] - Running step: step_create_dataflow_partition [4/16] - Running step: step_target_fps_parallelization [5/16] - Running step: step_apply_folding_config [6/16] - Running step: step_generate_estimate_reports [7/16] - Running step: step_hls_codegen [8/16] - Running step: step_hls_ipgen [9/16] - Running step: step_set_fifo_depths [10/16] - Running step: step_create_stitched_ip [11/16] - Running step: step_measure_rtlsim_performance [12/16] - Running step: step_make_pynq_driver [13/16] - Running step: step_out_of_context_synthesis [14/16] - Running step: step_synthesize_bitfile [15/16] - Running step: step_deployment_package [16/16] + Running step: step_qonnx_to_finn [1/19] + Running step: step_tidy_up [2/19] + Running step: step_streamline [3/19] + Running step: step_convert_to_hw [4/19] + Running step: step_create_dataflow_partition [5/19] + Running step: step_specialize_layers [6/19] + Running step: step_target_fps_parallelization [7/19] + Running step: step_apply_folding_config [8/19] + Running step: step_minimize_bit_width [9/19] + Running step: step_generate_estimate_reports [10/19] + Running step: step_hw_codegen [11/19] + Running step: step_hw_ipgen [12/19] + Running step: step_set_fifo_depths [13/19] + Running step: step_create_stitched_ip [14/19] + Running step: step_measure_rtlsim_performance [15/19] + Running step: step_out_of_context_synthesis [16/19] + Running step: step_synthesize_bitfile [17/19] + Running step: step_make_pynq_driver [18/19] + Running step: step_deployment_package [19/19] You can read a brief description of what each step does on @@ -99,6 +104,7 @@ The following outputs will be generated regardless of which particular outputs a * ``build_dataflow.log`` is the build logfile that will contain any warnings/errors * ``time_per_step.json`` will report the time (in seconds) each build step took * ``final_hw_config.json`` will contain the final (after parallelization, FIFO sizing etc) hardware configuration for the build +* ``template_specialize_layers_config.json`` is an example json file that can be used to set the specialize layers config * ``intermediate_models/`` will contain the ONNX file(s) produced after each build step @@ -206,3 +212,5 @@ You can launch the desired custom build flow using: This will mount the specified folder into the FINN Docker container and launch the build flow. If ```` is not specified it will default to ``build`` and thus execute ``build.py``. If it is specified, it will be ``.py``. + +If you would like to learn more about advance builder settings, please have a look at `our tutorial about this topic `_. diff --git a/docs/finn/conf.py b/docs/finn/conf.py index 47ba99fb5f..a4416706c2 100644 --- a/docs/finn/conf.py +++ b/docs/finn/conf.py @@ -19,7 +19,7 @@ # -- Project information ----------------------------------------------------- project = "FINN" -copyright = "2020, Xilinx" +copyright = "2020-2022, Xilinx, 2022-2024, AMD" author = "Y. Umuroglu and J. Petri-Koenig" diff --git a/docs/finn/developers.rst b/docs/finn/developers.rst index f9252f764c..2a5e26959b 100644 --- a/docs/finn/developers.rst +++ b/docs/finn/developers.rst @@ -2,15 +2,13 @@ Developer documentation *********************** -.. note:: **This page is under construction.** - This page is intended to serve as a starting point for new FINN developers. Power users may also find this information useful. Prerequisites ================ -Before starting to do development on FINN it's a good idea to start +Before starting to do development on FINN it is a good idea to start with understanding the basics as a user. Going through all of the :ref:`tutorials` is strongly recommended if you haven't already done so. Additionally, please review the documentation available on :ref:`internals`. @@ -61,7 +59,7 @@ further detailed below: Docker images =============== -If you want to add new dependencies (packages, repos) to FINN it's +If you want to add new dependencies (packages, repos) to FINN it is important to understand how we handle this in Docker. The finn.dev image is built and launched as follows: @@ -70,7 +68,7 @@ The finn.dev image is built and launched as follows: 2. run-docker.sh launches the build of the Docker image with `docker build` (unless ``FINN_DOCKER_PREBUILT=1``). Docker image is built from docker/Dockerfile.finn using the following steps: - * Base: PyTorch dev image + * Base: Ubuntu 22.04 LTS image * Set up apt dependencies: apt-get install a few packages for verilator and * Set up pip dependencies: Python packages FINN depends on are listed in requirements.txt, which is copied into the container and pip-installed. Some additional packages (such as Jupyter and Netron) are also installed. * Install XRT deps, if needed: For Vitis builds we need to install the extra dependencies for XRT. This is only triggered if the image is built with the INSTALL_XRT_DEPS=1 argument. @@ -84,9 +82,9 @@ The finn.dev image is built and launched as follows: 4. Entrypoint script (docker/finn_entrypoint.sh) upon launching container performs the following: - * Source Vivado settings64.sh from specified path to make vivado and vivado_hls available. - * Download PYNQ board files into the finn root directory, unless they already exist. - * Source Vitits settings64.sh if Vitis is mounted. + * Source Vivado settings64.sh from specified path to make vivado and vitis_hls available. + * Download board files into the finn root directory, unless they already exist or ``FINN_SKIP_BOARD_FILES=1``. + * Source Vitis settings64.sh if Vitis is mounted. 5. Depending on the arguments to run-docker.sh a different application is launched. run-docker.sh notebook launches a Jupyter server for the tutorials, whereas run-docker.sh build_custom and run-docker.sh build_dataflow trigger a dataflow build (see documentation). Running without arguments yields an interactive shell. See run-docker.sh for other options. @@ -106,7 +104,7 @@ Linting We use a pre-commit hook to auto-format Python code and check for issues. See https://pre-commit.com/ for installation. Once you have pre-commit, you can install the hooks into your local clone of the FINN repo. -It's recommended to do this **on the host** and not inside the Docker container: +It is recommended to do this **on the host** and not inside the Docker container: :: @@ -119,7 +117,7 @@ you may have to fix it manually, then run `git commit` once again. The checks are configured in .pre-commit-config.yaml under the repo root. Testing -======= +======== Tests are vital to keep FINN running. All the FINN tests can be found at https://github.com/Xilinx/finn/tree/main/tests. These tests can be roughly grouped into three categories: @@ -132,7 +130,7 @@ These tests can be roughly grouped into three categories: Additionally, qonnx, brevitas and finn-hlslib also include their own test suites. The full FINN compiler test suite -(which will take several hours to run and require a PYNQ board) can be executed +(which will take several hours to run) can be executed by: :: @@ -146,7 +144,7 @@ requiring Vivado or as slow-running tests: bash ./run-docker.sh quicktest -When developing a new feature it's useful to be able to run just a single test, +When developing a new feature it is useful to be able to run just a single test, or a group of tests that e.g. share the same prefix. You can do this inside the Docker container from the FINN root directory as follows: @@ -159,8 +157,8 @@ from the FINN root directory as follows: If you want to run tests in parallel (e.g. to take advantage of a multi-core CPU) you can use: -* pytest-parallel for any rtlsim tests, e.g. `python setup.py test --addopts "-k rtlsim --workers auto"` -* pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `python setup.py test --addopts "-k mytest -n auto --dist=loadfile"` +* pytest-parallel for any rtlsim tests, e.g. `pytest -k rtlsim --workers auto` +* pytest-xdist for anything else, make sure to add `--dist=loadfile` if you have tests in the same file that have dependencies on each other e.g. `pytest -k mytest -n auto --dist=loadfile` Finally, the full test suite with appropriate parallelization can be run inside the container by: @@ -178,16 +176,9 @@ FINN provides two types of documentation: * manually written documentation, like this page * autogenerated API docs from Sphinx -Everything is built using Sphinx, which is installed into the finn.dev -Docker image. You can build the documentation locally by running the following -inside the container: - -:: - - python setup.py docs +Everything is built using Sphinx. -You can view the generated documentation on build/html/index.html. -The documentation is also built online by readthedocs: +The documentation is built online by readthedocs: * finn.readthedocs.io contains the docs from the master branch * finn-dev.readthedocs.io contains the docs from the dev branch diff --git a/docs/finn/end_to_end_flow.rst b/docs/finn/end_to_end_flow.rst index 0a022067c3..8fafde5a5e 100644 --- a/docs/finn/end_to_end_flow.rst +++ b/docs/finn/end_to_end_flow.rst @@ -2,7 +2,11 @@ End-to-End Flow *************** -The following image shows an example end-to-end flow in FINN, starting from a trained PyTorch/Brevitas network and going all the way to a running FPGA accelerator. +The following image shows an example end-to-end flow in FINN for a PYNQ board. +Please note that you can build an IP block for your neural network **for every Xilinx-AMD FPGA**, but we only provide automatic system integration for a limited number of boards. +However, you can use Vivado to integrate an IP block generated by FINN into your own design. + +The example flow in this image starts from a trained PyTorch/Brevitas network and goes all the way to a running FPGA accelerator. As you can see in the picture, FINN has a high modularity and has the property that the flow can be stopped at any point and the intermediate result can be used for further processing or other purposes. This enables a wide range of users to benefit from FINN, even if they do not use the whole flow. .. image:: ../../notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg diff --git a/docs/finn/faq.rst b/docs/finn/faq.rst index ef4457f53a..70c2f24ed2 100644 --- a/docs/finn/faq.rst +++ b/docs/finn/faq.rst @@ -7,16 +7,6 @@ Frequently Asked Questions Can't find the answer to your question here? Check `FINN GitHub Discussions `_. -Can I install FINN out of the Docker container? - We do not support out of the Docker implementations at the moment. This is due - to the high complexity of the FINN project dependencies. - -Since FINN uses ONNX, can I compile any model from the ONNX Model Zoo to an FPGA accelerator? - The short answer is no. FINN uses ONNX in a specific (non-standard) way, including custom layer - types and quantization annotations. Networks must be first quantized using Brevitas and exported - to FINN-ONNX to be converted to FPGA accelerators. - - Can I install FINN out of the Docker container? We do not support out of the Docker implementations at the moment. This is due to the high complexity of the FINN project dependencies. @@ -52,7 +42,6 @@ What operating systems are supported by FINN? FINN should work fine under any Linux-based OS capable of running Vivado/Vitis, as long as you install Docker (``docker-ce``) on your machine. - I am getting DocNav and Model_Composer errors when launching the Docker image. We do not mount those particular directories into the Docker container because they are not used. The errors are Vivado related but you can safely ignore them. @@ -74,16 +63,8 @@ How can I target an arbitrary Xilinx FPGA without PYNQ support? Why does FINN-generated architectures need FIFOs between layers? See https://github.com/Xilinx/finn/discussions/383 -How do I tell FINN to utilize DSPs instead of LUTs for MAC operations in particular layers? - This is done with the ``resType="dsp"`` attribute on ``MatrixVectorActivation`` and ``Vector_Vector_Activate`` instances. - When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’ - folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`). - This is a good idea for layers with more weight/input act bits and high PE*SIMD. - See the `MobileNet-v1 build config for ZCU104 in finn-examples `_ for reference. - - How do I tell FINN to utilize a particular type of memory resource in particular layers? - This is done with the ``ram_style`` attribute. Check the particular ``HLSCustomOp`` attribute definition to see + This is done with the ``ram_style`` attribute. Check the particular ``HWCustomOp`` attribute definition to see which modes are supported (`example for MatrixVectorActivation `_). When using the ``build_dataflow`` system, this can be specified at a per layer basis by specifying it as part of one or more layers’ folding config (:py:mod:`finn.builder.build_dataflow_config.DataflowBuildConfig.folding_config_file`). diff --git a/docs/finn/getting_started.rst b/docs/finn/getting_started.rst index 9b3111b70e..217f982702 100644 --- a/docs/finn/getting_started.rst +++ b/docs/finn/getting_started.rst @@ -8,7 +8,7 @@ Quickstart ========== 1. Install Docker to run `without root `_ -2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.1``) +2. Set up ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables pointing respectively to the Xilinx tools installation directory and version (e.g. ``FINN_XILINX_PATH=/opt/Xilinx`` and ``FINN_XILINX_VERSION=2022.2``) 3. Clone the FINN compiler from the repo: ``git clone https://github.com/Xilinx/finn/`` and go into the directory where it is cloned 4. Execute ``./run-docker.sh quicktest`` to verify your installation. 5. Optionally, follow the instructions on :ref:`PYNQ board first-time setup` or :ref:`Alveo first-time setup` for board setup. @@ -28,8 +28,8 @@ to train *customized* networks and create highly-efficient FPGA implementations In general, the approach for using the FINN framework is as follows: 1. Train your own quantized neural network (QNN) in `Brevitas `_. We have some `guidelines `_ on quantization-aware training (QAT). -2. Export to FINN-ONNX by following `this tutorial `_ . -3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_ +2. Export to QONNX and convert to FINN-ONNX by following `this tutorial `_ . +3. Use FINN's ``build_dataflow`` system on the exported model by following this `tutorial `_ or for advanced settings have a look at this `tutorial `_ . 4. Adjust your QNN topology, quantization settings and ``build_dataflow`` configuration to get the desired results. Please note that the framework is still under development, and how well this works will depend on how similar your custom network is to the examples we provide. @@ -49,17 +49,16 @@ Running FINN in Docker ====================== FINN runs inside a Docker container, it comes with a script to easily build and launch the container. If you are not familiar with Docker, there are many excellent `online resources `_ to get started. You may want to review the :ref:`General FINN Docker tips` and :ref:`Environment variables` as well. -If you want to use prebuilt images, read :ref:`Using a prebuilt image`. The above mentioned script to build and launch the FINN docker container is called `run-docker.sh `_ . It can be launched in the following modes: Launch interactive shell ************************ -Simply running sh run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation: +Simply running bash run-docker.sh without any additional arguments will create a Docker container with all dependencies and give you a terminal with you can use for development for experimentation: :: - bash ./run_docker.sh + bash ./run-docker.sh Launch a Build with ``build_dataflow`` @@ -70,8 +69,8 @@ or a user-defined flow from the command line as follows: :: - bash ./run_docker.sh build_dataflow - bash ./run_docker.sh build_custom + bash ./run-docker.sh build_dataflow + bash ./run-docker.sh build_custom Launch Jupyter notebooks @@ -93,11 +92,12 @@ This will launch the `Jupyter notebook `_ server inside a Environment variables ********************** -Prior to running the `run-docker.sh` script, there are several environment variables you can set to configure certain aspects of FINN. -These are summarized below: +Prior to running the ``run-docker.sh`` script, there are several environment variables you can set to configure certain aspects of FINN. +For a complete list, please have a look in the `run-docker.sh `_ file. +The most relevant are summarized below: * (required) ``FINN_XILINX_PATH`` points to your Xilinx tools installation on the host (e.g. ``/opt/Xilinx``) -* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.1``) +* (required) ``FINN_XILINX_VERSION`` sets the Xilinx tools version to be used (e.g. ``2022.2``) * (required for Alveo) ``PLATFORM_REPO_PATHS`` points to the Vitis platform files (DSA). * (required for Alveo) ``XRT_DEB_VERSION`` specifies the .deb to be installed for XRT inside the container (see default value in ``run-docker.sh``). * (optional) ``NUM_DEFAULT_WORKERS`` (default 4) specifies the degree of parallelization for the transformations that can be run in parallel, potentially reducing build time @@ -107,18 +107,14 @@ These are summarized below: * (optional) ``LOCALHOST_URL`` (default localhost) sets the base URL for accessing e.g. Netron from inside the container. Useful when running FINN remotely. * (optional) ``NETRON_PORT`` (default 8081) changes the port for Netron inside Docker * (optional) ``PYNQ_BOARD`` or ``ALVEO_BOARD`` specifies the type of PYNQ/Alveo board used (see "supported hardware" below) for the test suite -* (optional) ``PYNQ_IP`` and ``PYNQ_PORT`` (or ``ALVEO_IP`` and ``ALVEO_PORT``) specify ip address and port number to access the PYNQ board / Alveo target -* (optional) ``PYNQ_USERNAME`` and ``PYNQ_PASSWORD`` (or ``ALVEO_USERNAME`` and ``ALVEO_PASSWORD``) specify the PYNQ board / Alveo host access credentials for the test suite. For PYNQ, password is always needed to run as sudo. For Alveo, you can leave the password empty and place your ssh private key in the ``finn/ssh_keys`` folder to use keypair authentication. -* (optional) ``PYNQ_TARGET_DIR`` (or ``ALVEO_TARGET_DIR``) specifies the target dir on the PYNQ board / Alveo host for the test suite * (optional) ``IMAGENET_VAL_PATH`` specifies the path to the ImageNet validation directory for tests. -* (optional) ``FINN_DOCKER_PREBUILT`` (default 0) if set to 1 then skip Docker image building and use the image tagged with ``FINN_DOCKER_TAG``. * (optional) ``FINN_DOCKER_TAG`` (autogenerated) specifies the Docker image tag to use. * (optional) ``FINN_DOCKER_RUN_AS_ROOT`` (default 0) if set to 1 then run Docker container as root, default is the current user. -* (optional) ``FINN_DOCKER_GPU`` (autodetected) if not 0 then expose all Nvidia GPUs or those selected by ``NVIDIA_VISIBLE_DEVICES`` to Docker container for accelerated DNN training. Requires `Nvidia Container Toolkit `_ * (optional) ``FINN_DOCKER_EXTRA`` (default "") pass extra arguments to the ``docker run`` command when executing ``./run-docker.sh`` * (optional) ``FINN_SKIP_DEP_REPOS`` (default "0") skips the download of FINN dependency repos (uses the ones already downloaded under deps/. * (optional) ``NVIDIA_VISIBLE_DEVICES`` (default "") specifies specific Nvidia GPUs to use in Docker container. Possible values are a comma-separated list of GPU UUID(s) or index(es) e.g. ``0,1,2``, ``all``, ``none``, or void/empty/unset. * (optional) ``DOCKER_BUILDKIT`` (default "1") enables `Docker BuildKit `_ for faster Docker image rebuilding (recommended). +* (optional) ``FINN_SINGULARITY`` (default "") points to a pre-built Singularity image to use instead of the Docker image. Singularity support is experimental and intended only for systems where Docker is unavailable. Does not support GPUs. General FINN Docker tips ************************ @@ -127,23 +123,11 @@ General FINN Docker tips * If you want a new terminal on an already-running container, you can do this with ``docker exec -it bash``. * The container is spawned with the `--rm` option, so make sure that any important files you created inside the container are either in the finn compiler folder (which is mounted from the host computer) or otherwise backed up. -Using a prebuilt image -********************** - -By default the ``run-docker.sh`` script tries to re-build the Docker image with each run. After the first run this should go quite fast thanks to Docker caching. -If you are having trouble building the Docker image or need offline access, you can use prebuilt images by following these steps: - -1. Pull a prebuilt Docker image with ``docker pull maltanar/finn:`` where ```` can be ``dev_latest`` or ``main_latest`` -2. Set the ``FINN_DOCKER_TAG`` to the name of the image you just pulled e.g. ``FINN_DOCKER_TAG=maltanar/finn:dev_latest`` -3. Set ``FINN_DOCKER_PREBUILT=1`` -4. You can now launch the Docker image in all modes without re-building or any internet access. - - Supported FPGA Hardware ======================= -**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards. +**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx-AMD FPGA as part of a larger system. It’s up to you to take the FINN-generated accelerator (what we call “stitched IP” in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator. -**Vivado IPI support for any Xilinx FPGA:** FINN generates a Vivado IP Integrator (IPI) design from the neural network with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. It's up to you to take the FINN-generated accelerator (what we call "stitched IP" in the tutorials), wire it up to your FPGA design and send/receive neural network data to/from the accelerator. +**Shell-integrated accelerator + driver:** For quick deployment, we target boards supported by `PYNQ `_ . For these platforms, we can build a full bitfile including DMAs to move data into and out of the FINN-generated accelerator, as well as a Python driver to launch the accelerator. We support the Pynq-Z1, Pynq-Z2, Kria SOM, Ultra96, ZCU102 and ZCU104 boards, as well as Alveo cards. PYNQ board first-time setup **************************** @@ -179,7 +163,7 @@ On the target side: On the host side: -1. Install Vitis 2022.1 and set up the ``VITIS_PATH`` environment variable to point to your installation. +1. Install Vitis 2022.2 and set up the ``VITIS_PATH`` environment variable to point to your installation. 2. Install Xilinx XRT. Ensure that the ``XRT_DEB_VERSION`` environment variable reflects which version of XRT you have installed. 3. Install the Vitis platform files for Alveo and set up the ``PLATFORM_REPO_PATHS`` environment variable to point to your installation. *This must be the same path as the target's platform files (target step 2)* 4. Set up the ``ALVEO_*`` environment variables accordingly for your target, see description of environment variables above. @@ -203,7 +187,7 @@ System Requirements * Ubuntu 18.04 with ``bash`` installed * Docker `without root `_ -* A working Vitis/Vivado 2022.1 installation +* A working Vitis/Vivado 2022.2 installation * ``FINN_XILINX_PATH`` and ``FINN_XILINX_VERSION`` environment variables correctly set, see `Quickstart`_ * *(optional)* `Vivado/Vitis license`_ if targeting non-WebPack FPGA parts. * *(optional)* A PYNQ board with a network connection, see `PYNQ board first-time setup`_ diff --git a/docs/finn/hw_build.rst b/docs/finn/hw_build.rst index a5c486935d..39c39eb7df 100644 --- a/docs/finn/hw_build.rst +++ b/docs/finn/hw_build.rst @@ -8,7 +8,7 @@ Hardware Build and Deployment :scale: 70% :align: center -A model where all layers have been converted to HLS layers can be processed by +A model where all layers have been converted to either HLS or RTL layers can be processed by FINN to build a bitfile and driver targeting a Zynq or Alveo system or to generate a Vivado IP Integrator (IPI) design with AXI stream (FIFO) in-out interfaces, which can be integrated onto any Xilinx FPGA as part of a larger system. @@ -69,9 +69,11 @@ FINN will descend into each partition and insert FIFO nodes between streaming no where FIFO depths dictated by the node attributes, using the :py:mod:`finn.transformation.fpgadataflow.insert_fifo.InsertFIFO` transformation. Afterwards, IP blocks will be created for each partition, which in turn contain the -IP blocks for each layer stitched together. The layer-level IP blocks -are generated by Vivado HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` +IP blocks for HLS layers and RTL modules for RTL layers stitched together. The layer-level IP blocks for HLS layers +are generated by Vitis HLS, using a sequence of :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` and :py:mod:`finn.transformation.fpgadataflow.hlssynth_ip.HLSSynthIP` transformations. +For RTL layers calling :py:mod:`finn.transformation.fpgadataflow.prepare_ip.PrepareIP` will fill out the RTL wrapper files and store all files belonging to the RTL module in a folder. + The top-level IP blocks are generated in Vivado IPI, using the :py:mod:`finn.transformation.fpgadataflow.create_stitched_ip.CreateStitchedIP` transformation. Vivado/Vitis Project Generation and Synthesis @@ -85,8 +87,4 @@ transformation for Zynq, and the `VitisLink` transformation for Alveo. Deployment ========== - -Deployment and Remote Execution -------------------------------- - -The bitfile and the driver file(s) are copied to the PYNQ board and can be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks. +The bitfile and the driver file(s) can be copied to the PYNQ board and be executed there. For more information see the description in the `end2end_example `_ Jupyter notebooks. diff --git a/docs/finn/img/finn-hw-build.png b/docs/finn/img/finn-hw-build.png index f3a591fa8f..412317b8d1 100644 Binary files a/docs/finn/img/finn-hw-build.png and b/docs/finn/img/finn-hw-build.png differ diff --git a/docs/finn/img/finn-stack.png b/docs/finn/img/finn-stack.png index e34b1ecb45..c2b49de57e 100644 Binary files a/docs/finn/img/finn-stack.png and b/docs/finn/img/finn-stack.png differ diff --git a/docs/finn/img/mem_mode.png b/docs/finn/img/mem_mode.png index 27783c5f32..451561c54b 100755 Binary files a/docs/finn/img/mem_mode.png and b/docs/finn/img/mem_mode.png differ diff --git a/docs/finn/img/nw-prep.png b/docs/finn/img/nw-prep.png index bed56ebc6d..28a7c9d3ff 100755 Binary files a/docs/finn/img/nw-prep.png and b/docs/finn/img/nw-prep.png differ diff --git a/docs/finn/img/repo-structure.png b/docs/finn/img/repo-structure.png index 704e5e5bda..05db9d201c 100644 Binary files a/docs/finn/img/repo-structure.png and b/docs/finn/img/repo-structure.png differ diff --git a/docs/finn/img/rtl_swg_impl_styles.png b/docs/finn/img/rtl_swg_impl_styles.png new file mode 100644 index 0000000000..265ff9b915 Binary files /dev/null and b/docs/finn/img/rtl_swg_impl_styles.png differ diff --git a/docs/finn/index.rst b/docs/finn/index.rst index c13bf81cec..ab9cc96fb1 100644 --- a/docs/finn/index.rst +++ b/docs/finn/index.rst @@ -5,21 +5,21 @@ FINN Welcome to the FINN Read the Docs website! What is FINN? -============= +============== .. image:: img/finn-stack.png - :scale: 40% + :scale: 15% :align: center 'FINN' is colloquially used to refer to two separate but highly related things: -* The FINN **project**, which is an experimental framework from Xilinx Research Labs - to explore deep neural network inference on FPGAs. It specifically targets - quantized neural networks (QNNs), with emphasis on generating dataflow-style +* The FINN **project**, which is an experimental framework from AMD Research and + Advanced Development (RAD) to explore deep neural network inference on FPGAs. + It specifically targets quantized neural networks (QNNs), with emphasis on generating dataflow-style architectures customized for each network. The key components are illustrated in the figure above; including tools for training quantized neural networks (Brevitas), the FINN compiler, and the finn-hlslib - Vivado HLS library of FPGA components for QNNs. + Vitis HLS library of FPGA components for QNNs. Read more on the `FINN project homepage `_. * The FINN **compiler**, which this Read the Docs website is the documentation for. diff --git a/docs/finn/internals.rst b/docs/finn/internals.rst index add70d649c..0fd6c42350 100644 --- a/docs/finn/internals.rst +++ b/docs/finn/internals.rst @@ -27,8 +27,6 @@ Custom Operations/Nodes FINN uses many custom operations (op_type in ONNX NodeProto) that are not defined in the ONNX operator schema. These custom nodes are marked with domain="finn.*" or domain="qonnx.*" in the protobuf to identify them as such. These nodes can represent specific operations that we need for low-bit networks, or operations that are specific to a particular hardware backend. To get more familiar with custom operations and how they are created, please take a look in the Jupyter notebook about CustomOps (see chapter :ref:`tutorials` for details) or directly in the module :py:mod:`finn.custom_op`. -.. note:: See the description of `this PR `_ for more on how the operator wrapper library is organized. - Custom ONNX Execution Flow ========================== @@ -137,7 +135,7 @@ ModelWrapper contains more useful functions, if you are interested please have a Analysis Pass ============= -An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis` . +An analysis pass traverses the graph structure and produces information about certain properties. It gets the model in the ModelWrapper as input and returns a dictionary of the properties the analysis extracts. If you are interested in how to write an analysis pass for FINN, please take a look at the Jupyter notebook about how to write an analysis pass, see chapter :ref:`tutorials` for details. For more information about existing analysis passes in FINN, see module :py:mod:`finn.analysis`. .. _transformation_pass: @@ -148,26 +146,26 @@ A transformation passes changes (transforms) the given model, it gets the model .. _mem_mode: -MatrixVectorActivation *mem_mode* -================================== +HLS variant of MatrixVectorActivation: *mem_mode* +================================================= FINN supports three types of the so-called *mem_mode* attrıbute for the node MatrixVectorActivation. This mode controls how the weight values are accessed during the execution. That means the mode setting has direct influence on the resulting circuit. Currently three settings for the *mem_mode* are supported in FINN: -* "const" +* "internal_embedded" (former "const" mode) -* "decoupled" +* "internal_decoupled" (former "decoupled" mode) * "external" -The following picture shows the idea behind the "const" and "decoupled" mode. +The following picture shows the idea behind the "internal_embedded" and "internal_decoupled" mode. .. image:: img/mem_mode.png :scale: 55% :align: center -Const mode ----------- -In *const* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *const* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these. +Internal_embedded mode +------------------------ +In *internal_embedded* mode the weights are "baked in" into the Matrix-Vector-Activate-Unit (MVAU), which means they are part of the HLS code. During the IP block generation the weight values are integrated as *params.h* file in the HLS code and synthesized together with it. For the *internal_embedded* mode IP block generation the `Matrix_Vector_Activate_Batch function `_ from the finn-hls library is used, which implements a standard MVAU. The resulting IP block has an input and an output stream, as shown in the above picture on the left. FIFOs in the form of verilog components are connected to these. Advantages: @@ -175,17 +173,15 @@ Advantages: * easier to debug layer in cppsim since no additional components -* well-tested and mature components - Disadvantages: * can lead to very long HLS synthesis times for certain weight array shapes * less control over the weight memory FPGA primitives, Vivado HLS doesn't always make the best resource allocation decisions -Decoupled mode --------------- -In *decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined in a verilog wrapper. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *const* mode. +Internal_decoupled mode +------------------------ +In *internal_decoupled* mode a different variant of the MVAU with three ports is used. Besides the input and output streams, which are fed into the circuit via Verilog FIFOs, there is another input, which is used to stream the weights. For this the `streaming MVAU `_ from the finn-hls library is used. To make the streaming possible a Verilog weight streamer component accesses the weight memory and sends the values via another FIFO to the MVAU. This component can be found in the `finn-rtllib `_ under the name *memstream.v*. For the IP block generation this component, the IP block resulting from the synthesis of the HLS code of the streaming MVAU and a FIFO for the weight stream are combined. The weight values are saved in .dat files and stored in the weight memory from which the weight streamer reads. The resulting verilog component, which is named after the name of the node and has the suffix "_memstream.v", exposes only two ports to the outside, the data input and output. It therefore behaves externally in the same way as the MVAU in *internal_embedded* mode. Advantages: @@ -197,11 +193,149 @@ Advantages: Disadvantages: -* somewhat less well-tested compared to the const mode - -* higher resource footprint due to additional weight streamer and weight FIFO +* slightly higher resource footprint due to additional weight streamer and weight FIFO How to set *mem_mode* --------------------- -When the nodes in the network are converted to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the conversion to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is passed as argument. Note that if no argument is passed, the default is *const*. +When the nodes in the network are specialized to HLS layers, the *mem_mode* can be passed. More detailed information about the transformations that prepare the network and the transformation that performs the specialization to HLS layers can be found in chapter :ref:`nw_prep`. The *mem_mode* is set in the node attributes of the nodes and can be passed as part of the folding configuration. The default is *internal_decoupled*. + + +.. _folding_factors: + +Constraints to folding factors per layer +========================================= + +.. list-table:: Folding factor constraints + + * - **Layers** + - **Parameters** + - **Constraints** + * - Addstreams + - PE + - inp_channels % PE == 0 + * - ChannelwiseOp + - PE + - channels % PE == 0 + * - ConvolutionInputGenerator + - SIMD + - inp_channels % SIMD == 0 + * - Downsampler + - SIMD + - inp_channels % SIMD == 0 + * - DuplicateStreams + - PE + - channels % PE == 0 + * - StreamingEltwise + - PE + - inp_channels % PE == 0 + * - FMPadding + - SIMD + - inp_channels % SIMD == 0 + * - FMPadding_Pixel + - SIMD + - inp_channels % SIMD == 0 + * - Globalaccpool + - PE + - channels % PE == 0 + * - Labelselect + - PE + - num_labels % PE == 0 + * - MatrixVectorActivation + - PE & SIMD + - MH % PE == 0 & MW % SIMD == 0 + * - Pool + - PE + - inp_channels % PE == 0 + * - Thresholding + - PE + - MH % PE == 0 + * - VectorVectorActivation + - PE & SIMD + - k_h * k_w % SIMD == 0 & channels % PE == 0 + + +RTL ConvolutionInputGenerator +============================= + +FINN implements convolution operations by pairing a ConvolutionInputGenerator (or "sliding window generator (SWG)") with an MVAU or VVAU (for depthwise convolution). +This RTL version is an alternative to the original `HLS implementation `_ and aims to improve on it in the following ways: + +* Support a wider range of hyperparameters without the fragmentation into 16+ separate HLS functions + +* Support additional degrees of parallelism (i.e., across the output window or multiple input samples) that are difficult to implement in HLS + +* Support additional features, such as dynamic feature map sizing + +* Improve resource efficiency + + +The component is implemented by generating (System-)Verilog code for each individual instance, realized via the template + replacement dictionary mechanism found in other FINN components. + +Implementation styles +--------------------- +Depending on the amount of parallelism requested, one of two implementation styles is selected. The following table defines folding parameters (marked in bold text) and supported configurations. + +.. list-table:: Parallelism configurations + + * - **SIMD** + - **parallel_window** + - **M** + - MMV_in + - MMV_out + - Style + - Notes + * - < C + - 0 + - 1 + - 1 + - 1 + - default + - depthwise-aware + * - C + - 0 + - 1 + - 1 + - 1 + - default + - depthwise-agnostic + * - < C + - 1 + - 1 + - 1 + - K + - parallel + - depthwise only + * - C + - 1 + - 1 + - 1 + - K + - parallel + - depthwise-agnostic + * - C + - 1 + - M + - M + - M*K + - parallel + - Currently unsupported + +(With C = #Channels, MMV_in = input samples (or "pixels") per cycle, MMV_out = output samples (or "pixels") per cycle, K = kernel_width * kernel_height.) + +The following diagram shows the operating principle of both styles, the "parallel" variant is pictured for a 2x2 kernel without dilation. + +.. image:: img/rtl_swg_impl_styles.png + :align: center + +The main difference lies in the buffer structure. If the output width is equal to the input width ("default mode"), an addressable circular buffer is used, which can be implemented either in LUTRAM, BRAM, or URAM resources. If parallel access to multiple window elements is required ("parallel mode"), the SWG generates a fixed structure of registers and line buffers to avoid memory port limitations and exploding multiplexing logic, while still featuring LUT-saving BRAM/URAM implementation for the line buffers. + +The "default" style also supports a dynamic mode, which provides an interface to change feature map dimensions, stride, or dilation at run-time. See `this pull request `_ description for more information. + +Folding +------- +The RTL SWG is supported by the basic automatic folding algorithm in FINN (:py:mod:`finn.transformation.fpgadataflow.set_folding.SetFolding`). Consider the following implications: + +**MVAU:** Although it is recommended to unfold SIMD first, SIMD and PE can be set independently. Full (and balanced) parallelism is achieved by using the SWG in parallel window mode and setting MVAU SIMD and PE to their maximum values (SIMD = MW = C_in * K, PE = MH = C_out). + +**VVAU:** The VVAU component supports SIMD unfolding (up to SIMD = K) independently from PE unfolding (up to PE = C), but can't accept a datawidth-converted input from a fully-parallel SWG in case PE is not fully unfolded due to the depthwise data layout. Therefore, it is required to set SIMD of the SWG = PE of the VVAU when window-parallelism is enabled. In this scenario, VVAU SIMD < K is supported via an automatically inserted DWC. diff --git a/docs/finn/nw_prep.rst b/docs/finn/nw_prep.rst index 6fea992cf7..5b1d59b99d 100644 --- a/docs/finn/nw_prep.rst +++ b/docs/finn/nw_prep.rst @@ -32,19 +32,28 @@ The idea behind streamlining is to eliminate floating point operations in a mode After this transformation the ONNX model is streamlined and contains now custom nodes in addition to the standard nodes. At this point we can use the :ref:`verification` to simulate the model using Python and in the next step some of the nodes can be converted into HLS layers that correspond to finn_hlslib functions. -Convert to HLS Layers +Convert to HW Layers ===================== -In this step standard or custom layers are converted to HLS layers. HLS layers are layers that directly correspond to a finn-hlslib function call. For example pairs of binary XNORPopcountMatMul and MultiThreshold layers are converted to MatrixVectorActivation layers. The result is a model consisting of a mixture of HLS and non-HLS layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hls_layers`. The MatrixVectorActivation layer can be implemented in three different modes, *const*, *decoupled* (see chapter :ref:`mem_mode`) and *external*. +In this step standard or custom layers are converted to HW layers. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. These layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow. + +The result is a model consisting of a mixture of HW and non-HW layers. For more details, see :py:mod:`finn.transformation.fpgadataflow.convert_to_hw_layers`. Dataflow Partitioning ===================== -In the next step the graph is split and the part consisting of HLS layers is further processed in the FINN flow. The parent graph containing the non-HLS layers remains. The PE and SIMD are set to 1 by default, so the result is a network of only HLS layers with maximum folding. The model can be verified using the *cppsim* simulation. It is a simulation using C++ and is described in more detail in chapter :ref:`verification`. +In the next step the graph is split and the part consisting of HW layers is further processed in the FINN flow. The parent graph containing the non-HW layers remains. + +Specialize Layers +===================== + +The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the SpecializeLayers transformation. It is possible to let the FINN flow know a preference for the implementation style {"hls", "rtl"} and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default. Folding ========= +The PE and SIMD are set to 1 by default, so the result is a network of only HLS/RTL layers with maximum folding. The HLS layers of the model can be verified using the *cppsim* simulation. It is a simulation using C++ and is described in more detail in chapter :ref:`verification`. + To adjust the folding, the values for PE and SIMD can be increased to achieve also an increase in the performance. The result can be verified using the same simulation flow as for the network with maximum folding (*cppsim* using C++), for details please have a look at chapter :ref:`verification`. -The result is a network of HLS layers with desired folding and it can be passed to :ref:`hw_build`. +The result is a network of HLS/RTL layers with desired folding and it can be passed to :ref:`hw_build`. diff --git a/docs/finn/source_code/finn.analysis.rst b/docs/finn/source_code/finn.analysis.rst index f2321dbee7..d97c04eb62 100644 --- a/docs/finn/source_code/finn.analysis.rst +++ b/docs/finn/source_code/finn.analysis.rst @@ -31,6 +31,14 @@ qonnx.analysis.inference\_cost :undoc-members: :show-inheritance: +qonnx.analysis.tensor\_stats +----------------------------- + +.. automodule:: qonnx.analysis.tensor_stats + :members: + :undoc-members: + :show-inheritance: + qonnx.analysis.topology ----------------------------- diff --git a/docs/finn/source_code/finn.core.rst b/docs/finn/source_code/finn.core.rst index afa1ecffa0..28cb47eaf7 100644 --- a/docs/finn/source_code/finn.core.rst +++ b/docs/finn/source_code/finn.core.rst @@ -54,14 +54,6 @@ finn.core.onnx\_exec :undoc-members: :show-inheritance: -finn.core.remote\_exec ------------------------------ - -.. automodule:: finn.core.remote_exec - :members: - :undoc-members: - :show-inheritance: - finn.core.rtlsim\_exec ----------------------------- diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst new file mode 100644 index 0000000000..5a4fff6052 --- /dev/null +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.hls.rst @@ -0,0 +1,184 @@ +***************************** +Custom Op - fpgadataflow.hls +***************************** + +HLS Custom Op Nodes +=================== + +finn.custom\_op.fpgadataflow.addstreams\_hls +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.addstreams_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.channelwise\_op\_hls +----------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.channelwise_op_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.checksum_hls +------------------------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.hls.checksum_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.concat_hls +----------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.concat_hls + :members: + :undoc-members: + :show-inheritance: + + +finn.custom\_op.fpgadataflow.convolutioninputgenerator_hls +----------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.downsampler_hls +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.downsampler_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.duplicatestreams\_hls +------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.duplicatestreams_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.fmpadding\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.fmpadding_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.fmpadding\_pixel\_hls +--------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.globalaccpool\_hls +--------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.globalaccpool_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.iodma\_hls +---------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.iodma_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.labelselect\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.labelselect_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.lookup\_hls +------------------------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.hls.lookup_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.matrixvectoractivation_hls +-------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls + :members: + :undoc-members: + :show-inheritance: + + +finn.custom\_op.fpgadataflow.pool\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.pool_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_hls +---------------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingeltwise\_hls +---------------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.streamingeltwise_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingmaxpool\_hls +----------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.thresholding\_hls +------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.thresholding_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.tlastmarker\_hls +----------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.tlastmarker_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.upsampler\_hls +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.upsampler_hls + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.vectorvectoractivation\_hls +--------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst index fdcf44c6d9..25aafc324e 100644 --- a/docs/finn/source_code/finn.custom_op.fpgadataflow.rst +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rst @@ -2,71 +2,71 @@ Custom Op - fpgadataflow ************************ -HLS Custom Op Nodes -=================== +Submodules +========== -Base Class ----------- +.. toctree:: + :maxdepth: 2 -.. automodule:: finn.custom_op.fpgadataflow.hlscustomop - :members: - :undoc-members: - :show-inheritance: + finn.custom_op.fpgadataflow.hls + finn.custom_op.fpgadataflow.rtl -finn.custom\_op.fpgadataflow.addstreams\_batch ------------------------------------------------ -.. automodule:: finn.custom_op.fpgadataflow.addstreams_batch +HW Custom Op Nodes +=================== + +Base Class - HWCustomOp +------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.hwcustomop :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.channelwise\_op\_batch ------------------------------------------------------ +HLSBackend +----------- -.. automodule:: finn.custom_op.fpgadataflow.channelwise_op_batch +.. automodule:: finn.custom_op.fpgadataflow.hlsbackend :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.checksum --------------------------------------- +RTLBackend +----------- -.. automodule:: finn.custom_op.fpgadataflow.checksum +.. automodule:: finn.custom_op.fpgadataflow.rtlbackend :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.concat -------------------------------------- +finn.custom\_op.fpgadataflow.addstreams +---------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.concat +.. automodule:: finn.custom_op.fpgadataflow.addstreams :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.channelwise\_op +--------------------------------------------- -finn.custom\_op.fpgadataflow.convolutioninputgenerator --------------------------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator +.. automodule:: finn.custom_op.fpgadataflow.channelwise_op :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.convolutioninputgenerator1d -------------------------------------------------------------- +finn.custom\_op.fpgadataflow.concat +------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator1d +.. automodule:: finn.custom_op.fpgadataflow.concat :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.convolutioninputgenerator +-------------------------------------------------------- -finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl ------------------------------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl +.. automodule:: finn.custom_op.fpgadataflow.convolutioninputgenerator :members: :undoc-members: :show-inheritance: @@ -79,52 +79,42 @@ finn.custom\_op.fpgadataflow.downsampler :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.duplicatestreams\_batch -------------------------------------------------------- +finn.custom\_op.fpgadataflow.duplicatestreams +---------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.duplicatestreams_batch +.. automodule:: finn.custom_op.fpgadataflow.duplicatestreams :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.fmpadding +--------------------------------------- -finn.custom\_op.fpgadataflow.eltwise -------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.eltwise +.. automodule:: finn.custom_op.fpgadataflow.fmpadding :members: :undoc-members: :show-inheritance: - -finn.custom\_op.fpgadataflow.fmpadding\_batch +finn.custom\_op.fpgadataflow.fmpadding\_pixel ----------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.fmpadding_batch - :members: - :undoc-members: - :show-inheritance: - -finn.custom\_op.fpgadataflow.globalaccpool\_batch ---------------------------------------------------- - -.. automodule:: finn.custom_op.fpgadataflow.globalaccpool_batch +.. automodule:: finn.custom_op.fpgadataflow.fmpadding_pixel :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.iodma ------------------------------------- +finn.custom\_op.fpgadataflow.globalaccpool +------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.iodma +.. automodule:: finn.custom_op.fpgadataflow.globalaccpool :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.labelselect\_batch ------------------------------------------------ +finn.custom\_op.fpgadataflow.labelselect +----------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.labelselect_batch +.. automodule:: finn.custom_op.fpgadataflow.labelselect :members: :undoc-members: :show-inheritance: @@ -138,7 +128,7 @@ finn.custom\_op.fpgadataflow.lookup :show-inheritance: finn.custom\_op.fpgadataflow.matrixvectoractivation ------------------------------------------------------------ +----------------------------------------------------- .. automodule:: finn.custom_op.fpgadataflow.matrixvectoractivation :members: @@ -146,10 +136,10 @@ finn.custom\_op.fpgadataflow.matrixvectoractivation :show-inheritance: -finn.custom\_op.fpgadataflow.pool\_batch ------------------------------------------------ +finn.custom\_op.fpgadataflow.pool +---------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.pool_batch +.. automodule:: finn.custom_op.fpgadataflow.pool :members: :undoc-members: :show-inheritance: @@ -163,51 +153,50 @@ finn.custom\_op.fpgadataflow.streamingdataflowpartition :show-inheritance: -finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_batch ----------------------------------------------------------------------- +finn.custom\_op.fpgadataflow.streamingdatawidthconverter +--------------------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch +.. automodule:: finn.custom_op.fpgadataflow.streamingdatawidthconverter :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.streamingfifo -------------------------------------------------- +finn.custom\_op.fpgadataflow.streamingeltwise +---------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.streamingfifo +.. automodule:: finn.custom_op.fpgadataflow.streamingeltwise :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.streamingmaxpool\_batch ------------------------------------------------------------ +finn.custom\_op.fpgadataflow.streamingfifo +------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.streamingmaxpool_batch +.. automodule:: finn.custom_op.fpgadataflow.streamingfifo :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.templates ---------------------------------------------- +finn.custom\_op.fpgadataflow.streamingmaxpool +---------------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.templates +.. automodule:: finn.custom_op.fpgadataflow.streamingmaxpool :members: :undoc-members: :show-inheritance: -finn.custom\_op.fpgadataflow.thresholding\_batch -------------------------------------------------------- +finn.custom\_op.fpgadataflow.templates +---------------------------------------- -.. automodule:: finn.custom_op.fpgadataflow.thresholding_batch +.. automodule:: finn.custom_op.fpgadataflow.templates :members: :undoc-members: :show-inheritance: +finn.custom\_op.fpgadataflow.thresholding +------------------------------------------ -finn.custom\_op.fpgadataflow.tlastmarker ------------------------------------------------ - -.. automodule:: finn.custom_op.fpgadataflow.tlastmarker +.. automodule:: finn.custom_op.fpgadataflow.thresholding :members: :undoc-members: :show-inheritance: diff --git a/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst new file mode 100644 index 0000000000..346eddb073 --- /dev/null +++ b/docs/finn/source_code/finn.custom_op.fpgadataflow.rtl.rst @@ -0,0 +1,62 @@ +***************************** +Custom Op - fpgadataflow.rtl +***************************** + +RTL Custom Op Nodes +=================== + +finn.custom\_op.fpgadataflow.convolutioninputgenerator\_rtl +------------------------------------------------------------ + +.. automodule:: finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.fmpadding\_rtl +--------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.fmpadding_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.matrixvectoractivation\_rtl +--------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingdatawidthconverter\_rtl +--------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.streamingfifo\_rtl +------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.thresholding\_rtl +------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.thresholding_rtl + :members: + :undoc-members: + :show-inheritance: + +finn.custom\_op.fpgadataflow.vectorvectoractivation\_rtl +--------------------------------------------------------------- + +.. automodule:: finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/finn/source_code/finn.transformation.fpgadataflow.rst b/docs/finn/source_code/finn.transformation.fpgadataflow.rst index 9f8ec07930..f56b5fcf01 100644 --- a/docs/finn/source_code/finn.transformation.fpgadataflow.rst +++ b/docs/finn/source_code/finn.transformation.fpgadataflow.rst @@ -38,10 +38,10 @@ finn.transformation.fpgadataflow.compile\_cppsim :undoc-members: :show-inheritance: -finn.transformation.fpgadataflow.convert\_to\_hls\_layers ----------------------------------------------------------------- +finn.transformation.fpgadataflow.convert\_to\_hw\_layers +---------------------------------------------------------- -.. automodule:: finn.transformation.fpgadataflow.convert_to_hls_layers +.. automodule:: finn.transformation.fpgadataflow.convert_to_hw_layers :members: :undoc-members: :show-inheritance: @@ -79,22 +79,29 @@ finn.transformation.fpgadataflow.externalize\_params :show-inheritance: finn.transformation.fpgadataflow.floorplan ----------------------------------------------------- +----------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.floorplan :members: :undoc-members: :show-inheritance: - finn.transformation.fpgadataflow.hlssynth\_ip ----------------------------------------------------- +----------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.hlssynth_ip :members: :undoc-members: :show-inheritance: +finn.transformation.fpgadataflow.infer\_pixel\_padding\_deconv +---------------------------------------------------------------- + +.. automodule:: finn.transformation.fpgadataflow.infer_pixel_padding_deconv + :members: + :undoc-members: + :show-inheritance: + finn.transformation.fpgadataflow.insert\_dwc --------------------------------------------------- @@ -139,14 +146,6 @@ finn.transformation.fpgadataflow.insert\_tlastmarker :undoc-members: :show-inheritance: -finn.transformation.fpgadataflow.make\_deployment --------------------------------------------------------- - -.. automodule:: finn.transformation.fpgadataflow.make_deployment - :members: - :undoc-members: - :show-inheritance: - finn.transformation.fpgadataflow.make\_pynq\_driver ---------------------------------------------------------- @@ -173,6 +172,15 @@ finn.transformation.fpgadataflow.minimize\_accumulator\_width :show-inheritance: +finn.transformation.fpgadataflow.minimize\_weight\_bit\_width +-------------------------------------------------------------- + +.. automodule:: finn.transformation.fpgadataflow.minimize_weight_bit_width + :members: + :undoc-members: + :show-inheritance: + + finn.transformation.fpgadataflow.prepare\_cppsim ------------------------------------------------------- @@ -229,16 +237,24 @@ finn.transformation.fpgadataflow.set\_folding :undoc-members: :show-inheritance: -finn.transformation.fpgadataflow.synth\_ooc +finn.transformation.fpgadataflow.specialize\_layers ------------------------------------------------------- +.. automodule:: finn.transformation.fpgadataflow.specialize_layers + :members: + :undoc-members: + :show-inheritance: + +finn.transformation.fpgadataflow.synth\_ooc +--------------------------------------------- + .. automodule:: finn.transformation.fpgadataflow.synth_ooc :members: :undoc-members: :show-inheritance: finn.transformation.fpgadataflow.template\_driver -------------------------------------------------- +--------------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.template_driver :members: @@ -246,7 +262,7 @@ finn.transformation.fpgadataflow.template\_driver :show-inheritance: finn.transformation.fpgadataflow.templates -------------------------------------------------- +----------------------------------------------- .. automodule:: finn.transformation.fpgadataflow.templates :members: diff --git a/docs/finn/source_code/finn.transformation.rst b/docs/finn/source_code/finn.transformation.rst index f42b595a50..8dc7e1afc2 100644 --- a/docs/finn/source_code/finn.transformation.rst +++ b/docs/finn/source_code/finn.transformation.rst @@ -15,7 +15,7 @@ Submodules finn.transformation.streamline Transformation Passes -===================== +====================== Base Class ---------- @@ -49,6 +49,14 @@ qonnx.transformation.change\_3d\_tensors\_to\_4d :undoc-members: :show-inheritance: +qonnx.transformation.change\_batchsize +---------------------------------------- + +.. automodule:: qonnx.transformation.change_batchsize + :members: + :undoc-members: + :show-inheritance: + qonnx.transformation.change\_datalayout -------------------------------------------- @@ -83,6 +91,14 @@ qonnx.transformation.double\_to\_single\_float :undoc-members: :show-inheritance: +qonnx.transformation.expose\_intermediate +------------------------------------------ + +.. automodule:: qonnx.transformation.expose_intermediate + :members: + :undoc-members: + :show-inheritance: + qonnx.transformation.extend\_partition ------------------------------------------ @@ -99,9 +115,16 @@ qonnx.transformation.extract\_conv\_bias :undoc-members: :show-inheritance: +qonnx.transformation.extract\_quant\_scale\_zeropt +---------------------------------------------------- + +.. automodule:: qonnx.transformation.extract_quant_scale_zeropt + :members: + :undoc-members: + :show-inheritance: qonnx.transformation.fold\_constants ------------------------------------------- +-------------------------------------- .. automodule:: qonnx.transformation.fold_constants :members: @@ -117,7 +140,7 @@ qonnx.transformation.gemm\_to\_matmul :show-inheritance: qonnx.transformation.general ----------------------------------- +------------------------------ .. automodule:: qonnx.transformation.general :members: @@ -165,7 +188,7 @@ qonnx.transformation.lower\_convs\_to\_matmul :show-inheritance: qonnx.transformation.make\_input\_chanlast ------------------------------------------- +--------------------------------------------- .. automodule:: qonnx.transformation.make_input_chanlast :members: @@ -180,6 +203,29 @@ qonnx.transformation.merge\_onnx\_models :undoc-members: :show-inheritance: +qonnx.transformation.pruning +------------------------------ + +.. automodule:: qonnx.transformation.pruning + :members: + :undoc-members: + :show-inheritance: + +qonnx.transformation.qcdq\_to\_qonnx +---------------------------------------- + +.. automodule:: qonnx.transformation.qcdq_to_qonnx + :members: + :undoc-members: + :show-inheritance: + +qonnx.transformation.qonnx\_to\_qcdq +------------------------------------- + +.. automodule:: qonnx.transformation.qonnx_to_qcdq + :members: + :undoc-members: + :show-inheritance: qonnx.transformation.quant\_constant\_folding ---------------------------------------------- @@ -189,6 +235,13 @@ qonnx.transformation.quant\_constant\_folding :undoc-members: :show-inheritance: +qonnx.transformation.quantize\_graph +------------------------------------- + +.. automodule:: qonnx.transformation.quantize_graph + :members: + :undoc-members: + :show-inheritance: qonnx.transformation.rebalance\_conv ---------------------------------------- @@ -199,13 +252,28 @@ qonnx.transformation.rebalance\_conv :show-inheritance: qonnx.transformation.remove -------------------------------------- +---------------------------- .. automodule:: qonnx.transformation.remove :members: :undoc-members: :show-inheritance: +qonnx.transformation.resize\_conv\_to\_deconv +----------------------------------------------- + +.. automodule:: qonnx.transformation.resize_conv_to_deconv + :members: + :undoc-members: + :show-inheritance: + +qonnx.transformation.subpixel\_to\_deconv +----------------------------------------------- + +.. automodule:: qonnx.transformation.subpixel_to_deconv + :members: + :undoc-members: + :show-inheritance: finn.transformation.move\_reshape ---------------------------------------- diff --git a/docs/finn/source_code/finn.util.rst b/docs/finn/source_code/finn.util.rst index 7ba3b252ab..2ec1502441 100644 --- a/docs/finn/source_code/finn.util.rst +++ b/docs/finn/source_code/finn.util.rst @@ -31,8 +31,16 @@ qonnx.util.config :undoc-members: :show-inheritance: +qonnx.util.convert +-------------------- + +.. automodule:: qonnx.util.convert + :members: + :undoc-members: + :show-inheritance: + qonnx.util.exec\_qonnx ----------------------- +------------------------ .. automodule:: qonnx.util.exec_qonnx :members: @@ -55,6 +63,37 @@ qonnx.util.onnx :undoc-members: :show-inheritance: +qonnx.util.prune\_channels +--------------------------- + +.. automodule:: qonnx.util.prune_channels + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.random\_reseed +-------------------------- + +.. automodule:: qonnx.util.random_reseed + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.range\_analysis +--------------------------- + +.. automodule:: qonnx.util.range_analysis + :members: + :undoc-members: + :show-inheritance: + +qonnx.util.test +-------------------- + +.. automodule:: qonnx.util.test + :members: + :undoc-members: + :show-inheritance: qonnx.util.to\_channels\_last ------------------------------ @@ -81,8 +120,6 @@ finn.util.create :undoc-members: :show-inheritance: - - finn.util.data\_packing ------------------------------ @@ -99,14 +136,6 @@ finn.util.fpgadataflow :undoc-members: :show-inheritance: -finn.util.gdrive ------------------------------ - -.. automodule:: finn.util.gdrive - :members: - :undoc-members: - :show-inheritance: - finn.util.hls --------------- diff --git a/docs/finn/tutorials.rst b/docs/finn/tutorials.rst index 7ac54501cf..39d25c2634 100644 --- a/docs/finn/tutorials.rst +++ b/docs/finn/tutorials.rst @@ -16,7 +16,7 @@ The notebooks in this folder should give a basic insight into FINN, how to get s * This notebook can help you to learn how to create and manipulate a simple ONNX model, also by using FINN -* 1_brevitas_network_import +* 1_brevitas_network_import_via_QONNX * This notebook shows how to import a Brevitas network and prepare it for the FINN flow. @@ -47,6 +47,15 @@ The notebooks in this folder are more developer oriented. They should help you t * Explains the basics of FINN custom ops and how to define a new one. +* 3_folding + + * Describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. + +* 4_advanced_builder_settings + + * Provides a more detailed look into the FINN builder tool and explores different options to customize your FINN design. + + FINN Example FPGA Flow Using MNIST Numerals ============================================ diff --git a/docs/finn/verification.rst b/docs/finn/verification.rst index e1a9ac4b31..578c941c36 100644 --- a/docs/finn/verification.rst +++ b/docs/finn/verification.rst @@ -4,18 +4,18 @@ Functional Verification *********************** -.. image:: ../../notebooks/end2end_example/bnn-pynq/verification.png - :scale: 70% +.. image:: ../../notebooks/end2end_example/bnn-pynq/verification.svg + :scale: 40% :align: center This part of the flow is covered by the Jupyter notebook about the verification of a simple fully-connected network, which you can find in the `end2end notebook folder `_. -When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS custom nodes. A single node can be executed using one or more of the following methods: +When the network is transformed it is important to verify the functionality to make sure the transformation did not change the behaviour of the model. There are multiple ways of verification that can be applied in different stages of the network inside FINN. All can be accessed using the execution function in module :py:mod:`finn.core.onnx_exec`. The execution happens in most cases node by node, which supports networks that have a mixture of standard ONNX nodes, custom nodes and HLS/RTL custom nodes. A single node can be executed using one or more of the following methods: Simulation using Python ======================= -This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS custom nodes, so right after the streamlining transformations and before the nodes are converted into HLS layers. +This simulation can be used right after the :ref:`brevitas_export` or when the network does not contain any HLS/RTL custom nodes yet, so right after the streamlining transformations and before the nodes are specialized into HLS/RTL layers. Simulation using C++ ==================== @@ -26,7 +26,7 @@ This simulation can be used for a model containing several HLS custom operations Emulation using PyVerilator =========================== -The emulation using PyVerilator can be used when IP blocks were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. +The emulation using PyVerilator can be used when IP blocks/RTL modules were generated, either node by node or of a whole (IP-stitched) design. For that purpose PyVerilator gets the generated verilog files. For debugging purposes, it's possible to generate .vcd trace files that show the value of external & internal signals as the emuation is running. To enable this: - for node-by-node rtlsim, set the `rtlsim_trace` attribute of each node of interest to either a file name for the vcd or `default` to use the node name as the filename. diff --git a/docs/img/finn-examples-header.png b/docs/img/finn-examples-header.png deleted file mode 100644 index 50f8fa7761..0000000000 Binary files a/docs/img/finn-examples-header.png and /dev/null differ diff --git a/docs/img/imagenet.jpg b/docs/img/imagenet.jpg deleted file mode 100644 index 5cdd5aa303..0000000000 Binary files a/docs/img/imagenet.jpg and /dev/null differ diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000000..3a3730d2b9 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,16 @@ +brevitas@git+https://github.com/Xilinx/brevitas@master#egg=brevitas_examples +dataclasses-json==0.5.7 +docutils==0.19 +gspread==3.6.0 +importlib_resources +IPython +matplotlib +netron +pytest +pyverilator@git+https://github.com/maltanar/pyverilator@master#egg=pyverilator +qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx +sphinx_rtd_theme==2.0.0 +torch +torchvision +tqdm +vcdvcd diff --git a/fetch-repos.sh b/fetch-repos.sh index 5b060f5bc8..073c052d67 100755 --- a/fetch-repos.sh +++ b/fetch-repos.sh @@ -27,17 +27,18 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -QONNX_COMMIT="dd35a8ff49d7225a07ffceeebe25a6361df48349" -FINN_EXP_COMMIT="9cbd2787b5160e2b44e0e8164a0df1457dbd5366" -BREVITAS_COMMIT="a5b71d6de1389d3e7db898fef72e014842670f03" -PYVERILATOR_COMMIT="766e457465f5c0dd315490d7b9cc5d74f9a76f4f" +QONNX_COMMIT="fd61cfeebbdaba351abf7e9d54cd785d7776fa4f" +FINN_EXP_COMMIT="de99347e936d51715f5356a1b6c64e37b91c23c2" +BREVITAS_COMMIT="84f42259ec869eb151af4cb8a8b23ad925f493db" +PYVERILATOR_COMMIT="ce0a08c20cb8c1d1e84181d6f392390f846adbd1" CNPY_COMMIT="4e8810b1a8637695171ed346ce68f6984e585ef4" -HLSLIB_COMMIT="d27f6b6c5d8f1bb208db395659389603f63ad4be" -OMX_COMMIT="d1065a788219ca0eb54d5e57600b1f9d7f67d4cc" +HLSLIB_COMMIT="16e5847a5e3ef76cffe84c8fad2f010d593457d3" +OMX_COMMIT="0b59762f9e4c4f7e5aa535ee9bc29f292434ca7a" AVNET_BDF_COMMIT="2d49cfc25766f07792c0b314489f21fe916b639b" XIL_BDF_COMMIT="8cf4bb674a919ac34e3d99d8d71a9e60af93d14e" +RFSOC4x2_BDF_COMMIT="13fb6f6c02c7dfd7e4b336b18b959ad5115db696" KV260_BDF_COMMIT="98e0d3efc901f0b974006bc4370c2a7ad8856c79" -EXP_BOARD_FILES_MD5="30eecc497c31050bd46d10ea20eba232" +EXP_BOARD_FILES_MD5="226ca927a16ea4ce579f1332675e9e9a" QONNX_URL="https://github.com/fastmachinelearning/qonnx.git" FINN_EXP_URL="https://github.com/Xilinx/finn-experimental.git" @@ -48,6 +49,7 @@ HLSLIB_URL="https://github.com/Xilinx/finn-hlslib.git" OMX_URL="https://github.com/maltanar/oh-my-xilinx.git" AVNET_BDF_URL="https://github.com/Avnet/bdf.git" XIL_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git" +RFSOC4x2_BDF_URL="https://github.com/RealDigitalOrg/RFSoC4x2-BSP.git" KV260_BDF_URL="https://github.com/Xilinx/XilinxBoardStore.git" QONNX_DIR="qonnx" @@ -59,6 +61,7 @@ HLSLIB_DIR="finn-hlslib" OMX_DIR="oh-my-xilinx" AVNET_BDF_DIR="avnet-bdf" XIL_BDF_DIR="xil-bdf" +RFSOC4x2_BDF_DIR="rfsoc4x2-bdf" KV260_SOM_BDF_DIR="kv260-som-bdf" # absolute path to this script, e.g. /home/user/bin/foo.sh @@ -107,6 +110,7 @@ fetch_board_files() { unzip -q pynq-z2.zip cp -r $SCRIPTPATH/deps/$AVNET_BDF_DIR/* $SCRIPTPATH/deps/board_files/ cp -r $SCRIPTPATH/deps/$XIL_BDF_DIR/boards/Xilinx/rfsoc2x2 $SCRIPTPATH/deps/board_files/; + cp -r $SCRIPTPATH/deps/$RFSOC4x2_BDF_DIR/board_files/rfsoc4x2 $SCRIPTPATH/deps/board_files/; cp -r $SCRIPTPATH/deps/$KV260_SOM_BDF_DIR/boards/Xilinx/kv260_som $SCRIPTPATH/deps/board_files/; cd $OLD_PWD } @@ -120,19 +124,26 @@ fetch_repo $HLSLIB_URL $HLSLIB_COMMIT $HLSLIB_DIR fetch_repo $OMX_URL $OMX_COMMIT $OMX_DIR fetch_repo $AVNET_BDF_URL $AVNET_BDF_COMMIT $AVNET_BDF_DIR fetch_repo $XIL_BDF_URL $XIL_BDF_COMMIT $XIL_BDF_DIR +fetch_repo $RFSOC4x2_BDF_URL $RFSOC4x2_BDF_COMMIT $RFSOC4x2_BDF_DIR fetch_repo $KV260_BDF_URL $KV260_BDF_COMMIT $KV260_SOM_BDF_DIR -# download extra Pynq board files and extract if needed -if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then - fetch_board_files +# Can skip downloading of board files entirely if desired +if [ "$FINN_SKIP_BOARD_FILES" = "1" ]; then + echo "Skipping download and verification of board files" else - cd $SCRIPTPATH - BOARD_FILES_MD5=$(find deps/board_files/ -type f -exec md5sum {} \; | sort -k 2 | md5sum | cut -d' ' -f 1) - if [ "$BOARD_FILES_MD5" = "$EXP_BOARD_FILES_MD5" ]; then - echo "Verified board files folder content md5: $BOARD_FILES_MD5" - else - echo "Board files folder content mismatch, removing and re-downloading" - rm -rf deps/board_files/ + # download extra board files and extract if needed + if [ ! -d "$SCRIPTPATH/deps/board_files" ]; then fetch_board_files + else + cd $SCRIPTPATH + BOARD_FILES_MD5=$(find deps/board_files/ -type f -exec md5sum {} \; | sort -k 2 | md5sum | cut -d' ' -f 1) + if [ "$BOARD_FILES_MD5" = "$EXP_BOARD_FILES_MD5" ]; then + echo "Verified board files folder content md5: $BOARD_FILES_MD5" + else + echo "Board files folder md5: expected $BOARD_FILES_MD5 found $EXP_BOARD_FILES_MD5" + echo "Board files folder content mismatch, removing and re-downloading" + rm -rf deps/board_files/ + fetch_board_files + fi fi fi diff --git a/finn-rtllib/axi_info/component.xml b/finn-rtllib/axi_info/component.xml index d22637534f..c7632e2915 100644 --- a/finn-rtllib/axi_info/component.xml +++ b/finn-rtllib/axi_info/component.xml @@ -197,6 +197,10 @@ ASSOCIATED_BUSIF s_axi + + FREQ_TOLERANCE_HZ + -1 + @@ -228,7 +232,7 @@ viewChecksum - 7d682dfc + c9da9874 @@ -244,7 +248,7 @@ viewChecksum - 7d682dfc + c9da9874 @@ -258,7 +262,7 @@ viewChecksum - e11f9727 + 1e654f67 @@ -607,7 +611,7 @@ hdl/axi_info_top.sv systemVerilogSource - CHECKSUM_ec9ff0da + CHECKSUM_db6ccc10 @@ -692,17 +696,22 @@ axi_info_top_v1_0 package_project - 5 - 2022-05-30T14:16:13Z + 6 + 2023-05-24T06:36:33Z - 2022.1 - + 2022.2 + - + + + + + + diff --git a/finn-rtllib/axi_info/hdl/axi_info_top.sv b/finn-rtllib/axi_info/hdl/axi_info_top.sv index ab2cfc8bed..74aebe3ec7 100644 --- a/finn-rtllib/axi_info/hdl/axi_info_top.sv +++ b/finn-rtllib/axi_info/hdl/axi_info_top.sv @@ -38,7 +38,10 @@ module axi_info_top #( bit [31:0] CHECKSUM_COUNT )( //- Global Control ------------------ + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axi, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input logic ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input logic ap_rst_n, //- AXI Lite ------------------------ diff --git a/finn-rtllib/dwc/hdl/dwc.sv b/finn-rtllib/dwc/hdl/dwc.sv new file mode 100644 index 0000000000..13b0cb34c4 --- /dev/null +++ b/finn-rtllib/dwc/hdl/dwc.sv @@ -0,0 +1,158 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Stream Data Width Converter. + * @author Thomas B. Preußer + *****************************************************************************/ +module dwc #( + int unsigned IBITS, + int unsigned OBITS +)( + //- Global Control ------------------ + input logic clk, + input logic rst, + + //- AXI Stream - Input -------------- + output logic irdy, + input logic ivld, + input logic [IBITS-1:0] idat, + + //- AXI Stream - Output ------------- + input logic ordy, + output logic ovld, + output logic [OBITS-1:0] odat +); + + if(IBITS == OBITS) begin : genNoop + assign irdy = ordy; + assign ovld = ivld; + assign odat = idat; + end : genNoop + else if(IBITS < OBITS) begin : genUp + + // Sanity Checking: integer upscaling + initial begin + if(OBITS % IBITS) begin + $error("Output width %0d is not a multiple of input width %0d.", OBITS, IBITS); + $finish; + end + end + + // Parallelizing Shift Register A and Sidestep Buffer B on Input Path + localparam int unsigned K = OBITS / IBITS; + typedef logic [IBITS-1:0] dat_t; + dat_t [K-1:0] ADat = 'x; + logic [$clog2(K):0] ACnt = K-1; // (empty) K-1, ..., 0, -1 (full/valid) + dat_t BDat = 'x; + logic BRdy = 1; + always_ff @(posedge clk) begin + if(rst) begin + ADat <= 'x; + ACnt <= K-1; + BDat <= 'x; + BRdy <= 1; + end + else begin + automatic type(ACnt) acnt = (ovld && ordy)? K-1 : ACnt; + automatic logic rdy = !ovld || ordy; + if((ivld || !BRdy) && rdy) begin + ADat <= { BRdy? idat : BDat, ADat[K-1:1] }; + acnt--; + end + ACnt <= acnt; + + if(BRdy) BDat <= idat; + BRdy <= rdy || (BRdy && !ivld); + end + end + + // Output Assignments + assign irdy = BRdy; + assign ovld = ACnt[$left(ACnt)]; + assign odat = ADat; + + end : genUp + else begin : genDown + + // Sanity Checking: integer downscaling + initial begin + if(IBITS % OBITS) begin + $error("Input width %0d is not a multiple of output width %0d.", IBITS, OBITS); + $finish; + end + end + + // Serializing Shift Register A and Sidestep Buffer B on Output Path + localparam int unsigned K = IBITS / OBITS; + typedef logic [OBITS-1:0] dat_t; + dat_t [ K-1:0] ADat = 'x; + logic [$clog2(K):0] ACnt = 1; // (full) -K+1, ..., -1, 0, 1 (empty/not valid) + dat_t BDat = 'x; + logic BRdy = 1; + dat_t CDat = 'x; + logic CVld = 0; + always_ff @(posedge clk) begin + if(rst) begin + ADat <= 'x; + ACnt <= 1; + BDat <= 'x; + BRdy <= 1; + CDat <= 'x; + CVld <= 0; + end + else begin + automatic type(ACnt) acnt = ACnt; + automatic logic ainc = 0; + if(irdy) begin + ADat <= idat; + acnt = ivld? -K+1 : 1; + end + else if(BRdy) begin + ADat <= { {OBITS{1'bx}}, ADat[K-1:1] }; + ainc = BRdy; + end; + ACnt <= acnt + ainc; + + if(BRdy) BDat <= ADat[0]; + BRdy <= !CVld || ordy || (BRdy && !ACnt[$left(ACnt)] && ACnt[0]); + + if(!CVld || ordy) CDat <= BRdy? ADat[0] : BDat; + CVld <= (CVld && !ordy) || !BRdy || ACnt[$left(ACnt)] || !ACnt[0]; + end + end + + // Output Assignments + assign irdy = BRdy && !ACnt[$left(ACnt)]; + assign ovld = CVld; + assign odat = CDat; + + end : genDown + +endmodule : dwc diff --git a/finn-rtllib/dwc/hdl/dwc_axi.sv b/finn-rtllib/dwc/hdl/dwc_axi.sv new file mode 100644 index 0000000000..dfe02fcb48 --- /dev/null +++ b/finn-rtllib/dwc/hdl/dwc_axi.sv @@ -0,0 +1,65 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief AXI Stream Adapter for Data Width Converter. + * @author Thomas B. Preußer + *****************************************************************************/ +module dwc_axi #( + int unsigned IBITS, + int unsigned OBITS, + + localparam int unsigned AXI_IBITS = (IBITS+7)/8 * 8, + localparam int unsigned AXI_OBITS = (OBITS+7)/8 * 8 +)( + //- Global Control ------------------ + input logic ap_clk, + input logic ap_rst_n, + + //- AXI Stream - Input -------------- + output logic s_axis_tready, + input logic s_axis_tvalid, + input logic [AXI_IBITS-1:0] s_axis_tdata, + + //- AXI Stream - Output ------------- + input logic m_axis_tready, + output logic m_axis_tvalid, + output logic [AXI_OBITS-1:0] m_axis_tdata +); + + dwc #(.IBITS(IBITS), .OBITS(OBITS)) core ( + .clk(ap_clk), .rst(!ap_rst_n), + .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata[IBITS-1:0]), + .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata[OBITS-1:0]) + ); + if(OBITS < AXI_OBITS) begin + assign m_axis_tdata[AXI_OBITS-1:OBITS] = '0; + end + +endmodule : dwc_axi diff --git a/finn-rtllib/dwc/hdl/dwc_template.v b/finn-rtllib/dwc/hdl/dwc_template.v new file mode 100644 index 0000000000..01a0254040 --- /dev/null +++ b/finn-rtllib/dwc/hdl/dwc_template.v @@ -0,0 +1,71 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module $TOP_MODULE_NAME$ #( + parameter IBITS = $IBITS$, + parameter OBITS = $OBITS$, + + parameter AXI_IBITS = (IBITS+7)/8 * 8, + parameter AXI_OBITS = (OBITS+7)/8 * 8 +)( + //- Global Control ------------------ + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + //- AXI Stream - Input -------------- + output in0_V_TREADY, + input in0_V_TVALID, + input [AXI_IBITS-1:0] in0_V_TDATA, + + //- AXI Stream - Output ------------- + input out_V_TREADY, + output out_V_TVALID, + output [AXI_OBITS-1:0] out_V_TDATA +); + + dwc_axi #( + .IBITS(IBITS), + .OBITS(OBITS) + ) impl ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .s_axis_tready(in0_V_TREADY), + .s_axis_tvalid(in0_V_TVALID), + .s_axis_tdata(in0_V_TDATA), + .m_axis_tready(out_V_TREADY), + .m_axis_tvalid(out_V_TVALID), + .m_axis_tdata(out_V_TDATA) + ); + +endmodule diff --git a/finn-rtllib/dwc/sim/dwc_axi_tb.sv b/finn-rtllib/dwc/sim/dwc_axi_tb.sv new file mode 100644 index 0000000000..64435c1900 --- /dev/null +++ b/finn-rtllib/dwc/sim/dwc_axi_tb.sv @@ -0,0 +1,195 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for AXI Stream Data Width Converter. + * @author Thomas B. Preußer + *****************************************************************************/ +module dwc_axi_tb; + + localparam int unsigned DBITS = 8; + localparam int unsigned K = 3; + typedef logic [DBITS-1:0] dat_t; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + repeat(8) @(posedge clk); + rst <= 0; + end + + if(1) begin : blkUp + localparam int unsigned IBITS = DBITS; + localparam int unsigned OBITS = K * DBITS; + + //- AXI Stream - Input -------------- + uwire s_axis_tready; + logic s_axis_tvalid; + dat_t s_axis_tdata; + + //- AXI Stream - Output ------------- + logic m_axis_tready; + uwire m_axis_tvalid; + dat_t [K-1:0] m_axis_tdata; + + dwc_axi #(.IBITS(IBITS), .OBITS(OBITS)) dut ( + .ap_clk(clk), .ap_rst_n(!rst), + .s_axis_tready, .s_axis_tvalid, .s_axis_tdata, + .m_axis_tready, .m_axis_tvalid, .m_axis_tdata + ); + + // Stimulus: Feed + dat_t Q[$]; + initial begin + s_axis_tvalid = 0; + s_axis_tdata = 'x; + @(posedge clk iff !rst); + + repeat(57600) begin + automatic type(s_axis_tdata) dat; + std::randomize(dat); + + while($urandom()%7 < 2) @(posedge clk); + + s_axis_tvalid <= 1; + s_axis_tdata <= dat; + @(posedge clk iff s_axis_tready); + Q.push_back(dat); + + s_axis_tvalid <= 0; + s_axis_tdata <= 'x; + end + + repeat(16) @(posedge clk); + $finish; + end + + // Output Sink + initial begin + m_axis_tready = 0; + @(posedge clk iff !rst); + + forever begin + automatic dat_t [K-1:0] dat; + + while($urandom()%9 < 1) @(posedge clk); + + m_axis_tready <= 1; + @(posedge clk iff m_axis_tvalid); + assert(Q.size >= K) else begin + $error("Spurious output."); + $stop; + end + for(int unsigned i = 0; i < K; i++) dat[i] = Q.pop_front(); + assert(m_axis_tdata == dat) else begin + $error("Output mismatch."); + $stop; + end + + m_axis_tready <= 0; + end + end + end : blkUp + + if(1) begin : blkDown + localparam int unsigned IBITS = K * DBITS; + localparam int unsigned OBITS = DBITS; + + //- AXI Stream - Input -------------- + uwire s_axis_tready; + logic s_axis_tvalid; + dat_t [K-1:0] s_axis_tdata; + + //- AXI Stream - Output ------------- + logic m_axis_tready; + uwire m_axis_tvalid; + dat_t m_axis_tdata; + + dwc_axi #(.IBITS(IBITS), .OBITS(OBITS)) dut ( + .ap_clk(clk), .ap_rst_n(!rst), + .s_axis_tready, .s_axis_tvalid, .s_axis_tdata, + .m_axis_tready, .m_axis_tvalid, .m_axis_tdata + ); + + // Stimulus: Feed + dat_t Q[$]; + initial begin + s_axis_tvalid = 0; + s_axis_tdata = 'x; + @(posedge clk iff !rst); + + repeat(57600) begin + automatic dat_t [K-1:0] dat; + std::randomize(dat); + + while($urandom()%7 < 2) @(posedge clk); + + s_axis_tvalid <= 1; + s_axis_tdata <= dat; + @(posedge clk iff s_axis_tready); + for(int unsigned i = 0; i < K; i++) Q.push_back(dat[i]); + + s_axis_tvalid <= 0; + s_axis_tdata <= 'x; + end + + repeat(16) @(posedge clk); + $finish; + end + + // Output Sink + initial begin + m_axis_tready = 0; + @(posedge clk iff !rst); + + forever begin + automatic dat_t dat; + + while($urandom()%9 < 1) @(posedge clk); + + m_axis_tready <= 1; + @(posedge clk iff m_axis_tvalid); + assert(Q.size) else begin + $error("Spurious output."); + $stop; + end + dat = Q.pop_front(); + assert(m_axis_tdata == dat) else begin + $error("Output mismatch: 0x%0x instead of 0x%0x", m_axis_tdata, dat); + $stop; + end + + m_axis_tready <= 0; + end + end + end : blkDown + +endmodule : dwc_axi_tb diff --git a/finn-rtllib/fifo/hdl/Q_srl.v b/finn-rtllib/fifo/hdl/Q_srl.v new file mode 100644 index 0000000000..11cef604e0 --- /dev/null +++ b/finn-rtllib/fifo/hdl/Q_srl.v @@ -0,0 +1,308 @@ +// original source: +// https://github.com/nachiket/tdfc/blob/master/verilog/queues/Q_srl_oreg3_prefull_SIMPLE.v + + +// Copyright (c) 1999 The Regents of the University of California +// Copyright (c) 2010 The Regents of the University of Pennsylvania +// Copyright (c) 2011 Department of Electrical and Electronic Engineering, Imperial College London +// Copyright (c) 2020 Xilinx +// +// Permission to use, copy, modify, and distribute this software and +// its documentation for any purpose, without fee, and without a +// written agreement is hereby granted, provided that the above copyright +// notice and this paragraph and the following two paragraphs appear in +// all copies. +// +// IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +// DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +// LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, +// EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF +// SUCH DAMAGE. +// +// THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +// INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +// AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON +// AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +// PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. +// + +// Q_srl_oreg3_prefull_SIMPLE.v +// +// - In-page queue with parameterizable depth, bit width +// - Stream I/O is triple (data, valid, back-pressure), +// with EOS concatenated into the data +// - Flow control for input & output is combinationally decoupled +// - 2 <= depth <= 256 +// * (depth >= 2) is required to decouple I/O flow control, +// where empty => no produce, full => no consume, +// and depth 1 would ping-pong between the two at half rate +// * (depth <= 256) can be modified +// by changing ''synthesis loop_limit X'' below +// and changing ''addrwidth'' or its log computation +// - 1 <= width +// - Queue storage is in SRL16E, up to depth 16 per LUT per bit-slice, +// plus output register (for fast output) +// - Queue addressing is done by ''addr'' up-down counter +// - Queue fullness is checked by comparator (addr==depth) +// - Queue fullness is pre-computed for next cycle +// - Queue input back-pressure is pre-computed for next cycle +// - Queue output valid (state!=state__empty) is pre-computed for next cycle +// (necessary since SRL data output reg requires non-boolean state) +// - FSM has 3 states (empty, one, more) +// - When empty, continue to emit most recently emitted value (for debugging) +// +// - Queue slots used = / (state==state_empty) ? 0 +// | (state==state_one) ? 1 +// \ (state==state_more) ? addr+2 +// - Queue slots used <= depth +// - Queue slots remaining = depth - used +// = / (state==state_empty) ? depth +// | (state==state_one) ? depth-1 +// \ (state==state_more) ? depth-2-addr +// +// - Synplify 7.1 / 8.0 +// - Eylon Caspi, 9/11/03, 8/18/04, 3/29/05 + + +`ifdef Q_srl +`else +`define Q_srl + + +module Q_srl (clock, reset, i_d, i_v, i_r, o_d, o_v, o_r, count, maxcount); + + parameter depth = 16; // - greatest #items in queue (2 <= depth <= 256) + parameter width = 16; // - width of data (i_d, o_d) + + parameter addrwidth = $clog2(depth); + + input clock; + input reset; + + input [width-1:0] i_d; // - input stream data (concat data + eos) + input i_v; // - input stream valid + output i_r; // - input stream ready + wire i_b; // - input stream back-pressure + + output [width-1:0] o_d; // - output stream data (concat data + eos) + output o_v; // - output stream valid + input o_r; // - output stream ready + wire o_b; // - output stream back-pressure + + output [addrwidth:0] count; // - output number of elems in queue + output [addrwidth:0] maxcount; // - maximum observed count since reset + + reg [addrwidth:0] maxcount_reg; // - maximum count seen until now + reg [addrwidth-1:0] addr, addr_, a_; // - SRL16 address + // for data output + reg shift_en_; // - SRL16 shift enable + reg [width-1:0] srl [depth-2:0]; // - SRL16 memory + reg shift_en_o_; // - SRLO shift enable + reg [width-1:0] srlo_, srlo // - SRLO output reg + /* synthesis syn_allow_retiming=0 */ ; + + parameter state_empty = 2'd0; // - state empty : o_v=0 o_d=UNDEFINED + parameter state_one = 2'd1; // - state one : o_v=1 o_d=srlo + parameter state_more = 2'd2; // - state more : o_v=1 o_d=srlo + // #items in srl = addr+2 + + reg [1:0] state, state_; // - state register + + wire addr_full_; // - true iff addr==depth-2 on NEXT cycle + reg addr_full; // - true iff addr==depth-2 + wire addr_zero_; // - true iff addr==0 + wire o_v_reg_; // - true iff state_empty on NEXT cycle + reg o_v_reg // - true iff state_empty + /* synthesis syn_allow_retiming=0 */ ; + wire i_b_reg_; // - true iff !full on NEXT cycle + reg i_b_reg // - true iff !full + /* synthesis syn_allow_retiming=0 */ ; + + assign addr_full_ = (state_==state_more) && (addr_==depth-2); + // - queue full + assign addr_zero_ = (addr==0); // - queue contains 2 (or 1,0) + assign o_v_reg_ = (state_!=state_empty); // - output valid if non-empty + assign i_b_reg_ = addr_full_; // - input bp if full + assign o_d = srlo; // - output data from queue + assign o_v = o_v_reg; // - output valid if non-empty + assign i_b = i_b_reg; // - input bp if full + assign maxcount = maxcount_reg; + + assign i_r = !i_b; + assign o_b = !o_r; + + assign count = (state==state_more ? addr+2 : (state==state_one ? 1 : 0)); + + // - ''always'' block with both FFs and SRL16 does not work, + // since FFs need reset but SRL16 does not + + always @(posedge clock) begin // - seq always: FFs + if (reset) begin + state <= state_empty; + addr <= 0; + addr_full <= 0; + o_v_reg <= 0; + + i_b_reg <= 0; + maxcount_reg <= 0; + + end + else begin + state <= state_; + addr <= addr_; + addr_full <= addr_full_; + o_v_reg <= o_v_reg_; + i_b_reg <= i_b_reg_; + maxcount_reg <= (count > maxcount_reg ? count : maxcount_reg); + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srlo + // - infer enabled output reg at end of shift chain + // - input first element from i_d, all subsequent elements from SRL16 + if (reset) begin + srlo <= 0; + end + else begin + if (shift_en_o_) begin + srlo <= srlo_; + end + end + end // always @ (posedge clock) + + always @(posedge clock) begin // - seq always: srl + // - infer enabled SRL16E from shifting srl array + // - no reset capability; srl[] contents undefined on reset + if (shift_en_) begin + // synthesis loop_limit 256 + for (a_=depth-2; a_>0; a_=a_-1) begin + srl[a_] = srl[a_-1]; + end + srl[0] <= i_d; + end + end // always @ (posedge clock or negedge reset) + + always @* begin // - combi always + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + case (state) + + state_empty: begin // - (empty, will not produce) + if (i_v) begin // - empty & i_v => consume + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else begin // - empty & !i_v => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end + + state_one: begin // - (contains one) + if (i_v && o_b) begin // - one & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - one & i_v & !o_b => cons+prod + srlo_ <= i_d; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && o_b) begin // - one & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_one; + end + else if (!i_v && !o_b) begin // - one & !i_v & !o_b => produce + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1'bx; + addr_ <= 0; + state_ <= state_empty; + end + end // case: state_one + + state_more: begin // - (contains more than one) + if (addr_full || (depth==2)) begin + // - (full, will not consume) + // - (full here if depth==2) + if (o_b) begin // - full & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else begin // - full & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; +// addr_ <= addr-1; +// state_ <= state_more; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end + else begin // - (mid: neither empty nor full) + if (i_v && o_b) begin // - mid & i_v & o_b => consume + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 1; + addr_ <= addr+1; + state_ <= state_more; + end + else if (i_v && !o_b) begin // - mid & i_v & !o_b => cons+prod + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 1; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && o_b) begin // - mid & !i_v & o_b => idle + srlo_ <= 'bx; + shift_en_o_ <= 0; + shift_en_ <= 0; + addr_ <= addr; + state_ <= state_more; + end + else if (!i_v && !o_b) begin // - mid & !i_v & !o_b => produce + srlo_ <= srl[addr]; + shift_en_o_ <= 1; + shift_en_ <= 0; + addr_ <= addr_zero_ ? 0 : addr-1; + state_ <= addr_zero_ ? state_one : state_more; + end + end // else: !if(addr_full) + end // case: state_more + + default: begin + srlo_ <= 'bx; + shift_en_o_ <= 1'bx; + shift_en_ <= 1'bx; + addr_ <= 'bx; + state_ <= 2'bx; + end // case: default + + endcase // case(state) + end // always @ * + +endmodule // Q_srl + + +`endif // `ifdef Q_srl diff --git a/finn-rtllib/fifo/hdl/fifo_template.v b/finn-rtllib/fifo/hdl/fifo_template.v new file mode 100644 index 0000000000..3f14ae991f --- /dev/null +++ b/finn-rtllib/fifo/hdl/fifo_template.v @@ -0,0 +1,72 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module $TOP_MODULE_NAME$( +//- Global Control ------------------ +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET = ap_rst_n" *) +(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) +input ap_rst_n, + +output $COUNT_RANGE$ count, +output $COUNT_RANGE$ maxcount, + +//- AXI Stream - Input -------------- +output in0_V_TREADY, +input in0_V_TVALID, +input $IN_RANGE$ in0_V_TDATA, + +//- AXI Stream - Output -------------- +input out_V_TREADY, +output out_V_TVALID, +output $OUT_RANGE$ out_V_TDATA +); + +Q_srl #( +.depth($DEPTH$), +.width($WIDTH$) +) +impl +( + .clock(ap_clk), + .reset(!ap_rst_n), + .count(count), + .maxcount(maxcount), + .i_d(in0_V_TDATA), + .i_v(in0_V_TVALID), + .i_r(in0_V_TREADY), + .o_d(out_V_TDATA), + .o_v(out_V_TVALID), + .o_r(out_V_TREADY) +); + +endmodule diff --git a/finn-rtllib/fmpadding/hdl/fmpadding_template.v b/finn-rtllib/fmpadding/hdl/fmpadding_template.v index 0b0f40f86a..2347d9b394 100644 --- a/finn-rtllib/fmpadding/hdl/fmpadding_template.v +++ b/finn-rtllib/fmpadding/hdl/fmpadding_template.v @@ -31,10 +31,11 @@ module $TOP_MODULE_NAME$( //- Global Control ------------------ -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) -input ap_clk, -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) -input ap_rst_n, +(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite, ASSOCIATED_RESET = ap_rst_n" *) +(* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) +input ap_clk, +(* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) +input ap_rst_n, //- AXI Lite ------------------------ // Writing @@ -86,7 +87,7 @@ fmpadding_axi #( .INIT_YOFF($INIT_YOFF$), .INIT_YEND($INIT_YEND$) ) -$TOP_MODULE_NAME$_impl +impl ( .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), diff --git a/finn-rtllib/memstream/component.xml b/finn-rtllib/memstream/component.xml index 63a8540a76..722da1d803 100644 --- a/finn-rtllib/memstream/component.xml +++ b/finn-rtllib/memstream/component.xml @@ -1,7 +1,7 @@ - xilinx.com - user + amd.com + finn memstream 1.0 @@ -37,201 +37,6 @@ - - m_axis_1 - - - - - - - TDATA - - - m_axis_1_tdata - - - - - TVALID - - - m_axis_1_tvalid - - - - - TREADY - - - m_axis_1_tready - - - - - - - true - - - - - - m_axis_2 - - - - - - - TDATA - - - m_axis_2_tdata - - - - - TVALID - - - m_axis_2_tvalid - - - - - TREADY - - - m_axis_2_tready - - - - - - - true - - - - - - m_axis_3 - - - - - - - TDATA - - - m_axis_3_tdata - - - - - TVALID - - - m_axis_3_tvalid - - - - - TREADY - - - m_axis_3_tready - - - - - - - true - - - - - - m_axis_4 - - - - - - - TDATA - - - m_axis_4_tdata - - - - - TVALID - - - m_axis_4_tvalid - - - - - TREADY - - - m_axis_4_tready - - - - - - - true - - - - - - m_axis_5 - - - - - - - TDATA - - - m_axis_5_tdata - - - - - TVALID - - - m_axis_5_tvalid - - - - - TREADY - - - m_axis_5_tready - - - - - - - true - - - - s_axilite @@ -393,16 +198,9 @@ - - - - true - - - - aresetn + ap_rst_n @@ -412,19 +210,19 @@ RST - aresetn + ap_rst_n POLARITY - ACTIVE_LOW + ACTIVE_LOW - aclk + ap_clk @@ -434,18 +232,22 @@ CLK - aclk + ap_clk + + ASSOCIATED_RESET + ap_rst_n + ASSOCIATED_BUSIF - m_axis_0:m_axis_1:m_axis_2:m_axis_3:m_axis_4:m_axis_5:s_axilite + m_axis_0:s_axilite - ASSOCIATED_RESET - aresetn + FREQ_TOLERANCE_HZ + -1 @@ -453,11 +255,13 @@ interface_aximm + interface_aximm reg0 - 0 - 65536 - 32 + reg0 + 0x0 + 4096 + 32 register @@ -468,15 +272,15 @@ xilinx_anylanguagesynthesis Synthesis :vivado.xilinx.com:synthesis - Verilog - memstream + SystemVerilog + memstream_axi_wrapper xilinx_anylanguagesynthesis_view_fileset viewChecksum - 1fc5a310 + 04464096 @@ -484,15 +288,27 @@ xilinx_anylanguagebehavioralsimulation Simulation :vivado.xilinx.com:simulation - Verilog - memstream + SystemVerilog + memstream_axi_wrapper xilinx_anylanguagebehavioralsimulation_view_fileset viewChecksum - d02d9990 + 9e058959 + + + + + xilinx_implementation + Implementation + :vivado.xilinx.com:implementation + memstream_axi_wrapper + + + viewChecksum + cd434062 @@ -506,7 +322,7 @@ viewChecksum - f960907f + 6c92393d @@ -520,14 +336,14 @@ viewChecksum - d2aad2c5 + 923e7b90 - aclk + ap_clk in @@ -540,7 +356,7 @@ - aresetn + ap_rst_n in @@ -582,11 +398,11 @@ - awaddr + awprot in - 15 + 2 0 @@ -602,11 +418,11 @@ - awprot + awaddr in - 2 + 10 0 @@ -766,11 +582,11 @@ - araddr + arprot in - 15 + 2 0 @@ -786,11 +602,11 @@ - arprot + araddr in - 2 + 10 0 @@ -868,29 +684,6 @@ - - m_axis_0_afull - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - - true - - - - m_axis_0_tready @@ -925,352 +718,7 @@ out - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_1_afull - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - - true - - - - - - m_axis_1_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - m_axis_1_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_1_tdata - - out - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_2_afull - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - - true - - - - - - m_axis_2_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - m_axis_2_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_2_tdata - - out - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_3_afull - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - - true - - - - - - m_axis_3_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - m_axis_3_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_3_tdata - - out - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_4_afull - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - - true - - - - - - m_axis_4_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - m_axis_4_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_4_tdata - - out - - 31 - 0 - - - - std_logic_vector - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_5_afull - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 0 - - - - - - true - - - - - - m_axis_5_tready - - in - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - 1 - - - - - m_axis_5_tvalid - - out - - - std_logic - xilinx_anylanguagesynthesis - xilinx_anylanguagebehavioralsimulation - - - - - - m_axis_5_tdata - - out - - 31 + 31 0 @@ -1285,129 +733,29 @@ - CONFIG_EN - Config En - true + DEPTH + Depth + 512 - NSTREAMS - Nstreams - 6 - - - MEM_DEPTH - Mem Depth - 13824 - - - MEM_WIDTH - Mem Width - 32 + WIDTH + Width + 32 - MEM_INIT - Mem Init - ./ + INIT_FILE + Init File + RAM_STYLE Ram Style auto - - STRM0_WIDTH - Strm0 Width - 32 - - - STRM1_WIDTH - Strm1 Width - 32 - - - STRM2_WIDTH - Strm2 Width - 32 - - - STRM3_WIDTH - Strm3 Width - 32 - - - STRM4_WIDTH - Strm4 Width - 32 - - - STRM5_WIDTH - Strm5 Width - 32 - - - STRM0_DEPTH - Strm0 Depth - 2304 - - - STRM1_DEPTH - Strm1 Depth - 2304 - - - STRM2_DEPTH - Strm2 Depth - 2304 - - - STRM3_DEPTH - Strm3 Depth - 2304 - - - STRM4_DEPTH - Strm4 Depth - 2304 - - - STRM5_DEPTH - Strm5 Depth - 2304 - - - STRM0_OFFSET - Strm0 Offset - 0 - - - STRM1_OFFSET - Strm1 Offset - 2304 - - - STRM2_OFFSET - Strm2 Offset - 4608 - - - STRM3_OFFSET - Strm3 Offset - 6912 - - - STRM4_OFFSET - Strm4 Offset - 9216 - - - STRM5_OFFSET - Strm5 Offset - 11520 - AXILITE_ADDR_WIDTH Axilite Addr Width - 16 + 11 @@ -1417,13 +765,6 @@ ACTIVE_HIGH ACTIVE_LOW - - choice_list_e2bd1cd0 - auto - distributed - block - ultra - @@ -1433,71 +774,41 @@ verilogSource - hdl/memstream.v - verilogSource + hdl/memstream.sv + systemVerilogSource - hdl/memstream_multiblock.v - verilogSource + hdl/memstream_axi.sv + systemVerilogSource - hdl/memstream_singleblock.v + hdl/memstream_axi_wrapper.v verilogSource - - - hdl/mux.v - verilogSource - - - hdl/ramb18_sdp.v - verilogSource - - - hdl/ramb18_wf_dualport.v - verilogSource - CHECKSUM_9425c051 + CHECKSUM_7caabca7 xilinx_anylanguagebehavioralsimulation_view_fileset - hdl/memstream.v - verilogSource + hdl/memstream.sv + systemVerilogSource USED_IN_ipstatic xil_defaultlib - hdl/axilite_if.v - verilogSource + hdl/memstream_axi.sv + systemVerilogSource USED_IN_ipstatic xil_defaultlib - hdl/memstream_singleblock.v - verilogSource - USED_IN_ipstatic - xil_defaultlib - - - hdl/mux.v - verilogSource - USED_IN_ipstatic - xil_defaultlib - - - hdl/ramb18_wf_dualport.v - verilogSource - USED_IN_ipstatic - xil_defaultlib - - - hdl/memstream_multiblock.v + hdl/axilite_if.v verilogSource USED_IN_ipstatic xil_defaultlib - hdl/ramb18_sdp.v + hdl/memstream_axi_wrapper.v verilogSource USED_IN_ipstatic xil_defaultlib @@ -1508,7 +819,7 @@ xgui/memstream_v1_0.tcl tclSource - CHECKSUM_f960907f + CHECKSUM_32cad48d XGUI_VERSION_2 @@ -1520,132 +831,32 @@ - memstream_v1_0 + memstream - CONFIG_EN - Config En - true - - - NSTREAMS - Nstreams - 6 - - - MEM_DEPTH - Mem Depth - 13824 + DEPTH + Depth + 512 - MEM_WIDTH - Mem Width - 32 + WIDTH + Width + 32 - MEM_INIT - Mem Init - ./ + INIT_FILE + Init File + RAM_STYLE Ram Style - auto - - - STRM0_WIDTH - Strm0 Width - 32 - - - STRM1_WIDTH - Strm1 Width - 32 - - - STRM2_WIDTH - Strm2 Width - 32 - - - STRM3_WIDTH - Strm3 Width - 32 - - - STRM4_WIDTH - Strm4 Width - 32 - - - STRM5_WIDTH - Strm5 Width - 32 - - - STRM0_DEPTH - Strm0 Depth - 2304 - - - STRM1_DEPTH - Strm1 Depth - 2304 - - - STRM2_DEPTH - Strm2 Depth - 2304 - - - STRM3_DEPTH - Strm3 Depth - 2304 - - - STRM4_DEPTH - Strm4 Depth - 2304 - - - STRM5_DEPTH - Strm5 Depth - 2304 - - - STRM0_OFFSET - Strm0 Offset - 0 - - - STRM1_OFFSET - Strm1 Offset - 2304 - - - STRM2_OFFSET - Strm2 Offset - 4608 - - - STRM3_OFFSET - Strm3 Offset - 6912 - - - STRM4_OFFSET - Strm4 Offset - 9216 - - - STRM5_OFFSET - Strm5 Offset - 11520 + auto AXILITE_ADDR_WIDTH Axilite Addr Width - 16 + 11 @@ -1656,52 +867,40 @@ Component_Name - memstream_v1_0 + memstream_axi_wrapper_v1_0 - - aartix7 - akintex7 - artix7 - artix7l - azynq - kintex7 - kintex7l - kintexu - kintexuplus - qkintex7 - qkintex7l - qvirtex7 - qzynq - qzynqplus - versal - versalprime - virtex7 - virtexu - virtexuplus - virtexuplusHBM - virtexupluse58g - zynq - zynquplus - /UserIP - memstream_v1_0 + memstream + level_1 package_project + AMD 5 - 2020-10-09T15:31:57Z + + user.org:user:memstream_axi_wrapper:1.0 + + 2023-05-24T06:34:57Z + + + - 2020.1 - - - - - - + 2022.2 + + + + + + + + + + + diff --git a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl index a68b85e1f5..271f9df453 100644 --- a/finn-rtllib/memstream/gui/memstream_v1_0.gtcl +++ b/finn-rtllib/memstream/gui/memstream_v1_0.gtcl @@ -1,2 +1,2 @@ # This file is automatically written. Do not modify. -proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {MEM_DEPTH MEM_WIDTH } {expr 2+ceil(log($MEM_DEPTH*pow(2,ceil(log(($MEM_WIDTH+31)/32)/log(2))))/log(2))} +proc gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE {DEPTH WIDTH } {expr 2 + ceil(log($DEPTH*pow(2, ceil(log(($WIDTH+31)/32)/log(2))))/log(2))} diff --git a/finn-rtllib/memstream/hdl/memstream.sv b/finn-rtllib/memstream/hdl/memstream.sv new file mode 100644 index 0000000000..9cbef493a3 --- /dev/null +++ b/finn-rtllib/memstream/hdl/memstream.sv @@ -0,0 +1,176 @@ +/** + * Copyright (c) 2023, Xilinx + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of FINN nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + */ + +module memstream #( + int unsigned DEPTH, + int unsigned WIDTH, + + parameter INIT_FILE = "", + parameter RAM_STYLE = "auto" +)( + input logic clk, + input logic rst, + + // Configuration and readback interface - compatible with ap_memory + input logic config_ce, + input logic config_we, + input logic [31 :0] config_address, + input logic [WIDTH-1:0] config_d0, + + output logic config_rack, + output logic [WIDTH-1:0] config_q0, + + // Continuous output stream + input logic ordy, + output logic ovld, + output logic [WIDTH-1:0] odat +); + + typedef logic [$clog2(DEPTH)-1:0] addr_t; + typedef logic [WIDTH -1:0] data_t; + + uwire en; // Pipeline enable + uwire rollback; // Rollback stream reads if backpressure would block read back + + // Counter with pre-computed last indication for val == DEPTH-1 + typedef struct { + addr_t val; + logic lst; + } ptr_t; + + // Counter history to facilitate pipeline rollback + ptr_t Ptr[3] = '{ + 0: '{ val: 0, lst: DEPTH<2 }, + default: '{ default: 'x } + }; + + //----------------------------------------------------------------------- + // Stage #0: Address & Op + logic Wr1 = 0; // Write + logic Rb1 = 0; // Read back + logic Rs1 = 0; // Read stream + data_t Data1 = 'x; + if(1) begin : blkStage1 + // Increment for wrapping DEPTH-1 back to zero + localparam int unsigned WRAP_INC = 2**$bits(addr_t) - DEPTH + 1; + + uwire ptr_t ptr_eff = rollback? Ptr[2] : Ptr[0]; + uwire ptr_t ptr_nxt; + assign ptr_nxt.val = ptr_eff.val + (config_ce? 0 : !ptr_eff.lst? 1 : WRAP_INC); + assign ptr_nxt.lst = + DEPTH < 2? 1 : + config_ce? ptr_eff.lst : + ptr_eff.lst? 0 : + /* else */ ptr_eff.val == DEPTH-2; + + always_ff @(posedge clk) begin + if(rst) Ptr[0] <= '{ val: 0, lst: DEPTH<2 }; + else if(en) Ptr[0] <= ptr_nxt; + end + + // Issue next Memory Operation + always_ff @(posedge clk) begin + if(rst) begin + Wr1 <= 0; + Rb1 <= 0; + Rs1 <= 0; + Ptr[1] <= '{ default : 'x }; + Data1 <= 'x; + end + else if(en) begin + Wr1 <= 0; + Rb1 <= 0; + Rs1 <= 0; + if(config_ce) begin + if(config_we) Wr1 <= 1; + else Rb1 <= 1; + Ptr[1] <= '{ val: config_address, lst: 'x }; + Data1 <= config_d0; + end + else begin + Rs1 <= 1; + Ptr[1] <= ptr_eff; + Data1 <= 'x; + end + end + end + end : blkStage1 + + //----------------------------------------------------------------------- + // Stage #2: Memory Access + logic Rb2 = 0; + logic Rs2 = 0; + data_t Data2 = 'x; + if(1) begin : blkStage2 + (* RAM_STYLE = RAM_STYLE *) + data_t Mem[DEPTH]; + + // Optional Memory Initialization + if(INIT_FILE != "") initial $readmemh(INIT_FILE, Mem); + + // Execute Memory Operation + uwire addr_t addr = Ptr[1].val; + always_ff @(posedge clk) begin + if(en) begin + if(Wr1) Mem[addr] <= Data1; + Data2 <= Mem[addr]; + end + end + + // Copy Output Designation + always_ff @(posedge clk) begin + if(rst) begin + Rb2 <= 0; + Rs2 <= 0; + Ptr[2] <= '{ default: 'x }; + end + else if(en) begin + Rb2 <= Rb1; + Rs2 <= Rs1 && !rollback; + Ptr[2] <= Ptr[1]; + end + end + end : blkStage2 + + //----------------------------------------------------------------------- + // Output Interfaces + assign config_rack = Rb2; + assign config_q0 = Data2; + + assign ovld = Rs2; + assign odat = Data2; + + uwire backpressure = Rs2 && !ordy; + assign rollback = backpressure && (Rb1 || config_ce); + assign en = !backpressure || Rb1 || config_ce; + +endmodule : memstream diff --git a/finn-rtllib/memstream/hdl/memstream.v b/finn-rtllib/memstream/hdl/memstream.v deleted file mode 100644 index 2cd955f8d1..0000000000 --- a/finn-rtllib/memstream/hdl/memstream.v +++ /dev/null @@ -1,327 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -module memstream -#( -//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths - parameter CONFIG_EN = 1, - parameter NSTREAMS = 6,//1 up to 6 - - parameter MEM_DEPTH = 13824, - parameter MEM_WIDTH = 32, - parameter MEM_INIT = "./", - parameter RAM_STYLE = "auto", - - //widths per stream - parameter STRM0_WIDTH = 32, - parameter STRM1_WIDTH = 32, - parameter STRM2_WIDTH = 32, - parameter STRM3_WIDTH = 32, - parameter STRM4_WIDTH = 32, - parameter STRM5_WIDTH = 32, - - //depths per stream - parameter STRM0_DEPTH = 2304, - parameter STRM1_DEPTH = 2304, - parameter STRM2_DEPTH = 2304, - parameter STRM3_DEPTH = 2304, - parameter STRM4_DEPTH = 2304, - parameter STRM5_DEPTH = 2304, - - //offsets for each stream - parameter STRM0_OFFSET = 0, - parameter STRM1_OFFSET = 2304, - parameter STRM2_OFFSET = 4608, - parameter STRM3_OFFSET = 6912, - parameter STRM4_OFFSET = 9216, - parameter STRM5_OFFSET = 11520, - - parameter AXILITE_ADDR_WIDTH = 2+$clog2(MEM_DEPTH*(1<<$clog2((MEM_WIDTH+31)/32))) -) - -( - input aclk, - input aresetn, - - output awready, - input awvalid, - input [AXILITE_ADDR_WIDTH-1:0] awaddr, - input [2:0] awprot, - //write data - output wready, - input wvalid, - input [31:0] wdata, - input [3:0] wstrb, - //burst response - input bready, - output bvalid, - output [1:0] bresp, - - //Read channels - //read address - output arready, - input arvalid, - input [AXILITE_ADDR_WIDTH-1:0] araddr, - input [2:0] arprot, - //read data - input rready, - output rvalid, - output [1:0] rresp, - output [31:0] rdata, - - //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits - input m_axis_0_afull, - input m_axis_0_tready, - output m_axis_0_tvalid, - output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata, - - input m_axis_1_afull, - input m_axis_1_tready, - output m_axis_1_tvalid, - output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata, - - input m_axis_2_afull, - input m_axis_2_tready, - output m_axis_2_tvalid, - output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata, - - input m_axis_3_afull, - input m_axis_3_tready, - output m_axis_3_tvalid, - output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata, - - input m_axis_4_afull, - input m_axis_4_tready, - output m_axis_4_tvalid, - output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata, - - input m_axis_5_afull, - input m_axis_5_tready, - output m_axis_5_tvalid, - output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata - - -); - -wire [31:0] config_address; -wire config_ce; -wire config_we; -wire config_rack; -wire [MEM_WIDTH-1:0] config_d0; -wire [MEM_WIDTH-1:0] config_q0; - -generate -if(NSTREAMS <= 2) begin: singleblock - - -memstream_singleblock -#( - .CONFIG_EN(CONFIG_EN), - .NSTREAMS(NSTREAMS), - .MEM_DEPTH(MEM_DEPTH), - .MEM_WIDTH(MEM_WIDTH), - .MEM_INIT(MEM_INIT), - .RAM_STYLE(RAM_STYLE), - - //widths per stream - .STRM0_WIDTH(STRM0_WIDTH), - .STRM1_WIDTH(STRM1_WIDTH), - - //depths per stream - .STRM0_DEPTH(STRM0_DEPTH), - .STRM1_DEPTH(STRM1_DEPTH), - - //offsets for each stream - .STRM0_OFFSET(STRM0_OFFSET), - .STRM1_OFFSET(STRM1_OFFSET) -) -mem -( - .aclk(aclk), - .aresetn(aresetn), - - .config_address(config_address), - .config_ce(config_ce), - .config_we(config_we), - .config_d0(config_d0), - .config_q0(config_q0), - .config_rack(config_rack), - - .m_axis_0_tready(m_axis_0_tready), - .m_axis_0_tvalid(m_axis_0_tvalid), - .m_axis_0_tdata(m_axis_0_tdata), - - .m_axis_1_tready(m_axis_1_tready), - .m_axis_1_tvalid(m_axis_1_tvalid), - .m_axis_1_tdata(m_axis_1_tdata) -); - -assign m_axis_2_tvalid = 0; -assign m_axis_2_tdata = 0; -assign m_axis_3_tvalid = 0; -assign m_axis_3_tdata = 0; -assign m_axis_4_tvalid = 0; -assign m_axis_4_tdata = 0; -assign m_axis_5_tvalid = 0; -assign m_axis_5_tdata = 0; - -end else begin: multiblock - - -memstream_multiblock -#( - .CONFIG_EN(CONFIG_EN), - .NSTREAMS(NSTREAMS), - .MEM_DEPTH(MEM_DEPTH), - .MEM_WIDTH(MEM_WIDTH), - .MEM_INIT(MEM_INIT), - .RAM_STYLE(RAM_STYLE), - - //widths per stream - .STRM0_WIDTH(STRM0_WIDTH), - .STRM1_WIDTH(STRM1_WIDTH), - .STRM2_WIDTH(STRM2_WIDTH), - .STRM3_WIDTH(STRM3_WIDTH), - .STRM4_WIDTH(STRM4_WIDTH), - .STRM5_WIDTH(STRM5_WIDTH), - - //depths per stream - .STRM0_DEPTH(STRM0_DEPTH), - .STRM1_DEPTH(STRM1_DEPTH), - .STRM2_DEPTH(STRM2_DEPTH), - .STRM3_DEPTH(STRM3_DEPTH), - .STRM4_DEPTH(STRM4_DEPTH), - .STRM5_DEPTH(STRM5_DEPTH), - - //offsets for each stream - .STRM0_OFFSET(STRM0_OFFSET), - .STRM1_OFFSET(STRM1_OFFSET), - .STRM2_OFFSET(STRM2_OFFSET), - .STRM3_OFFSET(STRM3_OFFSET), - .STRM4_OFFSET(STRM4_OFFSET), - .STRM5_OFFSET(STRM5_OFFSET) -) -mem -( - .aclk(aclk), - .aresetn(aresetn), - - .config_address(config_address), - .config_ce(config_ce), - .config_we(config_we), - .config_d0(config_d0), - .config_q0(config_q0), - - .m_axis_0_afull(m_axis_0_afull), - .m_axis_0_tready(m_axis_0_tready), - .m_axis_0_tvalid(m_axis_0_tvalid), - .m_axis_0_tdata(m_axis_0_tdata), - - .m_axis_1_afull(m_axis_1_afull), - .m_axis_1_tready(m_axis_1_tready), - .m_axis_1_tvalid(m_axis_1_tvalid), - .m_axis_1_tdata(m_axis_1_tdata), - - .m_axis_2_afull(m_axis_2_afull), - .m_axis_2_tready(m_axis_2_tready), - .m_axis_2_tvalid(m_axis_2_tvalid), - .m_axis_2_tdata(m_axis_2_tdata), - - .m_axis_3_afull(m_axis_3_afull), - .m_axis_3_tready(m_axis_3_tready), - .m_axis_3_tvalid(m_axis_3_tvalid), - .m_axis_3_tdata(m_axis_3_tdata), - - .m_axis_4_afull(m_axis_4_afull), - .m_axis_4_tready(m_axis_4_tready), - .m_axis_4_tvalid(m_axis_4_tvalid), - .m_axis_4_tdata(m_axis_4_tdata), - - .m_axis_5_afull(m_axis_5_afull), - .m_axis_5_tready(m_axis_5_tready), - .m_axis_5_tvalid(m_axis_5_tvalid), - .m_axis_5_tdata(m_axis_5_tdata) - -); - - -end -endgenerate - -axi4lite_if -#( - .ADDR_WIDTH(AXILITE_ADDR_WIDTH), - .DATA_WIDTH(32), - .IP_DATA_WIDTH(MEM_WIDTH) -) -config_if -( - //system signals - .aclk(aclk), - .aresetn(aresetn), - - //Write channels - //write address - .awready(awready), - .awvalid(awvalid), - .awaddr(awaddr), - .awprot(awprot), - //write data - .wready(wready), - .wvalid(wvalid), - .wdata(wdata), - .wstrb(wstrb), - //burst response - .bready(bready), - .bvalid(bvalid), - .bresp(bresp), - - //Read channels - //read address - .arready(arready), - .arvalid(arvalid), - .araddr(araddr), - .arprot(arprot), - //read data - .rready(rready), - .rvalid(rvalid), - .rresp(rresp), - .rdata(rdata), - - //IP-side interface - .ip_en(config_ce), - .ip_wen(config_we), - .ip_addr(config_address), - .ip_wdata(config_d0), - .ip_rack(config_rack), - .ip_rdata(config_q0) -); - -endmodule diff --git a/finn-rtllib/memstream/hdl/memstream_axi.sv b/finn-rtllib/memstream/hdl/memstream_axi.sv new file mode 100644 index 0000000000..136bcb1d7e --- /dev/null +++ b/finn-rtllib/memstream/hdl/memstream_axi.sv @@ -0,0 +1,136 @@ +/** + * Copyright (c) 2023, Xilinx + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of FINN nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + */ + +module memstream_axi #( + int unsigned DEPTH, + int unsigned WIDTH, + + parameter INIT_FILE = "", + parameter RAM_STYLE = "auto", + + localparam int unsigned AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2 +)( + // Global Control + input logic clk, + input logic rst, + + // AXI-lite Write + output logic awready, + input logic awvalid, + input logic [2:0] awprot, + input logic [AXILITE_ADDR_WIDTH-1:0] awaddr, + + output logic wready, + input logic wvalid, + input logic [31:0] wdata, + input logic [ 3:0] wstrb, + + input logic bready, + output logic bvalid, + output logic [1:0] bresp, + + // AXI-lite Read + output logic arready, + input logic arvalid, + input logic [2:0] arprot, + input logic [AXILITE_ADDR_WIDTH-1:0] araddr, + + input logic rready, + output logic rvalid, + output logic [ 1:0] rresp, + output logic [31:0] rdata, + + // Continuous output stream + input logic m_axis_0_tready, + output logic m_axis_0_tvalid, + output logic [((WIDTH+7)/8)*8-1:0] m_axis_0_tdata +); + + //----------------------------------------------------------------------- + // AXI-lite to ap_memory Adapter + uwire [31:0] config_address; + uwire config_ce; + uwire config_we; + uwire config_rack; + uwire [WIDTH-1:0] config_d0; + uwire [WIDTH-1:0] config_q0; + axi4lite_if #( + .ADDR_WIDTH(AXILITE_ADDR_WIDTH), + .DATA_WIDTH(32), + .IP_DATA_WIDTH(WIDTH) + ) config_if ( + .aclk(clk), .aresetn(!rst), + + // Write Channels + .awready, .awvalid, .awaddr, .awprot, + .wready, .wvalid, .wdata, .wstrb, + .bready, .bvalid, .bresp, + + // Read Channels + .arready, .arvalid, .araddr, .arprot, + .rready, .rvalid, .rresp, .rdata, + + // IP-side Interface + .ip_en(config_ce), + .ip_wen(config_we), + .ip_addr(config_address), + .ip_wdata(config_d0), + .ip_rack(config_rack), + .ip_rdata(config_q0) + ); + + //----------------------------------------------------------------------- + // Streaming Memory Backend + memstream #( + .DEPTH(DEPTH), + .WIDTH(WIDTH), + .INIT_FILE(INIT_FILE), + .RAM_STYLE(RAM_STYLE) + ) mem ( + .clk, .rst, + + .config_address, + .config_ce, + .config_we, + .config_d0, + .config_q0, + .config_rack, + + .ordy(m_axis_0_tready), + .ovld(m_axis_0_tvalid), + .odat(m_axis_0_tdata[WIDTH-1:0]) + ); + if($bits(m_axis_0_tdata) > WIDTH) begin + assign m_axis_0_tdata[$left(m_axis_0_tdata):WIDTH] = '0; + end + +endmodule : memstream_axi diff --git a/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v new file mode 100644 index 0000000000..13f5c82d6e --- /dev/null +++ b/finn-rtllib/memstream/hdl/memstream_axi_wrapper.v @@ -0,0 +1,123 @@ +/** + * Copyright (c) 2023, Xilinx + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of FINN nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + */ + +module memstream_axi_wrapper #( + parameter DEPTH = 512, + parameter WIDTH = 32, + + parameter INIT_FILE = "", + parameter RAM_STYLE = "auto", + + parameter AXILITE_ADDR_WIDTH = $clog2(DEPTH * (2**$clog2((WIDTH+31)/32))) + 2 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF m_axis_0, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // AXI-lite Write + output awready, + input awvalid, + input [2:0] awprot, + input [AXILITE_ADDR_WIDTH-1:0] awaddr, + + output wready, + input wvalid, + input [31:0] wdata, + input [ 3:0] wstrb, + + input bready, + output bvalid, + output [1:0] bresp, + + // AXI-lite Read + output arready, + input arvalid, + input [2:0] arprot, + input [AXILITE_ADDR_WIDTH-1:0] araddr, + + input rready, + output rvalid, + output [ 1:0] rresp, + output [31:0] rdata, + + // Continuous output stream + input m_axis_0_tready, + output m_axis_0_tvalid, + output [((WIDTH+7)/8)*8-1:0] m_axis_0_tdata +); + + localparam INIT_FILTERED = +`ifdef SYNTHESIS + RAM_STYLE == "ultra"? "" : +`endif + INIT_FILE; + + memstream_axi #( + .DEPTH(DEPTH), .WIDTH(WIDTH), + .INIT_FILE(INIT_FILTERED), + .RAM_STYLE(RAM_STYLE) + ) core ( + .clk(ap_clk), .rst(!ap_rst_n), + + // AXI-lite Write + .awready(awready), + .awvalid(awvalid), + .awprot(awprot), + .awaddr(awaddr), + .wready(wready), + .wvalid(wvalid), + .wdata(wdata), + .wstrb(wstrb), + .bready(bready), + .bvalid(bvalid), + .bresp(bresp), + + // AXI-lite Read + .arready(arready), + .arvalid(arvalid), + .arprot(arprot), + .araddr(araddr), + .rready(rready), + .rvalid(rvalid), + .rresp(rresp), + .rdata(rdata), + + // Continuous output stream + .m_axis_0_tready(m_axis_0_tready), + .m_axis_0_tvalid(m_axis_0_tvalid), + .m_axis_0_tdata(m_axis_0_tdata) + ); + +endmodule : memstream_axi_wrapper diff --git a/finn-rtllib/memstream/hdl/memstream_multiblock.v b/finn-rtllib/memstream/hdl/memstream_multiblock.v deleted file mode 100644 index 4e6167132d..0000000000 --- a/finn-rtllib/memstream/hdl/memstream_multiblock.v +++ /dev/null @@ -1,474 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -module memstream_multiblock -#( -//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths - parameter CONFIG_EN = 1, - parameter NSTREAMS = 6,//1 up to 6 - - parameter MEM_DEPTH = 13824, - parameter MEM_WIDTH = 32, - parameter MEM_INIT = "./", - parameter RAM_STYLE = "auto", - - //widths per stream - parameter STRM0_WIDTH = 32, - parameter STRM1_WIDTH = 32, - parameter STRM2_WIDTH = 32, - parameter STRM3_WIDTH = 32, - parameter STRM4_WIDTH = 32, - parameter STRM5_WIDTH = 32, - - //depths per stream - parameter STRM0_DEPTH = 2304, - parameter STRM1_DEPTH = 2304, - parameter STRM2_DEPTH = 2304, - parameter STRM3_DEPTH = 2304, - parameter STRM4_DEPTH = 2304, - parameter STRM5_DEPTH = 2304, - - //offsets for each stream - parameter STRM0_OFFSET = 0, - parameter STRM1_OFFSET = 2304, - parameter STRM2_OFFSET = 4608, - parameter STRM3_OFFSET = 6912, - parameter STRM4_OFFSET = 9216, - parameter STRM5_OFFSET = 11520 -) - -( - input aclk, - input aresetn, - - //optional configuration interface compatible with ap_memory - input [31:0] config_address, - input config_ce, - input config_we, - input [31:0] config_d0, - output [31:0] config_q0, - output config_rack, - - //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits - input m_axis_0_afull, - input m_axis_0_tready, - output m_axis_0_tvalid, - output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata, - - input m_axis_1_afull, - input m_axis_1_tready, - output m_axis_1_tvalid, - output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata, - - input m_axis_2_afull, - input m_axis_2_tready, - output m_axis_2_tvalid, - output [((STRM2_WIDTH+7)/8)*8-1:0] m_axis_2_tdata, - - input m_axis_3_afull, - input m_axis_3_tready, - output m_axis_3_tvalid, - output [((STRM3_WIDTH+7)/8)*8-1:0] m_axis_3_tdata, - - input m_axis_4_afull, - input m_axis_4_tready, - output m_axis_4_tvalid, - output [((STRM4_WIDTH+7)/8)*8-1:0] m_axis_4_tdata, - - input m_axis_5_afull, - input m_axis_5_tready, - output m_axis_5_tvalid, - output [((STRM5_WIDTH+7)/8)*8-1:0] m_axis_5_tdata - - -); - -//calculate number of RAMB18 blocks we need depth-wise -localparam NMEMBLOCKS = (MEM_DEPTH+1023) / 1024; //ceil(MEM_DEPTH/1024) - -//calculate width of address for each block -localparam BLOCKADRWIDTH = NMEMBLOCKS > 1 ? 10 : $clog2(MEM_DEPTH); - -//determine whether a stream needs to multiplex between memory blocks -localparam STRM0_MUX = ((STRM0_OFFSET/1024) != ((STRM0_OFFSET+STRM0_DEPTH)/1024)); -localparam STRM1_MUX = ((STRM1_OFFSET/1024) != ((STRM1_OFFSET+STRM1_DEPTH)/1024)); -localparam STRM2_MUX = ((STRM2_OFFSET/1024) != ((STRM2_OFFSET+STRM2_DEPTH)/1024)); -localparam STRM3_MUX = ((STRM3_OFFSET/1024) != ((STRM3_OFFSET+STRM3_DEPTH)/1024)); -localparam STRM4_MUX = ((STRM4_OFFSET/1024) != ((STRM4_OFFSET+STRM4_DEPTH)/1024)); -localparam STRM5_MUX = ((STRM5_OFFSET/1024) != ((STRM5_OFFSET+STRM5_DEPTH)/1024)); - -//determine what the base block of each stream is -localparam STRM0_BLOCK = (STRM0_OFFSET/1024); -localparam STRM1_BLOCK = (STRM1_OFFSET/1024); -localparam STRM2_BLOCK = (STRM2_OFFSET/1024); -localparam STRM3_BLOCK = (STRM3_OFFSET/1024); -localparam STRM4_BLOCK = (STRM4_OFFSET/1024); -localparam STRM5_BLOCK = (STRM5_OFFSET/1024); - -//determine what the end block of each stream is -localparam STRM0_END_BLOCK = ((STRM0_OFFSET+STRM0_DEPTH-1)/1024); -localparam STRM1_END_BLOCK = ((STRM1_OFFSET+STRM1_DEPTH-1)/1024); -localparam STRM2_END_BLOCK = ((STRM2_OFFSET+STRM2_DEPTH-1)/1024); -localparam STRM3_END_BLOCK = ((STRM3_OFFSET+STRM3_DEPTH-1)/1024); -localparam STRM4_END_BLOCK = ((STRM4_OFFSET+STRM4_DEPTH-1)/1024); -localparam STRM5_END_BLOCK = ((STRM5_OFFSET+STRM5_DEPTH-1)/1024); - -//determine the number of blocks spanned by each stream -localparam STRM0_NBLOCKS = STRM0_END_BLOCK - STRM0_BLOCK + 1; -localparam STRM1_NBLOCKS = STRM1_END_BLOCK - STRM1_BLOCK + 1; -localparam STRM2_NBLOCKS = STRM2_END_BLOCK - STRM2_BLOCK + 1; -localparam STRM3_NBLOCKS = STRM3_END_BLOCK - STRM3_BLOCK + 1; -localparam STRM4_NBLOCKS = STRM4_END_BLOCK - STRM4_BLOCK + 1; -localparam STRM5_NBLOCKS = STRM5_END_BLOCK - STRM5_BLOCK + 1; - -//TODO: check that memory width is equal to the widest stream -//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?) -initial begin - if((NSTREAMS < 1) | (NSTREAMS > 6)) begin - $display("Invalid setting for NSTREAMS, please set in range [1,6]"); - $finish(); - end -end - -//invert reset -wire rst; -assign rst = ~aresetn; - -//WARNING: pipeline depth is larger than the number of streams per port so we have in-flight writes that may see not-ready when they get executed -//solution: use prog-full to make sure we have an equal number of free slots in the stream to the read pipeline depth - -reg [$clog2(MEM_DEPTH)-1:0] strm0_addr = STRM0_OFFSET; -reg [$clog2(MEM_DEPTH)-1:0] strm1_addr = STRM1_OFFSET; -reg [$clog2(MEM_DEPTH)-1:0] strm2_addr = STRM2_OFFSET; -reg [$clog2(MEM_DEPTH)-1:0] strm3_addr = STRM3_OFFSET; -reg [$clog2(MEM_DEPTH)-1:0] strm4_addr = STRM4_OFFSET; -reg [$clog2(MEM_DEPTH)-1:0] strm5_addr = STRM5_OFFSET; - -reg strm0_incr_en; -reg strm1_incr_en; -reg strm2_incr_en; -reg strm3_incr_en; -reg strm4_incr_en; -reg strm5_incr_en; - -wire strm0_rst; -wire strm1_rst; -wire strm2_rst; -wire strm3_rst; -wire strm4_rst; -wire strm5_rst; - -reg strm0_ready; -reg strm1_ready; -reg strm2_ready; -reg strm3_ready; -reg strm4_ready; -reg strm5_ready; - -//arbiter: work on one stream at a time -//multiplex each port between (up to) half of the streams -reg [1:0] current_stream_porta = 0; -reg [1:0] current_stream_portb = 0; - -always @(posedge aclk) begin - if(rst) - current_stream_porta <= 0; - else case(current_stream_porta) - 0: current_stream_porta <= strm2_ready ? 1 : strm4_ready ? 2 : 0; - 1: current_stream_porta <= strm4_ready ? 2 : strm0_ready ? 0 : 1; - 2: current_stream_porta <= strm0_ready ? 0 : strm2_ready ? 1 : 2; - endcase - if(rst) - current_stream_portb <= 0; - else case(current_stream_portb) - 0: current_stream_portb <= strm3_ready ? 1 : strm5_ready ? 2 : 0; - 1: current_stream_portb <= strm5_ready ? 2 : strm1_ready ? 0 : 1; - 2: current_stream_portb <= strm1_ready ? 0 : strm3_ready ? 1 : 2; - endcase -end - -always @(posedge aclk) begin - if(rst) begin - strm0_incr_en <= 0; - strm1_incr_en <= 0; - strm2_incr_en <= 0; - strm3_incr_en <= 0; - strm4_incr_en <= 0; - strm5_incr_en <= 0; - end else begin - strm0_incr_en <= (current_stream_porta == 0) & strm0_ready; - strm1_incr_en <= (current_stream_portb == 0) & strm1_ready; - strm2_incr_en <= (current_stream_porta == 1) & strm2_ready; - strm3_incr_en <= (current_stream_portb == 1) & strm3_ready; - strm4_incr_en <= (current_stream_porta == 2) & strm4_ready; - strm5_incr_en <= (current_stream_portb == 2) & strm5_ready; - end -end - -assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1)); -assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1)); -assign strm2_rst = strm2_incr_en & (strm2_addr == (STRM2_OFFSET + STRM2_DEPTH-1)); -assign strm3_rst = strm3_incr_en & (strm3_addr == (STRM3_OFFSET + STRM3_DEPTH-1)); -assign strm4_rst = strm4_incr_en & (strm4_addr == (STRM4_OFFSET + STRM4_DEPTH-1)); -assign strm5_rst = strm5_incr_en & (strm5_addr == (STRM5_OFFSET + STRM5_DEPTH-1)); - -always @(posedge aclk) begin - strm0_ready <= ~m_axis_0_afull; - strm1_ready <= ~m_axis_1_afull & (NSTREAMS >= 2); - strm2_ready <= ~m_axis_2_afull & (NSTREAMS >= 3); - strm3_ready <= ~m_axis_3_afull & (NSTREAMS >= 4); - strm4_ready <= ~m_axis_4_afull & (NSTREAMS >= 5); - strm5_ready <= ~m_axis_5_afull & (NSTREAMS >= 6); -end - -//one address counter per stream; more LUTs but keeps routing short and local -always @(posedge aclk) begin - if(strm0_rst | rst) - strm0_addr <= STRM0_OFFSET; - else if(strm0_incr_en) - strm0_addr <= strm0_addr + 1; - if(strm1_rst | rst) - strm1_addr <= STRM1_OFFSET; - else if(strm1_incr_en) - strm1_addr <= strm1_addr + 1; - if(strm2_rst | rst) - strm2_addr <= STRM2_OFFSET; - else if(strm2_incr_en) - strm2_addr <= strm2_addr + 1; - if(strm3_rst | rst) - strm3_addr <= STRM3_OFFSET; - else if(strm3_incr_en) - strm3_addr <= strm3_addr + 1; - if(strm4_rst | rst) - strm4_addr <= STRM4_OFFSET; - else if(strm4_incr_en) - strm4_addr <= strm4_addr + 1; - if(strm5_rst | rst) - strm5_addr <= STRM5_OFFSET; - else if(strm5_incr_en) - strm5_addr <= strm5_addr + 1; -end - -reg [$clog2(MEM_DEPTH)-1:0] addra; -wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqa; - -reg [$clog2(MEM_DEPTH)-1:0] addrb; -wire [MEM_WIDTH*NMEMBLOCKS-1:0] rdqb; - -wire [NMEMBLOCKS-1:0] we; - -reg [1:0] addr_select_porta; -reg [1:0] addr_select_portb; - -//multiplex addresses of various streams into address ports of memory -always @(posedge aclk) begin - addr_select_porta <= current_stream_porta; - case(addr_select_porta) - 0: addra <= strm0_addr; - 1: addra <= strm2_addr; - 2: addra <= strm4_addr; - endcase - addr_select_portb <= current_stream_portb; - case(addr_select_portb) - 0: addrb <= strm1_addr; - 1: addrb <= strm3_addr; - 2: addrb <= strm5_addr; - endcase -end - -genvar g; -generate for(g=0; g 1) begin: multiblock - -wire [MEM_WIDTH-1:0] rdqmux[5:0]; - -reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblocka[2:0]; -reg [$clog2(MEM_DEPTH)-BLOCKADRWIDTH-1:0] rdblockb[2:0]; - -always @(posedge aclk) begin - rdblocka[0] <= addra[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH]; - rdblockb[0] <= addrb[$clog2(MEM_DEPTH)-1:BLOCKADRWIDTH]; - for(i=0; i<2; i=i+1) begin - rdblocka[i+1] <= rdblocka[i]; - rdblockb[i+1] <= rdblockb[i]; - end -end - -if(NSTREAMS >= 1) begin: en_strm0 - if(STRM0_MUX == 1) begin: mux0 - mux #(STRM0_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM0_BLOCK+STRM0_NBLOCKS)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH],rdqmux[0],rdblocka[1] - STRM0_BLOCK); - end else begin: nomux0 - assign rdqmux[0] = rdqa[(STRM0_BLOCK+1)*MEM_WIDTH-1:STRM0_BLOCK*MEM_WIDTH]; - end - assign m_axis_0_tdata = rdqmux[0][STRM0_WIDTH-1:0]; -end - -if(NSTREAMS >= 2) begin: en_strm1 - if(STRM1_MUX == 1) begin: mux1 - mux #(STRM1_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM1_BLOCK+STRM1_NBLOCKS)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH],rdqmux[1],rdblockb[1] - STRM1_BLOCK); - end else begin: nomux1 - assign rdqmux[1] = rdqb[(STRM1_BLOCK+1)*MEM_WIDTH-1:STRM1_BLOCK*MEM_WIDTH]; - end - assign m_axis_1_tdata = rdqmux[1][STRM1_WIDTH-1:0]; -end - -if(NSTREAMS >= 3) begin: en_strm2 - if(STRM2_MUX == 1) begin: mux2 - mux #(STRM2_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM2_BLOCK+STRM2_NBLOCKS)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH],rdqmux[2],rdblocka[1] - STRM2_BLOCK); - end else begin: nomux2 - assign rdqmux[2] = rdqa[(STRM2_BLOCK+1)*MEM_WIDTH-1:STRM2_BLOCK*MEM_WIDTH]; - end - assign m_axis_2_tdata = rdqmux[2][STRM2_WIDTH-1:0]; -end - -if(NSTREAMS >= 4) begin: en_strm3 - if(STRM3_MUX == 1) begin: mux3 - mux #(STRM3_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM3_BLOCK+STRM3_NBLOCKS)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH],rdqmux[3],rdblockb[1] - STRM3_BLOCK); - end else begin: nomux3 - assign rdqmux[3] = rdqb[(STRM3_BLOCK+1)*MEM_WIDTH-1:STRM3_BLOCK*MEM_WIDTH]; - end - assign m_axis_3_tdata = rdqmux[3][STRM3_WIDTH-1:0]; -end - -if(NSTREAMS >= 5) begin: en_strm4 - if(STRM4_MUX == 1) begin: mux4 - mux #(STRM4_NBLOCKS, MEM_WIDTH) m(rdqa[(STRM4_BLOCK+STRM4_NBLOCKS)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH],rdqmux[4],rdblocka[1] - STRM4_BLOCK); - end else begin: nomux4 - assign rdqmux[4] = rdqa[(STRM4_BLOCK+1)*MEM_WIDTH-1:STRM4_BLOCK*MEM_WIDTH]; - end - assign m_axis_4_tdata = rdqmux[4][STRM4_WIDTH-1:0]; -end - -if(NSTREAMS >= 6) begin: en_strm5 - if(STRM5_MUX == 1) begin: mux5 - mux #(STRM5_NBLOCKS, MEM_WIDTH) m(rdqb[(STRM5_BLOCK+STRM5_NBLOCKS)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH],rdqmux[5],rdblockb[1] - STRM5_BLOCK); - end else begin: nomux5 - assign rdqmux[5] = rdqb[(STRM5_BLOCK+1)*MEM_WIDTH-1:STRM5_BLOCK*MEM_WIDTH]; - end - assign m_axis_5_tdata = rdqmux[5][STRM5_WIDTH-1:0]; -end - -end else begin: singleblock - -if(NSTREAMS >= 1) begin: en_strm0_direct - assign m_axis_0_tdata = rdqa[STRM0_WIDTH-1:0]; -end -if(NSTREAMS >= 2) begin: en_strm1_direct - assign m_axis_1_tdata = rdqb[STRM1_WIDTH-1:0]; -end -if(NSTREAMS >= 3) begin: en_strm2_direct - assign m_axis_2_tdata = rdqa[STRM2_WIDTH-1:0]; -end -if(NSTREAMS >= 4) begin: en_strm3_direct - assign m_axis_3_tdata = rdqb[STRM3_WIDTH-1:0]; -end -if(NSTREAMS >= 5) begin: en_strm4_direct - assign m_axis_4_tdata = rdqa[STRM4_WIDTH-1:0]; -end -if(NSTREAMS >= 6) begin: en_strm5_direct - assign m_axis_5_tdata = rdqb[STRM5_WIDTH-1:0]; -end - -end -endgenerate - -//output to AXI Streams -reg tvalid_pipe0[2:0]; -reg tvalid_pipe1[2:0]; -reg tvalid_pipe2[2:0]; -reg tvalid_pipe3[2:0]; -reg tvalid_pipe4[2:0]; -reg tvalid_pipe5[2:0]; - -assign m_axis_0_tvalid = tvalid_pipe0[2]; -assign m_axis_1_tvalid = tvalid_pipe1[2]; -assign m_axis_2_tvalid = tvalid_pipe2[2]; -assign m_axis_3_tvalid = tvalid_pipe3[2]; -assign m_axis_4_tvalid = tvalid_pipe4[2]; -assign m_axis_5_tvalid = tvalid_pipe5[2]; - - -always @(posedge aclk) begin - tvalid_pipe0[0] <= strm0_incr_en; - tvalid_pipe1[0] <= strm1_incr_en; - tvalid_pipe2[0] <= strm2_incr_en; - tvalid_pipe3[0] <= strm3_incr_en; - tvalid_pipe4[0] <= strm4_incr_en; - tvalid_pipe5[0] <= strm5_incr_en; - for(i=0; i<2; i=i+1) begin: srl - tvalid_pipe0[i+1] <= tvalid_pipe0[i]; - tvalid_pipe1[i+1] <= tvalid_pipe1[i]; - tvalid_pipe2[i+1] <= tvalid_pipe2[i]; - tvalid_pipe3[i+1] <= tvalid_pipe3[i]; - tvalid_pipe4[i+1] <= tvalid_pipe4[i]; - tvalid_pipe5[i+1] <= tvalid_pipe5[i]; - end -end - -//dummy read, for now -assign config_q0 = 0; -assign config_rack = config_ce & ~config_we; - -endmodule diff --git a/finn-rtllib/memstream/hdl/memstream_singleblock.v b/finn-rtllib/memstream/hdl/memstream_singleblock.v deleted file mode 100644 index c9b8770aaa..0000000000 --- a/finn-rtllib/memstream/hdl/memstream_singleblock.v +++ /dev/null @@ -1,246 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -/* - Implements a lightweight streamer for up to 2 streams in a single block of memory -*/ - -module memstream_singleblock -#( - parameter CONFIG_EN = 1, - parameter NSTREAMS = 2,//1 up to 2 - - parameter MEM_DEPTH = 512, - parameter MEM_WIDTH = 32, - parameter MEM_INIT = "./", - parameter RAM_STYLE = "auto", - - //widths per stream - parameter STRM0_WIDTH = 32, - parameter STRM1_WIDTH = 32, - - //depths per stream - parameter STRM0_DEPTH = 256, - parameter STRM1_DEPTH = 256, - - //offsets for each stream - parameter STRM0_OFFSET = 0, - parameter STRM1_OFFSET = 256 -) - -( - input aclk, - input aresetn, - - //optional configuration interface compatible with ap_memory - input [31:0] config_address, - input config_ce, - input config_we, - input [MEM_WIDTH-1:0] config_d0, - output [MEM_WIDTH-1:0] config_q0, - output config_rack, - - //multiple output AXI Streams, TDATA width rounded to multiple of 8 bits - input m_axis_0_tready, - output m_axis_0_tvalid, - output [((STRM0_WIDTH+7)/8)*8-1:0] m_axis_0_tdata, - - input m_axis_1_tready, - output m_axis_1_tvalid, - output [((STRM1_WIDTH+7)/8)*8-1:0] m_axis_1_tdata - -); - - -//TODO: check that memory width is equal to the widest stream -//TODO: check that the stream depths and offsets make sense, and that the memory depth is sufficient (or calculate depth here?) -initial begin - if((NSTREAMS < 1) | (NSTREAMS > 2)) begin - $display("Invalid setting for NSTREAMS, please set in range [1,2]"); - $finish(); - end -end - -//invert reset -wire rst; -assign rst = ~aresetn; - -wire strm0_incr_en; -wire strm1_incr_en; - -assign strm0_incr_en = m_axis_0_tready | ~m_axis_0_tvalid; -assign strm1_incr_en = m_axis_1_tready | ~m_axis_1_tvalid; - -reg rack_shift[1:0]; - -generate -if(MEM_DEPTH > 1) begin: use_ram - -//calculate width of memory address, with a minimum of 1 bit -localparam BLOCKADRWIDTH = $clog2(MEM_DEPTH); - -reg [BLOCKADRWIDTH-1:0] strm0_addr = STRM0_OFFSET; -wire strm0_rst; -assign strm0_rst = strm0_incr_en & (strm0_addr == (STRM0_OFFSET + STRM0_DEPTH-1)); - -//one address counter per stream; more LUTs but keeps routing short and local -always @(posedge aclk) begin - if(strm0_rst | rst) - strm0_addr <= STRM0_OFFSET; - else if(strm0_incr_en) - strm0_addr <= strm0_addr + 1; -end - -if(NSTREAMS == 1) begin: sdp - -ramb18_sdp -#( - .ID(0), - .DWIDTH(MEM_WIDTH), - .AWIDTH(BLOCKADRWIDTH), - .DEPTH(MEM_DEPTH), - .MEM_INIT(MEM_INIT), - .RAM_STYLE(RAM_STYLE) -) -ram -( - .clk(aclk), - - .ena(config_ce), - .wea(config_we), - .addra(config_address[BLOCKADRWIDTH-1:0]), - .wdataa(config_d0), - - .enb(strm0_incr_en | config_ce), - .enqb(strm0_incr_en | rack_shift[0]), - .addrb(config_ce ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr), - .rdqb(m_axis_0_tdata) -); - - -end else begin: tdp - -reg [BLOCKADRWIDTH-1:0] strm1_addr = STRM1_OFFSET; -wire strm1_rst; -assign strm1_rst = strm1_incr_en & (strm1_addr == (STRM1_OFFSET + STRM1_DEPTH-1)); - -always @(posedge aclk) begin - if(strm1_rst | rst) - strm1_addr <= STRM1_OFFSET; - else if(strm1_incr_en) - strm1_addr <= strm1_addr + 1; -end - -ramb18_wf_dualport -#( - .ID(0), - .DWIDTH(MEM_WIDTH), - .AWIDTH(BLOCKADRWIDTH), - .DEPTH(MEM_DEPTH), - .MEM_INIT(MEM_INIT), - .RAM_STYLE(RAM_STYLE) -) -ram -( - .clk(aclk), - - .wea(config_we), - .ena(strm0_incr_en | config_ce), - .enqa(strm0_incr_en | config_ce_r), - .addra(config_we ? config_address[BLOCKADRWIDTH-1:0] : strm0_addr), - .wdataa(config_d0), - .rdqa(m_axis_0_tdata), - - .web(1'b0), - .enb(strm1_incr_en), - .enqb(strm1_incr_en), - .addrb(strm1_addr), - .wdatab('d0), - .rdqb(m_axis_1_tdata) -); - -end - -end else begin: bypass - -reg [MEM_WIDTH-1:0] singleval[0:0]; -initial begin - `ifdef SYNTHESIS - $readmemh({MEM_INIT,"memblock_synth_0.dat"}, singleval, 0, 0); - `else - $readmemh({MEM_INIT,"memblock_sim_0.dat"}, singleval, 0, 0); - `endif -end - -always @(posedge aclk) - if(config_ce & config_we) - singleval[0] <= config_d0; - -assign m_axis_0_tdata = singleval[0]; -assign m_axis_1_tdata = singleval[0]; - -end -endgenerate - -//signal valid after 2 tready cycles after initialization -//then stay valid -reg [1:0] tvalid_pipe0 = 2'd0; -reg [1:0] tvalid_pipe1 = 2'd0; - -assign m_axis_0_tvalid = tvalid_pipe0[1]; -assign m_axis_1_tvalid = tvalid_pipe1[1]; - -always @(posedge aclk) begin - if(rst) begin - tvalid_pipe0 <= 0; - end else if(strm0_incr_en) begin - tvalid_pipe0[0] <= 1; - tvalid_pipe0[1] <= tvalid_pipe0[0]; - end -end - -always @(posedge aclk) begin - if(rst) begin - tvalid_pipe1 <= 0; - end else if(strm1_incr_en) begin - tvalid_pipe1[0] <= 1; - tvalid_pipe1[1] <= tvalid_pipe1[0]; - end -end - -always @(posedge aclk) begin - rack_shift[0] <= config_ce & ~config_we; - rack_shift[1] <= rack_shift[0]; -end - -assign config_rack = rack_shift[1]; -assign config_q0 = m_axis_0_tdata; - -endmodule diff --git a/finn-rtllib/memstream/hdl/mux.v b/finn-rtllib/memstream/hdl/mux.v deleted file mode 100644 index f7087f9735..0000000000 --- a/finn-rtllib/memstream/hdl/mux.v +++ /dev/null @@ -1,44 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -module mux -#( - parameter NINPUTS = 1, - parameter WIDTH = 16 -) -( - input [NINPUTS*WIDTH-1:0] in, - output [WIDTH-1:0] out, - input [$clog2(NINPUTS)-1:0] sel -); - -assign out = in >> (sel*WIDTH); - -endmodule diff --git a/finn-rtllib/memstream/hdl/ramb18_sdp.v b/finn-rtllib/memstream/hdl/ramb18_sdp.v deleted file mode 100644 index 8d2fbf9a98..0000000000 --- a/finn-rtllib/memstream/hdl/ramb18_sdp.v +++ /dev/null @@ -1,96 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -module ramb18_sdp -#( - parameter ID = 0, - parameter DWIDTH = 18, - parameter AWIDTH = 10, - parameter DEPTH = 2**AWIDTH, - parameter MEM_INIT = "", - parameter RAM_STYLE = "auto" -) -( - input clk, - - input ena, - input wea, - input [AWIDTH-1:0] addra, - input [DWIDTH-1:0] wdataa, - - input enb, - input enqb, - input [AWIDTH-1:0] addrb, - output reg [DWIDTH-1:0] rdqb -); - -(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1]; -reg [DWIDTH-1:0] rdatab; - -`ifdef SYNTHESIS -reg [7:0] idx = ID; -`else -reg [15:0] idx; -`endif - -//initialize memory -initial begin - //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT - //ID can go up to 99 - if (ID < 0 && ID > 99) begin - $display("ID out of range [0-99]"); - $finish(); - end - //MEM_INIT path must be terminated by / - `ifdef SYNTHESIS - if (ID < 10) - $readmemh({MEM_INIT,"memblock_synth_",idx+8'd48,".dat"}, mem, 0, DEPTH-1); - else - $readmemh({MEM_INIT,"memblock_synth_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1); - `else - $sformat(idx,"%0d",ID); - if (ID < 10) - $readmemh({MEM_INIT,"memblock_sim_",idx[7:0],".dat"}, mem, 0, DEPTH-1); - else - $readmemh({MEM_INIT,"memblock_sim_",idx,".dat"}, mem, 0, DEPTH-1); - `endif -end - -//memory ports, with output pipeline register -always @(posedge clk) begin - if(wea) - mem[addra] <= wdataa; - if(enb) - rdatab <= mem[addrb]; - if(enqb) - rdqb <= rdatab; -end - -endmodule diff --git a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v b/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v deleted file mode 100644 index c7850106ae..0000000000 --- a/finn-rtllib/memstream/hdl/ramb18_wf_dualport.v +++ /dev/null @@ -1,111 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -module ramb18_wf_dualport -#( - parameter ID = 0, - parameter DWIDTH = 18, - parameter AWIDTH = 10, - parameter DEPTH = 2**AWIDTH, - parameter MEM_INIT = "", - parameter RAM_STYLE = "auto" -) -( - input clk, - - input wea, - input ena, - input enqa, - input [AWIDTH-1:0] addra, - input [DWIDTH-1:0] wdataa, - output reg [DWIDTH-1:0] rdqa, - - input web, - input enb, - input enqb, - input [AWIDTH-1:0] addrb, - input [DWIDTH-1:0] wdatab, - output reg [DWIDTH-1:0] rdqb -); - -(* ram_style = RAM_STYLE *) reg [DWIDTH-1:0] mem[0:DEPTH-1]; -reg [DWIDTH-1:0] rdataa; -reg [DWIDTH-1:0] rdatab; - -`ifdef SYNTHESIS -reg [7:0] idx = ID; -`else -reg [15:0] idx; -`endif - -//initialize memory -initial begin - //note the hacky way of adding a filename memblock_ID.dat to the path provided in MEM_INIT - //ID can go up to 99 - if (ID < 0 && ID > 99) begin - $display("ID out of range [0-99]"); - $finish(); - end - //MEM_INIT path must be terminated by / - `ifdef SYNTHESIS - if (ID < 10) - $readmemh({MEM_INIT,"memblock_",idx+8'd48,".dat"}, mem, 0, DEPTH-1); - else - $readmemh({MEM_INIT,"memblock_",(idx/10)+8'd48,(idx%10)+8'd48,".dat"}, mem, 0, DEPTH-1); - `else - $sformat(idx,"%0d",ID); - if (ID < 10) - $readmemh({MEM_INIT,"memblock_",idx[7:0],".dat"}, mem, 0, DEPTH-1); - else - $readmemh({MEM_INIT,"memblock_",idx,".dat"}, mem, 0, DEPTH-1); - `endif -end - -//memory ports, with output pipeline register -always @(posedge clk) begin - if(ena) begin - if(wea) - mem[addra] <= wdataa; - rdataa <= mem[addra]; - end - if(enqa) - rdqa <= rdataa; -end -always @(posedge clk) begin - if(enb) begin - if(web) - mem[addrb] <= wdatab; - rdatab <= mem[addrb]; - end - if(enqb) - rdqb <= rdatab; -end - -endmodule diff --git a/finn-rtllib/memstream/sim/golden.dat b/finn-rtllib/memstream/sim/golden.dat deleted file mode 100644 index 1466271bca..0000000000 --- a/finn-rtllib/memstream/sim/golden.dat +++ /dev/null @@ -1,9216 +0,0 @@ -AFB2B66A -BB100CFF -1ED93E9B -1B8E800D -DA9E0150 -38B1C916 -93BC4E64 -860F8373 -B31D708B -C2934023 -739C9593 -4C898A3D -CCC8F4C5 -8FA275E6 -47732CC7 -6857ABF0 -31671013 -6BC4AA43 -73D4F790 -2C6158B6 -FDC3B5D -6DC755F2 -E0E7E8C9 -7862E17 -3D4FFE1E -9AFFF447 -C862FD7D -A4C4D89A -D7D6EF51 -10E5A31D -79DA9C63 -A83060A8 -EA988813 -6B411BCF -85544B5A -5AC91DE6 -586E6779 -8FE8161B -4C57CC92 -74C918A6 -36B20D44 -5CB62FC0 -62FDB2E1 -4B1CB514 -526B7CEC -B3FA61D0 -C95DDBE -CC2BA600 -2466CD1D -3354A056 -CCED3EAC -6FFA09EE -F9648FAF -18CB5358 -EA506270 -66F385A6 -5B0246E5 -26218A76 -BC7CECFD -5969F6FF -3DAF5901 -C53D05BD -1EDA2D76 -5C0C0010 -7A6C0C8C -BF99E997 -C964C884 -4DE417F4 -8637312 -133B8C3A -D637DB88 -297288F6 -CF1D00B3 -426BD0F3 -4D258120 -8F7EC898 -E15482D9 -DFDFC442 -16A5C4AE -7A6A14DF -5E9C2807 -31BD3EA2 -BD6DCDBC -E47CD35E -FA4FE42 -CCDE0036 -345EBCB7 -64686255 -AE1D77EB -D2B42B84 -CD5E5824 -8DABAB1F -4E07FFCA -7F3B4C13 -1A62C962 -CE08835F -E8E05318 -DC25C7BF -132E4308 -5D0122D6 -B7451ACE -829D2507 -19329C7F -39FCA8F0 -DCD1A574 -17E2EEE -B2B6583A -2181E65 -7013A2A7 -46535CDE -C85BF5D3 -2FD5EFC2 -E05C5D2E -244F0F96 -F01D711F -F1CBB67E -6DAE6666 -84AD6F4A -B95BC84E -9DD54B95 -5A7CA1B -7B1447F4 -44A8EDA7 -20929E9 -40E62E02 -3D03CC3E -81EEF8C4 -1E686D13 -17C13B3D -A14967BE -D8693E0E -15A7FDD1 -19F51C6D -249D0C21 -51424939 -BA05F551 -C614827A -32841A0D -2F8B041 -11A2806 -DBF24199 -F246D9EB -52FFB23D -F3061A47 -B6D51EF3 -2DE434C3 -E1D3F874 -85270B0A -CC405B14 -DD3E9F23 -A0352F98 -67EE5731 -96892C65 -6D67A443 -16354414 -17959F75 -A554F236 -C585076 -2B665011 -7D503509 -77A4530 -6A13C8DC -31996F5 -916AD400 -E761D000 -D23CFD32 -CF3A5154 -C575A1CB -B91ACDBF -BEE7F338 -44C26212 -8124CD5B -245F7451 -DD6D18BA -6B838EC6 -5247AB98 -2F41FDAA -A780BD3B -1FD2F95 -6CDA39C -C31FA5A0 -AB56A5E1 -87F50441 -47093971 -BEBD81EC -2A7F6977 -8C83BD29 -FB067DAC -5FEBDCDC -8FB43F72 -EE45FC6D -4088691C -34F235D0 -43AB8E4D -67FA8BB5 -FC2D2C02 -DA77044C -22E6FC7 -6B6039A9 -BA6E3C45 -46DEC612 -8E7E0FF7 -438DE467 -F4525025 -7937973A -9ABE4BEF -8F8DF841 -F74C5087 -7EDE1CA4 -FF3C7F98 -A025FE0B -59E5EDF6 -6DD27411 -65C080E6 -C86D872D -628B6B26 -B9316D56 -E09EFA8B -A8CD3F21 -C0CD8745 -F4D62BA7 -D4D7FB99 -E9174232 -7F068FC4 -767480FC -275BBBF7 -3470FF88 -E632ACD1 -85677507 -AE0E2C69 -E2C74DA9 -C307B72B -5FB5A769 -99C18162 -FAFB7660 -6E984733 -E17FD97B -EC5E6CA7 -3D659815 -30826B60 -300BE8E8 -86D0B096 -856F2CB0 -2A61ADE4 -24EEB996 -2FCB729B -8190CE0D -E64F7E6A -4D0D42F -CE29765B -C77DE893 -9264C299 -A200E419 -868B5EC6 -8452AC39 -59F7BDED -422E75B2 -74E6329A -38F053E8 -16F8BD5A -363A2E43 -8018AB7B -44AE4CF5 -C8F7B14B -52658A45 -7B46C7D8 -CD319C38 -19AC8957 -5F42CFAA -5DB4DBF7 -DF66DDBA -4FBCB611 -266DFB86 -4F0EE64C -1765E724 -E30C89CA -4705FCE8 -BB7636B3 -789EFEFC -AAC0F37F -424B1661 -234F05AB -1BC0ADF8 -7F9EC67E -500448E5 -BF4D1C45 -C5B64E3B -914F44FE -EB17F041 -1752165C -F5B72E31 -6D68C060 -4EF27C55 -8CEDFDC5 -E3996A56 -25C5C632 -430D930F -EE04DE4D -576E4921 -E13A2A6E -CFE21675 -B1067912 -4C888068 -3C3A1A6D -FCE12E0 -FAD6AD8B -F7DE2E0F -E8DC0DE7 -CC8721DF -34411355 -2C664D07 -ED034324 -F57FDA56 -8C70BCDF -3A6FF2C8 -C6440537 -8113D976 -A40176A1 -46D1D0D9 -877A407C -3FBCD395 -3E74C1D8 -72E22A13 -BA46116D -CFB14406 -21400896 -7AD34367 -2905F60C -C1F9C16F -2E0E5FCF -2EEB00A0 -9C2D94A9 -8DE1CF01 -5912596C -CF2CA22A -774E7D4F -805657AE -1BA223EF -236FD53F -C1ABFD4A -6B8DD778 -6A6E40D2 -70CF4F79 -950E8D35 -5E4F9545 -86AA4166 -28D056E9 -9C550D75 -CB435A3 -B875667E -F54E6E97 -BB7ACD6B -F11637E9 -C220E1FA -C7CAD54B -32853439 -65BA20C9 -1838F8C0 -C3CCE57D -7D2B69F9 -137AD6E9 -6C041B9 -296497AA -98C5E853 -D37AB835 -376764A9 -2F714011 -D24BE867 -B2BA4E -9EA785F9 -726FCED6 -6B4C6950 -44C6D5C0 -85DEA727 -733F5A86 -41785CFF -BB395E8A -100F8117 -276A08D3 -9268A16E -FBF63C19 -AA497F25 -E92E1DC3 -185B4692 -FE6377D6 -C50771B -D98BCD04 -50FC7D74 -BE5BC294 -2C9C4482 -12FBF6CD -D1E04AE4 -5C9679EE -889D2695 -3699F061 -933D06A9 -930DC367 -496D7A37 -C4161D19 -3E08728B -66388C70 -B2363734 -5D12926F -39B4AEF8 -1948B925 -321E08BC -27559FC2 -A543B709 -4D28BC0 -46C64305 -F7B7D459 -97C4966B -A027A9C8 -43CABFA9 -F7C3643D -1128AB2A -AA4A1419 -AC6F2B46 -8F6FEFEF -34284D4D -D951EB81 -77AC6B7C -70F6E0B2 -FD7BE3CE -77BE497E -4883FBD6 -FCAB08D4 -9BC032A4 -67DA8A5C -82037EC1 -E3EC6CC9 -481B7623 -DA1F3873 -CE9E8787 -785CD654 -1661CF27 -42BD0C3C -990F261A -49F18930 -FA336094 -FFD6FC06 -B71077A6 -204B911E -BA1586D6 -8A2F6DBC -36B184AD -76017CAB -DA7E891E -88A51A1A -97AC49CB -2482BE28 -CE6BD009 -C7776DE0 -4E960944 -64081AF2 -56512D55 -D6D1C640 -EE78145B -54CC5EE0 -BE5D3E1F -8FC8816C -1D6AC407 -5D98F8F1 -18FECC5C -F3DE9A29 -93A19068 -AB623B35 -43FF1A02 -AA26434C -B071FDD5 -45AB6A2E -C1275AA7 -EADA5CDA -E427C95E -AE6E5B77 -89F3CA30 -9648C00A -330A03A7 -20DB35D6 -AA9946BF -A0E3050E -DEBB5819 -5047E2E -9C8FBEB9 -6B70D173 -8A99428D -230C88FE -3B26DBD4 -8DBED704 -EFF1C946 -C2381970 -71087497 -2268599D -FCE50AAE -460A49E5 -EC65BC4C -5A83C23C -DD44120F -D6E81BEB -D10235B7 -9362A387 -B3C9220C -46F21F0 -3D04FBC0 -63A2B38D -8F7DEF26 -F326457D -21933DC1 -775197FB -8D6C7C5F -B2D7D570 -147F9FF7 -78666356 -BAB7D249 -69B45EC6 -F56634ED -34738794 -26DF0163 -188DA00 -D2035A36 -FFBB8062 -62852DCF -55FC882A -849388E6 -43BE6E2C -D53EA2A2 -A228BC21 -9112A960 -5FCDE2F1 -79F42B27 -8AE37179 -1D722815 -5AE6DD26 -A8531C6F -EF386673 -AC761B14 -23C6BC3A -488D93B -AE6B0D63 -A4F1CEAC -43F80A43 -D9681EF6 -BA959674 -CCB852B8 -D9F4D79E -6403622F -75FAECC6 -7F43813F -51FC7BE6 -896A3A28 -CAF31C60 -76000EE7 -C1135AAB -6E83B2E6 -2AED1966 -C4F88A86 -21219EA -8AF14AD6 -14014BA2 -BC0BE2D5 -78757CE8 -C09D83DC -6B2021FE -D5AD900 -3685A49F -FD8B4BA0 -7B005539 -2F0C36EF -B41DBA0D -1DCF61B0 -CB3DA1A6 -24C0ADAA -BED01B2B -59C8C334 -11CCA76C -6F962508 -ABE672A6 -3C281A24 -A6C3DC39 -A72517B1 -FBA81175 -9906CEE4 -E8177FE1 -338D0184 -CC6650DF -840D8CA0 -4C55C42B -6B40F9CC -57B7E7B7 -B7C42442 -4500E9B -8C788183 -9B8F5FCE -49D0AEE1 -426B2271 -EC25BCE3 -7D63A976 -2EFFF592 -32A9E43C -AF5AFA52 -3ABE1133 -35B75ED7 -8F4271A9 -725A6EF -7ED7EB40 -37BD3B -7A0A5AF2 -F6492D7D -C2856688 -9595C241 -C07F646A -7D394FDC -7A991B05 -2CE3AF30 -9929E6E6 -4AE66BD4 -F0F3D1A3 -F76F72E9 -6C2051E2 -72431DE4 -B1796A93 -E04FD748 -D19522B1 -71396A78 -4202F058 -4F2CEB1E -A186853F -8B4474AA -C679B644 -98E10D42 -E7CEB08C -733CA225 -3478B95C -A706A842 -9510B8EB -F47E426E -9A0A17EE -2DA8832B -E73536CC -E6CA4B40 -11A2708F -753AC1E1 -8C304DED -5FC83F07 -4F9A04C9 -E0737708 -9091DFDD -8E1B322 -2552D768 -7C894296 -EABDC081 -E3B2A37 -DEC7EC87 -37FFB6DC -2B2A0CD6 -7E797B13 -64ABD0C5 -1FF12252 -F81AFB24 -C16F1ABC -F0B5AAFC -F80281BA -E51C04D -EEF8BD3E -450A49DB -AC985D7B -CBD4D077 -CAA6370A -FDA6530C -20B71F06 -ED5A891E -BA51A622 -E9F8E132 -63C23719 -2F59EE96 -14D77539 -1A98FC31 -12FCC937 -F39AD8FB -3750DBA9 -564E45B -F74C47FD -1010AD3A -8BE0AED3 -28B27F7B -D5E8EEFA -DC0EFEFB -959F5394 -A10ECCB8 -5C366706 -3B82A5EE -74E377DD -9881CEF3 -D1A4BD88 -69106661 -B209B42 -B56EE86B -63F37839 -C5AB7736 -4AD627C4 -8A4C7E1C -F7CC6334 -3D6CAEC4 -A86A18D5 -8FD910B1 -972371C8 -A423E9B6 -CE8C76C7 -DF930841 -C9D4A7B0 -18521955 -F6F167FC -889F1625 -432C606A -CA5EB4D0 -AFE77C91 -EAF55F16 -6F9A9777 -33726C1D -DC7B1D64 -8031DC00 -CF13144F -84BF2AB -45F5FD45 -6AF06D8C -C50FBE6C -11B8A4A2 -16B780E1 -98033979 -8EFAAEC0 -DD984A5A -D6A80AFC -15C793A3 -EF458063 -B784551F -552CC380 -D1E05EBA -4A795261 -F2B25418 -66066848 -D935B481 -136D2C8F -7A25AEFB -7000439A -E147CC62 -68976C6E -69447DAB -C72506F3 -C6E3FE3B -4FB0FD96 -DB465740 -A254195C -B11EA223 -FC3C44B5 -A9A86F1C -8EED03E3 -24CFF3A -A1B488CE -FD75D002 -9FEF0461 -75DC6637 -B3D38CD2 -57C8F65D -C62026D0 -D6320A18 -5E961798 -80FE0097 -6DA57E68 -D1E8A3C7 -96D49CFC -A8D2DFBC -520D2C1 -151C3F1D -8180DCC7 -4461E43E -C895BF5C -18EE374 -33EA06D4 -75B9D006 -23B934C1 -C2E89F39 -444BCB75 -78077AA5 -ECA64716 -3C1E3FFD -F7DB9CEE -6EC313DD -9CABEC47 -675FA281 -16B8304D -3E38FEC -A9663BDE -8EF647F2 -B646C61C -2228E400 -2B411566 -7A72EB44 -88BD9AE9 -4EF4EBA3 -BCC822D9 -4668160D -695667C1 -CE51A675 -40DE9687 -877561EF -416F5AE6 -EF9304FE -34C1C9D3 -5B63E1BB -C50E9899 -1831810D -25DE2CC1 -10539A77 -EE51D9B2 -462E5A70 -B0F8C3B7 -CA16E410 -1796F2E5 -573F6B28 -E157A965 -2640969A -153B4909 -7FC1290F -ABCAC2F -2A42D17 -BFFA3865 -7B12D8B9 -9321F9EF -E560B7A9 -36E18DD2 -57710FF9 -FAE1F933 -F717FEF8 -E86BAF7E -D0CE3E89 -C8755650 -704BB6ED -6309F650 -E21DDB4F -7CBF531C -7E0AFB8E -D6A1128B -60F16A1B -534186AF -72971F2E -428A867C -F571D32C -CD522E7B -13F6443 -38CDC9EC -D01C51E6 -2E575D3F -7E86B596 -C1460B28 -1403B019 -76D89A66 -4F2D9465 -9B87B1 -172A00A4 -4669559C -105C8A19 -3CD2DD63 -EF054D76 -8B9AB48 -64136500 -71C56349 -B7AEEDF5 -4145D7AC -D6A3E4C7 -2F9E0DF4 -31E418C8 -D2C839DE -63E919D9 -2F4D0353 -8812C572 -B88E671F -54D2BBE0 -E166998 -B7487741 -64312607 -5ADF6F3E -31A86BF1 -D8A96C85 -22AA3021 -AD4719B5 -49EB0670 -93B76AAF -B109648 -FBC7346C -2530A7B5 -C8525175 -15EC0A76 -315FACCE -D8C21A6F -9EDEF96D -6495575D -722A0577 -51EDE2ED -8109F168 -6CBA0929 -1ED88DCD -D79A67E2 -CE62A29C -6FE2A87F -D1E6E3B9 -601988A0 -6A045849 -A7E30F35 -E0EE4424 -AA89C628 -33D7A7A3 -FCD27B7A -80CAF9A4 -2E7F1302 -69F19C -80DBDC64 -392FBDC -E5981A33 -B4AF4210 -1DBFDB9F -31E5DF02 -5C571556 -EE256151 -9F573818 -200D540B -87743240 -1335188F -5A1E9D1F -FA267CB -688D2302 -80D32C1 -195719E -EF151174 -772EEC93 -DD2E2E4E -D8EA362D -3B24FC06 -FFFCF7FC -C571F2F4 -A8DAC7D -3BA7880C -16FC184D -7DBC453C -8F355780 -65C7ED3D -2202E50E -9EC765A9 -9D8F8CDA -CFA71D0B -7A463A33 -AA94D750 -359750D8 -B9A4BEFD -B153CD8C -93AFB5F4 -2676E0A0 -78C0805 -347133 -3B229F4D -4486A7BE -F3A0FAF3 -D29E9349 -A62C0FB4 -574D3763 -BCDAEE6E -BA27D40D -896903EB -8AE6171C -A911D78E -970FB490 -33B8A631 -893F7E3B -700EDF9D -EA7AC6E6 -6041F473 -FC6702EE -F225A258 -96A21B4 -CCA94D4D -FA6D00B7 -35580441 -F5E42BA -EE9AB535 -50874EBA -4454B2B -30653468 -9ABFE240 -29A13784 -EBF5F88F -B1769BB8 -EF22637D -A2FEEE4E -4B39E8F8 -38AD4316 -A3FCB454 -7D6F402 -18CEA9F0 -956B2CCE -6559ADC4 -F00F696E -C878E2A3 -3AB31BE4 -FF2E6E3A -3767BE32 -37CFBCBC -C307A74B -ED6A132B -8D5A1B70 -774C41D1 -A45F1CA9 -3FCF576A -C1BBAB8C -5B11B23A -620B6C8E -A6F5CB83 -450BFF8B -FBB9620D -BD936B56 -2FBF9A89 -2E000CD5 -E508C955 -2FB99422 -5043B664 -1C43CF3B -2D7E713F -FAD8A72B -7CF2FA33 -8FDD90A6 -8B5CDCDE -6CBF908F -740425F6 -D142F4B9 -2B30DF9D -3808D354 -508C4729 -E6FB0279 -FA0F9DF5 -2FFA33E1 -8A93B18 -FE7C0855 -E69193B1 -AA7E4DA -DCDD121D -4E7CD1 -14C03D9 -ACB60232 -818C10F0 -D8CAA46E -2CBC53B4 -46F82991 -9B24E92B -E1DBF265 -C6649C -87D0CA2F -C24A605 -AEB470E -8DC36FE7 -2D6B856E -9B459A3A -5C204000 -C7CC0BA9 -E637D8C4 -1F8C7240 -41788DF4 -27B94DFA -BBA5B2CD -51E1AB57 -FB14B16B -B6821713 -F955BAB9 -44FEBDEF -A484D04E -FCC08A15 -A117E11E -CAE09305 -789A734A -338EAB60 -183825B -61931C6E -ECBBBA86 -1AC53895 -BCEFB579 -CC68D938 -217A4ED1 -3CC6F2DE -12E55EF5 -FAE1CE98 -CF89DDCE -8FEFFF33 -8C27552E -6D63AA8F -B094E27C -4E7632FE -5D9DDBD8 -8E2766E6 -2EF9333E -98B9A7D4 -20D98AB -C12C8047 -5995F2BB -BB30E14 -C769CC0E -632D8C76 -B7FBE051 -3170D046 -D595ACCF -190326FC -D1D03166 -DA4420CD -81FA57FA -D8615FD4 -33AEF793 -E2B32AB3 -E2B2D613 -5A37DB74 -EBF473BC -62C5F8CF -624D5D2D -9A9006D4 -8515BED2 -7DD650C8 -D0BABA59 -1E635B2C -690CBFF7 -E4028EC4 -E4E5B3C2 -57607B0E -D4087B2 -3C06022A -813133A2 -B206699 -3827A132 -985BF479 -6C11EA62 -F58DA68F -818CD2B6 -F204828B -64A0D011 -A6F07C40 -6816D54D -8B00F959 -3B6A1891 -EF20520A -B5B90BD0 -D70B3B4 -7B165E3F -FBE60B95 -50656296 -6250C189 -B50E29BC -7BBB35AE -124AD7B3 -BAD38F67 -A0CA136 -FB03F6CB -B88FB36D -9025524E -4EB80454 -D07FEA2B -D9385E1F -B1EDF69A -11D2AE5C -9EEC00C3 -55916263 -AAD5CF88 -2740548B -662FB2DE -173DFA86 -8D734BE9 -D4A27E13 -E92A39A2 -A58A3F4A -A71CE9AC -B43ED5F -1600E2AD -265C4182 -4EA4F91 -1E3A0BD5 -62650FD0 -BC6E23A1 -3BF3E963 -5F6AFA4A -6BA2B659 -5C00047A -E8F81B0A -C30BF4A0 -DFF059E0 -4E3F93FE -D688F348 -3220541C -F8A72F57 -6D78CAE6 -AF13AA11 -BDB3229D -936DA76F -749DB9C1 -EBF347A6 -BBFA776B -6472B218 -6144ECA8 -E66CD255 -274BC846 -64C0C67A -95748CF2 -25DE3E48 -29A685B3 -CC8C7B15 -F18FA7CF -5F2D1C01 -6DFEC90F -CF834DDD -A72D9439 -BC6D83C3 -9F888C34 -385D225F -168886B3 -98EF8EB2 -BD8ADDD1 -80DA0EE2 -F4196AC8 -6F020F21 -61136480 -4DA28475 -86A506E0 -1A75F4D7 -222C4645 -8C4486EE -98560E3C -944205C9 -D5E0BB3C -C9667421 -2932030 -BFE65EB0 -FB463370 -9FE77763 -DE8ED32D -FC9BDBEE -FD77E3F -288C605F -7475F3D -C3F75513 -C5AF2C40 -40FB62E2 -2C7C83E9 -A8A7E6CC -512E4560 -950C9D -EC507007 -65B7CEC6 -4A91094F -3BDA586B -7029FB6E -739B556A -678652AD -7B940AD3 -4A8728BC -76841FC0 -F53DEB4C -1B13B0F8 -80A5CFA8 -69C8B602 -6F984889 -14A53B17 -409BF6B7 -46D597EE -3502ED7D -315B1DE7 -E785791 -21871730 -78BE7E05 -D1536BC0 -F9708FE6 -EE4E143D -4E498B00 -A2113F88 -630DFE4E -3FA3D4B -F88D623D -3ADB0736 -BF25AD18 -CB89D619 -1D41D458 -EEFA6367 -7671EBAB -B98E8CFB -238D9F19 -C5155B -223C16B -E484FED9 -DD6A6680 -5192089B -CFF24757 -F2CD17B3 -CC3C7B1C -581E6ED2 -C2D7E5D2 -E9789543 -424EF913 -E6B10C7F -706C0B16 -6EC36BE6 -54C41CF4 -CD1EAD0D -17460ECA -452A78CC -D680E5A2 -57AA8EB1 -252EB084 -9DBB8E55 -BF759D75 -6E5E9F27 -30EBEFCA -C4514A4F -FE76382B -99A07A25 -F9017D0B -452226BA -3DD6111B -967464D -C0BAF41B -C4D39425 -767A57E4 -7183FC19 -844A33A5 -54F13F7 -C5854DAD -BE406FE9 -14340FCF -F665DC28 -701D2EA1 -A7B6AC6C -AC3167EF -C3CE6810 -C6844D77 -64887D7E -4EFF4E1C -8508CD3 -45CD4361 -3FAB9023 -9121F935 -46C5C6BE -272C83A9 -24762973 -EB858013 -FF2D23BA -6F5C8026 -A045E967 -7B844395 -2611E8E4 -8AF4659 -89FB4D33 -D9F50DF4 -CA6BD0F6 -A47A1386 -F78D3515 -2E73ABAE -36C0297B -DCF0FD32 -3930C7E1 -246799B2 -BF8BEEAF -7AD6D40C -7BDCB9B9 -7829D32C -EC826EC9 -ECE1D576 -4E3D613B -DCB44DB2 -67EA1BF2 -D1DE75BF -4609E175 -423132A3 -D33DD5F6 -D74829AF -FE0FB1F4 -C32939D9 -4FB97597 -1441DE62 -649D26B5 -4835C073 -1F67EAE0 -E28AE826 -DB808A84 -58FD0074 -1424245 -6BD9E7E1 -26476595 -E8C08661 -F1F0D3D5 -577263A7 -CB86C426 -EA57839B -C8B37BC9 -FBD2B525 -D033D0BC -A3A0474F -22EDE40F -CCD58291 -CB64AA7D -3176C162 -78DE2512 -ADD0A1B3 -EB41F141 -A7B5DAB1 -C68652ED -1F8E90D -31578AF4 -CFA12A8A -E20A88F2 -74AA9676 -3B353B5E -1956E731 -AA8B10C0 -63369269 -C833A9E5 -9425A8E4 -89DB1783 -1BE23F63 -D84221B9 -F8D9FE9B -EA1FD309 -E16516F3 -8F0EA801 -F5256123 -F21B02D8 -F3335520 -F7729F5D -B7F2AF17 -6B97F182 -806347D9 -962A011D -A5427014 -B7358896 -E9D6A1C6 -2E3DBDE7 -94B06EA1 -4B3D9107 -26F1956B -1726E033 -6660681C -39E4E3D5 -E8CD4742 -78D71E0E -15733521 -89D0606F -D449755F -A2753DF9 -AC7ED71 -7803B9A9 -87CCA2B4 -23003317 -2A91CE6 -C37B28F5 -CD9A436B -893C12E2 -C1FB04FB -3D8230BC -737002C2 -15314ACB -F4D74B95 -6C8BCBFC -292459A8 -1692BDFF -DC68FEB8 -48DEF854 -4BAE6B50 -8B850B23 -AEDD7125 -5B740DA0 -AA83A652 -474C59D4 -A4B2D4D3 -451C3B83 -D93BD101 -BF10B243 -8AB74771 -68C5891 -C8EE35CC -D22DC638 -5C7FA2D3 -54A2001A -747538DC -AC75ECD3 -F1BBFFB4 -844C0E4B -D7D25E9E -460EC0ED -688BA8D7 -CA6E35E7 -9396DBBA -3E9C3E0C -5D29B720 -3E5BB85D -F1CFA9A -8EF00E21 -28669B1B -98BE145D -2696E360 -F91E3763 -B0E3F6FE -45699C1 -F5945549 -2CB64CA4 -F3508C44 -653BABD0 -773F51CB -9D228D81 -E4FAB747 -1DC767E3 -89A77290 -8E2A722 -45D00328 -42E979FA -C19D28EB -C6645B54 -5AD41E9A -93587C5A -719944B2 -B10FF0A7 -A57FE070 -78C8DFAE -138BFBAF -1126A4D8 -C9DB256B -EE01D5FF -A8EB81AB -80AB24B4 -95B129FD -802078 -A6F71D37 -334BFF82 -32678187 -4AA896B0 -149226EB -5B8C446 -D1799EBD -74EA35A0 -FA9B52C8 -FAC6A436 -9E543685 -C1184EE -2D8CF846 -C2AFF300 -18EED386 -80C04036 -77FA6FF7 -5D1512F0 -D2C0C9B7 -22DBA873 -62468BB9 -42C90933 -F7EA7A3C -69449140 -7DD1B0F0 -52AAADFF -2F8B7479 -70B719F9 -CD8E1081 -4B46932 -DB933B74 -1E7A04BF -75DC735A -C3925701 -7EC84718 -DFEE049D -E8B3328A -3A9936EE -F2E22D2A -1F2B5894 -DB44DCE5 -4F1DD5B4 -B66F3E9F -943480BE -ABA71BB2 -E4F15D5B -4C9D7A9C -B751518B -24C9762E -F9DA3386 -D13AB9B6 -5CFC891C -CBEDF3E9 -395421ED -5A3570B8 -1641D0A0 -AF9A9981 -A07CC659 -4BA92C0 -D94C7431 -AA749489 -372456FB -690097AE -B5EF28F3 -1F8F313B -6C45ECE2 -24F4CAD9 -40C5200C -920AFACD -A2E0DD6A -CEC81C6C -DED2D22F -4AEA1A34 -7504D5DA -1F8E8F02 -72100835 -BB4AE282 -A0154848 -EF3ECE2D -6DA87A1A -46D17BF -DAE80D31 -FA8CA757 -8F75F943 -AFFB5EDD -F1A09255 -A80EDAB5 -5AC04A14 -B51A2E1E -FD9C51F4 -F99A5A90 -3EA5F0D -C4D40DFC -C0280AF9 -CEC83127 -FA1A5F6B -D603510E -3663D878 -A79682FB -B7313271 -7E37A2C7 -A1CB289D -C51B6F15 -EC66F0DA -80D5C268 -F3A52A28 -E056F895 -4A0A2418 -66E47974 -8E8CA911 -FD7E6D05 -70960317 -5D378166 -3A2D634 -CA6510C4 -93BBB6AB -4FE2CF83 -2273B7D4 -E372BB74 -8AD6B40E -496AA885 -11F4186 -8DEDF498 -5435E535 -5145EF8D -44AB3DF -7B449D2C -3489063E -F0A61E35 -A2F75775 -F691A0D2 -9CA997F2 -D64FFFB7 -DA79CC6A -2DEA4171 -D2E4D598 -C641D01 -79699CD2 -49FF5A89 -C967A1C4 -F4C7FF25 -9CD04F9A -374C3740 -7B6376BD -ECC505A1 -E76F3618 -42C0B205 -B28C63BC -2BA4280E -7278103B -83B861F6 -F862D563 -433B3F81 -358E4226 -2E9334B5 -2E9B7324 -23BF3CB0 -1E44A323 -BAA2480D -3B8483BD -419659C5 -91A9B2C2 -82574F8 -28A32CD0 -3534C89B -759FD52E -B260329C -82112334 -2D5B7F7B -816C0227 -ED5FAD1D -7BDFA5AE -B5C8006C -BD9691EA -36C28C33 -B8702558 -EB3E656A -D752A865 -FA94FF5E -AE5D43C3 -747587AD -6E5E5C96 -39312BCE -B13B468A -81543486 -1B57D2B3 -4D3D70A7 -2D4ECFBA -640E83F8 -4FD1588B -4EA4599A -E231E4F0 -A2D4437B -47D88CE6 -D048C6D1 -4CA7F923 -E9E435A8 -E93D6805 -C032C4A6 -E15934E3 -CB728ED0 -E7D65CEA -8E5D2F8B -1676D174 -B42D23CC -A1462E09 -CA718E2A -F5BA8F57 -EFA467ED -6DA31185 -895FB4A2 -649A7D89 -3B71CFA2 -C67F9D02 -DFBDDF09 -AAB8BDDB -870C617A -220F7717 -795DE75E -5C787D87 -BB94CBBC -99928778 -9D5C4DAB -4EEC433E -F4C08960 -F71FE87B -BF78D7C6 -671FB341 -4EAD6A0E -534B1D46 -1B4DE7CF -A7B45E06 -97F43041 -4B77382C -61EBC96C -336A9206 -E2A6FD02 -72E6EE51 -26144F77 -DD22DF66 -CBAFB596 -B9CE864D -CEBC372F -907981E8 -A9FA3C97 -6B1704B8 -B1160637 -FE603AC4 -274C6ED5 -6C317434 -77A16703 -2489D28D -2DBFB899 -4A3D882B -E81AF570 -1B8F583E -F1CFA601 -C7B776D2 -A26651A3 -303D5E43 -CD80678 -7E9DCEBA -E0F128C5 -4B1807BB -25B10534 -4117D98B -95079C39 -58C7BCE2 -AE0AF4E3 -331A0152 -DB3D821C -F4F11B78 -E2F55DDF -15BF23DA -15E7695F -1F40D321 -128A49CA -2D25CD8F -AE762164 -7EC8AC49 -1D9A1899 -97B6BAF0 -D7E07736 -A2566738 -A903EE89 -67CD354E -89C1C57A -97B3EF5C -240FC35D -52CE3A2C -15E8D7D2 -6A8A9E32 -4254550D -A345B8F1 -464C5420 -FD2E1DB2 -C629DA54 -81D24EFE -421E30F4 -E4008742 -62839D68 -AD78257A -23DBB6EE -49DAE0F2 -B1B07AAD -EC7791BA -3B4D3E2F -C241836D -C836E98A -EE9D6DA5 -33B5A570 -81D50D38 -6EE68232 -76677B3C -AF355302 -D2415D7 -1510CCAA -A6627F82 -A5A96453 -CD0B833E -5CF4C1E1 -C14866A -AFB8FE0E -B7D08BAC -4CBFF97E -F0191C3D -4E2A3EC -E76E048 -FF368683 -F4DF51 -8D0F29CD -91E431F5 -B6808051 -927E3404 -6ADBDD1 -5852A1E9 -394DFE4 -8990BE64 -A69026EF -3656791E -63C5AC11 -B9E88670 -9326F9CC -414EFA53 -B5028CB5 -22181175 -3B1A49C1 -22FEDBAC -A39731D2 -9C7E2E87 -E931F133 -D9AFCE3F -C2CC527A -A85B19BB -C66CB9EC -93558B54 -F5197362 -7EA88969 -B380F206 -56AC8890 -56D0C8A6 -B39C42A6 -7B966768 -1B6E37E5 -43429273 -668BAF0B -327CE28C -CEA34DC6 -EA727DD9 -2C1AE3E4 -802A7A51 -A1934827 -1A18C4BF -AEB9CA99 -D572EF76 -18DFC210 -11A4385C -671ED0D6 -D1E5D02E -9EE0AE12 -DF1EC812 -51BFF4B5 -CE089E79 -CE4BADF4 -75879327 -C98B6178 -D7B1E852 -95D6767 -1283D091 -20F90A2C -9020BD75 -504D84DD -D8982F3B -E41E0CF4 -55F4FE2E -2097DB6F -4B8B7790 -F3A1E487 -F4C274C1 -3452A00A -15587F21 -687D0671 -7EB3715 -945B9A90 -8C83F0D1 -8934F9BC -38A50D8A -7EF49EB5 -A45D34E3 -6C014201 -D4D19185 -821E216B -569485E9 -6DCC7357 -7711858C -852AA907 -591CCDF4 -775E7DDB -9463CA74 -DFF1EFEC -1F60E4B -2628AEE4 -EC89EF52 -49D232FB -E8BD7DD1 -EED418A8 -C35E3A33 -5C739CE7 -979E4B23 -B386E4FC -62F98F10 -2FEF090 -599508E2 -F3F9F428 -17A18287 -639B700A -AA9AA4A6 -B1AFC9E7 -FB6E8D34 -44F6A6D9 -EEFB7788 -9D616EA3 -78F3BDCF -A5E71361 -1D25ED7E -9059ACA7 -89118CEB -BDE78C2E -55B9E0E4 -FB6B9A -2DBAC44 -85C0DEFA -1E222914 -2413FBCA -C8569486 -E757EC3C -5ED9DB70 -3EA2086B -F4A4057D -E29E1B00 -C271490A -525A60E4 -9A286CE0 -61A42BC0 -D3F6ABE4 -9F31FB75 -335ADC59 -9EA61808 -232ACBB1 -270C7B13 -6EA6535D -F1D1B1A0 -AE9088BE -D9E4FD87 -3C8C0972 -5EAA57A -26997EF4 -3B02B885 -A4722715 -434BE51C -495165DA -BC9FC978 -18D8C1E -328203FD -12643D32 -65EFAAAF -71297EEC -EF8496AC -E5B7BF16 -2B2C5A0A -86B713DD -101E03D1 -14F4FB7E -34EBDF2E -2A9F4CF5 -7143B386 -448716E5 -C61C8469 -5F9F797D -6A89B910 -548E4139 -C48968FC -11F52973 -E18DC2B5 -7EEDA069 -2EE38156 -B8F99E97 -E066E1BB -ACC5C04E -6E645848 -98CA4890 -78191984 -84EC83C1 -C58D9987 -3AA63D1C -E17CA75A -CF8B5E23 -155BC19C -5809C3C5 -E2A7DAE3 -D55C1B6A -585BF6D2 -5D192255 -310467FC -ECA8FE97 -4ACDBA8C -E6319F8B -FD4F3E85 -47FF7B0 -B6FA3B69 -D75D49C2 -B831D3F4 -1D6282B8 -E335FE0A -C955B98D -87968F47 -B9600C1 -805AB6DD -2677ED62 -86AA7680 -836DD1B4 -82C073FF -F2664656 -DBE8C3BB -E4DA24B2 -AE14BE60 -1CF178AA -F2C661B -9ED5C4B4 -3B67F448 -426F85E0 -40195BA0 -66BDEE57 -3A128638 -A48D546B -7DC7834 -C7706566 -1E23F578 -CF55EC28 -F46031E2 -CFDD3546 -6CD58E9C -C40E02C2 -19558D54 -46E056B2 -C1581093 -20C057BD -34695F72 -1C4B7B13 -2FD3155E -152F2F86 -189E2F15 -31991472 -1B85405D -D1F72A1F -8AA93824 -CE409894 -9F6D30AD -E72C6DE5 -A31CC799 -694EB42E -C2D96633 -7F4776D2 -509C0781 -6A84F278 -E11739F5 -CC5EFAC4 -DDD81D37 -6960145A -E40C5DEC -70C068DF -1E6CC338 -592EDE93 -A19B8534 -DA27B1C9 -608D85FD -63AAE798 -509A13B -BAF29F05 -69342538 -5A2FD47D -5FA22C82 -AC7E3397 -4E546537 -4611C427 -DA39FAAC -445F1CE8 -5BC83B69 -64AB6C7D -F2B4EFB5 -DC0016AF -987EDDC1 -3354C952 -A5B9ECBD -E5B77548 -997279F9 -7C460F6 -82A1099 -B7CF0472 -ABC3726D -DD4155C0 -319B8C50 -CAE7E88C -910F1C5E -B1367D8E -56B78305 -8F4CB7A1 -8765A3AA -89624EB6 -22DE29BD -A12D4C67 -6BC56ADC -B587BB0F -3806EC0 -3C269C48 -9EA289A3 -B5EB4FDF -1ADB0729 -A991429C -CE574FF8 -CF071DB5 -CE0D372F -3D99AE5C -D6D56E7C -3A493434 -86AC7C63 -FAF8B585 -B9F1994 -89CB3A3D -7C8974F7 -2169640E -D74D62DA -8F0D850D -3B9D0225 -4E2CBB6A -BCA7006 -9DCE6E7B -3695D660 -EB344960 -F3D223F5 -6B8CA588 -45744961 -2F493968 -E9CBD376 -9B0FDE95 -F17603FE -B0825FF2 -5B1CCD35 -6F98639D -5CBBFA88 -890B3C42 -2DD4CA67 -DC9513B5 -A7B91C22 -83A897B6 -399ACDEC -AD11B2EF -11D76C5E -E170FB03 -9326B999 -87845BB9 -CA14B73D -943FE9FF -341ADB81 -D800A2CD -A7265DEE -1E7F3F7D -8AC49BD1 -CCE49B1F -58764B66 -D57DF0D7 -229BE279 -42DB683C -D8530314 -F1FE931 -DE1A4EEB -DF35B43B -3E90F80 -B3934E4A -FD658EFA -E6CF1CFA -472B47E9 -20F155AD -77571441 -9FE03233 -8BC0043E -80E9B238 -D325F7D2 -F0333147 -FC86E62F -A5451DCE -D9374B52 -674D4083 -9952E9AC -B529BFF5 -B7E072D6 -5BCD2886 -8381AC4 -5CD6C7FF -F24E3549 -9EBB5EB9 -23F47A79 -49D578D0 -6CA5874A -2F3C83E6 -D975C720 -FB484F11 -3BCFB5C0 -3A66DB47 -B3BB4F33 -D5136C2 -D4AB89C5 -8A782859 -C8FE9ADA -B5D57BA5 -9C8D2781 -7D0919B5 -D362A6D6 -1006FFAA -3BB31D71 -7709BEE4 -8A348C59 -44A704D7 -96F2AFF3 -592DF706 -F3247289 -3E9BC2A8 -570D8349 -2F615AFC -B3802616 -B54191C6 -DD155718 -455945B6 -C74C7DF8 -232005C5 -6185D2D2 -8FACE1C -73D27EB -770D2680 -DB913D28 -90FC0FA5 -9DE358EA -2BD3287A -D5C8095A -DE541F30 -D10F0F61 -4657627D -739F2E93 -F9F7B479 -DFC6490 -3D554A13 -D3C6C2EE -80145765 -D601408B -52EFFD8 -A44B597A -9E65E39 -2A5CB536 -A0420638 -EA752AFA -A7DE4743 -18480882 -A559B83D -2DC4B6C -8F33055B -7C4E3B8D -52C7F9F7 -9FFA0A63 -A0413C90 -ECA35002 -AB4A7AD9 -A829613 -71904BCD -9560A35E -118EC2D1 -CA730775 -A631E447 -F526588 -C415CDC9 -DE509745 -C2C64E6B -4A3350CF -CB04DB23 -8D3BA4E2 -3FC18EC6 -C8CFB2C4 -C2B600BF -FE36BBA5 -EB4B302E -F2BD24D2 -A820E2B0 -DDE54189 -744E33AA -9E63B141 -21C2E601 -2C12D5AF -85AAD794 -EE1F97C2 -9096006 -14132FBE -FDDA365D -E3623A52 -9F52F94C -18F84D8D -F866F6EB -9759E208 -38195047 -E31F1936 -9D7E9182 -CEC2787B -975EB96B -12F202B -CA36D8E3 -A694168A -F033E484 -DAEA79C6 -C465D02A -154EBBA3 -FFE408B5 -977F7FD7 -59992C2 -72DAEF3B -47AD9078 -11CEA76E -3B88B352 -BA2FF2D9 -2A7F4E47 -DD6B398A -164FCDDE -CB7284FE -9FCF9606 -34406791 -104CC89C -A2F32BB7 -213E9CB0 -1E1E0B37 -7226FA86 -20502886 -4C1C9E90 -2D4D0ADC -D843214D -57730409 -614341B4 -ECF30446 -330F5216 -5FBA2C4F -B4102EF6 -D6129240 -7D5DFBEA -EB01FCDB -7CA7342 -46DFED3F -5BE1B2D8 -2F40EF9D -59622E77 -A6AEA365 -78133A87 -7FEF9106 -3956BCC5 -8C6509F9 -79525FD -D3A518F9 -A76193BA -3F552EED -F974C309 -12A5B04E -A71DD6D4 -D9FE2B7D -95F822BA -EDBE32B0 -92BFA916 -79899BA5 -3FBDC933 -BC0E7C30 -6D7FEA47 -1F1954E -4F2F17AC -F6EA71E3 -B8E34FFE -3BCD8BD6 -695B7934 -D4CE8358 -26B0699 -784EC0DD -625BC98B -8861D087 -44DF0DE -35B7517A -A8FA9A12 -244B927 -AF7A58C -BE48CF00 -95C13C21 -9D8DBCFD -AE8B4798 -ED04535D -47A2219C -C8B87734 -8355D2A5 -B4127CD6 -DDA3394A -36846F2C -F38282D0 -177D3FF5 -EE8924CA -5E6CB3D2 -1F6C2C7F -3EACD843 -51A77194 -51D89AA4 -DCC17C24 -DB5043E9 -25D52B74 -1C7176E2 -1F483DAF -24B587EA -6188E94F -C886E2F7 -7B24254F -A761DFA7 -357C70B5 -6BC46A7 -31B8CF7C -BACB7205 -6C1B0387 -50685794 -7726ACF -64C49E4D -7AF06B7F -D1F2AD02 -E4F5BB37 -2A8A4925 -4245E047 -B7CD8000 -6C72A8DD -19590349 -7F7EDB49 -5DAF5458 -5EEBC5E9 -6E84757D -AD3868FA -F85A2B5D -A8569A1 -88F1F6BE -AF363178 -D9A61BFD -A2959EC8 -C1343E46 -B34A697B -22530AC3 -70213F56 -1DDEECA5 -4DF030F3 -78A4B8E6 -F93B20A6 -27AB7A7B -F43A2969 -AEB9E421 -75A8F820 -52CD9316 -CA166F29 -C28D14E7 -51E4C76A -50249FCB -3EDA432D -C6C3EEB3 -6CFF2A56 -5B50A9CE -D2CEB19B -2F16746B -1C19CB24 -9CD2076 -3F804860 -FE59323F -62F1F95 -2CF56FAE -E1A3437E -973F442F -DB62AE6C -C0AA4F87 -67224779 -A28378EA -6C5BE4D5 -97F75FF8 -49922E2 -19ECBBCB -C89000E7 -436496D2 -29C94230 -21A4D75 -3DF46E1A -A6D150BF -4EDE1CCF -37A996E3 -B0F73D3C -33E41F15 -14076103 -7BC6082F -E98E377E -1E787464 -16AB93F5 -B8E3ECD1 -4A944320 -41E77D61 -8B669E91 -20F1F65 -F4D26572 -81D9D4AD -99843F88 -7066E60C -4D6B9549 -C79BBF94 -F53252E4 -EDB94B9F -EA504F01 -9BE5AD3C -98F301D4 -C1C0ED35 -3F2734C7 -76351C26 -AEC02AAC -B9D4A014 -A01F14A1 -2DD27A90 -27C43590 -5A06F84E -64CC23AC -76387C33 -A07A8306 -3BC362BF -5ED88200 -CA6DC828 -4DBF3E47 -F633C85E -96F44176 -76B2A46B -CF414D71 -AD77A07A -9A1F71BC -FDEE86EE -7A8AC33B -AD3C257D -BEFBD214 -5B562E2C -3527654F -FAFCD066 -575BF8E0 -BC2A071A -C903C2CF -EB1AB30 -7B8C7CA1 -5ED6E493 -E1C822C6 -368B9DDE -91122C29 -5B1358F8 -6DCADBBF -ED845AC -61E42CB5 -732B420B -39154876 -C10442B5 -E1CC1A11 -875215B9 -AE9E4FEC -B2435F4C -DBC844A -10FDB0DA -F85D3FC4 -608B78A1 -DAE2B7B2 -DCD08039 -CC0962E7 -10602FA7 -62522FE1 -D3AFCD9D -2882BAA3 -70C31CD3 -A69E9A2A -975BB834 -2A35C91F -5FB2644F -69B2BF1 -9C365DDE -E4199E06 -ACCF8904 -DE105FEB -9C07AC45 -F75CF55 -EF6E3E9C -1FB088A2 -9A93BA86 -4E91C403 -E07827D7 -5F7593 -FC778EF4 -5B831E07 -354A60B2 -8D39DB34 -5C3C16CF -38489DCA -D83EBDED -F9E5BE76 -D2C7FCF3 -E868A2FA -D29E98A9 -5AFBCA1A -D01628BF -B2334643 -4EC99A5C -189E9585 -CC2B18FB -C692AC25 -A7F6B978 -C1530E03 -AC815E6 -6304151C -52EB83ED -C4921682 -96441A15 -56338D69 -5C82292 -FCA308FD -978D2310 -192DB3D1 -CA6B9EAA -7AD9F05D -E7C35D2B -AB5505FB -3DD6013C -532AAD00 -87EA4F8B -1AC88F4A -4BFC2053 -65356D9B -B03A54FF -6F585110 -2C75F6A4 -CFDC2733 -3E7BD30C -2DE068DD -F318385E -26CEC150 -532C4D5B -B264C41E -46229E71 -39E85376 -A074FDB6 -461E84CD -BADDA454 -77D4AD4E -479457C8 -F0E4F65E -DBA7730A -24D4FEE1 -9442683 -7725F0EA -F8647367 -5F4D5208 -6DC11B5C -4E65BE22 -EC0713FD -1D54F605 -4B0F99DD -E585AB57 -E14C5EA4 -B7909465 -12ABA66C -EEF519D -62F4CFD1 -48DEF31F -16B38659 -5528B313 -5C031870 -87ED6DE1 -55ACABF2 -FACEBE99 -3007B9E5 -F5C0C90F -E97F9A15 -951AE375 -67E41B2C -CF7F6BC3 -C7836B7F -88B077DB -DA60BEA0 -1FD6BE04 -95A08F39 -B7EA73B3 -10F6685D -A9C04118 -EAC17020 -CEEDC89 -7EFB007C -8D900B82 -4C2BCF1C -9B9BDFC5 -28846A96 -139B4D19 -32E0786A -72F19BF4 -66D61EB0 -609F7568 -3A785E09 -B6F2294F -96E73FE3 -99A0812E -1BBAE42 -9DF477DD -111FF2F7 -8A882B32 -2542FA4E -7BEAFF22 -405268CA -2427EDE6 -7D9F0726 -7EF6ABC7 -7F8DD904 -C3F2F4AB -213FB22D -62AD3732 -955CA4C7 -9E83055D -BE9C70CD -C0E6DDF0 -892D1B64 -56F3A648 -43547D3E -35EB967E -EBC18CA5 -D4DAC35A -9DDB564B -6DFD4F07 -CB02555B -425A1595 -B978D512 -B3D78E9F -A3EA970F -8E27124E -6A57B7D -26D405F2 -C8A1CED7 -7A6338C -A497AA49 -95602B8B -C6F1583D -CF5B6A58 -81F2D693 -A34B3C07 -B7180B4C -46C6E5CC -8C3736E9 -980482E6 -8A34B532 -B698520A -20E9DDDC -A5D8B27 -6A0B3989 -10071434 -C82002AE -8A343B26 -2FD61FC8 -C1257546 -FF154858 -1AFEAE33 -C2B1532D -D979A2DC -93F9FD3F -769B0DDF -4132C851 -A372D4CC -6A5532FB -E8F203C1 -A421B3A0 -B50F5C9F -AE5B067F -8CE6F896 -8BFFEABA -B0CCFB51 -D455681E -FDEEE781 -A4873A97 -E3FAC8DA -5039A29 -C703A1CF -E4E29AEE -39C0B0DB -DE5756E -303C7D43 -586246C -41ADBF9B -D1CD7207 -3BC8FD94 -7E50A650 -390914DC -ABD6170 -ECFBE529 -3D51360 -569802B4 -25F255D -1523D176 -9F98AEF0 -9DB1B681 -DAE01D8 -46D4F7B7 -47DD8DB6 -23BDB9D8 -90C47F30 -998BF564 -5D60F7E4 -309B5851 -9D246C3 -C1895130 -1F918DFB -6F303265 -71E0D0A7 -77F2FF64 -589BBF0D -A25C4510 -9F05AB6E -4990B583 -D335BD7 -6CBC0400 -D7894817 -36176CCF -1C6A98BE -53EE793B -4003C3B3 -9E46BEB5 -57647A51 -D5599FED -38156D3F -B1F425B1 -7AD6402D -74B619BE -A11B18AA -9C4211AF -DB076668 -7A94C4DD -6833F9A5 -A088A4AE -6A70BAFA -BC6740FF -B7F6508A -F3BAF225 -29BF8108 -7F074F1C -18B3D5C1 -8A948077 -BE0483D3 -46B195FE -D7AF0FD0 -C31414F4 -B5BD4871 -CFAC4C37 -57D2D42C -10A73F90 -407A80A8 -21C50A11 -22E165A0 -8361F9A8 -EDEA52BD -28F3650D -CAD63254 -9AB9033E -82BA1020 -E6E6A470 -9C829847 -BC3AB877 -A91A7C99 -1ABAB07E -583AD9D7 -9AFA901C -9AE116AB -27B4F5A6 -877D0225 -92DEB3AB -BAA1506D -EB04B325 -C275FBF2 -2331B6DD -74F623AE -933EC4BD -9470C6AF -6C0828EF -AAC0532D -318961A -29C176E6 -4011BAB1 -895DF78F -410AD703 -F363E54D -B4913DBE -6B5047EE -E7099A72 -E2961301 -E587CAE2 -1449E31A -EB048AC6 -D21BCEF -EACEF00E -EF09B5C6 -2C050BB2 -D660ACA0 -361BA74E -26D1A92E -10F1FD22 -DAD028BE -5DDB96F4 -A1C8F873 -66F44797 -DD6019B -618F707A -4E4525A0 -551B89EA -6A93FE33 -8219D90A -5E3E3FA6 -C9C25F24 -D4593D42 -CB12B9FF -B09814CE -DAF289CF -C59234E7 -6C96C435 -1E7337A5 -FE315E60 -451A4E00 -CC3E2B8 -EB1AABDF -B2D1AD85 -2A12A008 -B525A4EA -ABE700A4 -80603A44 -3E2E49F6 -48630509 -9673204F -7B0DEAD3 -B0B2B6D2 -68C0453E -BA31833B -4BD68812 -C64D0638 -A8987E25 -48850A6D -9B337E66 -1D99461A -D47AE0D1 -2E3023F7 -29CD452B -A211306A -15CD90B9 -D5D57C24 -727FA881 -51316FCD -BF62F735 -9E67B311 -51A2B90F -CF7C9936 -A537087E -3EB2EE91 -8F4D2C93 -F83E1906 -826C14F4 -6CBE676 -ED2DF931 -38270781 -4C567B1E -96BD9972 -E089656B -7DD03E9 -534E777F -695B12CF -338EDC74 -D5E3DFDD -13937C2C -A386AB68 -CADAD94A -B624A652 -9E4D0656 -3BDD26F4 -8B9D1ADD -180D5005 -E8744FCF -6CA71503 -20697624 -49269DB9 -B27B12B1 -AC181CE2 -9289684A -E5D3A21F -6A79B5AE -EE6DD5DE -355DA7A4 -C5B13162 -5FFA0324 -602F32A7 -85BA4032 -DCBEE18A -D76BFC80 -4B72BA0 -4101BC2D -A3CB1CE3 -4C6262A3 -59198E3D -AAD7C84F -4DFE129E -E8153DB5 -66EA03BA -D3247EB4 -750DAFC0 -68FB3A27 -67005B98 -C2255031 -1D9106CC -7FD4C833 -491CF81A -28D5F0BD -E2275FB1 -762FF58D -D9D940D7 -C6B5CBDC -810E0D6B -DAFD7E89 -15C3544B -D7B6A237 -3DA125A3 -3272795 -A7BCF9DD -4FE52CD5 -3FB69C23 -4F106EA9 -3632D2EE -9DA08D3C -5282D2C7 -9575F24E -D390A80B -2897EB0A -A4B9FBE0 -DA3FD83B -EAA2A95A -73FC7AEE -CCDBF4F9 -3EA97EA4 -A8AD7E75 -C533A490 -3FCE73 -D451BBF2 -6A71BE12 -76E1EC5A -1845E1F8 -CD2B7C0F -4D92E7BD -81B44E4B -65E1B458 -6B69FD73 -86CE76BD -88B1CA29 -EA1F0D7F -43D393F9 -C85E394 -B5C665F0 -AE373F77 -46196293 -E6057838 -7C63A634 -C3F66075 -1F15C3E1 -ED457843 -83F9BA3C -D8B8A399 -852DA2FC -3B81F785 -DFA3848 -877B985B -1C82BEF1 -6482EA27 -A4F94E9D -9FB72748 -47CF963D -C514BF88 -4D4B79D -232D2991 -3DEB3B5C -49784213 -9D79AAEC -EB89F7E9 -B9F9993 -71528CF1 -E1390DCC -F4655453 -97847A30 -3C30D55E -72649CB1 -F0647A6 -C6C8AC04 -FB48D1A -39EA9573 -70C70D43 -3F6BAD93 -342ACF49 -F37B506D -EE64D0B3 -4DC05CFD -79E116BD -5458D922 -3957971C -970D89F1 -9AF398C7 -A9A651DF -D3A64902 -27339129 -2FCC3329 -B1C70D5C -3FCCAD9E -C10A34 -80B546E -7EC04275 -512434B7 -526742B7 -E96DE8A8 -27CE6F9D -FD566C7B -8DB1FE12 -93F810FE -C660877D -348D5704 -BB3F2FD7 -9F859C53 -907BB57E -318DA95D -BF1CF416 -3E8BF68B -BB8CE4F6 -A9954212 -D1A396D6 -C33F5A44 -2DC0A59D -5B66EF45 -1CB288E0 -D6874F40 -E275F00B -E6B62E72 -6BB1EE97 -389CF9D6 -8C093ED1 -D4CB36E1 -12F4840B -F18A2F83 -782EB525 -12BFBACE -78F772C4 -91988F79 -55BE57F8 -6605D204 -5A7471F4 -355005FE -267A8C9 -CAB49590 -9479E9EA -BEE93B2A -34E95C45 -61788682 -6B99ED61 -33D4D3D8 -DD149E5D -D3BED775 -287B4087 -A2552A0E -477D609D -96765321 -2696E220 -3B6E26E8 -5CFFD0A4 -FDBF561C -4C41A4FC -B0637D44 -85DF60F0 -539171DD -9A1D1F12 -72ADB48A -D8C0C9CB -E4FE15BC -24EB5C50 -E1A9B3DC -360563C8 -F20C02CA -E9FBE774 -B2FEE97A -EF34194C -6DA8A0E1 -ED9FFA1 -4EB5D717 -47D296E0 -FA147414 -C1F868CB -761182D1 -6B9F8311 -7A99903C -95449FC9 -A349B21D -F2AA6E8E -CBD733B -1EAA2224 -C7CC9CD1 -DF3D1C7F -81343E5 -30682CA5 -65C5BDFE -811D5CC5 -8D2DEF35 -D8B4F4DD -9E121109 -FCA97592 -99E76951 -7CFB5D -8489CBDE -D7A8D721 -ADD1A5B5 -4A96DA59 -CE6C2C78 -17593D2D -F94AF7BA -6CE767D0 -DBCEDF25 -43629583 -CDB11A86 -BB630047 -8A579D2A -FC17AF19 -ED54597D -9BCAA00 -B7865C74 -BADFD092 -9AB0AF05 -AE371DB7 -EC0EE641 -A9781E96 -D1B8A429 -FE9A2043 -BA4C2CC0 -F243E36 -78A88066 -70925DF6 -97A35A05 -F18822EB -212A79D -666D7F82 -4558A3AC -FCF953EF -F8C6DD4A -C535BE4F -973A007C -4DB7E662 -C8995287 -B3527C60 -FA4F7A3A -D417AA12 -D861531D -11A81498 -5072EC65 -5886C667 -7EF848B3 -CA4ED80C -3DAEA7BC -34EC1028 -349C86EB -6423A583 -22A163C -339CC766 -E93138FD -7A79EA77 -E480913 -1220E06B -65ED8DDB -ADF487D5 -82CAE485 -A88E6546 -3A7F5961 -4672ECFA -425EB8F -AA3C4450 -44CA10FA -B1EAA942 -9EC93584 -E417CBF4 -B5F4C488 -EAB1DE5C -10446170 -C5F9C89A -391EF7F7 -10C62C73 -817FC74C -DA1A9F17 -FA38D673 -D2026552 -D7CD67A8 -4E0E21A6 -56812AAA -1D7294ED -575452A3 -90581C22 -82E00D73 -A8FECF07 -1CB1E500 -7F51D70F -F840E8D4 -DD73E72F -8DED415A -3F029F0D -C9CC871A -3388492A -AA1DEF8D -F2E93846 -F9CC596 -48221BB4 -6F7B2734 -F5A1010C -C0FB41C5 -8693416B -C8EAD749 -21ED8A7A -9FF52520 -613635AF -92C5E0FF -435C33AD -2550A70F -B17B7FE9 -9CC5F28E -690D4EB3 -5C5DCAC4 -25E14191 -B03B4C07 -50DCF2C0 -499BCF9A -5CCD6CF1 -ECBB2C48 -A2990792 -2105FDBF -3D62BECB -493AA5F0 -2CF5BAD2 -DFF53D23 -50D77C82 -35CDBF8D -E3BD4C29 -6A2FC510 -A9B2D0FD -404B053E -BF548C52 -E52081D2 -AD550AB1 -D4316A79 -776E6C42 -203A4395 -54DAB8DE -EB67FB95 -46E34074 -21679614 -C395F6BF -6D513D56 -93DDFEE7 -7D2866A -2283CD12 -12789536 -5C1F1037 -4170B23 -8BB451B5 -A9915ACA -784C0FE1 -50A95654 -CB574A -8A1690D5 -D9753D9A -3084718F -8E429880 -D1B7693E -A7613422 -C1707E97 -D658E57C -1C2A8F42 -21BE34EE -E545D5C3 -23DF7522 -B7AD16A3 -C6E7279A -2AD251D -FF0BA8C9 -E586EA40 -D86C394D -1A0D6737 -5AE27469 -8A0F53FE -1A0DC5E9 -8A56C2C4 -AD3214FD -DD999E92 -E53F55E7 -5AB39BDD -119C7046 -19B8238 -E21A4F81 -5DE3F0F9 -BFB5E145 -5020F616 -C2794F78 -9B7D9F3A -8FBBF3F1 -1D9C111C -49FEEDAE -1C83E386 -BB5B0273 -C290FD8 -52C788BC -86C12DD3 -6608E8F1 -313C6430 -142570B6 -F75B9552 -C8F1E8B8 -F3E5AAB1 -9E4D9E8A -7E48E48F -2182FBF -F21DC3 -BD6E45C0 -8DC88EA2 -D5B67DA1 -C592692A -979B0A6B -783D09B0 -C2231CCF -5CBB3057 -4C10986F -3F738112 -BED7BBF2 -A2577A6D -13128005 -3C71262B -BC8E920B -40C44CC9 -C6C4B496 -5AA9CBD6 -C7A9741 -2A8EDC58 -D2253A26 -F343439A -13F71CF9 -A4BB5CE3 -FB52ADA9 -1AF0749E -ADABA787 -C22B2194 -C5132023 -846C2188 -33A64D52 -E5CE9022 -CAA4C044 -E7032B82 -30251130 -22463302 -954AA98D -52D6F132 -11E0FDD7 -D62BAE17 -9844BF8B -68ECD60A -E637BA92 -1D7BA1A7 -F091F891 -CC96CCF3 -E2C50AF4 -149FAA77 -F16F7294 -27212569 -B96E1119 -E7806734 -15A5818F -4E05DAF0 -F022D5A0 -303D930 -B92CF71 -377DE596 -8835F16D -2D0B6E77 -2A89FF6F -9EA75369 -FCDF31A7 -8F674B8 -34D270E7 -BFE6FD70 -F165A645 -675B8D2D -318F8DAB -9F52E28A -A464F277 -B998CE45 -9E932DF9 -2918A97F -EA5C5130 -952FECC3 -7DCBA50B -DEE7C01D -96B96F4F -1C6106A0 -85A1AC4E -D62EECAE -6387F846 -271EB1BB -E1A2582 -D1E03035 -9EC6EA57 -300E10D3 -CB91419 -52652E8 -8291BE30 -E1D52680 -5044FC2D -35E58D3F -C6A01A83 -814DA7BE -97A50A83 -DB801411 -D4C43BF3 -BC3D29C -E4A072E8 -6F51D4C3 -21A5886A -F744A91A -5E12BC21 -F86FDFF8 -C320E6BC -3DEC9656 -F89A6364 -F668339E -44999436 -F40A8A0F -71837448 -B09D47B3 -2D2CAB19 -3FF04F12 -D8E5CC71 -33F39593 -160D74D7 -FB841949 -95F0E78B -B9A6102A -A4D3C679 -4774D90A -AC55693 -8F3CF617 -5BDA2B57 -A548BA77 -B1158C29 -FE9A4D00 -B52446D2 -E6DA1712 -3EFF4A4A -41EF9936 -D65FB56B -E3AED57C -BFF89053 -192E499D -DD703817 -C2B8C9A2 -65A8417 -670D3446 -2E936BCB -8A14CEFA -CF71A41D -842BD0E9 -628148DC -9733E864 -1C57CF93 -1A0CA311 -A1E13B05 -2C8F3844 -66C2361E -8981A417 -A4668A3C -271048C3 -6DD908BE -1A933D24 -BD0A78F8 -57C44DC3 -1EE04ABC -32275D51 -B25BCCC5 -509C83A2 -E5E1B85F -D45DFB17 -EF39D3BA -4F4F32D2 -8F1E52D -62A47A4F -7E4010A6 -189250D7 -CF3B51EF -5E9BE373 -E9719F77 -B2741A6D -CF19D7BA -993284DD -A1839978 -AC00E790 -ACD3A888 -1E74292 -6306A56B -F9EC26A3 -9FC5BC2 -2D6F22F -8CAAA98F -CD2135D6 -D2F5CD5A -CFCC3D48 -6AF7A18F -5A3EA067 -8DE9498F -A279E5FE -8C1D89E2 -5D15FE82 -AB291798 -40421279 -E101CFFC -D2D0D57B -5C977DF4 -68D4EF4D -22C36080 -81526010 -E5A41122 -160C517E -8BDCEC09 -5F12637A -F3714AF4 -D21C140F -B1EFABEE -E49A3E48 -E67BFC93 -C4BE9508 -21854565 -60757AA0 -FB5C43BB -150F6634 -115BE267 -3BE8F3E5 -EBF986EE -BA18FFF7 -82B52CF4 -50546F93 -118CCB96 -AA6603F1 -F434B7D1 -FC356F35 -C996ABD3 -CC8CF7C9 -4C2935D2 -2DC9EB76 -ECA4D776 -5D2D35A8 -7C747824 -ECAA990E -A6078345 -CF589355 -7E9AEC63 -859E12C -C2F31842 -6563A3BC -D43FE9EF -39D1717 -AB887505 -1AADAED9 -3D07A0C -7D2B456F -53C1B39B -DF349267 -FD9CC686 -5C1CB396 -89DD96DC -A0D8DA69 -F2A68012 -7F40A406 -1DBF2E24 -B31EAEB0 -5D5073EA -19C16D03 -10E50F00 -47D3D228 -A3C0E13B -5E801D5E -C58677AC -F6E9095C -E2C0938C -14CB070F -11B98703 -9FBA36D6 -5ADB369F -681BC767 -BEAE4008 -5A0AE129 -ACAD1673 -F9992AFA -2CA14EAA -F77F77B6 -2705BD3F -F9C3E6D6 -D3ED854E -4A5FB85D -54187218 -B9B8C83D -EBD38F57 -C0D17CF6 -8B464900 -3F8D26CA -C0FADB4A -7F79A367 -123EEC9B -99B683A9 -157062A4 -91DE43EF -65733625 -56DC9E5F -2C88A8E2 -83AE236C -DDBF0A9C -18873E45 -5040B3D7 -29927CA4 -B5A18202 -93CC4EA3 -5DC2F698 -A97A1713 -A104C149 -B9C5588A -AF182A52 -CFEC25AE -CB1C0A91 -143A132A -27C4A3B9 -D73DB7B0 -53AF7F76 -9A614866 -82A54DBB -D77A5A23 -AE3FA285 -8C2EEA1B -DD21D577 -186EBEF7 -DBACB855 -18E30376 -144A1FCD -773561F9 -F18F3C71 -4A13E021 -8738BA8E -1A9FF053 -56A546BF -860C6457 -9E5F2177 -B3CD57D8 -7A2CAF5E -F8D57DC7 -941CACB -E70A729F -7EDB09B5 -E972B09 -ADB7C542 -3832A659 -AF33DD9 -152082D4 -9A2A3452 -70B5EDBB -C6549E13 -D621FFE8 -15152F3A -7781B485 -67B0DEA1 -C787B62B -75B9A705 -C2A30FD7 -41CF8EA -3D2B2148 -CA0445C0 -802799F6 -FCBCCE57 -F539ADB0 -54952BE5 -B343804A -25752CC0 -3F276012 -7228715B -7F61944C -DCB8676E -132DC654 -CBA2782E -33016B92 -30F194E -F2D953D8 -15A92EA -495D2D8B -4366F311 -8F8DC099 -C4B2611B -D90839F0 -CEDA9833 -5CA78F56 -5D5F4751 -7F37FE54 -5B8F6537 -6B89CDD1 -6728B0EF -D2BED44C -60293190 -F41CF0F0 -8BF08F76 -861F32B8 -2053AB98 -315DF7D5 -58BAE934 -F38B7C9A -653396B3 -E2152002 -A4E66BCB -C1E3F151 -AE7AF50A -545F0684 -643CF8AE -BBC4B464 -7B8F849C -334A660 -3FFF02AA -7EFF666D -F80965DF -42D34429 -B8037A02 -36CA2FBE -539208E3 -D03932C7 -5C619FA4 -FC641E3E -D01051F3 -51DF9226 -116CF628 -8055029F -4A9130C9 -5A2701CF -89251BD3 -52D99785 -B2C16C02 -83581080 -57D8A09C -6D551FEA -EE6334BF -7D8061F0 -8556CEF4 -D9418360 -82DE39D1 -AA9CAE96 -8D3C1056 -8C67B490 -C7BA78F -D46697F3 -879107FB -88F4FC5A -E7B0C68A -3BD94FEA -648EAA00 -22724D11 -B6F00ECF -488584F7 -A104F52 -FEE79F3B -689DBC3C -2DFDA897 -411EFFAC -546F5C25 -45562F46 -C17613D7 -40CD3300 -9908DC56 -5AE62418 -4A3C1C82 -A28631C4 -4AA65060 -5614DE71 -6512AAA2 -5AE841E7 -B04094A1 -AA8F8123 -593A95CB -21919833 -DFFAC729 -106727F1 -273A2977 -85E6CD4A -E9751C6F -DC308E67 -40F7722C -1D8986DC -489D6002 -7A869A39 -6E02A88F -A04E30C2 -B98C740D -3672EB58 -9702EBCB -2CD4FB56 -A0CB2C94 -47299608 -6BB5451D -36EB4DEF -763593B9 -40029F5 -9392B153 -777DA521 -3125CFB6 -E60A4DE6 -98B9CB40 -819091F6 -83D23CD3 -ECE09D62 -22EE60D5 -29A3F86D -797C0E72 -1EC708F -76F78D62 -E527F0A5 -F11AD3D0 -BBF11E9D -5E944B45 -D090FFCF -4B8F7B5C -96ABDB47 -2F5379A2 -38FD509C -F49D4D2E -F5538B3E -BAD3E277 -E9C9831A -22D3C209 -CEE03CFC -EB55F3D7 -C61B5224 -6C4E6ACA -A63B52BD -695DBE54 -3C68D8AE -847F8449 -72B426E6 -95642CE7 -B021A768 -AB094E2E -90D8A573 -D3BFF1FB -460DD461 -EF32D23C -868AEBDA -6BEC2EC0 -34D18392 -6C9D6621 -6CE02624 -75E6AE8F -B5BE7494 -A033B3BE -EED6D471 -99D40A8A -BC742254 -530DDD69 -77698872 -E89F0ACA -39716DFA -C811D562 -FA7770AC -1F68B8E -7D325ECE -8CD870A9 -DE561FD2 -8D49A512 -979F1346 -CBC53E73 -E779994F -354561F2 -ECDDE60B -52EE9980 -46AC0C6F -555C8C8E -D382E1DE -2A9A602B -4F18FA80 -96068D7F -D1E5CBFA -957912AF -DC0A3107 -77CFB940 -E7161980 -EB44FE07 -C1597F4E -FFE737C9 -ECBD5506 -AF75488F -6D0BB14E -9ED0A181 -8EF54B6D -4E69EFD -9337A7B7 -A880D3A7 -97A5D09D -FD9F77A -7CECCBB1 -2869D0F4 -F1806C1 -F9FEB241 -7D368AA7 -FF972C5E -FEA0C745 -CC1413 -DD4CEA96 -FC8C6CEF -75727E51 -5A17C784 -422EDDB7 -6505031A -5662B865 -D7848124 -A93A9AC -D874DF58 -FEFDE7F8 -5B3E37E8 -5CDC346E -CAAFB037 -BF2135D8 -C6977D49 -8D61C84A -C6B1C620 -30AF013B -B98B3270 -CBBE51A9 -43E26F1 -99534D9A -11DEC7C2 -F3952B8C -52900E87 -80D2B350 -838A2A8C -F8BFC35A -AF0466F9 -CCFC01C9 -C4A559B8 -5FED8BFA -ECB87D1F -7BF187 -4662AA70 -1274E59B -41188FCB -A769BABA -38F43333 -D4645494 -3E464034 -6F3BBB27 -8149A2D5 -D3D96C7F -C04CB115 -DE3B6C40 -B94FC85F -E0E6291E -3E22885A -30D35E07 -81014DDD -A40ED586 -A713CBC9 -7E0CC084 -439FE695 -F4094931 -C293453E -741A83B0 -D9C2E5F3 -4E623673 -309436D5 -807620F7 -7DE3993B -8F31B5E7 -F12F65FD -66763A72 -D3606695 -ED7794EC -8BD7EF5B -5B3449BB -D9B93EBC -5CF89E53 -103CE7A -A1ADA14F -BD020E01 -F737C35B -8695E1B -2AAC416C -43B6BBD5 -31036C5F -E5A61222 -F3E01282 -9A93EECB -BA874043 -1D010D4C -3F45AF54 -662F04F8 -279C9BE3 -217787A0 -1D399000 -6669B218 -A8F4D699 -181ED599 -A584DCDF -97A49036 -C5D4A8F7 -3C7351B3 -E4A7A0A2 -9A13953B -A9649AB5 -E9B91DF8 -CA6E2F04 -F0B63E4F -C0F55BF2 -38EBAE63 -8D8A619A -1A798058 -E5C218FF -8B67C799 -A81704DD -2562EF33 -74B37ACB -B2C84D35 -2E0EC87 -5CAC361D -7FA10429 -DDC1672C -3574275D -A831D84E -65339BB4 -4B936FAF -8348EDC1 -B1802336 -601EDB14 -BB5E4EC -48CE4DD2 -4CC93BBC -E77987CA -6348CFF9 -90830A68 -1BF0414 -C2BC8AF9 -3EDED4A4 -66B38B85 -CD6A6E08 -92B71F79 -6BB2BA9D -B4EAF374 -5B723892 -C350B751 -D7A56661 -576B1A79 -C66D8E1D -442DA54F -ED0C819A -809EBE76 -413B884A -817EF987 -D76CDB84 -90F40F80 -2BEB3E69 -C2782488 -F07FF38C -93AD0DA3 -C3E8DFD3 -5B804608 -9CEFF79A -BC524335 -495E18F4 -7FEB37D1 -A8F15A96 -3AE50033 -9DC5D0BC -D4A241D8 -8F3CC38A -4573A224 -5A3DA58B -B446C862 -69EFCA93 -83B911B -CD50A370 -2E05D74A -407D2B79 -AD108E34 -95EA144B -EA3DE818 -7AF026A3 -21366692 -4D5B7972 -C7D14546 -B6EF2543 -48E7457F -6947E018 -F6B2DD01 -9FF698B9 -EA11BADF -741FB523 -70901C0E -6A71C468 -8BD95624 -1D98077E -EF7CE480 -21F44B08 -563A0A30 -D9165A -7F8E8474 -219FFBE2 -FE1D6D6E -F7B8D66C -CA49F15D -C481484B -85D5310D -3FF17830 -8F69C740 -590A3DE5 -867A85CD -21C9758 -2E625FDE -7CD5B8DA -8BF43699 -AA17B723 -C0DBB2D3 -617F6819 -4D6BE357 -A2D89B90 -C4B19255 -748BC770 -4BA5F90C -2AB43820 -CB75746F -FE7480E4 -239B7D6 -2567653F -7BD1399F -55A842E4 -572D6A8D -CD1600C -6C880525 -1C18F7EC -C9C74D53 -AB3AB21E -F5EA5F69 -F6F730D5 -FA454FEB -978E940C -64D4DE80 -2BB0D31F -10268273 -D060E295 -85A74B89 -A7A3AE03 -7B8883FC -D0615497 -9D637210 -105C40E7 -F9FB184B -B4E67A79 -373530B8 -30E04C2 -47A1D75 -A6A67936 -1B789F9D -AAC21CCB -E00A8B8 -517BDE82 -B1004DA3 -3F745A4A -8FD0E21A -529E48CB -BE6AE2A5 -DFD7DE91 -145FF288 -2B1AD7B5 -C2AE7259 -88B84292 -373D8796 -5E4B4FC5 -971622EA -3C6F40B5 -5FBCF21A -144B7DE0 -C588DF6D -804B7F0E -4B6714FC -C1C2E61 -1CB08E0B -6355112C -1912B0BF -22263C9C -954A5DE3 -4520505E -459D0661 -70FF554F -F1FED0C0 -D1F602A5 -AE5D07A5 -B86AAF05 -452536BA -B00C120F -1431099A -42F0959A -FF1EAB1E -9FD43C93 -5076B428 -ACB3DAA -5D0BA50 -16E00180 -90E21E72 -D497B8D8 -8414A6CD -B933AC93 -18B2DC20 -5BCC1468 -101CA9C -5AF125FB -E65A4FBE -A5B927FC -A8163208 -CBC14C7C -A00E7C50 -62DDE328 -3704BAEC -B354A1A8 -1FEFA49E -BFA928AF -73EBAEEF -F21664AB -B82DC773 -397C3EC7 -6DF7A081 -7B57E52F -43B47A0D -4BB8B26E -748CD62D -1D057255 -3A01A19E -ED35DB9E -B9192006 -9DAAEE03 -6F88BC5B -41F22AAE -DAF9FD8B -8A8D06B2 -99E4A71A -E0E5802 -AF2050EE -35D07382 -3CDB4F32 -1587CDF9 -29E0BC17 -F6641B4C -35557A67 -20B08FD9 -F89BE3B8 -994D534E -5084DC42 -B49E2B0B -25AD0456 -B05DABB3 -102657BF -FA7342E8 -508B7BD7 -FED0EFE6 -5EFAD4C0 -15101C27 -420BBBF4 -1783F9D0 -CA890820 -BD3539D3 -578ED490 -1DA8E967 -134F8B74 -D6C5A224 -8C8B1F06 -8977D881 -541937F5 -9013604E -4B54F163 -A9030FBF -A9EF1A9C -CB29FA97 -94A3F001 -4069BD15 -C0D5E43E -4E17F81E -90FFEC8B -32D0B0C7 -4044EC4C -7D7935C3 -BCFF474A -9AD1BF76 -2ED2D299 -263F8852 -4073932E -BEDCC036 -7A548119 -ADF45572 -7D8C451E -465569B8 -CA9E87A4 -731803CD -1DB59C5C -A90C6543 -A22221B0 -173A0706 -E040DBBC -941E546B -5503B9D7 -CC5D8948 -F7FE8FB5 -1AA3AAD0 -20229A2A -82CC4C33 -746BC086 -E9F90D08 -2B356E1A -14897456 -D9BC34FB -9056CB82 -1DD450BD -BF64BC9A -166164AD -94363CB2 -ED715F84 -CF4D9ACB -BC0EA0A1 -46E9697E -72428536 -D9569B91 -2B84C8EA -D4CDE0CD -E439EA2C -E19B71D5 -E45E8566 -541A4655 -845B296B -B2E478AE -1A35840C -C94F4E9F -A7AB9164 -AAF8D027 -82252CBF -20106216 -ACC1C08E -57E445D9 -FF68B8B3 -4DAE2000 -B5A7ACEC -1E9BE78A -88DC5BAF -C8A00837 -210B7F85 -E2A072CF -144DA567 -C6467799 -4BC0A056 -C60819E3 -B2B1ED7C -C0ADC696 -56F0E8AB -8D538C1E -879C3079 -6EE2F434 -7B9CD649 -94A30F21 -7DA211F1 -64035D90 -916A9128 -EC9C52F6 -92991BB2 -53F4309A -5AA71420 -F9B67D20 -45706BC1 -E71E83B -B091D34C -BE56577B -7D3CE09C -1A3F1DD2 -F90362F3 -3FD83E38 -E8274EA1 -CDFDF1C2 -62FD4CFB -C3A1DB75 -15E3C709 -B7F81AF6 -E58D41BC -5376E522 -698DCBFB -C76EBF96 -46682F6B -E5C0AE29 -50259284 -91A4E263 -4B03C104 -4B04D974 -914FF9B5 -783CEFF4 -4B232A85 -303E2F77 -6E902ACB -8D630D23 -9BE394EC -461237B1 -22760BF9 -B1F5BDC8 -F8557002 -9CA2BA41 -76418996 -B734B9D6 -C5D4B1EB -59F49A63 -4F9C6BB0 -219811DD -CB536800 -BDAC548A -824F1A42 -5CE7C68B -AC7A5DE8 -86D89A36 -49E127B3 -EE0E8BFB -4997152C -A43493BE -ED7179 -1049E699 -431EBDAC -379BEDAE -FBFB2AF6 -72C255F -F37B5D5C -2D15F748 -7759FCC8 -D6730ACA -52AE1913 -D709F4AA -581518C7 -BE85DA4D -1A24C4D7 -50ABC4ED -7B50804D -194F2CD7 -A56680A8 -1520F41A -A614FFCF -5F66A0AA -46877891 -4926E937 -74E93C8E -62515A1D -8F3F6DF7 -AA4D19C5 -8057E286 -8C90FAB5 -4AD3F2DF -D953B36F -37D20E08 -644A2AFC -5CF19FD -8C9431A7 -EEDC46C5 -F86BE6DC -6C12ED6C -5EDE86A5 -7E59C795 -5EB83E6 -6F36E55D -D9E35BDF -CC7E1D72 -21A42C4F -332994C1 -4E460BAE -C9A0955F -C080A0A0 -B2013D50 -E6CB68DE -E9C759D0 -4A1C7783 -D1028E6C -CEAC9773 -189398E7 -B57C20FE -D0D3E05C -6FEC2AAD -17643391 -1291E620 -978A16DB -37BE98F1 -9F773872 -1BEB32F2 -CF3DA84 -3088C11B -2BEB338A -1F308D75 -DD542BFE -C568D953 -BEFE8926 -B9E201D5 -EE6FA353 -826FBE38 -CC867513 -A00D32D6 -CE9B8989 -8D3CA53C -1718DB6C -CE2AABE9 -8FF0C7CD -DBEC0AA6 -E75EC71F -FF266269 -3D7D0B68 -D606EE1E -56F86B85 -6B67916A -B164B35A -D4E7337D -D7A68BBA -A39300CF -D7C72CA5 -A32F6380 -385F8023 -1FF83E95 -F4E55989 -6BED2F68 -C714269C -4D2E9366 -8C1A2FE6 -84756541 -6D353F18 -741B7419 -3BE84DCE -8FFA851F -FCA5E50F -519AC53 -2E36273C -995F9DF1 -A1A165BC -F5E804CE -DD395EDB -7B2D8A34 -FC3F84B1 -19EE5FEA -EB2CA6C2 -866CE073 -B60059C0 -35395446 -BD2B582E -C6E73349 -634D409 -B9AAD6A6 -81B516BC -6933344A -806F4464 -22AA3AB2 -A6FA442A -31DB2D66 -F64AFBC0 -480C5B8F -8CE98937 -F8BF9101 -395669D0 -A560F096 -C8A13D26 -9C62AC71 -C0EA2E1 -BDC5E76D -51C79BBC -E84416E5 -30CF1A91 -E87F3E55 -6CA51768 -4D09690F -D488F996 -ED850E82 -510DA36B -709F9D1 -A6AAD3D4 -E0C4B7BB -1A581776 -2F11B35C -748C7EFD -A2F0722A -A8C6D678 -915B88D8 -42E5FD90 -25B58AA4 -8FF166C2 -B5FC3947 -6427FBD0 -E1C01EC7 -91FD1568 -FE570CB2 -BBEE870B -811FA63F -BE89954D -C83ADB4F -C1B4D237 -65AC0055 -5E2B279A -3FC59820 -B1634DAF -AC02E4BB -B9D8412B -AB22C318 -9E528E95 -F4220FD4 -D83A7E2F -7C013BBC -23849524 -BEED0AF2 -C9AD6213 -4F367F0B -8FBA0438 -EC5899D7 -A4111441 -2D18DAF5 -E7349E7E -57AC8D6A -A27E98E3 -AA1A992A -5E7E0E0E -AE4AF437 -20A80262 -AE20A4C -2CA493A5 -FFC756B3 -68045EAC -A56BE46A -7B3EDB89 -BF17C1AB -445B3851 -FE16BE78 -23D0640A -694D05D9 -D76F0407 -AAC3808D -8D2609FF -BDBECF1E -D6074958 -7EA401E2 -CAD394F3 -4A67FBFE -A2A7FBED -59E0B573 -CEFE2B20 -2BE6EB1 -85FF9E57 -42C7617D -E9E01845 -43F02D16 -DF309F8A -880350B7 -65CE706E -CA6A2B8C -5C38AA9 -6C60FA8 -42BAB35F -9453366B -D5864332 -A25A3164 -F32EDF79 -C757635D -F6712B29 -4C43A3E0 -80D02D7C -A9DB16CA -55270F91 -3FE8F468 -AB0C835E -DD8A2F64 -D9551C26 -4642684D -69D1935E -9A7A2413 -E0BEC20B -14724D4 -B4A43613 -559418E -1E4A709B -A32F1E7E -EFEFB7A4 -5B26F487 -E6CBF46D -7139D0C0 -EC214DFF -7045BA9D -A9AB902A -CAE7661B -3B50F210 -A065F80E -B353DA84 -E6538D1B -965D76CE -E7F01488 -A1E57BCD -76920B33 -4EC379D2 -43909492 -8F621446 -C9033570 -FEEEB7B8 -E6FFA222 -E8CDDAA2 -3C5C0252 -A63AF91A -D545D3D7 -28ABECA4 -EA14F18F -23FF43B0 -F9F0198 -24568599 -71F0C3DD -63975EB3 -BF3AF93A -7B95B627 -9B0D74D5 -20967FF3 -A621FE0C -6CFF968B -909CF3B8 -79B5DFFF -FC87A4BC -5BB19840 -DB7D8F85 -D4641400 -54449140 -CA93FF98 -85668EF3 -C871B119 -58D44D70 -D93434A8 -453FD827 -906A01B7 -FD446B38 -CB63F172 -E4B0DFD8 -D4FE1E63 -C78583A2 -1D7463DC -7D69FEE0 -93EECB26 -337FCA9A -5D5D7447 -1ACDDE16 -C4CB8D59 -F178B39F -292E3426 -7A1A4318 -DCCE0A6D -EEC1FCB9 -3B264208 -F9D7CB6 -9A23DA53 -58B2B3A4 -654072EB -6CA920C5 -E145E547 -F5FF4A8E -AB7C553C -2A84E62D -6F6AE7B2 -322DB9DE -17E670D3 -7BDFB473 -7CD05987 -5B12A205 -5E9FB325 -542A1478 -FF46384C -69DE91C9 -65B4C13E -78DA8BBF -D85BC864 -3882BAC6 -444A8F13 -886DBD37 -2613D1CA -7CF2397E -513D4563 -1C57D4F0 -32B75B54 -E18B4953 -B59C2B91 -98F11972 -594CCC07 -39BE7B96 -B14E5D15 -ED093697 -953DA37C -6FDD4B93 -8D678AE0 -8B149A9C -B9ED6AC -E4FE210B -44EB15E9 -805CE5D6 -62FF689B -E6C011C6 -42C85768 -EC22FC81 -16858F65 -6A6BC5F1 -E5090FDE -482D0881 -65EAB7D8 -620494B9 -6160FAE2 -542E102 -81BCAF6F -C31AABA5 -BEFFEDB4 -A802765 -68A8ED5B -A47FADCE -3EC1897A -4DBCCC04 -83EAFD50 -6B8E05E7 -4FA1891A -9C2FCD23 -9ED7C877 -15FF9D1F -67DE6F18 -D2932D4B -E4B31601 -60B47713 -C1326724 -1F5FD6C9 -2A54C06B -599854F5 -C2121D8C -2D0FAD3B -762DB289 -CCE2E11E -622AD608 -29836424 -C9F1F838 -4E0F9445 -16C53328 -B9F2FC2E -28FFB831 -7C216796 -E065DC2C -561328B -92EEB73E -BBC5AE83 -2DE49E4B -BB32B7FC -E59D7B63 -B3375867 -5523615E -5532A7B5 -6890882D -21F33D70 -EA855CD7 -CBB7B3A1 -DD9C122E -5CEAC143 -E9E4332A -6F658BF6 -57E90D54 -715AA7A1 -DE7768FF -D8A3302B -1BECD73C -AD442F70 -EBBCB63 -5D25E0FB -EF9854C7 -DEBB6E96 -61591E99 -BE06EE6B -F74EDD0E -124B1712 -45833671 -1227307A -546B647C -9D2398D1 -DDB609E -EB68EAF7 -F05AFA0B -A6EABBB9 -60B5FC76 -992D25CF -A99743C -5FF72996 -E3D84005 -F47AC3D6 -D92BCBEB -3AD6BC2D -399AE49E -FFD7134A -80856732 -8C92A116 -D23F2A7F -1C1FF7CD -7E97215D -63CE5EAB -1E3D6441 -8CC7E1E2 -3144CABE -1B369565 -E681B9FD -3F72A224 -3146105D -68639F13 -61E4A798 -CF28AF43 -F18B6903 -F4D16333 -557BEB41 -F5DEEE8E -41F036AB -D0DBBD23 -E8E240CB -8FE50644 -8EF8CB38 -F8D6EBA6 -580EDAAC -25F0FEBF -1E09176D -CD156787 -8198153A -3D5D3DE3 -5132C51F -4B39B7FD -15BAA338 -AC2E0CAE -91DC2332 -3632CBA5 -2AD744AC -EF31B613 -6A9D8019 -17DE8C90 -E5CC66F7 -E81411C2 -C5B6931B -E8CF72F1 -ABF2E66 -5B7DEA27 -340E7880 -2B4ED84D -F6E86748 -9C181F92 -55DCA269 -1CEE9C9D -1DB0A271 -B1BB73B1 -2B802754 -596ED430 -25F4A422 -E186EA6C -A0793E1F -B54A8F34 -4EEA557C -A8085CD6 -276D7E7A -F711A6D4 -2534D88B -FA8CEFBD -A7E9E1C7 -EF6F2E -4620FD63 -7955C107 -50E0A968 -81DBA8B6 -92E0F3D4 -C78C01F7 -CFE5AB0F -C290FC3B -F12CC1D9 -56A9B1DA -69AC05FF -964D8EE -EB198C02 -A3D9435 -30D0BD52 -2A1A5868 -DF336813 -14C97AB3 -BA6717D1 -43FC05DC -32A6FFBC -C47276AB -DECB3B2F -1511FAA2 -155693C7 -E5BB37E4 -CB20ED97 -FDFD4014 -FFB25A3D -4F8B2CCE -8EC8D538 -A60DDEE4 -9E6196D0 -8895A4D -A2528B98 -D02F59B9 -47662556 -4FAB84CE -6C7FC2FC -F351CBF4 -F1917707 -B1F2737C -B46CC768 -F87757B9 -A24CA3F5 -74EC8337 -C46290C3 -77BBC380 -1B3087DC -C816F73C -6E2C562B -27C3E900 -4FB423EC -A77B1E37 -51063C80 -432108D2 -11F0367D -1D08F91D -D56068FA -F259DE46 -26CF3619 -6E6AF5EC -10AFB2EE -14F925E9 -5382204 -9F482CE6 -90B0897C -C768AA0B -654ED88C -AD60966B -8EB54FB3 -26275630 -A1C50A7E -21587F6E -9496FD06 -4B768A3F -1798404A -28C6B4D8 -5B579E3D -C79ECD09 -EC63FA6A -162A0135 -7FB7DDB1 -A0167E99 -196F14DB -CCD227F3 -3FB917CC -A3D30D38 -71874379 -E9E489BD -5DA989C2 -4F7C8E1 -F6E0502F -F8445D16 -25CC5FFA -FB06FF63 -CFEA3C99 -E41A8123 -6A5A256C -D7B67156 -50BDCCD2 -8165541 -F067F327 -B1E17258 -6901F3B0 -8B8CA0AC -CBA88A2D -4736E05D -DD5AD020 -35B501DF -73C67F6F -F2C513F -E6CF7C2D -E6A85B1B -8AE4F7E6 -1ACA7CFC -BCFCC182 -2930369B -642DC973 -990B6772 -681EC185 -164AC235 -9C676AC8 -B200AD7D -F13B8C8D -9D22DB12 -CE95663D -CE956E42 -29485F4F -BC5D5F8E -DAB561EF -C4C15BAA -77B9192C -86E8BF86 -5933ECE -E50B93C6 -F8B0CFB0 -3286711B -DD558ED9 -DD043899 -4AFAB231 -637BB2D7 -87036D19 -9A30430F -27798B63 -4D6E407D -CEE251F5 -ADFFB995 -B5C885B2 -7DF6519C -6EF51C85 -B95DAF30 -65EA99E7 -772FBB19 -49DBE1EC -F386A79B -EECD2F55 -8935CCEC -BAC4C120 -C71F82EF -2DF7E67D -9BA39901 -9614A4E1 -C6304402 -236FC777 -D47A5719 -8098EC85 -799E34F4 -896EBD9 -BAB10372 -32ED359C -6F9F763B -9D517447 -22B55AB9 -8E6F4104 -15BEC5D3 -6252E010 -23B5E8E7 -D0B113BA -965C42E7 -F2A0C19A -24CB582E -1F449982 -2E805DF0 -851608AC -755273C7 -3529A161 -6395258D -C5BD7D0C -27BABE75 -E1628E4A -47E5CD77 -EE797B13 -AB11893E -2F65151B -9CE2B20B -233C28A5 -749A0C91 -846BC1E1 -8C36F8FE -1489CF6A -70FB6BE0 -D0A84133 -9734B9B7 -FF166A04 -D118033F -BDDB2D63 -6F6691F0 -44FB36D0 -EFF2B14E -AC02C863 -ADFD2972 -905F6E84 -7C0008A8 -4A043A53 -D104FDC0 -1687FF25 -E6CF8FCF -120143AE -53F92C72 -19E2E798 -EE8C6B94 -15CEA57D -C8968EBD -D50EFBA3 -A8EA5FE1 -E2D073FB -B4EE195F -8928A91F -6B9EB970 -C24B509C -5D340563 -85FC3F3B -934FA012 -A2AB8533 -A6BD3187 -105DF0E3 -243ADD05 -49C299EF -7A42F84C -C90A1935 -3268B298 -CFA3B2EE -470C6457 -E579D2C4 -BB10428B -78D10FE4 -11F21813 -8424CE28 -EA2B114 -8239463D -9804414B -44B4FD1D -82D50F88 -10AED1B6 -E4768ADE -E7235A66 -C8705714 -936532B0 -15C63108 -92A91B17 -154B2415 -9BF0D15C -5F451388 -1DC102A8 -96CAFC23 -B076C0DE -3EBDCC3D -6B2EE523 -C6777AA9 -F7F48C4A -B1E8ADBD -FA30AC90 -5173D22A -D22827A6 -6504AED6 -3115E6F6 -E8937768 -C5ACC0E9 -366E15FD -AB81C84C -C27AFE96 -7361C8B1 -613A0811 -595F48E4 -1619DFA6 -233D2474 -4C174E1C -E7DCC63F -308FDED9 -502A0AB0 -C5004E90 -B7FBEFEB -918A77FF -F7235A04 -5CCB8B7E -3BA4B1ED -32F47DAC -FF7348B1 -996C8E7 -7203F1B0 -70583A2C -4D8046A0 -551119AD -BE5B31AE -35400CC7 -E8ECD409 -D1C104E0 -1A0858F -F26946 -458C8B3F -E8D66E91 -2F3F6384 -B36EC71B -289CD4C6 -6CA9E35 -B198A8B -816873F1 -346D66C9 -BD906E97 -802E5969 -261BBBD1 -9D7605C6 -72C2CDE6 -6C8DBDB5 -D7C8DD7C -F43FB2C8 -A9F384E6 -78FDC918 -6D20841A -20755F34 -F4C6AF99 -19393B53 -A525AE84 -CE881A38 -3D075300 -9B0E4DCA -7EB7E7A1 -4C4FD44A -78483ED6 -32D9D894 -1CCD379A -EA5FEB4B -F7E001D -44FA69A5 -E99F66B6 -9E16CD0B -CD098C41 -6DAAD279 -5FE50411 -CC855E2 -130C6563 -356CD9A1 -BFB318B8 -2E963C0F -DC5A046A -FE16FB -A599857C -F72FE561 -2914E4FE -B247AE8D -6A6F13C0 -B1052C98 -8086E53A -845345BA -D43D5F7A -82B30F5E -4206EB1B -89CCA1AE -86289F6 -567F22DE -25624C58 -6A78EC3F -7EC32D03 -8017213D -3A141336 -D1CA4E6E -FA84C2C -FE670E0 -3238E01 -18DF1794 -A7B900AD -1FCE47CD -14EFDCB1 -C21B04A8 -4C3343A2 -E5E611B7 -ADD06EF0 -32C81695 -201A9FEE -BA8925BB -5182EEED -7DA4917E -CC331235 -C304ABE9 -C2A16075 -937E1C4C -CCA0184E -9DB6C45A -3F2A79C9 -151B469E -162F22DA -D955D54E -E857CC0E -FFF2005B -60AD87FD -85512214 -E0A506A0 -FAF1A145 -9DA17F03 -332D26D1 -9EDF9643 -7BBF2D9D -3414FEA0 -A8FE5964 -D4841879 -3AE4E5EA -BC6B6D60 -950F4693 -70FD0254 -177C7A1F -635FE5B9 -C0C5B6CD -15D1D22F -BA495903 -CC100F38 -A5F1E225 -5AB4584F -AC4731FD -ABB04167 -A0E153B4 -5982BDA9 -8E2EE3AF -D635C631 -7C6154A2 -9F0EEFEE -429B22CA -B1346D4E -6B21663D -6A7EDD8A -DA34A355 -217132F0 -683BA78 -9CD46320 -A5D3BC4F -3194AB03 -DD66F958 -E7506C47 -17EE83A2 -4E4D80A0 -EB56662F -BE889C58 -6F5F6745 -2A05C12F -13D266A0 -3B2B18C9 -EF435E02 -5604DB7F -D35888A2 -CCC34421 -55E24355 -7F607F34 -E493720B -C6A492D7 -7DC6A789 -E01474B2 -97D35C32 -71F32335 -D3083D7 -2327D424 -35EA4BA1 -F5B20C6F -3ED28FCC -453A76AE -192A79A6 -2E64285D -A9463AEB -374E22E0 -92A5CF8F -E707F8E8 -B8E2FF36 -E8E959EC -91D9796C -F03960F6 -B62467FA -8836A487 -6418A93F -60932160 -3B72687C -37BBD7CB -1001C76F -201999EE -5955A1CA -925351D4 -767540E3 -570BBF27 -A073D4D8 -FE96246A -44784995 -232C0150 -AB7BCE2 -D47BF099 -BFA6A422 -70F4BC01 -C2139449 -F9ACB817 -26657111 -13263449 -7989D26A -2E972B3D -2F1C1C6 -930E479 -23243FE7 -BA7DDF9C -50C8AB43 -952377D6 -4C6C2B3A -BDAF48F3 -1C0BAE6E -7F6A8C04 -F529B9FA -9ECA4162 -342E6562 -9BD5EB52 -A14DB3C9 -14B1DC2 -4E1BB6D1 -9A1158D5 -73F84EC -685BD9F5 -8CE72161 -5F116605 -BA861D43 -A7150AC2 -391A105B -C8D798E8 -16633750 -33B29C4C -54211362 -34C2D5FB -CA197734 -A635990A -4E606FD7 -9D56673B -89976DD5 -5F2D2794 -81E95955 -9377829 -5DED53B7 -FEAD5592 -1CC6419B -BD3A45C6 -65FACDCA -7EAD0EF3 -EB856702 -D857FA75 -3B92DC0D -E66AE58C -51912618 -C63C75BC -ED05B556 -17EC2B32 -9F692578 -C706059B -D88D5576 -C2661C7B -6D7751C2 -119292CE -418700CA -2A2BC3D8 -CA20D341 -8A8F325D -D4A2DC8D -959FD62 -67883F8E -FBD3686B -6B862363 -F8C13880 -FCACA893 -8215D90C -67567E2D -3B501BED -7AFBFAF4 -2EC3CC34 -B360BFD9 -716C5E9A -907B1432 -E253CBD1 -4DB52F87 -6A37A21F -C860A6A2 -72DFE5D2 -84E0705D -80DDC195 -1ECD4E92 -2D2035A1 -B10A5B53 -C9AA9A79 -E999CC8D -C8C790EB -F7629DFA -93158872 -FAB6E7DF -58A0A3D -6104EAC7 -2BACDD14 -A8E3DE88 -AC4E16F4 -F7042189 -5AA6D923 -F491667D -C769767B -46EE7E69 -CE4BAE4E -FA1BE581 -2BF14278 -5356E813 -6225B503 -D33A6F26 -1A629247 -BD844A35 -E33ADFB -EFE720D6 -3D49752E -AD542CEB -EE36C608 -99FD833C -BA893EF7 -47E4A8A9 -B269C1DC -CEF39BB2 -91FD5B03 -C02E6C1D -29A3817F -70894875 -8C851D1B -8446E920 -8CBAB8AE -D9D7B185 -97987DFC -ADE83493 -4CD1FC4F -1D82738C -27665936 -CE3C907 -990136FD -E1E40CF2 -A3E15CA6 -DB7D4E0F -D8E87ED -FC23DA2F -76A6A0C0 -1C7F403F -380BCEC9 -C2BDE917 -74145443 -14C0823C -8D73C415 -BD7B9DB4 -C83449E7 -364D21C7 -7F01C97E -9ED9F208 -51417FC4 -D557CFF2 -5ED6B81F -BC0EBF41 -608D56CA -60AA90AF -8FC8A8D6 -809BE4D9 -47CD9035 -8CE71201 -B442C067 -A380EF4D -7B74A914 -513ADF78 -63E5C752 -6D4F2B4B -82717D99 -EC19F48C -7D0D1EC5 -944D936F -358B8D1F -D3A7E17D -5E6DFD92 -D6D2B538 -133AC914 -22C4BFCB -A9F4ABBF -7DDED93D -6836C5 -3F10AEBF -71713080 -A1868A02 -EC341DE1 -33D409F1 -41EA5D35 -47F18F89 -7C062A2E -1C66DC90 -D5E11362 -FACCDD77 -D96EA1F2 -31676D3 -B00B9D1D -36F80278 -754F427 -3D8C40A3 -D1FB426C -ED4869D3 -AD137726 -9704A7D6 -107A0E2D -AAD92A50 -58019B5B -F6FD55A -E876FBF7 -13451AEB -A530BF41 -11FCB24D -EF5D7F1B -BB65E3F3 -DCAF1904 -4262AE51 -8C2318E1 -96E7A13F -DDA281E3 -7B44E7BF -8048EB55 -AFC8D749 -D3F7E592 -23FF8DE -105E2923 -969758CE -B1BF840D -D301EDDB -42A3C6C4 -2C934ECA -B2FB9ACA -452302A4 -C96F49CB -D7342392 -48A6D82C -6B831657 -1A6989B2 -312D282B -9AC1D170 -3FB3070C -D83B178C -D894496D -5FFA91E8 -436E970D -54DC6812 -8CCA890F -96971388 -9CED7192 -216196F -BDBF8734 -441B7DC6 -8FCB2D4 -1C3375E3 -19EE1338 -E8BD4F25 -D65CD246 -85157D36 -34A4CE5A -BFF7BCD5 -41DD5123 -D92D0021 -C0265B3 -652BE05B -7B31FC27 -E8BBC732 -E5DB7686 -2D1EAFF8 -2283884 -CE0E4257 -1936BB27 -6ED44FBF -476ED2B -C249E9F6 -21C0827C -8DA28ECA -707E075B -10EFDAF6 -3DF4B474 -24AC5C3B -81F8A453 -8E1AF272 -E69E1816 -C40F1B4 -5AF2AD1A -C1236EE6 -78507240 -588C4851 -385396C3 -BE2210DE -E8FC3FE2 -B9E7C8F8 -A33939 -B9E8F7DB -F7DF1BA4 -400E6C2F -1139C2B3 -8195BA65 -A6052E5F -29E1F01D -512ABDD6 -ABE172A9 -350BB8FB -63D89399 -6C7CDD2F -F6E20A15 -36947843 -7D26A79A -133DF31B -AB375C67 -35D4F0E9 -8060F5A6 -94893A4F -1B4E1612 -431938A9 -F4F22D48 -E83BC91E -98D9DF02 -7CBB518A -947735EF -16DB6C38 -7BBEB95B -393A60CF -6984032C -F1879BA2 -F014440B -61CAEF50 -F9BAA90B -6D9CDB7A -4A4C3D3F -DD498DC8 -E27FE395 -AEA01257 -15FEAA99 -61A173A1 -28EFFD56 -A27152DF -10C613A7 -47AFE324 -5B4D4B5 -AF67027D -11ADBB9E -F8B22312 -4A9C0C1D -E94F39C8 -9AA4F0E2 -4C394A49 -41ABACE1 -6A96270B -171F3E81 -F29DB470 -A9E7F67E -6B445012 -B53EFB86 -B0AB92A -484432B2 -7C789E2 -116B012D -5A5434DA -83DD29B0 -418637F4 -C9E1FBB7 -FD84E0E9 -BB44A4ED -4847C699 -61807BB2 -F558A9F0 -264F9191 -697F6915 -EBC115CC -A1604C6E -9CD73651 -50ADAD72 -DE3698D8 -DAD728B2 -58F5527 -C58A4754 -C8CCF740 -A5CD4E0A -966E50B5 -6DEA9EAF -66DEDD5B -CE18EE1B -E0293294 -3C0C586C -ED04E099 -A1BB7722 -78AF5367 -3F0FBBB7 -4F623EEA -E3E1A85A -3C8EE1B0 -D2851D20 -F07248A0 -713EBA3 -8CCDC87C -B5ADE0C6 -54DC4354 -F7F43DE5 -AB512848 -69136DAC -71CEFCD8 -5F264F19 -D39D50DA -A184BC23 -57F38C31 -34DFEB30 -6B39F755 -60F7B6C8 -EA7FF406 -914CD331 -F4A15FC9 -68DB20A3 -6609D547 -18BD6EF6 -F5DDB763 -9E2C6236 -A9C0CD72 -EE8A864E -FA9A7891 -DCE7F5DE -4E5A9B63 -FBC574F8 -13C26C91 -70A2AD7F -9514018 -7786A6DF -708A442D -8AC98261 -57EC9F69 -D8B92F1F -5525E8BD -CFB927EB -47BA617A -4A71DA0F -9632F7DD -4A00D653 -3FC603A6 -A34C3C9F -EDFCB326 -BA31E996 -4158D5 -888F01B5 -F001473B -D67ACDF1 -587F7E20 -EC9AFA96 -6942D697 -76FEFEE9 -ED260881 -53D50BC9 -43FAA199 -DA4F8CB2 -D7FE8FC6 -7A659755 -394C88C8 -EFA3AFA -87710DA8 -DA1FF12A -C5D4E7F8 -4F0A47D7 -E7C2A799 -EE894D65 -20E4FD0E -8E51626 -17BB7611 -E48021B1 -4320CA45 -5315D225 -39684701 -3E943281 -B3B7B298 -A63E5C66 -11F2EAE5 -2E339781 -9BE79114 -187467D -9479787B -565D0658 -B43DBE73 -67F7EA80 -D1962413 -BF4B89AF -AC03F363 -1587941F -B7A14BD6 -AE1A36A4 -BF710690 -8009F7B0 -FB37D608 -58934215 -327E7B3E -A2BCED7 -57DB9C90 -3E7E56C9 -E554BE2A -6B6273A0 -766F5A68 -503BD141 -586BF1E1 -AF75978E -D93FB741 -75268390 -BDEAB299 -9871DD6A -9C042A7A -4CED46AC -706B559E -9C9CE827 -EFDAEFCB -A1AA3846 -330AAB65 -602F6FCE -DF14BBD9 -8BEF0FE8 -CEC4AC8B -28456573 -95AB0149 -43E11079 -B50D7970 -6F8F89C6 -B96DCC6C -E114C8BD -CF3F36AA -E02901C9 -8B452A2 -8AFEE7A2 -FD7C3D61 -4DA46DA5 -BD5C204A -83FB677D -42615EE0 -3783255C -9FA48033 -270F0FCB -157E94E0 -CC89D359 -715FCAEC -32EF8DFD -829D0BCF -E4FC364E -A629CB9D -7CE1FED6 -D6E9FEEA -24E55CE7 -8BB2DA23 -2FAEBFC0 -AD6EF205 -96142124 -6891653D -C5061A39 -9EA7F89C -D2CA9BBF -544A569 -E908D41E -EAA11FBF -4250EAF7 -6A5E60CF -5F84A53D -4324D154 -57320611 -DC3C692F -24685A97 -40F011E3 -25A224E -3712F01 -30F1AB94 -45F92B8A -450F8D4E -F3EFF92B -EA54D0BB -7E10A58D -D51BDF85 -FA6E7358 -A16E06FB -CA158DFF -9AAFDAD5 -AA48F649 -A4A78E50 -F2F73CFA -519FA6F5 -32933CF5 -9E55F1C2 -806019A2 -E56E0B7E -5F598AA3 -564C6D40 -757BDE5D -30757BFF -B906BD37 -52C6C503 -D2B00C73 -5969C7A1 -84FF193D -E668D8D1 -71E66078 -A200D7C6 -6585828A -FF8864E8 -B9EED36 -12C9F3AB -2F2C4A2D -2998FE0A -A1D47491 -59463A75 -1347C537 -77000037 -E6AC6FFE -C74CADE7 -83B75335 -767A69EF -4248CAAE -1DAA4A34 -BBCDEA3E -CE177B23 -59449B11 -A9DC563D -85589ACB -8926A959 -CADAB503 -6A1E5AD1 -E79EAAB5 -9C25D798 -B4750BE3 -249329AF -724F7831 -F4D2E094 -CD605F43 -CCC933E3 -4231A56 -8D15BB64 -A7B1E394 -FF2B04CB -7260C6F0 -A483E58C -35E5FBAC -A3D734E9 -64BF02D7 -24F8B625 -FBDA78F6 -6FA335D5 -5CAAE8EA -EBE22B69 -9BE5C3B2 -81028FF8 -E20FD2C2 -CC8506BD -E079C912 -BDE0AE94 -AA4AD182 -AE682162 -AADAA077 -C757CE81 -E4BBF694 -8ACFF53D -D1E85D5E -E29E9979 -9DC46E06 -A8FB412B -CA71D109 -987A6F6D -E5A13D87 -BCF3C6D6 -DA5A6320 -E78095AF -C0C4710D -7F06A362 -FF3D8A8F -428A02D8 -2EBFAF55 -D25B93D4 -344E75CC -ABC855A9 -E3577D95 -843C4274 -F5326A2D -EC6EB288 -7C4C82E6 -A70953D8 -8D8B314 -8772F0BB -3BA5025 -1BE5CFF -9592B505 -B9FE16F1 -EF77DAF1 -4C7B4119 -8B8FEB44 -3542576F -375EBF3E -D0927BE5 -2C6A3AAE -45D18D70 -6126FAB3 -58146389 -FBF50CF3 -3129860E -4B721C54 -95BCFF3C -DDF12106 -1E2428D3 -827395A7 -35266B84 -3CC089A3 -B8198C2A -B8EBD35B -7EBB213B -A93DCCAE -CBB25C42 -2A03D874 -46F6CAA -82986B02 -47EA89A6 -2C3E7BDC -852B0630 -A928EB9 -66A2BC66 -BBB43A54 -A6F55CB7 -FE990460 -5FA8BA0E -1CD34B74 -1C0F2BE4 -FE6C53A3 -C325B6C1 -A980B3D1 -9F031392 -31E17C1B -38B6D6A3 -E30D49E5 -E83F8C4F -BCF13E0E -28124F6E -57AF5DDB -691BCC17 -BD071C94 -DF4984C2 -8579EA0F -92150479 -7BB67579 -58D6EB84 -97754D0C -F569F71B -9990D0B5 -56DAB760 -9E988907 -9679988F -3EC5E4F4 -328D67D9 -317EB4E7 -5E6D7E6A -BFEE035F -D12E6060 -4F2A7A2D -F65F5B73 -54AE1242 -ADAD3A5B -61A81471 -FB09DC55 -72874DB5 -5302F1D1 -8B5F6A90 -82E98E7F -E808315D -DDF5B32F -C35356A6 -6F1FF7AC -1549941D -1460BF8A -D53684E0 -1A384C42 -D319924E -B0B1824A -2772DB36 -BA61B594 -712F9397 -41F5740B -C00A34B2 -F2FCE526 -4C874DC6 -FD5ED831 -301E874C -CE244111 -D6AEAE23 -516AF534 -FC101FD2 -EACEA514 -C23A0FCD -650BA0E6 -5C877E20 -ACB5DAE4 -5E56E78C -1AE6F2A -705046AF -7F53EEE7 -AAB30590 -2A1BD5B6 -300A6D8F -FECD64C6 -A8FF2EC9 -27B583C1 -29CAE718 -66D59871 -16E8C79F -14D20B3B -446862AA -1C5EBC93 -3831B437 -556E9FE -B877897C -D6FE7901 -D19ABB8C -964EB757 -D1DAC489 -B60AFF4D -31D01640 -A963359E -E233B856 -58D923CF -EF31455B -EC071BC8 -94F64E2E -F9384093 -36C8A1F -AC4A701F -657CD41F -731CAD58 -374B9753 -EC20E4D1 -E58959AF -E83E1021 -B7C14D53 -A651DDBA -D54BD80B -7291E323 -31310762 -A54A712F -482BD448 -1FC7B562 -EA69143D -4342848D -C4BB4C5F -B0B43A48 -962EF559 -5C395F65 -6C40A83D -AEC344E3 -881E5E3A -42D50FC5 -144B9CA5 -15DE8B4E -AB91DED2 -17FCB1B5 -87804536 -102205D0 -E57C9F29 -5D08E2E1 -A4AA0B4D -4FB1351D -F3BFE5C6 -5C439E04 -33A0A6AB -826A9A49 -D165E206 -229A4A83 -4897797B -396C7F04 -474B2792 -351AD33 -ECCFA3E6 -901B77BB -42B16DDA -FB3F707C -C6816341 -CE19D1AD -8297E119 -4458AB5 -FD9CA7B6 -250517BA -2E23BFF5 -F0D1C983 -699A7882 -557EB3B1 -D0D5822D -D1117539 -F271C507 -9364161D -6793E35B -8AF902C6 -DA5443B8 -EE1E1A0 -B941E448 -DE0E773A -4A41AF87 -D4AA88C2 -80B09F9E -53F2B381 -1C8EA42E -3D15C64F -93FE9251 -B242B629 -F7ED2942 -6AAE674C -EBF19F56 -E299D4A8 -4F22DB1F -20998388 -4742F182 -F6626B60 -992FB48A -26822FD4 -784D31DD -B84CAF35 -B8163E9E -2A27EE0C -FF09CF79 -81C74BBE -C914DAC2 -E768AAF6 -FFA5171 -CA93E6BF -E495891A -482A252B -18F8FD7D -DE52E34B -A4986019 -E363E1CB -EAF53373 -59FEDE9F -2FAEAEB6 -DCE56F6D -F10257B2 -7609DFE6 -4D0D263A -12696B9B -A56E0541 -8F12E1B7 -9E8E5761 -98C5816A -F2F8EFA5 -B91C1CF3 -59A19F9B -9235B967 -A58D23DB -71377517 -C50BCDB3 -60D31A7A -874811FA -58A69900 -CD8198EE -E4FA90EE -51352862 -3654B5D6 -B0442DA9 -5BA67D5E -A9B84B57 -FF61069A -21102ABD -8E6B59D -1DBF72C0 -9772AC77 -F26B2827 -E985C97D -CC311683 -E8216C66 -13E346BE -199D0C57 -578B8B90 -84462520 -7B33C9F9 -E18A5CC0 -8F70C75D -B9773D99 -8A8BDCAF -78B8631C -1AA0C9F2 -76FDD536 -8CECE336 -999E6F4F -29EB2768 -3417B854 -A56B87D4 -CA2F016B -69DED6A1 -8AF8128C -27732A2E -654939F8 -F0DE0291 -501F84CA -815055FE -99B595F6 -627F49E7 -2A7BE8CB -959032DB -7FD03C7E -54ADDCA0 -62EB2DA4 -6E458899 -2FE00E32 -B2E74808 -35803F87 -7369F52B -1586B4DD -61B61CC6 -1BDD1B8F -C6BAFAF5 -C4339DA2 -E1D3A0DC -8AD49CC3 -673B67FD -D81B434E -A41C5AA6 -BED70576 -22877C0D -71A3DC2A -FDE1F4AB -4FA1751E -DADBAFB0 -1C44975B -76EE876B -E3B81546 -86466730 -6A3F403E -255A72F8 -2D2AAE1D -77717644 -63E003E8 -40CDF1FA -FF37E1B5 -F0FC3CCA -45BE9807 -D8611D58 -D62AB82 -EE875225 -B8149434 -FFD0F0EB -2F3699E6 -7EBD4BFA -3E393CC6 -39777EAC -FE2A33EF -9AECBEB3 -322B14DC -DA2EB056 -1C942882 -C42C7C32 -A20E0D02 -E91D2834 -D465D9D1 -FC60192C -D3B7FCA1 -1E9B03FA -40323FF4 -DFA3D47B -2C26930E -391E6E18 -E340B164 -36FD76AB -204B0D9D -5F5027DD -FB05E9F -33C3443D -ABF1832A -152FEBC6 -FD83B071 -310222F3 -E07F3402 -61818FE6 -6E14F915 -F89FE609 -86FC4F17 -C860D97A -51B0EF08 -779B9BA3 -6D9C0908 -D14ED3D6 -692E8084 -233DEE29 -B85FF171 -12FAD29A -D37B7593 -AEDD969F -8E76CAF6 -A7FDDB58 -B5B7DFEF -A8881968 -50D65153 -D57A8EEC -7D144C49 -99B10DC -5660CCA2 -C02A1001 -7EE499CE -8C281511 -8B43EDB4 -31E58C4 -E9EAB787 -48BD8C20 -87C33E72 -9FD28F45 -9D8374B3 -3AEBB8FE -D25F7E5E -65B705F8 -ACB7BA8A -C7CE28F4 -1A365014 -12997929 -BAC3250 -3DA4DE9C -D90B5C3B -731BC23E -F952A129 -E5FECF74 -26D6A0 -B61C74A2 -B18937FA -E034B86 -6B3E73E1 -FC5891FE -E6F5F72B -BE380D96 -DB6DA2C1 -8BCAC0F9 -FCE57C36 -10230AAB -8E0B6278 -962C5A14 -4C257AA0 -95B50454 -478B67C6 -4BB1F24A -9DE453A7 -241965D7 -DE5E4EEB -77BCEB46 -A87FC004 -4EF35145 -35910ECD -8900342B -C9A653E2 -9AA2501F -DD4D16E8 -A2340ACF -F846821 -9A2A16D3 -33BF35C8 -185C4C5E -9A3A7865 -6CA5232C -8A93214E -8F9C13E3 -CF212018 -777D973A -3531924D -DAEBD9FA -4C4BA7D1 -C6DD4E96 -72F0CF35 -AD82F177 -B8486F78 -C89FE003 -991E4764 -F49CB023 -14C3A164 -B6B2733F -F78D6623 -F1C9D84E -6CE9487C -68F59E42 -B13A9862 -A60DF7FC -5680C3EE -8DBB03F3 -FE660987 -7F302425 -98915B -3EFAFEFE -819E3A26 -CF086D8 -EDDF6ADF -314D6342 -C7DC4A97 -231D9E12 -C8F0BB37 -E2A20026 -A9539B54 -E2047DA5 -3E5C9D4E -F91C18A5 -37B1EDB1 -DE88277F -765DEA9D -555D803F -6FAD1516 -41299623 -66D3E9F -B040E22F -28C55A65 -F5BBEB1 -8F85CC9 -C1F1FCFB -E0ACADA -FD138889 -F4E18B1B -6EAD0B49 -38441326 -17AEF5F -5A6EF970 -20ED5B3A -46A95C2B -CA7475C8 -8FA66C0 -3F831698 -E2C27DCC -7AB6C35D -9D979A50 -27F30FC -4FA19438 -321E637C -AD72B955 -C7BE128E -A428B5EC -48817E5 -7EBF668C -8DCEC036 -272C5582 -F8175767 -6ED7A880 -71E2497F -6EE3595D -D2579856 -15439021 -87C91FDA -A5682821 -E3FC8D77 -1545F959 -6341300 -D52520B7 -B0A0FAE6 -6F1C6BFB -226DE897 -4449D2DD -7E378981 -55A93F85 -91BFE157 -434EAE2F -AEC8DFBE -929F369C -DF654EA5 -CC2D5431 -152C1E93 -D800D93B -1969CB8D -46776BE7 -DF3D435C -2CD82C1F -241528BB -88B41461 -19463B47 -CD61AE6F -3C5DFE3 -8053B926 -5D0C9D00 -75240C8 -53A9DCF1 -B217E766 -616C0F89 -E73E36F5 -1E3E0BC3 -B6C474CC -9AFE8273 -AAA496CA -E9770A12 -9C3E2617 -3CB73C1B -2065FF5C -3A2B3E59 -280EF886 -B6A728CC -DDEE48DC -BE40F70 -449577CF -E5D72358 -5648EE48 -F6B9BB34 -F8E354C -84895AB6 -95DA9283 -882AF6A3 -4FBA089C -D27070D7 -17784421 -DDEBCE6E -4E6A43B3 -82AE90D7 -1A524C8F -D1C0C339 -993FA3FB -52CCA574 -523FF9E9 -764B2F69 -621F0749 -5C95BE3E -F2A36CAD -5C92ADE4 -F4238C46 -BDD0079D -CAE6D9F9 -5F3D1307 -9345998 -22C3C499 -631B8B0 -A6B9A88B -471749A7 -6BCD27C8 -5D371C05 -57081397 -F6CEF315 -1BACE19 -B7BF405 -5B6DD011 -BC74DA95 -781349E -F22A975C -72A5A101 -27BB6AED -933B9126 -14FBE3BB -50D095D9 -1CC937B1 -22CBC28 -1A6135EE -197E93EE -26A1CB1B -79BCF079 -A0134157 -9F232A75 -818BB26B -B2339659 -911E36A8 -AF2F9282 -347C34E8 -6255FF5B -1BB79854 -9A16AE8C -2A3D9B7D -93795FED -8284A6D4 -E58090F9 -A36C45A3 -F8065618 -4122FC06 -6F4DC90B -5336936D -F4E4BEDF -7A885091 -E19CB61D -9D398B7E -C9C4AF2D -A1C076FC -BF60AE9B -CBF56B80 -11038EE3 -4B78AA1C -59C72649 -D687CF08 -B182CC2E -43E4B13A -83126FE9 -EB042718 -627C8807 -47474E59 -3D317A4 -33919B88 -E00CD1A3 -3CC1F4AF -2E91597C -CDDAF2BE -3D3A18D6 -5BD6E47E -3D6A5286 -456410A0 -2B51CF4E -B55046FA -FA43946F -F90AC852 -A064AFA3 -F84235C4 -D316F3D2 -1BB0D769 -46905EBA -255EE03A -EB4D2C17 -6AFFB5CF -D755618F -ABECFB93 -594CBE9A -362C1B5 -ADFAAF67 -ECF2110C -E86FA43A -C789EFB4 -D9FDCC95 -F81FFEBB -C239F63C -16BBBF2F -B1AFC20E -B00BCEFB -D6B41A49 -A5856CBF -E2753B3C -8C03166E -537BA621 -B268C813 -C1B8E5B7 -1FCDD47C -BB257FF0 -37B89618 -6AD0F548 -C5EB6B1 -482EAE33 -1F898EA -C161076A -8112502F -77D0C22B -B1EF60B9 -D8122593 -D0ED144 -A258567E -7FCB11B8 -FC01313B -8A39DE11 -B9612887 -FAF9C5E9 -AFB24528 -C51F261D -15A83256 -E560FDB -5749D494 -61C88749 -F7C9978C -41583770 -73AF53AF -EDB828F7 -5B9A931F -B33EEF56 -3ED0DC67 -915BF5B -CD090180 -3659A346 -E09A572 -B0EB23 -F35F97ED -8708879A -E3761150 -FBCA868 -8EE5D700 -67931F7B -E3819B8F -FA9DD938 -3C3DD434 -FB62C866 -9D6A734E -2BE14923 -7ED6D7BE -423CF38D -CC4C4156 -898F3254 -405B1D62 -25995FCB -C062465 -12471B35 -6DB351F2 -5F23ABC5 -49EF7D2C -91B401B3 -85DE49E0 -81D81230 -9824E09D -767C5312 -E0744F5 -D99A77B9 -7657BA4F -46CA1289 -5D2AEFAC -ECDA74CB -DBA899D3 -AFC6E7B2 -DA79D8BB -F6508AA8 -6D0E5BF -76DD66F3 -DAA00B8F -C7EB98CF -65189199 -FC2F2235 -4F19D2CD -48D4E497 -67A7643D -777B5F1E -2F089D44 -4E841850 -2D371993 -B3ADA2E9 -421A44E9 -1D470C4D -81DA8998 -71D42D8D -E5F09965 -24BDEA19 -F8FB47FE -1CA01D53 -52A53F9B -B13279A7 -840C17AF -F27507D8 -36AA55D1 -29616808 -E5C25388 -404F7A96 -AF6CAD43 -AA2A8D86 -6D0D5DE5 -B60B5047 -F904AAE0 -9BCCB969 -73FFDDAF -AEC2E379 -DDC3B6E3 -85273FF -4F23EA7 -F1048821 -432CA7F7 -FEEFB49D -2749D00 -F0914942 -878203C4 -AB657B2F -FF754E6E -2A1B63BB -2B094F6C -8DD98DF4 -7E8810E3 -D17A81B6 -BF297F6D -FAE3391B -B28655B9 -2B4507BB -702B2563 -FFC8858A -B8DF3A03 -80018970 -4387C2E2 -81246EAC -1201F4B3 -9AF9F9B6 -29F63494 -98A87F7B -C637C322 -BCFB7066 -3505C623 -10BE77F4 -BE44797A -2EF31DB -C8DB4396 -FA7C2378 -AD3C30C3 -C3AEB714 -58183DA -5D961567 -1E42A328 -94430ED5 -866A3D67 -84B148EA -C823439 -80B57816 -D6395105 -B389CD22 -B574BF88 -F12CE1CF -C5B892E4 -94F6CE69 -9387A05E -C806C5C5 -B2823B0D -64F1253B -DD3B64F8 -4C6980E -BA9825C0 -573D9CE3 -A78DB442 -FB5510FE -C45DE1A4 -66DFA70F -47960901 -68D725DA -ACAE1E6B -60F9360 -8C9D39E -E78D5AE3 -A1A0BB75 -80E4ACAF -A0FD5042 -5E0CBC82 -C0474CF6 -840ADEA6 -6F972DE8 -5D16E0D1 -86688917 -E08A3150 -BB5FB87 -2EE82F9C -62867EB6 -B592C066 -64852270 -7A7634F0 -58C6FA6D -E83506E1 -7DC3ADA6 -E972E4D5 -4877FABF -CB37BA71 -7BD3131E -9CA64901 -C072094E -A28F50EC -CBBE833A -225D213F -D4266D98 -3DA08099 -22481B45 -899C4804 -3A8630B2 -7227F512 -FDA1F80E -E5515F91 -6EECC93B -4611F561 -47AD2CF3 -ED2A807A -D694C082 -6DEB43CE -9DBD4F70 -8C918F0D -28C5219F -EB23A332 -AAAACB21 -9B053C22 -6C5AEEBE -B1941AF2 -DEFAA083 -255DAF18 -B513F3E8 -CDE47DE0 -43DD2231 -71BA21A -AB772E2E -510C581D -93A91FFB -ED683872 -E561882C -C503A74E -E274473E -3F7D95C2 -AD48EE4C -887342AA -F4D0DC01 -68023FEA -F996EC8B -F4E33500 -8191511B -AFE0184C -8A6D392B -EDFEA13A -AC3E90B2 -94E7E8DF -76F491E4 -D45224EF -D32B9CD0 -C7167945 -2D56F7E1 -994E7AAB -65EDCC15 -AEAF497A -BA11EA7A -53D5812F -DF05201B -10A9356 -ADAEF92 -508293CC -B45B1908 -DD8C2367 -A385DBEF -A77E11BF -DE9B1792 -A9FFDB94 -AE48AD8B -E7798E96 -BAAF5B51 -44648397 -80303BBA -FBE848C0 -74F37EC6 -C9C0EE6E -1D80DBC0 -6CA37DEC -995387B6 -BA2D99D0 -D1869967 -39D0BB45 -36E391CD -12D6AB0F -4CB16A65 -8BED7413 -99987FE8 -55BD54E3 -5568C11B -F63606C4 -AC4D0747 -3032CADB -52407898 -C461B987 -1F3C8122 -C7E1B1FA -BC1BF34A -724843D7 -2DAB612E -F5180E4E -67FE89A9 -B7641E8E -185E5197 -5FDD9BA3 -C6AC4D7E -DB020625 -16ED5F8D -5A2DB8DB -58F7DE17 -8231D332 -9977723E -CFF39DC3 -A8B71C3E -3335D9BC -D34AE6FB -31559150 -E6494443 -D6C0C713 -515C9C4F -AA09B03F -EB32806D -981F48D -DAB324BE -33EDC165 -88011009 -F1120840 -48119894 -137409C1 -7F45314A -DD74A5A7 -C2251ABF -AA45B420 -4ACBA24E -D020B449 -50E55E0F -D78DD382 -F6E82B05 -9957DCE -1410E573 -CA93CF29 -83DBB1D9 -7AD6D5D4 -7921516F -8399BEB7 -DF07D89D -77AB752E -6D6DBA45 -890771BA -E87CBF52 -F90A7590 -78967761 -6617D522 -2EEDE919 -F28BA9E9 -E1E3AA90 -2CBEBEF8 -1D8A37FB -9CE04F02 -680B5A92 -561178BA -A19545D0 -DBDA24E8 -A7863CD1 -F1B829CD -2BCBD34A -B8DFF2A6 -2787D144 -A075B93E -AA7BC361 -B560CBA7 -F8E79316 -417B968B -9FF31C37 -F88ADDD1 -99A6E199 -D3D400B5 -79F33397 -4AF6EA07 -93EC79F3 -F7D9C5B8 -81D7EE3C -2898D7DC -4B8F67DB -D52D0F0B -10766E32 -E228EA2C -54C96B61 -74A99589 -7E60A886 -8FAF588 -634DD09 -1258CA8E -13E40785 -20861E8F -69BF3004 -E91E2BC8 -583A44C3 -36FD8D36 -572B4202 -BE43EB2C -65F871F3 -723C1C02 -65EBEF48 -8DD407C6 -513D6B1B -150993D3 -4C771124 -A18E6FE4 -C46071C8 -D824EA73 -7A54B17A -4AB1E70C -F7D078B5 -A315F9A4 -9A39A8C8 -CD34D2A6 -8CDEF63D -B273EFA6 -E15B8FB4 -BA2A092B -E540DF83 -33A3B82E -13BB16A4 -4AA79F4 -DCF1D80E -65B77A7E -80CB308 -9A407BA2 -D32D62B0 -DB34DA97 -109F323F -4B07538E -40AD97F -A810835D -6637380B -1ED7261B -DA642F4D -309A47D6 -9009C0E9 -7D9D6E1E -580CCE0B -67F92DAA -1936087F -342D9739 -A191FAF4 -2EF56C33 -EAB9AD66 -FB6E4FF8 -E58333E1 -E42B465D -2D61F572 -9FA12447 -848394C4 -599C9E50 -28675899 -8610332C -968735B8 -ACE06F66 -266C841B -8512CA53 -A25D3088 -D55264D0 -AC3678A9 -D1DF668E -5BEBD716 -DE986F08 -17DB60F5 -B88254C7 -BCA0E5B2 -E78B3459 -494B6F35 -5E0408F6 -A8638621 -62C27360 -8D98C864 -37EDB15B -ADC93344 -4197C21 -FEFE1A30 -ACD03EBB -A3A230A3 -45741EE4 -DE86AD8D -CDBB302B -303A5D5D -A42863D5 -9019ADA8 -EB8E036C -A5558A5D -A4D5AF4B -F04E0726 -C5AEA4BE -FCB9BC09 -3FF2E51A -53E510E9 -86FB3D5B -3031BBDC -1294451B -48879312 -972E95C1 -B8B861CE -FD180B55 -F2930D40 -31C5CF76 -8C132827 -CD696B0C -1446B194 -436D712D -9089677B -493A420F -DF82C186 -377516B8 -20ED2C1E -956EA0C3 -D26B4EEF -BFE59283 -B4D36719 -67B01DDD -6F3CA60 -BF6B98D -1B120FBA -7CF4D06 -83091BF6 -7D3F5D85 -D3E48FAD -E3025BBD -CA30F611 -64D1D991 -6A688C9 -D06F9682 -D346BF -E4DC58EB -4C4F7AB5 -9D5CBB9F -5536C074 -CCD9D1E4 -FADD0C6F -769C50EF -A1F0E40D -72EF3FEF -C421D7AC -182D7491 -3FDDA320 -49F136EE -4EFABBAA -7228A4DE -40A616A9 -EA37E4ED -5DADA164 -2F9C5671 -4D3D4CD3 -3A68B35E -7A26619D -11A14309 -D886253C -8F545687 -3666D9FB -131A5557 -9644C9A3 -FCC47DF7 -7CCDF226 -9FCBB958 -9DB97B96 -630B5596 -1B592B4C -2AB5341F -5817D559 -3C0A5FBE -F65E3830 -1D38ABAB -353E9D4 -41647BE0 -63DC6FC7 -CABC6846 -A7B8001D -2C018A1D -435D877E -3E5F838C -9709BC31 -ACA0EA75 -86A06AB -DBB06480 -2A09283F -D3A83953 -90967E13 -D055B4E1 -3365DA22 -E3FFD521 -50205ED7 -E907F5E6 -4D7D054C -C66CA376 -2A72C5C6 -793120B3 -170AC5FD -C4CFDAA2 -21A3CE3A -19F354F0 -FCE7F112 -279C9605 -AA9FBB98 -E269592C -B8E5DE7F -AE0A77D5 -45B4CF97 -6E9EE4C1 -C31F7C62 -D9E8C76C -75925FEC -EE34024B -73FEA2CD -BC601F7D -75776A1F -AC2A0090 -AA6E1956 -64C62B96 -D73C3066 -2F9C7E78 -7F1529BF -5974399A -79D31554 -2D559A9A -458A1BE -A820156A -26764010 -981D62C3 -A5C8534B -F8A5FAE0 -69EA2102 -2F62B77 -2AE14076 -88EB9A0A -36B5EF31 -73E63D55 -D6A15D81 -F5C8A216 -1EEFBC6A -8F16F5B6 -87064008 -7EEAA78F -35A4B04C -AE70F49 -9642CC0B -3199A9B1 -F0E6FE1C -F682DFA -E500C5B1 -AA1132D6 -3B3A2D9F -86C9A21E -BE1422DB -2218AF29 -64512A76 -C4624FF3 -F4E52FE4 -8473989E -269C4193 -B67528F3 -76FD1A6F -ACF6869B -DCEBBBFD -3ED92226 -3FEA0905 -2C4A131E -4CC5DF7B -63E3A62 -988BE035 -BB06A621 -61C2E087 -C2E46B3F -78010D43 -9EC6DFEB -3781CAAF -6D000EA0 -7E952EA8 -2874E849 -FAA54995 -45DB5F56 -8CB1094F -336FA04C -8CCD3F1C -A40704F0 -7AC652EF -83E998AF -8167F5FD -AA7527B6 -543AF979 -F21F16B6 -9A4E00F -1686D0AC -FB0EF404 -EBA9E0F4 -1A9BCC03 -F66D4C53 -4328EB30 -DF52A096 -4A61DDDE -3F19448E -5F3E0EDC -C9FEB2B1 -D8EDCB6 -4EAE672C -47FB8C0A -B4D64E67 -7F5AA323 -38796C27 -3ED30872 -6241EEE1 -AAFD55B6 -F31CA43A -54CE5828 -6D9103FC -665303B -ACD9B1CC -4961E187 -EEDB6D29 -544577B0 -9CC76FDC -718802FC -2EDC02F0 -6735768 -FC351962 -30F3C426 -7BD3050D -4C19A7C -97DC5F3C -720D7F42 -2F735FAA -B067A6FB -4F5EF847 -F500ABE8 -FD9E7B9E -8C37652E -B6189BE1 -BAEF411D -2584FC7F -FEA99C78 -873C71EE -51491598 -8BCC9600 -60A2176C -9D6D9475 -94E1A54E -78124EEF -4DDDA3D5 -DE77F79C -67E3A57B -1E75B5B5 -290C7ADC -30FDC46D -63BDBBD7 -9E61B234 -666593DE -8C7C1E27 -9C723CAF -EF1F2DDE -CA69CD52 -4DE571F3 -A0AD3A46 -902EB90 -D761B7BB -9F209F04 -15B1B5F -5C389CFF -B736B159 -97994EC -A2DBE074 -353360C5 -19E771B -94A72285 -2F4706A0 -64CC6476 -627BE8B7 -90FE94EA -7D02778 -2EEDEFD1 -9A5EF7C -E7B7B437 -F21A3517 -F33DF1F0 -7A865164 -4BFE70A7 -88A8B45C -C0D320E2 -E93442D3 -AA086067 -11B873ED -1BE002FE -2E799A3 -2AACAAA0 -EB1A91C7 -9FA88D6D -4D956843 -75FB8348 -1584A0EB -4C9D1E1A -413548BF -FA0CF448 -90D1256 -BEB74BF9 -EE7C6510 -765277BA -A6081E2D -E616DE16 -EDFB0495 -12EDC382 -DA64FCA3 -E258DCC3 -92E0B54B -B41B389A -D818F160 -F8F1A55D -17916C31 -DBC21683 -3272DA3 -931C08B3 -9F8EA606 -232CB0D7 -EC870992 -B5F586AB -3ECEF68A -BF7BE567 -2C009224 -C2BE6397 -90EE0A64 -FC3E6BC3 -F1190F98 -1D05D7F8 -52AA90F8 -FF7C45B0 -7F5579FE -6609C7B -9B56CD69 -4A6830B1 -ECF9E86F -62331FA4 -294B7FAB -DC7DFBA7 -4DFA98F8 -CA6447C5 -B0416FDF -5FAD4523 -BBBEA8BD -47DA6D1D -FB598321 -E4A1EBBB -DD0CD41D -77FC8F60 -E4D74C7F -E4B2B064 -52EF568C -91E87E37 -FAF6069 -6E28131E -4D39B103 -59A3C4EC -3AA49C6E -D90E743 -44FC3B9A -7D181041 -AD89A0E7 -616A565F -129B06C1 -907298A -5E98085E -9648A06 -4FE2BFCA -F73FCCCC -62DC849B -BB543EC0 -EF301310 -9801EC66 -43557EE0 -2C382E49 -5151FB5C -3C1DCC5B -DD1C153B -77B3F30 -FDE0F3E1 -C967E75E -D5C68278 -6CC1FA37 -A3FED046 -5DE77F4E -FB7F40F6 -2C9191BB -D089B672 -1E9C6BAC -756468C2 -13352B81 -D2CC73C6 -55B4D4BD -8D6BD8F4 -65F7C5C0 -34A629D9 -79424449 -1CE03FD7 -451FC3D3 -255B39FA -F5F01286 -D1623E81 -4B33EB3D -CB2326EC -9C1189DE -1ED995BA -1298FE00 -A5FDB07F -D80D48D -575374E6 -3664F373 -5ED3FE -2171B235 -413BEA38 -FD67D4A -34F10135 -F4544A59 -16BA37D6 -649879DE -EE8D839B -A545FEF1 -4573F79 -D53FE034 -F4418DBF -92181012 -FB81741F -376DF3DE -19763A21 -47FB6EB7 -7F997F6A -CB94D301 -36461AC2 -A3C2378C -2541AE5 -67D92471 -EC619D04 -3BE21ECC -A441FB3D -A19F0955 -39492084 -6C680626 -C8D37B17 -68B215A0 -8B3846B1 -9B21F1DE -8021097 -EBCC81B2 -E9310566 -AD50FB31 -AF65F01B -739CBC38 -35573201 -F7F58733 -4015ACA -6AA65104 -33202FD0 -B5B1AE8B -C1C66F1C -8BA3BEC9 -E55A2ED0 -49ABBD4B -42DD0652 -A936340A -8EE63409 -5C64BE2D -4D47E9F -745994DC -7CCF78A6 -516C7BF5 -395F9C6 -58E11E54 -73EAA341 -E2D4631A -C3552D0F -4CF36F47 -3FE7034B -EEFCB8C6 -8219943B -E800BB09 -55544B91 -A3292FE8 -89BC5746 -F63B4EE1 -E866DAF9 -E99B2D4B -BB57E938 -34FB7E1A -EBB559C1 -24838BA -48075561 -9E621607 -998E5D98 -DFCF97D6 -2ECF6FC5 -15EE774F -C3E53B77 -8EF5F879 -763B1F55 -5C90BD9 -267E7FCE -625E8032 -F12724C8 -635FC29F -36AF3D44 -B7D2299C -6E8F0DBE -A76006D5 -723C72E0 -ECA467C2 -5C7DFAD4 -23AC163E -F306D785 -67972062 -57D31D2C -4038D82E -D21756BD -257A9123 -BE96CEDC -917019D1 -362C4F33 -2A305FAF -D4389CC3 -4C435238 -D68F1F0C -372B2979 -A7D6B646 -53A2E4C2 -19E556E -62D716A7 -64918481 -4D3AA8F0 -BA8C6B54 -2468C102 -499AD5B3 -81AE28CD -42E94077 -C969675A -341B58FE -41159415 -ADE3FA94 -FF5F42BA -379C83ED -A7E678F -C2D60CBB -CC75230C -A12B9169 -9CF6EE67 -2DD905D3 -EACCF580 -367F9A41 -477BB16D -8438B576 -756D14EF -980599BD -C181C6AD -99A3EF95 -151D4F12 -CD85DFB7 -695F12C9 -4CF48772 -CB00E50D -B9E2AF4C -97EC19E3 -54810B59 -EC4F2D89 -ED77DA60 -19451088 -D5A52E95 -F6FAA3D3 -F2458DDF -D5AB6D8 -D4042924 -AEBEC90 -505DB6D0 -52505B2A -ED9CB8B3 -DB06312E -C508C5AF -4279ED2F -5C72A874 -15E22E84 -54E967EE -80A13FE3 -EE346264 -3569BCA7 -9AA9263B -2BEC95EA -966F3368 -B74F6A2B -25ADEA56 -30A1BCE9 -71EE7AB3 -74807D9C -E4C0D662 -A62305A1 -6B9FB6F0 -C2CAB758 -E3FA413E -5266648 -754C0A13 -C4FD0D47 -BEFA676C -786AFDA7 -297AA674 -F2895DA0 -72A98C20 -A662B307 -54DFB586 -8147050E -CF7C5819 -760EC4AA -F011339D -2D496BE5 -6FD43E03 -1DFD893E -814ADCDF -B7C38DCA -2149763D -EB58B9BA -9F1B81B2 -94C15E0C -5A9923B7 -6C4E0E11 -C63C3D44 -BF9AA840 -1A3E83C5 -B81CEED7 -7E9FD999 -C1A15CFF -B28F657F -287D5990 -8DB5B01E -E241144B -EB0EA64E -884A8775 -99F5DBEA -3DBB21D6 -CC9472CE -B932014E -22A35325 -7B22DCF6 -882BB2C3 -B47CDAE -28767633 -ED17CB12 -6302A17F -25D91C08 -4D61BFB6 -FA240AD0 -E9DBF560 -F0E9AD0E -835C152D -61E5F126 -C176F8FB -B793DC1C -622E04B -D9FB6072 -60124DA7 -8BEA323D -6C496459 -FBE1E578 -F1C73C9E -6A7C4C58 -43F1DB50 -E9BF93AC -B7DC5C72 -2E68083B -F3DE081F -AAA39D71 -73406424 -B99D0139 -E4FB0C67 -142AB82D -3312CC57 -7A3BEDB7 -6B6E42D2 -F8330EA0 -2FE05DA6 -3E6BB118 -3C73E09 -5FDB1471 -6A226A31 -88792727 -78708ED3 -7A095177 -9CCAD23E -C3B75180 -226F8D4C -46DD1DBE -D799BE11 -1F852432 -7361585D -97380EF8 -4F1A8127 -2EB7A73C -35B892A7 -933075A1 -2B6D3BEB -BCDCA6F1 -E9409A22 -3A8E5575 -E37AE0CA -97C2866C -BA575BC0 -C16049A3 -79FED5B1 -6356E153 -98789BE6 -47B95292 -FBDEC30C -2275A4D -632C436D -FDCBB3FE -4E0ACB8D -36A77186 -593FDA25 -D9B74A5D -18021557 -3919EF9B -DDD00927 -B0C6DFEE -F761C0C7 -886DBB5 -807A21DF -778F06D1 -27A67D08 -2CBBD43E -2696EC44 -1F916066 -DE884377 -1472CADD -F30A91AE -89C35DEC -84E5487E -792613D4 -1E59B1A9 -B18BF896 -8D7034AC -A144CE10 -F2FFC2AD -2F5FBA7D -FFEDDB97 -7C506BFD -85B811DE -CC3AD4C0 -B6CC2F1 -BFD63C90 -281E81D7 -89E82B39 -E5371DE9 -5BB68ED3 -3DA62382 -3C8CBB1D -4BE92297 -878783A4 -F925E76B -77DE554E -7EB5914E -9B3F869E -F47FA82D -23E861F2 -19E38BDE -C26E5CA7 -317C9C64 -B96B12FC -F6EB43AE -F979DCAE -DD5BE081 -5B11401 -3C4A8866 -38C6F309 -2FE6DD71 -84E2BDC8 -2FA36F63 -F0D171C -8AAD8CA5 -92D5E506 -D4CF4E62 -82DFFC21 -2C686264 -CDDA9A2B -98CF101 -847DC151 -C0FEC6AC -A1638360 -DD36C966 -A6A8635A -F700C63D -48377DC5 -138CB9D1 -857331B5 -4844609F -E29224CA -A5079F42 -3B39EA92 -F020BFFE -4859CF8E -7C1B1E1E -DD95482D -24C31760 -3555FB83 -B1D20BED -403E6587 -D04E4309 -74F63A1 -EAFDC6CD -781795C6 -BA9A1FD1 -60F61FF3 -B93EE92A -7BCCFCDF -477FB17A -B508142D -D2BC8CD8 -F11D8200 -24A8149A -8F00F213 -3822F374 -E37B6219 -4727F504 -12CD7551 -5FD2779 -E8EC01F6 -29CE5CE4 -1EDDBCF9 -69AFBC0F -11B3CB87 -E39AE82B -E66CDCBF -6824DB75 -7183BE54 -12A11956 -ADA59196 -437E5E61 -F1A7F4A1 -671FDE0A -9202817E -33ABACB2 -B0705AB1 -39952407 -D3672EB1 -A03BD94B -B46D2252 -1DC47573 -EE4C78D4 -B6E4D8E0 -12C2206A -5656E1EE -4D9D4988 -35E36416 -3AC9C8F2 -2161B02C -1B5A8615 -62587331 -CC4036C -EACDCEC6 -F40C98DC -9C8FFDE9 -D87FB3C0 -C55AABE7 -1BE31E0B -C0796911 -C08C311 -E41B196D -E4FFB7A3 -2483C766 -FD348C63 -F294631A -7B74B50A -D6416CD9 -66559F6C -A7CE68E0 -ACD88C63 -BB49939B -7987A018 -E1797428 -CE39ECE8 -D7B3DA7 -8F2A3F0C -37E3C72E -21F1A24E -57AFCEF2 -AB8CF2 -15B5A4E9 -94094315 -29C3AEB6 -A56B4233 -6D57E64E -3A7399D2 -103AE960 -8B93E67E -D5193079 -767DA47D -88AEDE6F -ABCFBF34 -2650782C -7A716475 -C86C9BBA -4423420D -3AF8FD02 -72E202EE -5A264F7B -4E103072 -4DA5A0E0 -59319F97 -B54F9AC -556DF0B3 -ABAD7DC0 -2A715C13 -9D443D0F -54BDC92C -1EC2B967 -80BE3AC2 -FA646E8A -2EE396F1 -8B0315E8 -9F52B6E -DAD30422 -2E9B6CDB -8686D47A -5D9DB3C7 -717E799B -20A4D4E5 -C2DC8AE4 -F630FADD -8C7DF047 -65F4928C -BE66D11E -6004484D -C1B509AB -FAA4C75F -B3D272A0 -7FE6F083 -A54B6584 -FC3292F -4D27DDFC -A1ABC224 -872FED55 -D235AEC -27ED8546 -1B170B2A -CE9E5C0 -2267B02 -285992BD -F855CC8 -8FFB1F6F -C7BDDF81 -349B4F5F -B9B28843 -D5D532A0 -8FD7BE3C -2DB04DE8 -C7D0C2FD -B6822987 -1FE0710D -8EADA490 -A03F99CF -F3E7F902 -F56CCCA3 -CED5B6BF -D6B3DC0D -92AA9FE8 -351208D -A1C9623B -5802547D -3480D77C -404D4E65 -679025BA -905FF962 -B7130CA8 -5AFA9CFE -2A654EFC -26218A8 -473A88A -5E3534CC -771FF1E1 -EADD6296 -DF7157B3 -D48E42E8 -3D6E848B -29CD6C -68732656 -A6C6D52A -B50279FF -705B645 -6DF7F119 -34152606 -72948D92 -18BEE72 -36BE21E3 -C34FD53A -9765DFF -E5C9B4AF -4604B155 -DEAC2388 -7841FE0C -2E275885 -3EE65330 -EB66439B -FF4AB5DE -67EDA5EA -BB722F57 -6A645B7 -DE9DD302 -5AC7601D -371B5D5B -42BAC84D -21C7AA9E -F4ECBE94 -554C8B8A -B7C8BB88 -4C77DB1D -D4D8F3AC -DAB292E5 -85D906E8 -47785703 -9CEE88D4 -7DB86DB7 -694B5A34 -DE77B361 -E8DE3CB9 -315EC35A -A71943BC -C297B8CA -55EA528C -A11AF15D -1490835E -19DA117B -403B0CC3 -FF7DE389 -ED6C22E8 -6F8A8782 -7BF2BA9B -6C95F5DF -F8270769 -AB421268 -F06B05EB -8FF7DE5F -F2AB2FCD -A5EDD602 -31F05712 -3C269177 -67D92F11 -38D8D3C5 -2047013B -8E8BA724 -EB6A773 -5AF14AD1 -49910D46 -C9D6F784 -B44B09CF -1AEA48EF -2F12BD47 -10E3F7C9 -39EA8108 -B88ADC9 -19DAC1B4 -554908DC -587A0A7E -109D1E5B -1920E3CF -BC49C914 -C1EB74A7 -A5E9A494 -5FA5B8C9 -320673C2 -CE643004 -720E4075 -FDFED2FE -89C22F8E -40887408 -3235FF6B -A906F59D -F6F98F12 -7122ECA4 -4CDFCB42 -391F2365 -53AE3667 -6CCCE2E2 -44877A8A -92561CAB -DA5DE0E7 -73B898D6 -2E37229E -ABAAED3C -21087331 -58C85412 -8BB37690 -1256467F -6EE9FAF7 -DB0895D6 -954EF968 -1C7693BC -5786650F -7D441E12 -10AA9174 -492C6A3B -34374CC9 -98E59E7C -5B7BD4E0 -D1124C9F -B5B3362F -8ECC58C7 -8EB0E23E -72991400 -13DF853B -789E8DFE -D85E60DC -A168D4D -C3B6FA3A -11443EE2 -F63F9FDD -1A14A7A5 -5EEBFD5 -B24D582D -AEA8F125 -4AA038EE -5F6A1A16 -CBADD812 -340605AA -8BD8F6E9 -B85F3A6A -A585AE8C -6D12D2B3 -17C97329 -DBB835B9 -789C3DF4 -E048D462 -BECE080A -506DE5CA -63C4FA5C -7C2D8103 -689A3516 -B218BADF -8B7F0BDE -85B17891 -8888A9C6 -3DFC9FA8 -5F2859CD -FF72AE34 -9EA3FFCA -CF2194D2 -53B56E7F -C7009619 -B127FD51 -3A513DF0 -E9147D4B -2FDF3C37 -22FA1629 -61480015 -57EE267A -EE04DA43 -EB2D289C -2C102144 -B012EED -B1B339C8 -AC1EA89 -3A4420D0 -5623907B -B0613D35 -A70F1B2C -589E3EA7 -F998AB7D -9566E921 -B133DB2D -A3106F6A -EFB4518 -6AA3FB8F -C505C8DF -65032E33 -6D3942DF -333553CC -BF392E2 -6C77F980 -39211AFC -9E0B71C9 -A3BB7123 -7CE16B9A -F15BB634 -BD68DE3E -77BB27AB -BB72659C -BFA916CA -7022CF20 -EA64C93D -B61C32CC -20201879 -148DDADC -58977 -8D5CC2E6 -76E678BD -5655B362 -587EAB4A -599E3DCF -7B470038 -E87E82DB -9088EC5E -ED9F9E4C -3DD98E27 -5AFA5052 -3DF313C4 -BB22A60D -44D97BDA -601409F3 -CD1D3CFE -7EAE52D0 -41ABBAA0 -A1D7C883 -FFE2B4C9 -13717374 -9DD27EC8 -29301EF0 -87953D6C -9309161C -C91DFE7C -DD5EC452 -F6C27DF2 -43B433FD -6D16B93F -92F09DBA -ABB598EF -B49A721A -3A03EE56 -3177D3AF -5D24FD94 -FEF88FB2 -52B3170F -64264DCC -18B683B7 -6B21935F -901A396C -4601FB55 -51F2547E -DD37C23B -35E6B3DF -31ABC979 -C7223449 -ABCA9CFB -A8F57AFA -A097240 -78704130 -7F1D7661 -456C2409 -63E31F62 -FD0D4BB1 -97FCC39 -951A7C93 -893165C9 -E86163CC -25F5694C -8890910A -43F3AE36 -55D414A1 -1ADDD3BA -C7EDFDDF -5A8607BA -219D3208 -27BD79E2 -2E9EA4B8 -5D8F951A -F9E880D5 -B2C7612A -862CCCF3 -7EDC71AC -1B6EA644 -EC3AA9A0 -970224FD -6C0DD16A -C589D1B6 -71AC91EE -C75B0206 -50232786 -316AAD4D -F4D5A31B -E30CCF43 -BD72BEAD -26DE4F8F -56E97741 -9243E978 -F7E2363D -BAE2CF31 -6367CFB1 -B72ED4E6 -75216393 -4626E74F -61194364 -8D6726A8 -458611B8 -1B536E4D -837AAD1F -F5A226D8 -8BB37701 -31F19003 -8E48DEEE -9DA11E9 -3BBB5BB4 -C6F15B5D -1A53A4EB -69AADAB -4FAE6295 -F0943601 -A449516E -BF7EE395 -176B1370 -F55873EE -553FEEF0 -9F3AB09 -2539B92E -F6803BC -BAA192FB -DBB0AD5A -B9C5415 -F92D0588 -88B9E738 -A033C767 -A1CA1EFF -5AC07200 -AC60C03D -17FE20F9 -B898B9AC -51AF425E -2706FC42 -F2A258E7 -353652D7 -CF3F89EE -63A13050 -5E6A7997 -153FD92F -1D0E8614 -6E504447 -5AAEC133 -9B6E5499 -64D5EAE6 -A29CFBAB -52B44B68 -8DC7C01A -704EB2F1 -395F1F7 -7D897418 -2FC66846 -ECCE81AE -21CD8E31 -B2EFA3D4 -16C4CD41 -D6A21ED0 -944897F9 -F495D730 -B4317C3C -8C074582 -22F6A9D9 -CE4425FB -FB08BCBA -DF07A006 -293AD5BA -BD224A44 -9DA6701B -DAB46DE4 -9F88773B -57CC02C7 -7A6B68E4 -55A54D48 -BCFC1C53 -DF64F920 -A9FE6014 -4C64DB55 -5FE9345F -412A1E48 -45D41945 -23B44D08 -8D5563A2 -26E5E437 -CECDF4D0 -1BE55025 -84329F92 -37C97F8F -C3CDE976 -580955A -C79E1131 -C5BC58E7 -7D14509B -3DE94089 -1B78FE71 -49A0ECD9 -501D09B1 -F30135CD -B0FA41B4 -33B11313 -32AB01B -635EBA76 -666D7FE5 -68CCC93 -59B0ADA5 -B305CBAA -1C553509 -5E564F7C -F057084C -52811FC8 -987465B2 -461DA750 -F0C471BB -3C9D3E64 -73C920AF -355A26B9 -3A1FDD13 -CEA3F7DD -66C0687 -1319291 -9045182D -174C724D -2A491012 -BA53519F -A62B41D8 -F6E1559E -25F93E6F -2A40C5F4 -C63D1AC2 -82598002 -2B81101A -63442848 -3788BB2D -74DDC016 -214CE0F4 -9CBAA8BD -9288E1AC -EF76E528 -719E7BAE -BD579EF6 -4E6B0C62 -6285F757 -9049BDA3 -80BFE3C1 -4344B7A7 -4552F1DD -DE2C0DAC -86346BE2 -A0A897E7 -1797D93 -6CF3C7F0 -7592D9E7 -CFB46F1E -17D6FF93 -87FF1727 -198FC755 -303540EF -78C07416 -46CB391E -8D441653 -3724DA3C -860D4DDF -A99F046E -4B167D86 -E2AFCBE9 -6608F2D2 -4E49A130 -3C64B760 -958BCEB3 -8C784B24 -5E07EF07 -7E6CAC6A -B69765D8 -65897B6D -60A8FB7D -6706E0E1 -142E4310 -15C4944C -F6A075AD -3CF66DF8 -CE1EFE72 -D6495864 -2BDEFA6B -9E511045 -F2E2E9A7 -B71B03EB -15DD8D69 -65E5A555 -52C644AE -301A8F69 -35075232 -17ADE8C4 -A2C808CC -F1A4C57B -D6EE3EF3 -85942F72 -26011F23 -D4211E97 -595E1A12 -6886CE0 -FBD6F396 -D10BD980 -6615476D -4662EB8F -F80BE955 -93A6E68E -4C3D4CAA -5838D0CB -756FB6E4 -F0BC8312 -EB89BE83 -D34E119E -34F860EC -F371DC73 -BB166E0D -CE86AF89 -C177E633 -A19C1D9B -B1DCBF1B -D7310057 -2452939E -120A830 -F92A9928 -64877B92 -3D69A585 -178187B6 -146C0495 -9A3D8886 -C79478AD -9A429976 -29795A97 -32BD0034 -1EE08CD -8982284A -ED362AC4 -4A1AC734 -6FD164B3 -422ADEBA -9374B593 -BBFA8568 -1C0B26A5 -5DF68365 -CFA1D689 -1C9509C2 -1056EAC4 -D492D000 -64076487 -2C1FB65B -9E1DEBC7 -C5AECD05 -39652664 -57A1B9F9 -3652484 -E8CCF72B -CB7EC405 -7DA97E78 -7ACE1B2C -A5DC0B75 -40C14422 -777B17AF -5AA3FEDF -319C2B1C -AB8EEE5F -159D66E5 -3E479D0 -12AF93DE -55EA550A -38853E1F -FB943864 -781FA52E -4FB9C9FA -377D8866 -8411E296 -641D997F -1933684F -27A62DEF -50E15F68 -755BCD7C -5DF3466F -494A937C -8763C6BD -C04B98E0 -E9E067FF -444151AB -C5FC7398 -5EC7D30E -E0610B7E -76CEBB5 -B15D9821 -37B2D1E2 -CC1249BF -3E064388 -246B17B3 -4A342228 -529E849B -F25F250D -31F3E925 -D1112DCA -DA6A8BC9 -2A7789D8 -C0C2C72D -4BB23226 -68166638 -4EC7519F -D559B4B7 -8035E823 -DFB06DE0 -2B4B86 -83D6F12F -84AC7F7B -7139E98B -C42D8AE3 -2992AD9C -E1E24DA1 -838772BD -CA28D517 -3606947F -B9FDFA59 -6C4F8489 -76DBFFD4 -3F0BFDF6 -1B04AD1B -8BA40134 -842A54F6 -621A0DFE -1F3729FC -C53AFEFE -CD5F1E79 -D2C0C70 -30A4FF4F -D384C76 -D73B9B17 -C74DC3F9 -E5ACD113 -901E6D5D -D376A71F -57BA08F9 -17E25669 -F7485021 -BCD1B9C5 -90C1A916 -EEF9DE6E -6AD37907 -40B05A7B -4A56C1D -901093E1 -5424EEE9 -3336300D -8B1767F3 -707A4B23 -37290194 -13A5E016 -C25902C0 -5C04C3AE -B7D84F4D -D57A495F -EE168042 -1584DB78 -7DBFDBD3 -DBE2218D -9EED8CD4 -2A562C0F -C76F7E04 -8FCA82B8 -7211C54F -8E76E82C -9BAF59A6 -C1E7B9CE -28E9E29F -6746FB40 -7841DDA1 -37D07C7 -88A5CF5 -4B0B8A4E diff --git a/finn-rtllib/memstream/sim/memstream_tb.sv b/finn-rtllib/memstream/sim/memstream_tb.sv new file mode 100644 index 0000000000..4b2e850415 --- /dev/null +++ b/finn-rtllib/memstream/sim/memstream_tb.sv @@ -0,0 +1,212 @@ +/** + * Copyright (c) 2023, Xilinx + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of FINN nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + */ + +module memstream_tb; + localparam int unsigned DEPTH = 256; + localparam int unsigned DATA_WIDTH = 32; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + logic rst; + + // Configuration Interface + logic [31:0] config_address; + logic config_ce; + logic config_we; + logic [DATA_WIDTH-1:0] config_d0; + uwire config_rack; + uwire [DATA_WIDTH-1:0] config_q0; + + // Streamed Output + logic ordy; + uwire ovld; + uwire [DATA_WIDTH-1:0] odat; + + initial begin + config_address = 'x; + config_ce = 0; + config_we = 0; + config_d0 = 'x; + + ordy = 0; + + rst = 1; + repeat(16) @(posedge clk); + rst <= 0; + + // Write Parameters + config_ce <= 1; + config_we <= 1; + for(int unsigned i = 0; i < DEPTH; i++) begin + config_address <= i; + config_d0 <= i; + @(posedge clk); + end + config_address <= 'x; + config_ce <= 0; + config_we <= 0; + config_d0 <= 'x; + + rst <= 1; + @(posedge clk); + rst <= 0; + + // One Round of Stream Read + ordy <= 1; + for(int unsigned i = 0; i < DEPTH; i++) begin + @(posedge clk iff ovld); + assert(odat == i) else begin + $error("Unexpected output: %0d instead of %0d", odat, i); + $stop; + end + end + ordy <= 0; + + // Full Parameter Readback + if(1) begin + automatic logic [DATA_WIDTH-1:0] Q[$] = {}; + + config_ce <= 1; + for(int unsigned i = 0; i < DEPTH; i++) begin + config_address <= i; + @(posedge clk); + Q.push_back(i); + + if(config_rack) begin + automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front(); + assert(config_q0 == exp) else begin + $error("Readback mismatch: %0d instead of %0d", config_q0, exp); + $stop; + end + end + end + config_address <= 'x; + config_ce <= 0; + + while(Q.size) begin + automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front(); + + @(posedge clk iff config_rack); + assert(config_q0 == exp) else begin + $error("Readback mismatch: %0d instead of %0d", config_q0, exp); + $stop; + end + end + end + + repeat(6) @(posedge clk); + + // Another Round of Stream Read + ordy <= 1; + for(int unsigned i = 0; i < DEPTH; i++) begin + @(posedge clk iff ovld); + assert(odat == i) else begin + $error("Unexpected output: %0d instead of %0d", odat, i); + $stop; + end + end + ordy <= 0; + + // A Round of Stream Read with intermittent Read Backs + if(1) begin + automatic logic [DATA_WIDTH-1:0] Q[$] = {}; + + for(int unsigned i = 0; i < DEPTH; i++) begin + do begin + // Randomly delayed Readiness + if($urandom()%5 != 0) ordy <= 1; + + // Issue and Check Random Read Backs + if($urandom()%9 == 0) begin + automatic int unsigned addr = $urandom() % DEPTH; + config_ce <= 1; + config_address <= addr; + Q.push_back(addr); + end + @(posedge clk); + config_ce <= 0; + config_address <= 'x; + + if(config_rack) begin + automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front(); + assert(config_q0 == exp) else begin + $error("Readback mismatch: %0d instead of %0d", config_q0, exp); + $stop; + end + end + + end while(!ovld || !ordy); + ordy <= 0; + + assert(odat == i) else begin + $error("Unexpected output: %0d instead of %0d", odat, i); + $stop; + end + end + + while(Q.size) begin + automatic logic [DATA_WIDTH-1:0] exp = Q.pop_front(); + + @(posedge clk iff config_rack); + assert(config_q0 == exp) else begin + $error("Readback mismatch: %0d instead of %0d", config_q0, exp); + $stop; + end + end + end + ordy <= 0; + + repeat(2) @(posedge clk); + $display("Test completed."); + $finish; + end + + memstream #( + .DEPTH(DEPTH), + .WIDTH(DATA_WIDTH) + ) dut ( + .clk, .rst, + + .config_address, + .config_ce, + .config_we, + .config_d0, + .config_q0, + .config_rack, + + .ordy, + .ovld, + .odat + ); + +endmodule : memstream_tb diff --git a/finn-rtllib/memstream/sim/tb_memstream.v b/finn-rtllib/memstream/sim/tb_memstream.v deleted file mode 100644 index ad3efad5bd..0000000000 --- a/finn-rtllib/memstream/sim/tb_memstream.v +++ /dev/null @@ -1,369 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -`timescale 1ns/10ps - -module tb_memstream; - -//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths -parameter CONFIG_EN = 1; -parameter NSTREAMS = 4;//1 up to 6 - -parameter MEM_DEPTH = 9216; -parameter MEM_WIDTH = 32; -parameter MEM_INIT = "./"; -parameter MEM_CHECK = "golden.dat"; - -//widths per stream -parameter STRM0_WIDTH = 32; -parameter STRM1_WIDTH = 32; -parameter STRM2_WIDTH = 32; -parameter STRM3_WIDTH = 32; -parameter STRM4_WIDTH = 1; -parameter STRM5_WIDTH = 1; - -//depths per stream -parameter STRM0_DEPTH = 2304; -parameter STRM1_DEPTH = 2304; -parameter STRM2_DEPTH = 2304; -parameter STRM3_DEPTH = 2304; -parameter STRM4_DEPTH = 1; -parameter STRM5_DEPTH = 1; - -//offsets for each stream -parameter STRM0_OFFSET = 0; -parameter STRM1_OFFSET = 2304; -parameter STRM2_OFFSET = 4608; -parameter STRM3_OFFSET = 6912; -parameter STRM4_OFFSET = 0; -parameter STRM5_OFFSET = 0; - - -reg clk; -reg rst; - -reg [31:0] config_address = 0; -reg config_ce = 0; -reg config_we = 0; -reg [31:0] config_d0 = 0; -wire [31:0] config_q0; - -//multiple wire AXI Streams -reg m_axis_0_afull; -reg m_axis_0_tready; -wire m_axis_0_tvalid; -wire [STRM0_WIDTH-1:0] m_axis_0_tdata; - -reg m_axis_1_afull; -reg m_axis_1_tready; -wire m_axis_1_tvalid; -wire [STRM1_WIDTH-1:0] m_axis_1_tdata; - -reg m_axis_2_afull; -reg m_axis_2_tready; -wire m_axis_2_tvalid; -wire [STRM2_WIDTH-1:0] m_axis_2_tdata; - -reg m_axis_3_afull; -reg m_axis_3_tready; -wire m_axis_3_tvalid; -wire [STRM3_WIDTH-1:0] m_axis_3_tdata; - -reg m_axis_4_afull; -reg m_axis_4_tready; -wire m_axis_4_tvalid; -wire [STRM4_WIDTH-1:0] m_axis_4_tdata; - -reg m_axis_5_afull; -reg m_axis_5_tready; -wire m_axis_5_tvalid; -wire [STRM5_WIDTH-1:0] m_axis_5_tdata; - -reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0]; -integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5; -integer done = 0; -reg [5:0] rng; - -//clock -initial begin - clk = 0; - forever #5 clk = ~clk; -end - -initial begin - rst = 1; - config_ce = 0; - m_axis_0_afull = 0; - m_axis_1_afull = 0; - m_axis_2_afull = 0; - m_axis_3_afull = 0; - m_axis_4_afull = 0; - m_axis_5_afull = 0; - m_axis_0_tready = 1; - m_axis_1_tready = 1; - m_axis_2_tready = 1; - m_axis_3_tready = 1; - m_axis_4_tready = 1; - m_axis_5_tready = 1; - repeat(100) @(negedge clk); - rst = 0; - #100 - fork - begin - $display("Starting to generate random AFULL"); - while(~done) begin - rng = $random; - m_axis_0_afull = rng[0]; - m_axis_1_afull = rng[1]; - m_axis_2_afull = rng[2]; - m_axis_3_afull = rng[3]; - m_axis_4_afull = rng[4]; - m_axis_5_afull = rng[5]; - @(negedge clk); - end - end - join -end - - -//DUT -memstream -#( - CONFIG_EN, - NSTREAMS, - MEM_DEPTH, - MEM_WIDTH, - MEM_INIT, - - //widths per stream - STRM0_WIDTH, - STRM1_WIDTH, - STRM2_WIDTH, - STRM3_WIDTH, - STRM4_WIDTH, - STRM5_WIDTH, - - //depths per stream - STRM0_DEPTH, - STRM1_DEPTH, - STRM2_DEPTH, - STRM3_DEPTH, - STRM4_DEPTH, - STRM5_DEPTH, - - //offsets for each stream - STRM0_OFFSET, - STRM1_OFFSET, - STRM2_OFFSET, - STRM3_OFFSET, - STRM4_OFFSET, - STRM5_OFFSET -) -dut -( - clk, - ~rst, - - //optional AXI-Lite interface - config_address, - config_ce, - config_we, - config_d0, - config_q0, - - //multiple output AXI Streams - m_axis_0_afull, - m_axis_0_tready, - m_axis_0_tvalid, - m_axis_0_tdata, - - m_axis_1_afull, - m_axis_1_tready, - m_axis_1_tvalid, - m_axis_1_tdata, - - m_axis_2_afull, - m_axis_2_tready, - m_axis_2_tvalid, - m_axis_2_tdata, - - m_axis_3_afull, - m_axis_3_tready, - m_axis_3_tvalid, - m_axis_3_tdata, - - m_axis_4_afull, - m_axis_4_tready, - m_axis_4_tvalid, - m_axis_4_tdata, - - m_axis_5_afull, - m_axis_5_tready, - m_axis_5_tvalid, - m_axis_5_tdata - - -); - -//stream checkers -initial begin - ptr0 = STRM0_OFFSET; - ptr1 = STRM1_OFFSET; - ptr2 = STRM2_OFFSET; - ptr3 = STRM3_OFFSET; - ptr4 = STRM4_OFFSET; - ptr5 = STRM5_OFFSET; - fork - //check stream 0 - begin - $display("Starting stream 0 checker"); - while(~done & (NSTREAMS > 0)) begin - @(negedge clk); - if(m_axis_0_tvalid) begin - if(m_axis_0_tdata != golden[ptr0]) begin - $display("Mismatch on stream 0"); - $stop(); - end - //increment pointer - ptr0 = ptr0 + 1; - //rewind pointer if it's reached end - if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH)) - ptr0 = STRM0_OFFSET; - end - end - end - //check stream 1 - begin - $display("Starting stream 1 checker"); - while(~done & (NSTREAMS > 1)) begin - @(negedge clk); - if(m_axis_1_tvalid) begin - if(m_axis_1_tdata != golden[ptr1]) begin - $display("Mismatch on stream 1"); - $stop(); - end - //increment pointer - ptr1 = ptr1 + 1; - //rewind pointer if it's reached end - if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH)) - ptr1 = STRM1_OFFSET; - end - end - end - - //check stream 2 - begin - $display("Starting stream 2 checker"); - while(~done & (NSTREAMS > 2)) begin - @(negedge clk); - if(m_axis_2_tvalid) begin - if(m_axis_2_tdata != golden[ptr2]) begin - $display("Mismatch on stream 2"); - $stop(); - end - //increment pointer - ptr2 = ptr2 + 1; - //rewind pointer if it's reached end - if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH)) - ptr2 = STRM2_OFFSET; - end - end - end - //check stream 3 - begin - $display("Starting stream 3 checker"); - while(~done & (NSTREAMS > 3)) begin - @(negedge clk); - if(m_axis_3_tvalid) begin - if(m_axis_3_tdata != golden[ptr3]) begin - $display("Mismatch on stream 3"); - $stop(); - end - //increment pointer - ptr3 = ptr3 + 1; - //rewind pointer if it's reached end - if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH)) - ptr3 = STRM3_OFFSET; - end - end - end - //check stream 4 - begin - $display("Starting stream 4 checker"); - while(~done & (NSTREAMS > 4)) begin - @(negedge clk); - if(m_axis_4_tvalid) begin - if(m_axis_4_tdata != golden[ptr4]) begin - $display("Mismatch on stream 4"); - $stop(); - end - //increment pointer - ptr4 = ptr4 + 1; - //rewind pointer if it's reached end - if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH)) - ptr4 = STRM4_OFFSET; - end - end - end - //check stream 5 - begin - $display("Starting stream 5 checker"); - while(~done & (NSTREAMS > 5)) begin - @(negedge clk); - if(m_axis_5_tvalid) begin - if(m_axis_5_tdata != golden[ptr5]) begin - $display("Mismatch on stream 5"); - $stop(); - end - //increment pointer - ptr5 = ptr5 + 1; - //rewind pointer if it's reached end - if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH)) - ptr5 = STRM5_OFFSET; - end - end - end - join -end - -initial begin - done = 0; - $readmemh(MEM_CHECK,golden); -// $dumpfile("wave.vcd"); -// $dumpvars(0,tb_memstream); - @(negedge rst); - #10000000 - $display("Test done!"); - done = 1; - #1000 - $finish(); -end - -endmodule diff --git a/finn-rtllib/memstream/sim/tb_memstream_writes.v b/finn-rtllib/memstream/sim/tb_memstream_writes.v deleted file mode 100644 index c66807454b..0000000000 --- a/finn-rtllib/memstream/sim/tb_memstream_writes.v +++ /dev/null @@ -1,486 +0,0 @@ -/* - Copyright (c) 2020, Xilinx - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions are met: - - * Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright notice, - this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. - - * Neither the name of FINN nor the names of its - contributors may be used to endorse or promote products derived from - this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE - FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR - SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER - CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, - OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -`timescale 1ns/10ps - -module tb_memstream_writes; - -//parameters to enable/disable axi-mm, set number of streams, set readmemh for memory, set per-stream offsets in memory, set per-stream widths -parameter CONFIG_EN = 1; -parameter NSTREAMS = 2;//1 up to 6 - -parameter MEM_DEPTH = 40; -parameter MEM_WIDTH = 70; - -//widths per stream -parameter STRM0_WIDTH = 70; -parameter STRM1_WIDTH = 32; -parameter STRM2_WIDTH = 32; -parameter STRM3_WIDTH = 32; -parameter STRM4_WIDTH = 1; -parameter STRM5_WIDTH = 1; - -//depths per stream -parameter STRM0_DEPTH = 20; -parameter STRM1_DEPTH = 20; -parameter STRM2_DEPTH = 2304; -parameter STRM3_DEPTH = 2304; -parameter STRM4_DEPTH = 1; -parameter STRM5_DEPTH = 1; - -//offsets for each stream -parameter STRM0_OFFSET = 0; -parameter STRM1_OFFSET = 20; -parameter STRM2_OFFSET = 4608; -parameter STRM3_OFFSET = 6912; -parameter STRM4_OFFSET = 0; -parameter STRM5_OFFSET = 0; - - -reg clk; -reg rst; - -wire awready; -reg awvalid; -reg [31:0] awaddr; -reg [2:0] awprot; -//write data -wire wready; -reg wvalid; -reg [31:0] wdata; -reg [3:0] wstrb; -//burst response -reg bready; -wire bvalid; -wire [1:0] bresp; - -//Read channels -//read address -wire arready; -reg arvalid; -reg [31:0] araddr; -reg [2:0] arprot; -//read data -reg rready; -wire rvalid; -wire [1:0] rresp; -wire [31:0] rdata; - -//multiple wire AXI Streams -reg m_axis_0_afull; -reg m_axis_0_tready; -wire m_axis_0_tvalid; -wire [STRM0_WIDTH-1:0] m_axis_0_tdata; - -reg m_axis_1_afull; -reg m_axis_1_tready; -wire m_axis_1_tvalid; -wire [STRM1_WIDTH-1:0] m_axis_1_tdata; - -reg m_axis_2_afull; -reg m_axis_2_tready; -wire m_axis_2_tvalid; -wire [STRM2_WIDTH-1:0] m_axis_2_tdata; - -reg m_axis_3_afull; -reg m_axis_3_tready; -wire m_axis_3_tvalid; -wire [STRM3_WIDTH-1:0] m_axis_3_tdata; - -reg m_axis_4_afull; -reg m_axis_4_tready; -wire m_axis_4_tvalid; -wire [STRM4_WIDTH-1:0] m_axis_4_tdata; - -reg m_axis_5_afull; -reg m_axis_5_tready; -wire m_axis_5_tvalid; -wire [STRM5_WIDTH-1:0] m_axis_5_tdata; - -reg [MEM_WIDTH-1:0] golden[MEM_DEPTH-1:0]; -reg [MEM_WIDTH-1:0] gword; -integer ptr0, ptr1, ptr2, ptr3, ptr4, ptr5; -integer done = 0; -integer i, j; -reg [5:0] rng; - -parameter NFOLDS_PER_WORD = (MEM_WIDTH+31)/32; - -task axi_write; - input [MEM_WIDTH-1:0] data; - input [31:0] adr; - begin - for(j=0; j<(1<<$clog2(NFOLDS_PER_WORD)); j=j+1) begin - @(negedge clk); - awvalid = 1; - wvalid = 1; - wdata = data>>(j*32); - awaddr = (adr*(1<<$clog2(NFOLDS_PER_WORD))+j)*4; - fork - begin - @(posedge awready); - @(posedge clk) awvalid = 0; - end - begin - @(posedge wready); - @(posedge clk) wvalid = 0; - end - join - @(posedge clk); - end - end -endtask - -task axi_read; - input [31:0] adr; - output [MEM_WIDTH-1:0] data; - begin - data = 0; - for(j=0; j 0)) begin - @(negedge clk); - if(m_axis_0_tvalid & m_axis_0_tready) begin - if(m_axis_0_tdata != golden[ptr0]) begin - $display("Mismatch on stream 0"); - $stop(); - end - //increment pointer - ptr0 = ptr0 + 1; - //rewind pointer if it's reached end - if(ptr0 == (STRM0_OFFSET + STRM0_DEPTH)) - ptr0 = STRM0_OFFSET; - end - end - end - //check stream 1 - begin - $display("Starting stream 1 checker"); - while(~done & (NSTREAMS > 1)) begin - @(negedge clk); - if(m_axis_1_tvalid & m_axis_1_tready) begin - if(m_axis_1_tdata != golden[ptr1]) begin - $display("Mismatch on stream 1"); - $stop(); - end - //increment pointer - ptr1 = ptr1 + 1; - //rewind pointer if it's reached end - if(ptr1 == (STRM1_OFFSET + STRM1_DEPTH)) - ptr1 = STRM1_OFFSET; - end - end - end - //check stream 2 - begin - $display("Starting stream 2 checker"); - while(~done & (NSTREAMS > 2)) begin - @(negedge clk); - if(m_axis_2_tvalid & m_axis_2_tready) begin - if(m_axis_2_tdata != golden[ptr2]) begin - $display("Mismatch on stream 2"); - $stop(); - end - //increment pointer - ptr2 = ptr2 + 1; - //rewind pointer if it's reached end - if(ptr2 == (STRM2_OFFSET + STRM2_DEPTH)) - ptr2 = STRM2_OFFSET; - end - end - end - //check stream 3 - begin - $display("Starting stream 3 checker"); - while(~done & (NSTREAMS > 3)) begin - @(negedge clk); - if(m_axis_3_tvalid & m_axis_3_tready) begin - if(m_axis_3_tdata != golden[ptr3]) begin - $display("Mismatch on stream 3"); - $stop(); - end - //increment pointer - ptr3 = ptr3 + 1; - //rewind pointer if it's reached end - if(ptr3 == (STRM3_OFFSET + STRM3_DEPTH)) - ptr3 = STRM3_OFFSET; - end - end - end - //check stream 4 - begin - $display("Starting stream 4 checker"); - while(~done & (NSTREAMS > 4)) begin - @(negedge clk); - if(m_axis_4_tvalid & m_axis_4_tready) begin - if(m_axis_4_tdata != golden[ptr4]) begin - $display("Mismatch on stream 4"); - $stop(); - end - //increment pointer - ptr4 = ptr4 + 1; - //rewind pointer if it's reached end - if(ptr4 == (STRM4_OFFSET + STRM4_DEPTH)) - ptr4 = STRM4_OFFSET; - end - end - end - //check stream 5 - begin - $display("Starting stream 5 checker"); - while(~done & (NSTREAMS > 5)) begin - @(negedge clk); - if(m_axis_5_tvalid & m_axis_5_tready) begin - if(m_axis_5_tdata != golden[ptr5]) begin - $display("Mismatch on stream 5"); - $stop(); - end - //increment pointer - ptr5 = ptr5 + 1; - //rewind pointer if it's reached end - if(ptr5 == (STRM5_OFFSET + STRM5_DEPTH)) - ptr5 = STRM5_OFFSET; - end - end - end - join -end - -initial begin - done = 0; - @(negedge rst); - $dumpfile("wave.vcd"); - $dumpvars(0,tb_memstream_writes); - #50000 - $display("Test done!"); - done = 1; - #1000 - $finish(); -end - -endmodule diff --git a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl index 87565bc561..e802d81c79 100644 --- a/finn-rtllib/memstream/xgui/memstream_v1_0.tcl +++ b/finn-rtllib/memstream/xgui/memstream_v1_0.tcl @@ -8,42 +8,21 @@ proc init_gui { IPINST } { #Adding Page set Page_0 [ipgui::add_page $IPINST -name "Page 0"] ipgui::add_param $IPINST -name "AXILITE_ADDR_WIDTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "CONFIG_EN" -parent ${Page_0} - ipgui::add_param $IPINST -name "MEM_DEPTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "MEM_INIT" -parent ${Page_0} - ipgui::add_param $IPINST -name "MEM_WIDTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "NSTREAMS" -parent ${Page_0} - ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0} -widget comboBox - ipgui::add_param $IPINST -name "STRM0_DEPTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM0_OFFSET" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM0_WIDTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM1_DEPTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM1_OFFSET" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM1_WIDTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM2_DEPTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM2_OFFSET" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM2_WIDTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM3_DEPTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM3_OFFSET" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM3_WIDTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM4_DEPTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM4_OFFSET" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM4_WIDTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM5_DEPTH" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM5_OFFSET" -parent ${Page_0} - ipgui::add_param $IPINST -name "STRM5_WIDTH" -parent ${Page_0} - - + ipgui::add_param $IPINST -name "DEPTH" -parent ${Page_0} + ipgui::add_param $IPINST -name "INIT_FILE" -parent ${Page_0} + ipgui::add_param $IPINST -name "RAM_STYLE" -parent ${Page_0} + ipgui::add_param $IPINST -name "WIDTH" -parent ${Page_0} } -proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_WIDTH } { +proc update_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.DEPTH PARAM_VALUE.WIDTH } { # Procedure called to update AXILITE_ADDR_WIDTH when any of the dependent parameters in the arguments change + set AXILITE_ADDR_WIDTH ${PARAM_VALUE.AXILITE_ADDR_WIDTH} - set MEM_DEPTH ${PARAM_VALUE.MEM_DEPTH} - set MEM_WIDTH ${PARAM_VALUE.MEM_WIDTH} - set values(MEM_DEPTH) [get_property value $MEM_DEPTH] - set values(MEM_WIDTH) [get_property value $MEM_WIDTH] - set_property value [gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE $values(MEM_DEPTH) $values(MEM_WIDTH)] $AXILITE_ADDR_WIDTH + set DEPTH ${PARAM_VALUE.DEPTH} + set WIDTH ${PARAM_VALUE.WIDTH} + set values(DEPTH) [get_property value $DEPTH] + set values(WIDTH) [get_property value $WIDTH] + set_property value [gen_USERPARAMETER_AXILITE_ADDR_WIDTH_VALUE $values(DEPTH) $values(WIDTH)] $AXILITE_ADDR_WIDTH } proc validate_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH } { @@ -51,48 +30,21 @@ proc validate_PARAM_VALUE.AXILITE_ADDR_WIDTH { PARAM_VALUE.AXILITE_ADDR_WIDTH } return true } -proc update_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } { - # Procedure called to update CONFIG_EN when any of the dependent parameters in the arguments change +proc update_PARAM_VALUE.DEPTH { PARAM_VALUE.DEPTH } { + # Procedure called to update DEPTH when any of the dependent parameters in the arguments change } -proc validate_PARAM_VALUE.CONFIG_EN { PARAM_VALUE.CONFIG_EN } { - # Procedure called to validate CONFIG_EN +proc validate_PARAM_VALUE.DEPTH { PARAM_VALUE.DEPTH } { + # Procedure called to validate DEPTH return true } -proc update_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } { - # Procedure called to update MEM_DEPTH when any of the dependent parameters in the arguments change +proc update_PARAM_VALUE.INIT_FILE { PARAM_VALUE.INIT_FILE } { + # Procedure called to update INIT_FILE when any of the dependent parameters in the arguments change } -proc validate_PARAM_VALUE.MEM_DEPTH { PARAM_VALUE.MEM_DEPTH } { - # Procedure called to validate MEM_DEPTH - return true -} - -proc update_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } { - # Procedure called to update MEM_INIT when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.MEM_INIT { PARAM_VALUE.MEM_INIT } { - # Procedure called to validate MEM_INIT - return true -} - -proc update_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } { - # Procedure called to update MEM_WIDTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.MEM_WIDTH { PARAM_VALUE.MEM_WIDTH } { - # Procedure called to validate MEM_WIDTH - return true -} - -proc update_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } { - # Procedure called to update NSTREAMS when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.NSTREAMS { PARAM_VALUE.NSTREAMS } { - # Procedure called to validate NSTREAMS +proc validate_PARAM_VALUE.INIT_FILE { PARAM_VALUE.INIT_FILE } { + # Procedure called to validate INIT_FILE return true } @@ -105,192 +57,29 @@ proc validate_PARAM_VALUE.RAM_STYLE { PARAM_VALUE.RAM_STYLE } { return true } -proc update_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } { - # Procedure called to update STRM0_DEPTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM0_DEPTH { PARAM_VALUE.STRM0_DEPTH } { - # Procedure called to validate STRM0_DEPTH - return true -} - -proc update_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } { - # Procedure called to update STRM0_OFFSET when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM0_OFFSET { PARAM_VALUE.STRM0_OFFSET } { - # Procedure called to validate STRM0_OFFSET - return true -} - -proc update_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } { - # Procedure called to update STRM0_WIDTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM0_WIDTH { PARAM_VALUE.STRM0_WIDTH } { - # Procedure called to validate STRM0_WIDTH - return true -} - -proc update_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } { - # Procedure called to update STRM1_DEPTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM1_DEPTH { PARAM_VALUE.STRM1_DEPTH } { - # Procedure called to validate STRM1_DEPTH - return true -} - -proc update_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } { - # Procedure called to update STRM1_OFFSET when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM1_OFFSET { PARAM_VALUE.STRM1_OFFSET } { - # Procedure called to validate STRM1_OFFSET - return true -} - -proc update_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } { - # Procedure called to update STRM1_WIDTH when any of the dependent parameters in the arguments change +proc update_PARAM_VALUE.WIDTH { PARAM_VALUE.WIDTH } { + # Procedure called to update WIDTH when any of the dependent parameters in the arguments change } -proc validate_PARAM_VALUE.STRM1_WIDTH { PARAM_VALUE.STRM1_WIDTH } { - # Procedure called to validate STRM1_WIDTH +proc validate_PARAM_VALUE.WIDTH { PARAM_VALUE.WIDTH } { + # Procedure called to validate WIDTH return true } -proc update_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } { - # Procedure called to update STRM2_DEPTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM2_DEPTH { PARAM_VALUE.STRM2_DEPTH } { - # Procedure called to validate STRM2_DEPTH - return true -} - -proc update_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } { - # Procedure called to update STRM2_OFFSET when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM2_OFFSET { PARAM_VALUE.STRM2_OFFSET } { - # Procedure called to validate STRM2_OFFSET - return true -} - -proc update_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } { - # Procedure called to update STRM2_WIDTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM2_WIDTH { PARAM_VALUE.STRM2_WIDTH } { - # Procedure called to validate STRM2_WIDTH - return true -} - -proc update_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } { - # Procedure called to update STRM3_DEPTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM3_DEPTH { PARAM_VALUE.STRM3_DEPTH } { - # Procedure called to validate STRM3_DEPTH - return true -} - -proc update_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } { - # Procedure called to update STRM3_OFFSET when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM3_OFFSET { PARAM_VALUE.STRM3_OFFSET } { - # Procedure called to validate STRM3_OFFSET - return true -} - -proc update_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } { - # Procedure called to update STRM3_WIDTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM3_WIDTH { PARAM_VALUE.STRM3_WIDTH } { - # Procedure called to validate STRM3_WIDTH - return true -} - -proc update_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } { - # Procedure called to update STRM4_DEPTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM4_DEPTH { PARAM_VALUE.STRM4_DEPTH } { - # Procedure called to validate STRM4_DEPTH - return true -} - -proc update_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } { - # Procedure called to update STRM4_OFFSET when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM4_OFFSET { PARAM_VALUE.STRM4_OFFSET } { - # Procedure called to validate STRM4_OFFSET - return true -} -proc update_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } { - # Procedure called to update STRM4_WIDTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM4_WIDTH { PARAM_VALUE.STRM4_WIDTH } { - # Procedure called to validate STRM4_WIDTH - return true -} - -proc update_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } { - # Procedure called to update STRM5_DEPTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM5_DEPTH { PARAM_VALUE.STRM5_DEPTH } { - # Procedure called to validate STRM5_DEPTH - return true -} - -proc update_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } { - # Procedure called to update STRM5_OFFSET when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM5_OFFSET { PARAM_VALUE.STRM5_OFFSET } { - # Procedure called to validate STRM5_OFFSET - return true -} - -proc update_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } { - # Procedure called to update STRM5_WIDTH when any of the dependent parameters in the arguments change -} - -proc validate_PARAM_VALUE.STRM5_WIDTH { PARAM_VALUE.STRM5_WIDTH } { - # Procedure called to validate STRM5_WIDTH - return true -} - - -proc update_MODELPARAM_VALUE.CONFIG_EN { MODELPARAM_VALUE.CONFIG_EN PARAM_VALUE.CONFIG_EN } { +proc update_MODELPARAM_VALUE.DEPTH { MODELPARAM_VALUE.DEPTH PARAM_VALUE.DEPTH } { # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.CONFIG_EN}] ${MODELPARAM_VALUE.CONFIG_EN} + set_property value [get_property value ${PARAM_VALUE.DEPTH}] ${MODELPARAM_VALUE.DEPTH} } -proc update_MODELPARAM_VALUE.NSTREAMS { MODELPARAM_VALUE.NSTREAMS PARAM_VALUE.NSTREAMS } { +proc update_MODELPARAM_VALUE.WIDTH { MODELPARAM_VALUE.WIDTH PARAM_VALUE.WIDTH } { # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.NSTREAMS}] ${MODELPARAM_VALUE.NSTREAMS} + set_property value [get_property value ${PARAM_VALUE.WIDTH}] ${MODELPARAM_VALUE.WIDTH} } -proc update_MODELPARAM_VALUE.MEM_DEPTH { MODELPARAM_VALUE.MEM_DEPTH PARAM_VALUE.MEM_DEPTH } { +proc update_MODELPARAM_VALUE.INIT_FILE { MODELPARAM_VALUE.INIT_FILE PARAM_VALUE.INIT_FILE } { # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.MEM_DEPTH}] ${MODELPARAM_VALUE.MEM_DEPTH} -} - -proc update_MODELPARAM_VALUE.MEM_WIDTH { MODELPARAM_VALUE.MEM_WIDTH PARAM_VALUE.MEM_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.MEM_WIDTH}] ${MODELPARAM_VALUE.MEM_WIDTH} -} - -proc update_MODELPARAM_VALUE.MEM_INIT { MODELPARAM_VALUE.MEM_INIT PARAM_VALUE.MEM_INIT } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.MEM_INIT}] ${MODELPARAM_VALUE.MEM_INIT} + set_property value [get_property value ${PARAM_VALUE.INIT_FILE}] ${MODELPARAM_VALUE.INIT_FILE} } proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE.RAM_STYLE } { @@ -298,96 +87,6 @@ proc update_MODELPARAM_VALUE.RAM_STYLE { MODELPARAM_VALUE.RAM_STYLE PARAM_VALUE. set_property value [get_property value ${PARAM_VALUE.RAM_STYLE}] ${MODELPARAM_VALUE.RAM_STYLE} } -proc update_MODELPARAM_VALUE.STRM0_WIDTH { MODELPARAM_VALUE.STRM0_WIDTH PARAM_VALUE.STRM0_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM0_WIDTH}] ${MODELPARAM_VALUE.STRM0_WIDTH} -} - -proc update_MODELPARAM_VALUE.STRM1_WIDTH { MODELPARAM_VALUE.STRM1_WIDTH PARAM_VALUE.STRM1_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM1_WIDTH}] ${MODELPARAM_VALUE.STRM1_WIDTH} -} - -proc update_MODELPARAM_VALUE.STRM2_WIDTH { MODELPARAM_VALUE.STRM2_WIDTH PARAM_VALUE.STRM2_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM2_WIDTH}] ${MODELPARAM_VALUE.STRM2_WIDTH} -} - -proc update_MODELPARAM_VALUE.STRM3_WIDTH { MODELPARAM_VALUE.STRM3_WIDTH PARAM_VALUE.STRM3_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM3_WIDTH}] ${MODELPARAM_VALUE.STRM3_WIDTH} -} - -proc update_MODELPARAM_VALUE.STRM4_WIDTH { MODELPARAM_VALUE.STRM4_WIDTH PARAM_VALUE.STRM4_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM4_WIDTH}] ${MODELPARAM_VALUE.STRM4_WIDTH} -} - -proc update_MODELPARAM_VALUE.STRM5_WIDTH { MODELPARAM_VALUE.STRM5_WIDTH PARAM_VALUE.STRM5_WIDTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM5_WIDTH}] ${MODELPARAM_VALUE.STRM5_WIDTH} -} - -proc update_MODELPARAM_VALUE.STRM0_DEPTH { MODELPARAM_VALUE.STRM0_DEPTH PARAM_VALUE.STRM0_DEPTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM0_DEPTH}] ${MODELPARAM_VALUE.STRM0_DEPTH} -} - -proc update_MODELPARAM_VALUE.STRM1_DEPTH { MODELPARAM_VALUE.STRM1_DEPTH PARAM_VALUE.STRM1_DEPTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM1_DEPTH}] ${MODELPARAM_VALUE.STRM1_DEPTH} -} - -proc update_MODELPARAM_VALUE.STRM2_DEPTH { MODELPARAM_VALUE.STRM2_DEPTH PARAM_VALUE.STRM2_DEPTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM2_DEPTH}] ${MODELPARAM_VALUE.STRM2_DEPTH} -} - -proc update_MODELPARAM_VALUE.STRM3_DEPTH { MODELPARAM_VALUE.STRM3_DEPTH PARAM_VALUE.STRM3_DEPTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM3_DEPTH}] ${MODELPARAM_VALUE.STRM3_DEPTH} -} - -proc update_MODELPARAM_VALUE.STRM4_DEPTH { MODELPARAM_VALUE.STRM4_DEPTH PARAM_VALUE.STRM4_DEPTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM4_DEPTH}] ${MODELPARAM_VALUE.STRM4_DEPTH} -} - -proc update_MODELPARAM_VALUE.STRM5_DEPTH { MODELPARAM_VALUE.STRM5_DEPTH PARAM_VALUE.STRM5_DEPTH } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM5_DEPTH}] ${MODELPARAM_VALUE.STRM5_DEPTH} -} - -proc update_MODELPARAM_VALUE.STRM0_OFFSET { MODELPARAM_VALUE.STRM0_OFFSET PARAM_VALUE.STRM0_OFFSET } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM0_OFFSET}] ${MODELPARAM_VALUE.STRM0_OFFSET} -} - -proc update_MODELPARAM_VALUE.STRM1_OFFSET { MODELPARAM_VALUE.STRM1_OFFSET PARAM_VALUE.STRM1_OFFSET } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM1_OFFSET}] ${MODELPARAM_VALUE.STRM1_OFFSET} -} - -proc update_MODELPARAM_VALUE.STRM2_OFFSET { MODELPARAM_VALUE.STRM2_OFFSET PARAM_VALUE.STRM2_OFFSET } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM2_OFFSET}] ${MODELPARAM_VALUE.STRM2_OFFSET} -} - -proc update_MODELPARAM_VALUE.STRM3_OFFSET { MODELPARAM_VALUE.STRM3_OFFSET PARAM_VALUE.STRM3_OFFSET } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM3_OFFSET}] ${MODELPARAM_VALUE.STRM3_OFFSET} -} - -proc update_MODELPARAM_VALUE.STRM4_OFFSET { MODELPARAM_VALUE.STRM4_OFFSET PARAM_VALUE.STRM4_OFFSET } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM4_OFFSET}] ${MODELPARAM_VALUE.STRM4_OFFSET} -} - -proc update_MODELPARAM_VALUE.STRM5_OFFSET { MODELPARAM_VALUE.STRM5_OFFSET PARAM_VALUE.STRM5_OFFSET } { - # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value - set_property value [get_property value ${PARAM_VALUE.STRM5_OFFSET}] ${MODELPARAM_VALUE.STRM5_OFFSET} -} - proc update_MODELPARAM_VALUE.AXILITE_ADDR_WIDTH { MODELPARAM_VALUE.AXILITE_ADDR_WIDTH PARAM_VALUE.AXILITE_ADDR_WIDTH } { # Procedure called to set VHDL generic/Verilog parameter value(s) based on TCL parameter value set_property value [get_property value ${PARAM_VALUE.AXILITE_ADDR_WIDTH}] ${MODELPARAM_VALUE.AXILITE_ADDR_WIDTH} diff --git a/finn-rtllib/mvu/mvu_4sx4u.sv b/finn-rtllib/mvu/mvu_4sx4u.sv new file mode 100644 index 0000000000..0ac2628ee5 --- /dev/null +++ b/finn-rtllib/mvu/mvu_4sx4u.sv @@ -0,0 +1,527 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48. + *****************************************************************************/ + +module mvu_4sx4u #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][3:0] w, // signed weights + input logic [SIMD-1:0][3:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned D[4:0] = '{ ACCU_WIDTH+22, 22, 15, 8, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+3)/4; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 4*c; + localparam int unsigned PE_END = PE < 4*(c+1)? PE : 4*(c+1); + localparam int unsigned PE_REM = 4*(c+1) - PE_END; + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD][3]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [17:0] bb = { {(14){SIGNED_ACTIVATIONS && a[s][3]}}, a[s] }; + logic [29:0] aa; + logic [26:0] dd; + logic [ 1:0] xx[3:1]; + if(1) begin : blkVectorize + uwire [3:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin + if(BEHAVIORAL) assign xx[pe + PE_REM] = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[pe + PE_REM][1]), + .O5(xx[pe + PE_REM][0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe + PE_REM]+:3] = ww[pe]; + aa[D[pe + PE_REM]+ 3] = ww[pe][3]; + end + end + end : blkVectorize + + uwire [47:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [17:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [45:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [47:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav +`ifndef VERILATOR + else begin : genDSP + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase + end : genDSP +`endif + + // External Canary Pipeline + logic [1:0] X1[3:1] = '{ default: 0 }; + logic [1:0] X2[3:1] = '{ default: 0 }; + logic [1:0] X3[3:1] = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + foreach(X3[i]) begin + X3[i] <= X2[i] + (L[3]? 2'h0 : pp[D[i]+:2]); + end + end + end + + // Derive actual cross-lane overflows + for(genvar i = 0; i < 3; i++) begin + assign h3[s][i] = pp[D[i+1]+:2] - X3[i+1]; + end + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 1}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -8:0] hi4[3]; + uwire [$clog2(SIMD)+7:0] lo4[3]; + for(genvar i = 0; i < 4; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + localparam int unsigned HI_WIDTH = ACCU_WIDTH - LO_WIDTH; + + // Conclusive high part accumulation + if(i >= PE_REM && i < 3) begin : genHi + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s][i]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + end + assign hi4[i] = Hi4; + end : genHi + else if (i < 3) begin : genHiZero + assign hi4[i] = '0; + end : genHiZero + + // Conclusive low part accumulation + if(i >= PE_REM) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 3) assign up4 = Lo4; + else assign lo4[i] = Lo4; + end : blkLo + else begin : blkLoZero + assign lo4[i] = '0; + end : blkLoZero + + end + + // Stage #5: Resolve lane totals + logic signed [3:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[3] <= up4 - hi4[2]; + Res5[2] <= $signed({ hi4[2], {(D[3] - D[2]){1'b0}} }) + $signed({ 1'b0, lo4[2] }) - hi4[1]; + Res5[1] <= $signed({ hi4[1], {(D[2] - D[1]){1'b0}} }) + $signed({ 1'b0, lo4[1] }) - hi4[0]; + Res5[0] <= $signed({ hi4[0], {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4[0] }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; + end + + end : genPipes + +endmodule : mvu_4sx4u diff --git a/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv new file mode 100644 index 0000000000..fbf48784f0 --- /dev/null +++ b/finn-rtllib/mvu/mvu_8sx8u_dsp48.sv @@ -0,0 +1,525 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) core compute kernel utilizing DSP48. + *****************************************************************************/ + +module mvu_8sx8u_dsp48 #( + int unsigned PE, + int unsigned SIMD, + int unsigned ACCU_WIDTH, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + + int unsigned VERSION = 1, + bit SIGNED_ACTIVATIONS = 0, + bit FORCE_BEHAVIORAL = 0 +)( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic signed [PE-1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] w, // signed weights + input logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] a, // unsigned activations (override by SIGNED_ACTIVATIONS) + + // Ouput + output logic vld, + output logic signed [PE-1:0][ACCU_WIDTH-1:0] p +); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + + typedef int unsigned leave_load_t[2*SIMD-1]; + function leave_load_t init_leave_loads(); + automatic leave_load_t res; + for(int i = 2*(SIMD-1); i >= int'(SIMD)-1; i--) res[i] = 1; + for(int i = SIMD-2; i >= 0; i--) res[i] = res[2*i+1] + res[2*i+2]; + return res; + endfunction : init_leave_loads + + // Pipeline for last indicator flag + logic [1:5] L = '0; + always_ff @(posedge clk) begin + if(rst) L <= '0; + else if(en) L <= { last, L[1:4] }; + end + assign vld = L[5]; + + // Stages #1 - #3: DSP Lanes + cross-lane canaries duplicated with SIMD parallelism + localparam int unsigned SINGLE_PROD_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH; + localparam int unsigned D[2:0] = '{ ACCU_WIDTH+SINGLE_PROD_WIDTH, SINGLE_PROD_WIDTH, 0 }; // Lane offsets + + localparam int unsigned PIPE_COUNT = (PE+1)/2; + for(genvar c = 0; c < PIPE_COUNT; c++) begin : genPipes + + localparam int unsigned PE_BEG = 2*c; + localparam int unsigned PE_END = PE < 2*(c+1)? PE : 2*(c+1); + localparam int unsigned PE_REM = 2*(c+1) - PE_END; + + uwire [57:0] p3[SIMD]; + uwire signed [ 1:0] h3[SIMD]; + for(genvar s = 0; s < SIMD; s++) begin : genSIMD + + // Input Lane Assembly + uwire [17:0] bb = { {(18-ACTIVATION_WIDTH){SIGNED_ACTIVATIONS && a[s][ACTIVATION_WIDTH-1]}}, a[s] }; + logic [29:0] aa; + logic [26:0] dd; + logic [ 1:0] xx; + if(1) begin : blkVectorize + uwire [WEIGHT_WIDTH-1:0] ww[PE_END - PE_BEG]; + for(genvar pe = 0; pe < PE_END - PE_BEG; pe++) begin + assign ww[pe] = w[PE_BEG + pe][s]; + if(pe) begin + if(BEHAVIORAL) assign xx = zero? 0 : ww[pe] * a[s]; +`ifndef VERILATOR + else begin + LUT6_2 #(.INIT(64'h0000_6AC0_0000_8888)) lut_x ( + .O6(xx[1]), + .O5(xx[0]), + .I5(1'b1), + .I4(zero), + .I3(ww[pe][1]), + .I2(a[s][1]), + .I1(ww[pe][0]), + .I0(a[s][0]) + ); + end +`endif + end + end + always_comb begin + dd = '0; + aa = '0; + for(int unsigned pe = 0; pe < PE_END - PE_BEG; pe++) begin + dd[D[pe + PE_REM] +: WEIGHT_WIDTH-1] = ww[pe]; + aa[D[pe + PE_REM] + WEIGHT_WIDTH-1] = ww[pe][WEIGHT_WIDTH-1]; + end + end + end : blkVectorize + + uwire [47:0] pp; + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input Refine + logic signed [17:0] B1 = 0; + always_ff @(posedge clk) begin + if(zero) B1 <= 0; + else if(en) B1 <= bb; + end + + logic signed [26:0] AD1 = 0; + always_ff @(posedge clk) begin + if(rst) AD1 <= 0; + else if(en) AD1 <= dd - aa; + end + + // Stage #2: Multiply + logic signed [45:0] M2 = 0; + always_ff @(posedge clk) begin + if(rst) M2 <= 0; + else if(en) M2 <= +// synthesis translate off + (B1 === '0) || (AD1 === '0)? 0 : +// synthesis translate on + B1 * AD1; + end + + // Stage #3: Accumulate + logic signed [47:0] P3 = 0; + always_ff @(posedge clk) begin + if(rst) P3 <= 0; + else if(en) P3 <= M2 + (L[3]? 0 : P3); + end + + assign pp = P3; + end : genBehav +`ifndef VERILATOR + else begin : genDSP + localparam logic [6:0] OPMODE_INVERSION = 7'b010_01_01; + uwire [6:0] opmode = { { 1'b0, L[2], 1'b0 }, 4'b00_00 }; + case(VERSION) + 1: DSP48E1 #( + // Feature Control Attributes: Data Path Selection + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .USE_DPORT("TRUE"), // Select D port usage (TRUE or FALSE) + .USE_MULT("MULTIPLY"), // Select multiplier usage ("MULTIPLY", "DYNAMIC", or "NONE") + .USE_SIMD("ONE48"), // SIMD selection ("ONE48", "TWO24", "FOUR12") + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // "NO_RESET", "RESET_MATCH", "RESET_NOT_MATCH" + .MASK('1), // 48-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 48-bit pattern match for pattern detect + .SEL_MASK("MASK"), // "C", "MASK", "ROUNDING_MODE1", "ROUNDING_MODE2" + .SEL_PATTERN("PATTERN"), // Select pattern value ("PATTERN" or "C") + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect ("PATDET" or "NO_PATDET") + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0, 1 or 2) + .ADREG(1), // Number of pipeline stages for pre-adder (0 or 1) + .ALUMODEREG(0), // Number of pipeline stages for ALUMODE (0 or 1) + .AREG(0), // Number of pipeline stages for A (0, 1 or 2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0, 1 or 2) + .BREG(1), // Number of pipeline stages for B (0, 1 or 2) + .CARRYINREG(0), // Number of pipeline stages for CARRYIN (0 or 1) + .CARRYINSELREG(0), // Number of pipeline stages for CARRYINSEL (0 or 1) + .CREG(0), // Number of pipeline stages for C (0 or 1) + .DREG(0), // Number of pipeline stages for D (0 or 1) + .INMODEREG(0), // Number of pipeline stages for INMODE (0 or 1) + .MREG(1), // Number of multiplier pipeline stages (0 or 1) + .OPMODEREG(1), // Number of pipeline stages for OPMODE (0 or 1) + .PREG(1) // Number of pipeline stages for P (0 or 1) + ) dsp ( + // Cascade: 30-bit (each) output: Cascade Ports + .ACOUT(), // 30-bit output: A port cascade output + .BCOUT(), // 18-bit output: B port cascade output + .CARRYCASCOUT(), // 1-bit output: Cascade carry output + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade output + .PCOUT(), // 48-bit output: Cascade output + + // Control: 1-bit (each) output: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc output + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect output + .PATTERNDETECT(), // 1-bit output: Pattern detect output + .UNDERFLOW(), // 1-bit output: Underflow in add/acc output + + // Data: 4-bit (each) output: Data Ports + .CARRYOUT(), // 4-bit output: Carry output + .P(pp), // 48-bit output: Primary data output + + // Cascade: 30-bit (each) input: Cascade Ports + .ACIN('x), // 30-bit input: A cascade data input + .BCIN('x), // 18-bit input: B cascade input + .CARRYCASCIN('x), // 1-bit input: Cascade carry input + .MULTSIGNIN('x), // 1-bit input: Multiplier sign input + .PCIN('x), // 48-bit input: P cascade input + + // Control: 4-bit (each) input: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock input + .ALUMODE('0), // 4-bit input: ALU control input + .CARRYINSEL('0), // 3-bit input: Carry select input + .INMODE(5'b01100), // 5-bit input: INMODE control input + .OPMODE(opmode ^ OPMODE_INVERSION), // 7-bit input: Operation mode input + + // Data: 30-bit (each) input: Data Ports + .A(aa), // 30-bit input: A data input + .B(bb), // 18-bit input: B data input + .C('x), // 48-bit input: C data input + .CARRYIN('0), // 1-bit input: Carry input signal + .D(dd), // 25-bit input: D data input + + // Reset/Clock Enable: 1-bit (each) input: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable input for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable input for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable input for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable input for ALUMODERE + .CEB1('0), // 1-bit input: Clock enable input for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable input for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable input for CREG + .CECARRYIN('0), // 1-bit input: Clock enable input for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable input for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable input for DREG + .CEINMODE('0), // 1-bit input: Clock enable input for INMODEREG + .CEM(en), // 1-bit input: Clock enable input for MREG + .CEP(en), // 1-bit input: Clock enable input for PREG + .RSTA('0), // 1-bit input: Reset input for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + 2: DSP48E2 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("AD"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND('0), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE48"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_48_96"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK('1), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN('0), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED('0), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED('0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED('0), // Optional inversion for CLK + .IS_INMODE_INVERTED('0), // Optional inversion for INMODE + .IS_OPMODE_INVERTED({ 2'b00, OPMODE_INVERSION}), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED('0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED('0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED('0), // Optional inversion for RSTA + .IS_RSTB_INVERTED('0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED('0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED('0), // Optional inversion for RSTC + .IS_RSTD_INVERTED('0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED('0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED('0), // Optional inversion for RSTM + .IS_RSTP_INVERTED('0), // Optional inversion for RSTP + + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(0), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(1), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(0), // Pipeline stages for A (0-2) + .BCASCREG(1), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(1), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(0), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(1) // Number of pipeline stages for P (0-1) + ) dsp ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(), // 58-bit output: Cascade output + + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN('x), // 58-bit input: P cascade + + // Control inputs: Control Inputs/Status Bits + .CLK(clk), // 1-bit input: Clock + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .INMODE(5'b01100), // 5-bit input: INMODE control + .OPMODE({ 2'b00, opmode }), // 9-bit input: Operation mode + + // Data inputs: Data Ports + .A(aa), // 34-bit input: A data + .B(bb), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D(dd), // 27-bit input: D data + + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .CEA1('0), // 1-bit input: Clock enable for 1st stage AREG + .CEA2('0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD(en), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1('0), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(en), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE('0), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(en), // 1-bit input: Clock enable for PREG + .RSTA('0), // 1-bit input: Reset for AREG + .RSTB( // 1-bit input: Reset for BREG +// synthesis translate_off + rst || +// synthesis translate_on + zero + ), + .RSTC('0), // 1-bit input: Reset for CREG + .RSTD( // 1-bit input: Reset for DREG and ADREG +// synthesis translate_off + zero || +// synthesis translate_on + rst + ), + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTCTRL('0), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTINMODE('0), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(rst) // 1-bit input: Reset for PREG + ); + default: initial begin + $error("Unknown version DSP48E%0d.", VERSION); + $finish; + end + endcase + end : genDSP +`endif + + // External Canary Pipeline + logic [1:0] X1 = '{ default: 0 }; + logic [1:0] X2 = '{ default: 0 }; + logic [1:0] X3 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) begin + X1 <= '{ default: 0 }; + X2 <= '{ default: 0 }; + X3 <= '{ default: 0 }; + end + else if(en) begin + X1 <= xx; + X2 <= X1; + X3 <= X2 + (L[3]? 2'h0 : pp[D[1]+:2]); + end + end + + // Derive actual cross-lane overflows + assign h3[s] = pp[D[1]+:2] - X3; + + assign p3[s] = pp; + + end : genSIMD + + // Stage #4: Cross-SIMD Reduction + + // Count leaves reachable from each node + localparam leave_load_t LEAVE_LOAD = SIMD > 1 ? init_leave_loads() : '{ default: 0}; // SIMD=1 requires no adder tree, so zero-ing out, otherwise init_leave_loads ends up in infinite loop + + uwire signed [ACCU_WIDTH -1:0] up4; + uwire signed [ACCU_WIDTH -SINGLE_PROD_WIDTH:0] hi4; + uwire [$clog2(SIMD)+SINGLE_PROD_WIDTH-1:0] lo4; + + // Conclusive high part accumulation + if(PE_REM == 0) begin : genHi + localparam int unsigned HI_WIDTH = ACCU_WIDTH - D[1]; + // Adder Tree across all SIMD high contributions, each from [-1:1] + uwire signed [2*SIMD-2:0][$clog2(1+SIMD):0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = h3[s]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + uwire signed [$clog2(1+LEAVE_LOAD[n]):0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + // High Sideband Accumulation + logic signed [HI_WIDTH-1:0] Hi4 = 0; + always_ff @(posedge clk) begin + if(rst) Hi4 <= 0; + else if(en) Hi4 <= (L[4]? 0 : Hi4) + $signed(tree[0]); + end + assign hi4 = Hi4; + end : genHi + else begin : genHiZero + assign hi4 = '0; + end : genHiZero + + for(genvar i = 0; i < 2; i++) begin + localparam int unsigned LO_WIDTH = D[i+1] - D[i]; + // Conclusive low part accumulation + if(i >= PE_REM) begin : blkLo + // Adder Tree across all SIMD low contributions + localparam int unsigned ROOT_WIDTH = $clog2(1 + SIMD*(2**LO_WIDTH-1)); + uwire [2*SIMD-2:0][ROOT_WIDTH-1:0] tree; + for(genvar s = 0; s < SIMD; s++) assign tree[SIMD-1+s] = p3[s][D[i]+:LO_WIDTH]; + for(genvar n = 0; n < SIMD-1; n++) begin + // Sum truncated to actual maximum bit width at this node + localparam int unsigned NODE_WIDTH = $clog2(1 + LEAVE_LOAD[n]*(2**LO_WIDTH-1)); + uwire [NODE_WIDTH-1:0] s = $signed(tree[2*n+1]) + $signed(tree[2*n+2]); + assign tree[n] = s; + end + + logic [ROOT_WIDTH-1:0] Lo4 = 0; + always_ff @(posedge clk) begin + if(rst) Lo4 <= 0; + else if(en) Lo4 <= tree[0]; + end + + if(i == 1) assign up4 = Lo4; + else assign lo4 = Lo4; + end : blkLo + else begin : blkLoZero + assign lo4 = '0; + end : blkLoZero + + end + + // Stage #5: Resolve lane totals + logic signed [1:0][ACCU_WIDTH-1:0] Res5 = '{ default: 0 }; + always_ff @(posedge clk) begin + if(rst) Res5 <= '{ default: 0 }; + else if(en) begin + Res5[1] <= up4 - hi4; + Res5[0] <= $signed({ hi4, {(D[1] - D[0]){1'b0}} }) + $signed({ 1'b0, lo4 }); + end + end + + // Output + for(genvar pe = PE_BEG; pe < PE_END; pe++) begin + assign p[pe] = Res5[pe - PE_BEG + PE_REM]; + end + + end : genPipes + +endmodule : mvu_8sx8u_dsp48 diff --git a/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv new file mode 100644 index 0000000000..3bbc7051b9 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_8sx9_dsp58.sv @@ -0,0 +1,430 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix/Vector Vector Unit (MVU/VVU) core compute kernel utilizing DSP58. + *****************************************************************************/ + +module mvu_vvu_8sx9_dsp58 #( + bit IS_MVU, + int unsigned PE, + int unsigned SIMD, + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + int unsigned SEGMENTLEN = 0, // Default to 0 (which implies a single segment) + bit FORCE_BEHAVIORAL = 0, + + localparam int unsigned ACTIVATION_ELEMENTS = (IS_MVU ? 1 : PE) * SIMD, + localparam int unsigned WEIGHT_ELEMENTS = PE*SIMD + ) + ( + // Global Control + input logic clk, + input logic rst, + input logic en, + + // Input + input logic last, + input logic zero, // ignore current inputs and force this partial product to zero + input logic [WEIGHT_ELEMENTS-1:0][WEIGHT_WIDTH-1:0] w, // weights + input logic [ACTIVATION_ELEMENTS-1:0][ACTIVATION_WIDTH-1:0] a, // activations + + // Ouput + output logic vld, + output logic [PE-1:0][ACCU_WIDTH-1:0] p + ); + // for verilator always use behavioral code + localparam bit BEHAVIORAL = +`ifdef VERILATOR + 1 || +`endif + FORCE_BEHAVIORAL; + +//-------------------- Declare global signals --------------------\\ + localparam int unsigned CHAINLEN = (SIMD+2)/3; + localparam int unsigned SEGLEN = SEGMENTLEN == 0 ? CHAINLEN : SEGMENTLEN; // Additional constant to default a SEGMENTLEN of '0' to the DSP-chain length + localparam int unsigned PE_ACTIVATION = IS_MVU ? 1 : PE; + uwire [26:0] a_in_i [PE_ACTIVATION * CHAINLEN]; + uwire [23:0] b_in_i [PE][CHAINLEN]; + uwire [PE-1:0][CHAINLEN-1:0][57:0] pcout; // Array with packed dimension > 256 (with a loop-carried dependency) cannot be handled out-of-the-box with PyVerilator + +//-------------------- Shift register for opmode select signal --------------------\\ + localparam int unsigned MAX_PIPELINE_STAGES = (CHAINLEN + SEGLEN-1)/SEGLEN; // >=1 (== number of pipeline registers + 1 (A/B inputs always have 1 register)) + logic L [0:1+MAX_PIPELINE_STAGES] = '{default: 0}; // After MAX_PIPELINE_STAGES (== number of pipeline stages for input data), we have 3 additional cycles latency (A/B reg, Mreg, Preg). + // Thus, we add +2 (since OPMODE is buffered by 1 cycle in the DSP fabric) + + always_ff @(posedge clk) begin + if(rst) L <= '{default: 0}; + else if(en) begin + L[1+MAX_PIPELINE_STAGES] <= last; + L[0:MAX_PIPELINE_STAGES] <= L[1:1+MAX_PIPELINE_STAGES]; + end + end + assign vld = L[0]; + +//-------------------- Shift register for ZERO flag --------------------\\ + logic Z [0:MAX_PIPELINE_STAGES-2] = '{default:0}; // We need MAX_PIPELINE_STAGES-1 pipeline stages (note: INMODE is buffered inside DSP fabric) + + if (MAX_PIPELINE_STAGES > 1) begin : genZreg + always_ff @(posedge clk) begin + if (rst) Z <= '{default: 0}; + else if(en) begin + Z[0] <= zero; + if (MAX_PIPELINE_STAGES > 2) Z[1:MAX_PIPELINE_STAGES-2] <= Z[0:MAX_PIPELINE_STAGES-3]; + end + end + end; + +//-------------------- Buffer for input activations --------------------\\ + localparam int unsigned PAD_BITS_ACT = 9 - ACTIVATION_WIDTH; + for (genvar k=0; k1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = i == CHAINLEN-1 ? SIMD - 3*i : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregAct + logic [0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][ACTIVATION_WIDTH-1:0] A = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) A <= '{default: 0}; + else if(en) begin + A[EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + a[SIMD*k + 3*i +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) A[0:EXTERNAL_PREGS-2] <= A[1:EXTERNAL_PREGS-1]; + end + end + for (genvar j=0; j1 ? TOTAL_PREGS-1 : 0; + localparam int LANES_OCCUPIED = j == CHAINLEN-1 ? SIMD - 3*j : 3; + + if (EXTERNAL_PREGS > 0) begin : genExternalPregWeight + logic [0:PE-1][0:EXTERNAL_PREGS-1][LANES_OCCUPIED-1:0][WEIGHT_WIDTH-1:0] B = '{ default : 0}; + always_ff @(posedge clk) begin + if (rst) B <= '{default: 0}; + else if (en) begin + B[i][EXTERNAL_PREGS-1] <= +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + //w[i][3*j +: LANES_OCCUPIED]; + w[SIMD*i+3*j +: LANES_OCCUPIED]; + if (EXTERNAL_PREGS > 1) B[i][0:EXTERNAL_PREGS-2] <= B[i][1:EXTERNAL_PREGS-1]; + end + end + for (genvar k = 0 ; k < LANES_OCCUPIED ; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = PAD_BITS_WEIGHT == 0 ? B[i][0][k] : { {PAD_BITS_WEIGHT{B[i][0][k][WEIGHT_WIDTH-1]}}, B[i][0][k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genExternalPregWeight + else begin : genInpDSPWeight + for (genvar k = 0; k < LANES_OCCUPIED; k++) begin : genBin + assign b_in_i[i][j][8*k +: 8] = +// synthesis translate_off + zero ? '1 : +// synthesis translate_on + PAD_BITS_WEIGHT == 0 ? w[SIMD*i+3*j+k] : { {PAD_BITS_WEIGHT{w[SIMD*i+3*j+k][WEIGHT_WIDTH-1]}}, w[SIMD*i+3*j+k] }; + end : genBin + for (genvar k=LANES_OCCUPIED; k<3; k++) begin : genBinZero + assign b_in_i[i][j][8*k +: 8] = 8'b0; + end : genBinZero + end : genInpDSPWeight + end : genWeightSIMD + end : genWeightPE + +//-------------------- Instantiate PE x CHAINLEN DSPs --------------------\\ + for (genvar i=0; i0 ? 2 : 1; // 1 : 0 + localparam bit PREG = (j+1)%SEGLEN==0 || j == CHAINLEN-1; + localparam bit FIRST = j == 0; + localparam bit LAST = j == CHAINLEN-1; + uwire [57:0] pp; + + if (LAST) begin : genPOUT + assign p[i] = pp[ACCU_WIDTH-1:0]; + end + + // Note: Since the product B * AD is computed, + // rst can be only applied to AD and zero only to B + // with the same effect as zeroing both. + if(BEHAVIORAL) begin : genBehav + // Stage #1: Input A/B + logic signed [33:0] Areg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Areg <= '{ default : 0}; + else if (en) begin + Areg[0] <= { 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }; + if (INTERNAL_PREGS == 2) Areg[1] <= Areg[0]; + end + end + logic signed [23:0] Breg [INTERNAL_PREGS]; + always_ff @(posedge clk) begin + if (rst) Breg <= '{ default : 0}; + else if (en) begin + Breg[0] <= b_in_i[i][j]; + if (INTERNAL_PREGS == 2) Breg[1] <= Breg[0]; + end + end + + // Stage #2: Multiply-Accumulate + logic signed [57:0] Mreg; + logic InmodeZero = 0; + always_ff @(posedge clk) begin + if (rst) InmodeZero <= 0; + else if (en) InmodeZero <= ( TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero ); + end + always_ff @(posedge clk) begin + if (rst) Mreg <= 0; + else if (en) begin + automatic logic signed [57:0] m = 0; + for (int k = 0; k < 3; k++) begin + m = m + (InmodeZero ? 0 : $signed(Areg[INTERNAL_PREGS-1][9*k +: 9]) * $signed(Breg[INTERNAL_PREGS-1][8*k +: 8])); + end + Mreg <= m; + end + end + + // Stage #3: Accumulate + logic signed [57:0] Preg; + logic Opmode = 0; + if (FIRST && !LAST) begin : genFirst + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg; + end + end + else assign Preg = Mreg; + end + else if (FIRST && LAST) begin : genSingle + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg; + end + end + else if (!FIRST && LAST) begin : genLast + always_ff @(posedge clk) begin + if (rst) Opmode <= 0; + else if (en) Opmode <= L[1]; + end + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= (Opmode ? 0 : Preg) + Mreg + pcout[i][j-1]; + end + end + else begin : genMid + if (PREG) begin : genPregBehav + always_ff @(posedge clk) begin + if (rst) Preg <= 0; + else if (en) Preg <= Mreg + pcout[i][j-1]; + end + end + else assign Preg = Mreg + pcout[i][j-1]; + end + assign pp = Preg; + assign pcout[i][j] = Preg; + end : genBehav +`ifndef VERILATOR + else begin: genDSP + DSP58 #( + // Feature Control Attributes: Data Path Selection + .AMULTSEL("A"), // Selects A input to multiplier (A, AD) + .A_INPUT("DIRECT"), // Selects A input source, "DIRECT" (A port) or "CASCADE" (ACIN port) + .BMULTSEL("B"), // Selects B input to multiplier (AD, B) + .B_INPUT("DIRECT"), // Selects B input source, "DIRECT" (B port) or "CASCADE" (BCIN port) + .DSP_MODE("INT8"), // Configures DSP to a particular mode of operation. Set to INT24 for + // legacy mode. + .PREADDINSEL("A"), // Selects input to pre-adder (A, B) + .RND(58'h000000000000000), // Rounding Constant + .USE_MULT("MULTIPLY"), // Select multiplier usage (DYNAMIC, MULTIPLY, NONE) + .USE_SIMD("ONE58"), // SIMD selection (FOUR12, ONE58, TWO24) + .USE_WIDEXOR("FALSE"), // Use the Wide XOR function (FALSE, TRUE) + .XORSIMD("XOR24_34_58_116"), // Mode of operation for the Wide XOR (XOR12_22, XOR24_34_58_116) + // Pattern Detector Attributes: Pattern Detection Configuration + .AUTORESET_PATDET("NO_RESET"), // NO_RESET, RESET_MATCH, RESET_NOT_MATCH + .AUTORESET_PRIORITY("RESET"), // Priority of AUTORESET vs. CEP (CEP, RESET). + .MASK(58'h0ffffffffffffff), // 58-bit mask value for pattern detect (1=ignore) + .PATTERN(58'h000000000000000), // 58-bit pattern match for pattern detect + .SEL_MASK("MASK"), // C, MASK, ROUNDING_MODE1, ROUNDING_MODE2 + .SEL_PATTERN("PATTERN"), // Select pattern value (C, PATTERN) + .USE_PATTERN_DETECT("NO_PATDET"), // Enable pattern detect (NO_PATDET, PATDET) + // Programmable Inversion Attributes: Specifies built-in programmable inversion on specific pins + .IS_ALUMODE_INVERTED(4'b0000), // Optional inversion for ALUMODE + .IS_CARRYIN_INVERTED(1'b0), // Optional inversion for CARRYIN + .IS_CLK_INVERTED(1'b0), // Optional inversion for CLK + .IS_INMODE_INVERTED(5'b00000), // Optional inversion for INMODE + .IS_NEGATE_INVERTED(3'b000), // Optional inversion for NEGATE + .IS_OPMODE_INVERTED({ LAST ? 2'b01 : 2'b00 , // W: LAST ? (L[1] ? 0 : P) : 0 + FIRST ? 3'b000 : 3'b001, // Z: FIRST ? 0 : PCIN + 2'b01, // Y : M + 2'b01 // X: M + }), // Optional inversion for OPMODE + .IS_RSTALLCARRYIN_INVERTED(1'b0), // Optional inversion for RSTALLCARRYIN + .IS_RSTALUMODE_INVERTED(1'b0), // Optional inversion for RSTALUMODE + .IS_RSTA_INVERTED(1'b0), // Optional inversion for RSTA + .IS_RSTB_INVERTED(1'b0), // Optional inversion for RSTB + .IS_RSTCTRL_INVERTED(1'b0), // Optional inversion for STCONJUGATE_A + .IS_RSTC_INVERTED(1'b0), // Optional inversion for RSTC + .IS_RSTD_INVERTED(1'b0), // Optional inversion for RSTD + .IS_RSTINMODE_INVERTED(1'b0), // Optional inversion for RSTINMODE + .IS_RSTM_INVERTED(1'b0), // Optional inversion for RSTM + .IS_RSTP_INVERTED(1'b0), // Optional inversion for RSTP + // Register Control Attributes: Pipeline Register Configuration + .ACASCREG(INTERNAL_PREGS), // Number of pipeline stages between A/ACIN and ACOUT (0-2) + .ADREG(0), // Pipeline stages for pre-adder (0-1) + .ALUMODEREG(0), // Pipeline stages for ALUMODE (0-1) + .AREG(INTERNAL_PREGS), // Pipeline stages for A (0-2) + .BCASCREG(INTERNAL_PREGS), // Number of pipeline stages between B/BCIN and BCOUT (0-2) + .BREG(INTERNAL_PREGS), // Pipeline stages for B (0-2) + .CARRYINREG(0), // Pipeline stages for CARRYIN (0-1) + .CARRYINSELREG(0), // Pipeline stages for CARRYINSEL (0-1) + .CREG(0), // Pipeline stages for C (0-1) + .DREG(0), // Pipeline stages for D (0-1) + .INMODEREG(1), // Pipeline stages for INMODE (0-1) + .MREG(1), // Multiplier pipeline stages (0-1) + .OPMODEREG(1), // Pipeline stages for OPMODE (0-1) + .PREG(PREG), // Number of pipeline stages for P (0-1) + .RESET_MODE("SYNC") // Selection of synchronous or asynchronous reset. (ASYNC, SYNC). + ) + DSP58_inst ( + // Cascade outputs: Cascade Ports + .ACOUT(), // 34-bit output: A port cascade + .BCOUT(), // 24-bit output: B cascade + .CARRYCASCOUT(), // 1-bit output: Cascade carry + .MULTSIGNOUT(), // 1-bit output: Multiplier sign cascade + .PCOUT(pcout[i][j]), // 58-bit output: Cascade output + // Control outputs: Control Inputs/Status Bits + .OVERFLOW(), // 1-bit output: Overflow in add/acc + .PATTERNBDETECT(), // 1-bit output: Pattern bar detect + .PATTERNDETECT(), // 1-bit output: Pattern detect + .UNDERFLOW(), // 1-bit output: Underflow in add/acc + // Data outputs: Data Ports + .CARRYOUT(), // 4-bit output: Carry + .P(pp), // 58-bit output: Primary data + .XOROUT(), // 8-bit output: XOR data + // Cascade inputs: Cascade Ports + .ACIN('x), // 34-bit input: A cascade data + .BCIN('x), // 24-bit input: B cascade + .CARRYCASCIN('x), // 1-bit input: Cascade carry + .MULTSIGNIN('x), // 1-bit input: Multiplier sign cascade + .PCIN(FIRST ? 'x : pcout[i][j-1]), // 58-bit input: P cascade + // Control inputs: Control Inputs/Status Bits + .ALUMODE(4'h0), // 4-bit input: ALU control + .CARRYINSEL('0), // 3-bit input: Carry select + .CLK(clk), // 1-bit input: Clock + .INMODE({ + INTERNAL_PREGS==2 ? 1'b0 : 1'b1, + 2'b00, + TOTAL_PREGS > 0 ? Z[TOTAL_PREGS-1] : zero, + INTERNAL_PREGS==2 ? 1'b0 : 1'b1 + }), // 5-bit input: INMODE control + .NEGATE('0), // 3-bit input: Negates the input of the multiplier + .OPMODE({ + LAST ? {1'b0, L[1]} : 2'b00, + 7'b000_0000 + }), // 9-bit input: Operation mode + // Data inputs: Data Ports + .A({ 7'bx, a_in_i[(IS_MVU ? 0 : CHAINLEN*i) + j] }), // 34-bit input: A data + .B(b_in_i[i][j]), // 24-bit input: B data + .C('x), // 58-bit input: C data + .CARRYIN('0), // 1-bit input: Carry-in + .D('x), // 27-bit input: D data + // Reset/Clock Enable inputs: Reset/Clock Enable Inputs + .ASYNC_RST('0), // 1-bit input: Asynchronous reset for all registers. + .CEA1(en), // 1-bit input: Clock enable for 1st stage AREG + .CEA2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage AREG + .CEAD('0), // 1-bit input: Clock enable for ADREG + .CEALUMODE('0), // 1-bit input: Clock enable for ALUMODE + .CEB1(en), // 1-bit input: Clock enable for 1st stage BREG + .CEB2(INTERNAL_PREGS==2 ? en : '0), // 1-bit input: Clock enable for 2nd stage BREG + .CEC('0), // 1-bit input: Clock enable for CREG + .CECARRYIN('0), // 1-bit input: Clock enable for CARRYINREG + .CECTRL(en), // 1-bit input: Clock enable for OPMODEREG and CARRYINSELREG + .CED('0), // 1-bit input: Clock enable for DREG + .CEINMODE(en), // 1-bit input: Clock enable for INMODEREG + .CEM(en), // 1-bit input: Clock enable for MREG + .CEP(PREG && en), // 1-bit input: Clock enable for PREG + .RSTA(rst), // 1-bit input: Reset for AREG + .RSTALLCARRYIN('0), // 1-bit input: Reset for CARRYINREG + .RSTALUMODE('0), // 1-bit input: Reset for ALUMODEREG + .RSTB(rst), // 1-bit input: Reset for BREG + .RSTC('0), // 1-bit input: Reset for CREG + .RSTCTRL(rst), // 1-bit input: Reset for OPMODEREG and CARRYINSELREG + .RSTD('0), // 1-bit input: Reset for DREG and ADREG + .RSTINMODE(rst), // 1-bit input: Reset for INMODE register + .RSTM(rst), // 1-bit input: Reset for MREG + .RSTP(PREG && rst) // 1-bit input: Reset for PREG + ); + end : genDSP +`endif + end : genDSPChain + end : genDSPPE + +endmodule : mvu_vvu_8sx9_dsp58 diff --git a/finn-rtllib/mvu/mvu_vvu_axi.sv b/finn-rtllib/mvu/mvu_vvu_axi.sv new file mode 100644 index 0000000000..6498530113 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi.sv @@ -0,0 +1,367 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Matrix Vector Unit (MVU) & Vector Vector Unit (VVU) AXI-lite interface wrapper. + * @details + * The following compute cores are supported: + * - 4-bit MVU on DSP48 achieving 4 MACs/DSP, + * - (4,8]-bit MVU on DSP48 achieving 2 MACs/DSP, + * - [4,9]-bit MVU and VVU on DSP58 achieving 3 MACs/DSP, + * Folding hints: + * - PE scaling should divide MH. + * - SIMD scaling should divide MW. + * - Otherwise, keep SIMD and PE somewhat balanced. SIMD scaling tends to + * impact critical paths more than PE scaling. PE scaling implies a + * bigger fanout on the input activations. + * - Full unfolding along MH (PE=MH) results in no replay buffer instantiated + *****************************************************************************/ + +module mvu_vvu_axi #( + bit IS_MVU, + parameter COMPUTE_CORE, + int unsigned MW, + int unsigned MH, + int unsigned PE, + int unsigned SIMD, + int unsigned SEGMENTLEN = 0, + + int unsigned ACTIVATION_WIDTH, + int unsigned WEIGHT_WIDTH, + int unsigned ACCU_WIDTH, + bit SIGNED_ACTIVATIONS = 0, + + bit PUMPED_COMPUTE = 0, + bit FORCE_BEHAVIORAL = 0, + bit M_REG_LUT = 1, + + // Safely deducible parameters + localparam int unsigned WEIGHT_STREAM_WIDTH = PE * SIMD * WEIGHT_WIDTH, + localparam int unsigned WEIGHT_STREAM_WIDTH_BA = (WEIGHT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned INPUT_STREAM_WIDTH = (IS_MVU ? 1 : PE) * SIMD * ACTIVATION_WIDTH, + localparam int unsigned INPUT_STREAM_WIDTH_BA = (INPUT_STREAM_WIDTH + 7)/8 * 8, + localparam int unsigned OUTPUT_STREAM_WIDTH = PE*ACCU_WIDTH, + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (OUTPUT_STREAM_WIDTH + 7)/8 * 8, + localparam bit SIMD_UNEVEN = SIMD % 2 +)( + // Global Control + input logic ap_clk, + input logic ap_clk2x, // synchronous, double-speed clock; only used for PUMPED_COMPUTE + input logic ap_rst_n, + + // Weight Stream + input logic [WEIGHT_STREAM_WIDTH_BA-1:0] s_axis_weights_tdata, + input logic s_axis_weights_tvalid, + output logic s_axis_weights_tready, + + // Input Stream + input logic [INPUT_STREAM_WIDTH_BA-1:0] s_axis_input_tdata, + input logic s_axis_input_tvalid, + output logic s_axis_input_tready, + + // Output Stream + output logic [OUTPUT_STREAM_WIDTH_BA-1:0] m_axis_output_tdata, + output logic m_axis_output_tvalid, + input logic m_axis_output_tready +); + +//-------------------- Parameter sanity checks --------------------\\ + initial begin + if (MW % SIMD != 0) begin + $error("Matrix width (%0d) is not a multiple of SIMD (%0d).", MW, SIMD); + $finish; + end + if (MH % PE != 0) begin + $error("Matrix height (%0d) is not a multiple of PE (%0d).", MH, PE); + $finish; + end + if (WEIGHT_WIDTH > 8) begin + $error("Weight width of %0d-bits exceeds maximum of 8-bits", WEIGHT_WIDTH); + $finish; + end + if (ACTIVATION_WIDTH > 8) begin + if (!(SIGNED_ACTIVATIONS == 1 && ACTIVATION_WIDTH == 9 && COMPUTE_CORE == "mvu_vvu_8sx9_dsp58")) begin + $error("Activation width of %0d-bits exceeds maximum of 9-bits for signed numbers on DSP48", ACTIVATION_WIDTH); + $finish; + end + end + if (COMPUTE_CORE == "mvu_vvu_8sx9_dsp58") begin + if (SEGMENTLEN == 0) begin + $warning("Segment length of %0d defaults to chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + end + if (SEGMENTLEN > (SIMD+2)/3) begin + $error("Segment length of %0d exceeds chain length of %0d", SEGMENTLEN, (SIMD+2)/3); + $finish; + end + end + if (!IS_MVU) begin + if (COMPUTE_CORE != "mvu_vvu_8sx9_dsp58" && COMPUTE_CORE != "mvu_vvu_lut") begin + $error("VVU only supported on DSP58 or LUT-based implementation"); + $finish; + end + end + end + + uwire clk = ap_clk; + uwire clk2x = ap_clk2x; + uwire rst = !ap_rst_n; + + //- Replay to Accommodate Neuron Fold ----------------------------------- + typedef logic [(IS_MVU? 1:PE)*SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_flatin_t; + uwire mvu_flatin_t amvau; + uwire alast; + uwire afin; + uwire avld; + uwire ardy; + + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NF = MH/PE; + replay_buffer #(.LEN(SF), .REP(IS_MVU ? NF : 1), .W($bits(mvu_flatin_t))) activation_replay ( + .clk, .rst, + .ivld(s_axis_input_tvalid), .irdy(s_axis_input_tready), .idat(mvu_flatin_t'(s_axis_input_tdata)), + .ovld(avld), .ordy(ardy), .odat(amvau), .olast(alast), .ofin(afin) + ); + + //- Unflatten inputs into structured matrices --------------------------- + localparam int unsigned ACT_PE = IS_MVU? 1 : PE; + typedef logic [PE -1:0][SIMD-1:0][WEIGHT_WIDTH -1:0] mvu_w_t; + typedef logic [ACT_PE-1:0][SIMD-1:0][ACTIVATION_WIDTH-1:0] mvu_a_t; + + uwire mvu_w_t mvu_w = s_axis_weights_tdata; + + //- Conditional Activations Layout Adjustment for VVU + uwire mvu_a_t amvau_i; + if (IS_MVU || (PE == 1)) begin : genMVUInput + assign amvau_i = amvau; + end : genMVUInput + else begin : genVVUInput + // The input stream will have the channels interleaved for VVU when PE>1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i, P_1), ..., (S_i, P_i) + for(genvar pe = 0; pe < ACT_PE; pe++) begin + for(genvar simd = 0; simd < SIMD; simd++) begin + assign amvau_i[pe][simd] = amvau[simd*ACT_PE+pe]; + end + end + end : genVVUInput + + //- Flow Control Bracket around Compute Core ---------------------------- + uwire en; + uwire istb = avld && s_axis_weights_tvalid; + assign ardy = en && s_axis_weights_tvalid; + assign s_axis_weights_tready = en && avld; + + //- Conditionally Pumped DSP Compute ------------------------------------ + typedef logic [PE-1:0][ACCU_WIDTH-1:0] dsp_p_t; + uwire ovld; + uwire dsp_p_t odat; + if(1) begin : blkDsp + localparam int unsigned EFFECTIVE_SIMD = SIMD_UNEVEN && PUMPED_COMPUTE ? SIMD+1 : SIMD; + localparam int unsigned DSP_SIMD = EFFECTIVE_SIMD/(PUMPED_COMPUTE+1); + typedef logic [PE -1:0][DSP_SIMD-1:0][WEIGHT_WIDTH -1:0] dsp_w_t; + typedef logic [ACT_PE-1:0][DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] dsp_a_t; + + uwire dsp_clk; + uwire dsp_en; + + uwire dsp_last; + uwire dsp_zero; + uwire dsp_w_t dsp_w; + uwire dsp_a_t dsp_a; + + uwire dsp_vld; + uwire dsp_p_t dsp_p; + + if(!PUMPED_COMPUTE) begin : genUnpumpedCompute + assign dsp_clk = clk; + assign dsp_en = en; + + assign dsp_last = alast && avld; + assign dsp_zero = !istb; + assign dsp_w = mvu_w; + assign dsp_a = amvau_i; + + assign ovld = dsp_vld; + assign odat = dsp_p; + end : genUnpumpedCompute + else begin : genPumpedCompute + assign dsp_clk = clk2x; + + // Identify second fast cycle just before active slow clock edge + logic Active = 0; + if(1) begin : blkActive + uwire clk_lut[2]; // Put some LUT delay on the input from the fast clock net + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut0(.O(clk_lut[0]), .I0(clk)); + (* DONT_TOUCH = "TRUE", HLUTNM = "CLK_LUT" *) LUT1 #(.INIT(2'b10)) lut1(.O(clk_lut[1]), .I0(clk_lut[0])); + always_ff @(posedge clk2x) Active <= clk_lut[1]; + end : blkActive + + // The input for a slow cycle is split across two fast cycles along the SIMD dimension. + // - Both fast cycles are controlled by the same enable state. + // - A zero cycle is duplicated across both fast cycles. + // - The last flag must be restricted to the second fast cycle. + + dsp_w_t W = 'x; + for(genvar pe = 0; pe < PE; pe++) begin : genPERegW + + uwire [2*DSP_SIMD-1:0][WEIGHT_WIDTH-1:0] w; + for(genvar i = 0; i < SIMD; i++) assign w[i] = mvu_w[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign w[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) W[pe] <= 'x; + else if(en) W[pe] <= w[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegW + + dsp_a_t A = 'x; + for(genvar pe = 0; pe < ACT_PE; pe++) begin : genPERegA + + uwire [2*DSP_SIMD-1:0][ACTIVATION_WIDTH-1:0] a; + for(genvar i = 0; i < SIMD; i++) assign a[i] = amvau_i[pe][i]; + for(genvar i = SIMD; i < 2*DSP_SIMD; i++) assign a[i] = 0; + + always_ff @(posedge clk2x) begin + if(rst) A[pe] <= 'x; + else if(en) A[pe] <= a[(Active? DSP_SIMD : 0) +: DSP_SIMD]; + end + + end : genPERegA + + logic Zero = 1; + logic Last = 0; + always_ff @(posedge clk2x) begin + if(rst) begin + Zero <= 1; + Last <= 0; + end + else if(en) begin + Zero <= !istb; + Last <= alast && avld && Active; + end + end + + assign dsp_en = en; + assign dsp_last = Last; + assign dsp_zero = Zero; + assign dsp_w = W; + assign dsp_a = A; + + // Since no two consecutive last cycles will ever be asserted on the input, + // valid outputs will also always be spaced by, at least, one other cycle. + // We can always hold a captured output for two cycles to allow the slow + // clock to pick it up. + logic Vld = 0; + dsp_p_t P = 'x; + always_ff @(posedge clk2x) begin + if(rst) begin + Vld <= 0; + P <= 'x; + end + else if(en) begin + if(dsp_vld) P <= dsp_p; + Vld <= dsp_vld || (Vld && !Active); + end + end + assign ovld = Vld; + assign odat = P; + + end : genPumpedCompute + + case(COMPUTE_CORE) + "mvu_vvu_8sx9_dsp58": + mvu_vvu_8sx9_dsp58 #(.IS_MVU(IS_MVU), .PE(PE), .SIMD(DSP_SIMD), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_4sx4u": + mvu_4sx4u #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + "mvu_8sx8u_dsp48": + mvu_8sx8u_dsp48 #(.PE(PE), .SIMD(DSP_SIMD), .ACCU_WIDTH(ACCU_WIDTH), .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL)) core ( + .clk(dsp_clk), .rst, .en(dsp_en), + .last(dsp_last), .zero(dsp_zero), .w(dsp_w), .a(dsp_a), + .vld(dsp_vld), .p(dsp_p) + ); + default: initial begin + $error("Unrecognized COMPUTE_CORE '%s'", COMPUTE_CORE); + $finish; + end + endcase + + end : blkDsp + +//-------------------- Output register slice --------------------\\ + // Make `en`computation independent from external inputs. + // Drive all outputs from registers. + struct packed { + logic rdy; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } A = '{ rdy: 1, default: 'x }; // side-step register used when encountering backpressure + struct packed { + logic vld; + logic [PE-1:0][ACCU_WIDTH-1:0] dat; + } B = '{ vld: 0, default: 'x }; // ultimate output register + + assign en = A.rdy; + uwire b_load = !B.vld || m_axis_output_tready; + + always_ff @(posedge clk) begin + if(rst) begin + A <= '{ rdy: 1, default: 'x }; + B <= '{ vld: 0, default: 'x }; + end + else begin + if(A.rdy) A.dat <= odat; + A.rdy <= (A.rdy && !ovld) || b_load; + + if(b_load) begin + B <= '{ + vld: ovld || !A.rdy, + dat: A.rdy? odat : A.dat + }; + end + end + end + assign m_axis_output_tvalid = B.vld; + // Why would we need a sign extension here potentially creating a higher signal load into the next FIFO? + // These extra bits should never be used. Why not 'x them out? + assign m_axis_output_tdata = { {(OUTPUT_STREAM_WIDTH_BA-OUTPUT_STREAM_WIDTH){B.dat[PE-1][ACCU_WIDTH-1]}}, B.dat}; + +endmodule : mvu_vvu_axi diff --git a/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v new file mode 100644 index 0000000000..50c15c1b02 --- /dev/null +++ b/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v @@ -0,0 +1,97 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Verilog AXI-lite wrapper for MVU & VVU. + *****************************************************************************/ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter IS_MVU = $IS_MVU$, + parameter COMPUTE_CORE = "$COMPUTE_CORE$", + parameter PUMPED_COMPUTE = 0, + parameter MW = $MW$, + parameter MH = $MH$, + parameter PE = $PE$, + parameter SIMD = $SIMD$, + parameter ACTIVATION_WIDTH = $ACTIVATION_WIDTH$, + parameter WEIGHT_WIDTH = $WEIGHT_WIDTH$, + parameter ACCU_WIDTH = $ACCU_WIDTH$, + parameter SIGNED_ACTIVATIONS = $SIGNED_ACTIVATIONS$, + parameter SEGMENTLEN = $SEGMENTLEN$, + parameter FORCE_BEHAVIORAL = $FORCE_BEHAVIORAL$, + + // Safely deducible parameters + parameter WEIGHT_STREAM_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8 * 8, + parameter INPUT_STREAM_WIDTH_BA = ((IS_MVU == 1 ? 1 : PE) * SIMD * ACTIVATION_WIDTH + 7) / 8 * 8, + parameter OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8 +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF weights_V:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + // (* X_INTERFACE_PARAMETER = "ASSOCIATED_RESET ap_rst_n" *) + // (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk2x CLK" *) + // input ap_clk2x, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + // Weight Stream + input [WEIGHT_STREAM_WIDTH_BA-1:0] weights_V_TDATA, + input weights_V_TVALID, + output weights_V_TREADY, + // Input Stream + input [INPUT_STREAM_WIDTH_BA-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, + // Output Stream + output [OUTPUT_STREAM_WIDTH_BA-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY +); + +mvu_vvu_axi #( + .IS_MVU(IS_MVU), .COMPUTE_CORE(COMPUTE_CORE), .PUMPED_COMPUTE(PUMPED_COMPUTE), .MW(MW), .MH(MH), .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), .WEIGHT_WIDTH(WEIGHT_WIDTH), .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), .SEGMENTLEN(SEGMENTLEN), .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL) + ) inst ( + .ap_clk(ap_clk), + .ap_clk2x(1'b0), // wired to ground since double-pumped compute not enabled through FINN for now + .ap_rst_n(ap_rst_n), + .s_axis_weights_tdata(weights_V_TDATA), + .s_axis_weights_tvalid(weights_V_TVALID), + .s_axis_weights_tready(weights_V_TREADY), + .s_axis_input_tdata(in0_V_TDATA), + .s_axis_input_tvalid(in0_V_TVALID), + .s_axis_input_tready(in0_V_TREADY), + .m_axis_output_tdata(out_V_TDATA), + .m_axis_output_tvalid(out_V_TVALID), + .m_axis_output_tready(out_V_TREADY) +); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/mvu/replay_buffer.sv b/finn-rtllib/mvu/replay_buffer.sv new file mode 100644 index 0000000000..3e2766f63d --- /dev/null +++ b/finn-rtllib/mvu/replay_buffer.sv @@ -0,0 +1,181 @@ +/****************************************************************************** + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Replay buffer for counted sequences on an AXI-lite stream. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer #( + int unsigned LEN, // Sequence length + int unsigned REP, // Sequence replay count + int unsigned W // Data width +)( + input logic clk, + input logic rst, + + input logic [W-1:0] idat, + input logic ivld, + output logic irdy, + + output logic [W-1:0] odat, + output logic olast, + output logic ofin, + output logic ovld, + input logic ordy +); + + if(LEN == 0) initial begin + $error("%m: Illegal zero sequence LEN."); + $finish; + end + if(REP == 0) initial begin + $error("%m: Illegal zero REP count."); + $finish; + end + + // Track position in Sequence + uwire last_item; + uwire shift; + if(LEN == 1) assign last_item = 1; + else begin + typedef logic [$clog2(LEN)-1:0] count_t; + count_t Count = 0; + logic Last = 0; + always_ff @(posedge clk) begin + if(rst) begin + Count <= 0; + Last <= 0; + end + else if(shift) begin + Count <= Count + (Last? 2**$clog2(LEN)-LEN+1 : 1); + Last <= (((LEN-2) & ~Count) == 0) && ((LEN&1) || !Last); + end + end + assign last_item = Last; + end + + if(REP == 1) begin + assign shift = ivld && ordy; + + assign irdy = ordy; + assign odat = idat; + assign olast = last_item; + assign ofin = last_item; + assign ovld = ivld; + end + else begin + + // Track Repetitions + uwire last_rep; + if(1) begin : blkRep + typedef logic [$clog2(REP)-1:0] rep_t; + rep_t RepCnt = 0; + logic RepLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + RepCnt <= 0; + RepLst <= 0; + end + else if(last_item && shift) begin + RepCnt <= RepCnt + (RepLst? 2**$clog2(REP)-REP+1 : 1); + RepLst <= (((REP-2) & ~RepCnt) == 0) && ((REP&1) || !RepLst); + end + end + assign last_rep = RepLst; + end : blkRep + + localparam int unsigned AWIDTH = LEN < 2? 1 : $clog2(LEN); + typedef logic [AWIDTH :0] ptr_t; // pointers with additional generational MSB + typedef logic [W -1:0] data_t; + + // Output Registers + data_t ODat; + logic OVld = 0; + logic OLst = 'x; + logic OFin = 'x; + assign odat = ODat; + assign olast = OLst; + assign ofin = OFin; + assign ovld = OVld; + + // Buffer Memory Management + data_t Mem[2**AWIDTH]; + ptr_t WP = 0; // Write Pointer + ptr_t RP = 0; // Read Pointer + ptr_t FP = 0; // Free Pointer + + // Operational Guards + // Occupancy: WP-FP + // WP-FP < 2**AWIDTH -> writing allowed + // - increments WP + // Availability: WP-RP + // WP-RP > 0 -> reading allowed + // - increments RP, last in sequence rewinds to FP for non-final repetition + // - increments FP in last repetition + assign irdy = !((WP-FP) >> AWIDTH); + + uwire wr = irdy && ivld; + uwire rd = !OVld || ordy; + always_ff @(posedge clk) begin + if(wr) Mem[WP[AWIDTH-1:0]] <= idat; + if(rd) ODat <= Mem[RP[AWIDTH-1:0]]; + end + + uwire vld = (RP != WP); + assign shift = rd && vld; + always_ff @(posedge clk) begin + if(rst) begin + WP <= 0; + RP <= 0; + FP <= 0; + + OVld <= 0; + OLst <= 'x; + OFin <= 'x; + end + else begin + if(wr) WP <= WP + 1; + if(rd) begin + if(vld) begin + automatic logic rewind = last_item && !last_rep; + RP <= RP + (rewind? 2**(AWIDTH+1)-LEN+1 : 1); + FP <= FP + last_rep; + end + + OVld <= vld; + OLst <= last_item; + OFin <= last_rep && last_item; + end + end + end + + end + +endmodule : replay_buffer diff --git a/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv new file mode 100644 index 0000000000..34b5d8eb53 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_8sx9_tb.sv @@ -0,0 +1,165 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU core compute kernel. + *****************************************************************************/ + +module mvu_8sx9_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam int unsigned MH = 256; + localparam int unsigned PE = 16; + localparam int unsigned MW = 600; + localparam int unsigned SIMD = 60; + localparam int unsigned SEGMENTLEN = 4; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned NUM_OF_DSP = SIMD/3; + + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + typedef logic signed [PE-1:0][57:0] output_t; + typedef output_t output_vector_t [NF]; + + function output_vector_t check_output(activation_vector_t a, weight_matrix_t w); + automatic output_vector_t res = '{default: 0}; + for (int j = 0; j 1) && !rst; + end + + // Compare computed output against golden output when vld flag is raised by DUT + always_ff @(posedge clk iff (vld && en)) begin + foreach(p[i]) begin + assert ($signed(p[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(p[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + NF_CNT += 1; + end + + // Instantiate DUT + mvu_8sx9 #( + .PE(PE), + .SIMD(SIMD), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .SEGMENTLEN(SEGMENTLEN) + ) + dut ( + .clk, .rst, .en, .last, .zero, .a, .w, .vld, .p + ); + +endmodule : mvu_8sx9_tb diff --git a/finn-rtllib/mvu/tb/mvu_axi_tb.sv b/finn-rtllib/mvu/tb/mvu_axi_tb.sv new file mode 100644 index 0000000000..4ed7b4bf5f --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_axi_tb.sv @@ -0,0 +1,229 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for MVU AXI wrapper module. + *****************************************************************************/ + +module mvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 1; + localparam string COMPUTE_CORE = "mvu_4sx4u"; + localparam int unsigned MW = 120; + localparam int unsigned MH = 40; + localparam int unsigned SIMD = 20; + localparam int unsigned PE = 10; + localparam int unsigned SEGMENTLEN = 2.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 0; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[j][l]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[j][l]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : mvu_axi_tb diff --git a/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv new file mode 100644 index 0000000000..108980c497 --- /dev/null +++ b/finn-rtllib/mvu/tb/mvu_dsp58_tb.sv @@ -0,0 +1,142 @@ +module mvu_dsp58_tb; + + localparam int unsigned N = 1000; + + localparam int unsigned MW = 12; + localparam int unsigned MH = 4; + localparam int unsigned PE = 2; + localparam int unsigned SIMD = 6; + localparam int unsigned ACTIVATION_WIDTH = 8; + localparam int unsigned WEIGHT_WIDTH = 8; + localparam int unsigned ACCU_WIDTH = 24; + + //- Global Control ------------------ + logic clk = 1; + logic clk2x = 1; + always #5ns clk = !clk; + always #2.5ns clk2x = !clk2x; + + logic rst = 1; + initial begin + repeat(8) @(posedge clk); + rst <= 0; + end + + //- DUTs ---------------------------- + + // Weight Stream + logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] s_axis_weights_tdata; + logic s_axis_weights_tvalid[2]; + uwire s_axis_weights_tready[2]; + + // Input Stream + logic [SIMD-1:0][ACTIVATION_WIDTH-1:0] s_axis_input_tdata; + logic s_axis_input_tvalid[2]; + uwire s_axis_input_tready[2]; + + // Output Stream + uwire [PE-1:0][ACCU_WIDTH-1:0] m_axis_output_tdata[2]; + uwire m_axis_output_tvalid[2]; + logic m_axis_output_tready[2]; + + for(genvar i = 0; i < 2; i++) begin : genDUTs + mvu_vvu_axi #( + .IS_MVU(1), + .COMPUTE_CORE("mvu_vvu_8sx9_dsp58"), + .MW(MW), .MH(MH), + .PE(PE), .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .PUMPED_COMPUTE(i) + ) dut ( + .ap_clk(clk), .ap_clk2x(clk2x), .ap_rst_n(!rst), + .s_axis_weights_tdata, .s_axis_weights_tvalid(s_axis_weights_tvalid[i]), .s_axis_weights_tready(s_axis_weights_tready[i]), + .s_axis_input_tdata, .s_axis_input_tvalid (s_axis_input_tvalid [i]), .s_axis_input_tready (s_axis_input_tready [i]), + .m_axis_output_tdata(m_axis_output_tdata[i]), .m_axis_output_tvalid (m_axis_output_tvalid [i]), .m_axis_output_tready (m_axis_output_tready [i]) + ); + end : genDUTs + + + //- Stimuli ------------------------- + + // Weight Feed + initial begin + s_axis_weights_tvalid = '{ default: 0 }; + s_axis_weights_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)*(MW/SIMD)) begin + automatic type(s_axis_weights_tdata) weights; + std::randomize(weights); + s_axis_weights_tdata <= weights; + s_axis_weights_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_weights_tready[0]); + s_axis_weights_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_weights_tready[1]); + s_axis_weights_tvalid[1] <= 0; + end + join + end + end + + // Input Feed + initial begin + s_axis_input_tvalid = '{ default: 0 }; + s_axis_input_tdata = 'x; + @(posedge clk iff !rst); + + repeat(N * (MW/SIMD)) begin + automatic type(s_axis_input_tdata) in; + std::randomize(in); + s_axis_input_tdata <= in; + s_axis_input_tvalid <= '{ default: 1 }; + fork + begin + @(posedge clk iff s_axis_input_tready[0]); + s_axis_input_tvalid[0] <= 0; + end + begin + @(posedge clk iff s_axis_input_tready[1]); + s_axis_input_tvalid[1] <= 0; + end + join + end + end + + // Output Capture and Comparison + initial begin + m_axis_output_tready = '{ default: 0 }; + @(posedge clk iff !rst); + + repeat(N * (MH/PE)) begin + automatic type(m_axis_output_tdata) res; + m_axis_output_tready <= '{ default: 1 }; + fork + begin + @(posedge clk iff m_axis_output_tvalid[0]); + m_axis_output_tready[0] <= 0; + res[0] = m_axis_output_tdata[0]; + end + begin + @(posedge clk iff m_axis_output_tvalid[1]); + m_axis_output_tready[1] <= 0; + res[1] = m_axis_output_tdata[1]; + end + join + assert(res[0] == res[1]) else begin + $error("Output mismatch: %0x <=> %0x", res[0], res[1]); + $stop; + end + while($urandom()%7 < MW/SIMD) @(posedge clk); // Occassional backpressure + end + + $display("Test completed."); + $finish; + end + +endmodule : mvu_dsp58_tb diff --git a/finn-rtllib/mvu/tb/replay_buffer_tb.sv b/finn-rtllib/mvu/tb/replay_buffer_tb.sv new file mode 100644 index 0000000000..5581354e0e --- /dev/null +++ b/finn-rtllib/mvu/tb/replay_buffer_tb.sv @@ -0,0 +1,130 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for replay_buffer module. + * @author Thomas B. Preußer + *****************************************************************************/ + +module replay_buffer_tb; + + // Global Control + logic clk = 0; + always #5ns clk = !clk; + uwire rst = 0; + + // DUT Geometries + localparam int unsigned DIMS[3] = '{ 7, 8, 10 }; + localparam int unsigned W = 8; + typedef logic [W-1:0] data_t; + + bit [2**$size(DIMS)-1:0] done = 0; + always_comb begin + if(&done) begin + $display("Test completed."); + $finish; + end + end + + // Parallel DUT Instantiations + for(genvar r = 0; r < $size(DIMS); r++) begin + for(genvar l = 0; l < $size(DIMS); l++) begin + localparam int unsigned REP = DIMS[r]; + localparam int unsigned LEN = DIMS[l]; + + data_t idat; + logic ivld; + uwire irdy; + + uwire data_t odat; + uwire olast; + uwire ofin; + uwire ovld; + logic ordy; + + replay_buffer #(.LEN(LEN), .REP(REP), .W(W)) dut ( + .clk, .rst, + .idat, .ivld, .irdy, + .odat, .olast, .ofin, .ovld, .ordy + ); + + // Input Feed: 0, 1, ..., 10*LEN-1 + initial begin + idat = 'x; + ivld = 0; + @(posedge clk iff !rst); + + for(int unsigned i = 0; i < 10*LEN; i++) begin + idat <= i; + ivld <= 1; + @(posedge clk iff irdy); + idat <= 'x; + ivld <= 0; + while($urandom()%(REP-1) != 0) @(posedge clk); + end + end + + // Output Check + initial begin + automatic int unsigned base = 0; + + ordy = 0; + @(posedge clk iff !rst); + + for(int unsigned k = 0; k < 10; k++) begin + for(int unsigned j = 0; j < REP; j++) begin + for(int unsigned i = 0; i < LEN; i++) begin + ordy <= 1; + @(posedge clk iff ovld); + assert(odat == base+i) else begin + $error("#%0d.%0d: Data mismatch: %0d instead of %0d.", r, l, odat, base+i); + $stop; + end + assert(olast == (i == LEN-1)) else begin + $error("#%0d.%0d: Last mismatch.", r, l); + $stop; + end + assert(ofin == ((i == LEN-1) && (j == REP-1))) else begin + $error("#%0d.%0d: Fin mismatch.", r, l); + $stop; + end + + ordy <= 0; + while($urandom()%13 == 0) @(posedge clk); + end + end + base += LEN; + end + + done[$size(DIMS)*r + l] <= 1; + end + end + end + +endmodule : replay_buffer_tb diff --git a/finn-rtllib/mvu/tb/vvu_axi_tb.sv b/finn-rtllib/mvu/tb/vvu_axi_tb.sv new file mode 100644 index 0000000000..853dcc6e17 --- /dev/null +++ b/finn-rtllib/mvu/tb/vvu_axi_tb.sv @@ -0,0 +1,227 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for VVU AXI wrapper module. + *****************************************************************************/ + +module vvu_axi_tb(); + +//-------------------- Simulation parameters --------------------\\ + // Matrix & parallelism config + localparam bit IS_MVU = 0; + localparam string COMPUTE_CORE = "mvu_vvu_8sx9_dsp58"; + localparam int unsigned MW = 25; // Kernel*Kernel + localparam int unsigned MH = 4; // Channels + localparam int unsigned SIMD = 1; // MW%SIMD == 0 + localparam int unsigned PE = 1; // MH%PE == 0 + localparam int unsigned SEGMENTLEN = 1.0; + localparam bit FORCE_BEHAVIORAL = 1; + localparam bit M_REG_LUT = 1; + // Bit-width config + localparam int unsigned ACTIVATION_WIDTH = 4; + localparam int unsigned WEIGHT_WIDTH = 4; + localparam int unsigned ACCU_WIDTH = ACTIVATION_WIDTH+WEIGHT_WIDTH+$clog2(MW); + localparam bit SIGNED_ACTIVATIONS = 1; + // Simulation constants + localparam int unsigned NF = MH/PE; + localparam int unsigned SF = MW/SIMD; + localparam int unsigned WEIGHT_WIDTH_BA = (PE*SIMD*WEIGHT_WIDTH+7)/8*8; + localparam int unsigned ACTIVATION_WIDTH_BA = (PE*SIMD*ACTIVATION_WIDTH+7)/8*8; + localparam int unsigned WEIGHT_WIDTH_BA_DELTA = WEIGHT_WIDTH_BA - PE*SIMD*WEIGHT_WIDTH; + localparam int unsigned ACTIVATION_WIDTH_BA_DELTA = ACTIVATION_WIDTH_BA - PE*SIMD*ACTIVATION_WIDTH; + localparam int unsigned OUTPUT_STREAM_WIDTH_BA = (PE*ACCU_WIDTH + 7)/8 * 8; + + // Generate clk and reset signal + logic clk = 0; + always #5ns clk = !clk; + + logic ap_rst_n = 0; + initial begin + repeat(16) @(posedge clk); + ap_rst_n <= 1; + end + + uwire ap_clk = clk; + + // Generate activations + typedef logic [PE*SIMD-1:0][ACTIVATION_WIDTH-1:0] activation_t; + typedef activation_t activation_vector_t[NF*SF]; + + function activation_vector_t init_ACTIVATIONS; + automatic activation_vector_t res; + std::randomize(res); + return res; + endfunction : init_ACTIVATIONS + + activation_vector_t ACTIVATIONS = init_ACTIVATIONS(); + + struct { + activation_t dat; + logic vld; + logic rdy; + } activations; + + initial begin + activations.vld = 0; + activations.dat = 'X; + @(posedge clk iff ap_rst_n); + + for (int i=0; i= 0; + @(posedge clk); + end while (!(activations.vld === 1 && activations.rdy === 1)); + end + + activations.vld <= 0; + activations.dat <= 'x; + end + + // Generate weights + typedef logic [PE-1:0][SIMD-1:0][WEIGHT_WIDTH-1:0] weight_t; + typedef weight_t weight_matrix_t[NF][SF]; + + function weight_matrix_t init_WEIGHTS; + automatic weight_matrix_t res; + std::randomize(res); + return res; + endfunction : init_WEIGHTS; + + weight_matrix_t WEIGHTS = init_WEIGHTS(); + + struct { + weight_t dat; + logic vld; + logic rdy; + } weights; + + initial begin + weights.vld = 0; + weights.dat = 'X; + @(posedge clk iff ap_rst_n); + + weights.vld <= 1; + for (int i=0; i1 + // Hence, we need to 'untangle' the input stream, i.e. [..][SIMD*PE][..] --> [..][PE][SIMD][..] + // Note that for each 'SIMD' (S) and 'PE' (P) element, we have something like: + // (S_0, P_0), ..., (S_0, P_i), (S_1, P_0), ..., (S_1, P_i), ..., (S_i, P_i) which we need to 'untangle' to + // (S_0, P_0), ..., (S_i, P_0), (S_0, P_1), ..., (S_i,, P_1), ..., (S_i, P_i) + for (int i = 0; i < NF; i++) begin + for (int j = 0; j < SF; j++) begin + for (int k = 0; k < PE; k++) begin + for (int l = 0; l < SIMD; l++) begin + if (SIGNED_ACTIVATIONS) + res[i][k] = $signed(res[i][k]) + $signed(a[i*SF+j][k + l*PE]) * $signed(w[i][j][k][l]); + else + res[i][k] = $signed(res[i][k]) + $signed({1'b0, a[i*SF+j][k + l*PE]}) * $signed(w[i][j][k][l]); + end + end + end + end + return res; + endfunction : check_output; + + output_vector_t GOLDEN_OUTPUT = check_output(ACTIVATIONS, WEIGHTS); + + int unsigned NF_CNT = 0; + initial begin + outputs.rdy = 0; + while (NF_CNT < NF) begin + // Loop until both rdy & vld are asserted + do begin + outputs.rdy <= $urandom()%7 >= 0; + @(posedge clk iff ap_rst_n); + end while (!(outputs.rdy === 1 && outputs.vld === 1)); + + // Compare produced outputs against golden outputs + foreach(outputs.dat[i]) begin + assert ($signed(outputs.dat[i]) == $signed(GOLDEN_OUTPUT[NF_CNT][i])) $display(">>> [t=%0t] Test succeeded (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + else begin + $error(">>> [t=%0t] TEST failed (NF=%0d)! Computed / GOLDEN = %0d / %0d", $time, NF_CNT, $signed(outputs.dat[i]), $signed(GOLDEN_OUTPUT[NF_CNT][i])); + $stop; + end + end + + NF_CNT += 1; + end + + $finish; + end + + // Instantiate DUT + mvu_vvu_axi #( + .IS_MVU(IS_MVU), + .COMPUTE_CORE(COMPUTE_CORE), + .MW(MW), + .MH(MH), + .PE(PE), + .SIMD(SIMD), + .ACTIVATION_WIDTH(ACTIVATION_WIDTH), + .WEIGHT_WIDTH(WEIGHT_WIDTH), + .ACCU_WIDTH(ACCU_WIDTH), + .SIGNED_ACTIVATIONS(SIGNED_ACTIVATIONS), + .SEGMENTLEN(SEGMENTLEN), + .FORCE_BEHAVIORAL(FORCE_BEHAVIORAL), + .M_REG_LUT(M_REG_LUT) + ) + dut ( + .ap_clk, .ap_rst_n, .s_axis_weights_tdata({ {WEIGHT_WIDTH_BA_DELTA{1'b0}}, weights.dat }), .s_axis_weights_tvalid(weights.vld), + .s_axis_weights_tready(weights.rdy), .s_axis_input_tdata({ {ACTIVATION_WIDTH_BA_DELTA{1'b0}}, activations.dat }), .s_axis_input_tvalid(activations.vld), + .s_axis_input_tready(activations.rdy), .m_axis_output_tdata(outputs.dat), .m_axis_output_tvalid(outputs.vld), + .m_axis_output_tready(outputs.rdy) + ); + +endmodule : vvu_axi_tb diff --git a/finn-rtllib/swg/swg_common.sv b/finn-rtllib/swg/swg_common.sv new file mode 100644 index 0000000000..c1d388550a --- /dev/null +++ b/finn-rtllib/swg/swg_common.sv @@ -0,0 +1,248 @@ +/****************************************************************************** + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +// loop controller used for both, "default" and "parallel", implementation styles +module swg_controller +import swg::*; #( + int unsigned LOOP_H_ITERATIONS, + int unsigned LOOP_W_ITERATIONS, + int unsigned LOOP_KH_ITERATIONS, + int unsigned LOOP_KW_ITERATIONS, + int unsigned LOOP_SIMD_ITERATIONS, + + int unsigned INCR_BITWIDTH, + + bit IS_DEPTHWISE, + + int HEAD_INCR_SIMD, + int HEAD_INCR_KW, + int HEAD_INCR_KH, + int HEAD_INCR_W, + int HEAD_INCR_H, + int TAIL_INCR_W, + int TAIL_INCR_H, + int TAIL_INCR_LAST, + + state_e INNERMOST_STATE +)( + input logic clk, + input logic rst_n, + + input logic advance, + output logic [INCR_BITWIDTH-1:0] addr_incr, + output logic [INCR_BITWIDTH-1:0] tail_incr +); + + // state and counters + state_e State = INNERMOST_STATE; + state_e state_next; + + logic signed [$clog2(LOOP_H_ITERATIONS +2)+1-1:0] Counter_loop_h = LOOP_H_ITERATIONS; + logic signed [$clog2(LOOP_W_ITERATIONS +2)+1-1:0] Counter_loop_w = LOOP_W_ITERATIONS; + logic signed [$clog2(LOOP_KH_ITERATIONS +2)+1-1:0] Counter_loop_kh = LOOP_KH_ITERATIONS; + logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS; + logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS; + + // combinational logic for addr_incr generation + always_comb begin : blkHead + unique case (State) + STATE_START : addr_incr = 0; + STATE_LOOP_SIMD : addr_incr = HEAD_INCR_SIMD; + STATE_LOOP_KW : addr_incr = HEAD_INCR_KW; + STATE_LOOP_KH : addr_incr = HEAD_INCR_KH; + STATE_LOOP_W : addr_incr = HEAD_INCR_W; + STATE_LOOP_H : addr_incr = HEAD_INCR_H; + endcase + end + + // combinational logic for tail_incr generation + uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0; + assign tail_incr = + tail_incr_inner_condition? 1 : + Counter_loop_w >= 0? TAIL_INCR_W : + Counter_loop_h >= 0? TAIL_INCR_H : + /* else */ TAIL_INCR_LAST; + + // combinational next state logic + always_comb begin : blkState + state_next = State; + if(State != INNERMOST_STATE) state_next = INNERMOST_STATE; + else begin + if(Counter_loop_simd < 0) begin + state_next = + (Counter_loop_kw >= 0)? STATE_LOOP_KW : + (Counter_loop_kh >= 0)? STATE_LOOP_KH : + (Counter_loop_w >= 0)? STATE_LOOP_W : + (Counter_loop_h >= 0)? STATE_LOOP_H : + /* else */ STATE_START; + end + end + end : blkState + + // sequential logic + always_ff @ (posedge clk) begin + if(!rst_n) begin + State <= INNERMOST_STATE; + Counter_loop_h <= LOOP_H_ITERATIONS; + Counter_loop_w <= LOOP_W_ITERATIONS; + Counter_loop_kh <= LOOP_KH_ITERATIONS; + Counter_loop_kw <= LOOP_KW_ITERATIONS; + Counter_loop_simd <= LOOP_SIMD_ITERATIONS; + end + else if(advance) begin + State <= state_next; + if (State == INNERMOST_STATE) begin + if(Counter_loop_simd >= 0) Counter_loop_simd <= Counter_loop_simd-1; + else begin + Counter_loop_simd <= LOOP_SIMD_ITERATIONS; + if(Counter_loop_kw >= 0) Counter_loop_kw <= Counter_loop_kw-1; + else begin + Counter_loop_kw <= LOOP_KW_ITERATIONS; + if(Counter_loop_kh >= 0) Counter_loop_kh <= Counter_loop_kh-1; + else begin + Counter_loop_kh <= LOOP_KH_ITERATIONS; + if(Counter_loop_w >= 0) Counter_loop_w <= Counter_loop_w-1; + else begin + Counter_loop_w <= LOOP_W_ITERATIONS; + if(Counter_loop_h >= 0) Counter_loop_h <= Counter_loop_h-1; + else Counter_loop_h <= LOOP_H_ITERATIONS; + end + end + end + end + end + end + end + +endmodule : swg_controller + +// buffer used in "default" implementation style +module swg_cyclic_buffer_addressable #( + int unsigned WIDTH, + int unsigned DEPTH, + parameter RAM_STYLE = "auto" +)( + input logic clk, + + input logic write_enable, + input logic [$clog2(DEPTH)-1:0] write_addr, + input logic [WIDTH-1:0] data_in, + + input logic read_enable, + input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer + output logic [WIDTH-1:0] data_out +); + + (*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram[DEPTH]; + logic [WIDTH-1:0] Out = 'x; + always_ff @(posedge clk) begin + if (read_enable) Out <= Ram[read_addr]; + if (write_enable) Ram[write_addr] <= data_in; + end + assign data_out = Out; + +endmodule : swg_cyclic_buffer_addressable + +// buffer used in "parallel" implementation style +module swg_reg_buffer +#( + int unsigned WIDTH = 1, + int unsigned DEPTH = 1 +) +( + input logic clk, + input logic shift_enable, + input logic [WIDTH-1:0] shift_in, + output logic [WIDTH-1:0] shift_out, + output logic [WIDTH*DEPTH-1:0] data_out +); + +logic [WIDTH-1:0] Data [DEPTH-1:0]; + +assign shift_out = Data[DEPTH-1]; + +for (genvar e=0; e 1) Data[DEPTH-1:1] <= Data[DEPTH-2:0]; + Data[0] <= shift_in; + end +end +endmodule : swg_reg_buffer + +// buffer used in "parallel" implementation style +module swg_ram_buffer +#( + int unsigned WIDTH, + int unsigned DEPTH, + parameter RAM_STYLE = "auto" +) +( + input logic clk, + input logic rst_n, + input logic shift_enable, + input logic [WIDTH-1:0] shift_in, + output logic [WIDTH-1:0] shift_out +); + +logic [WIDTH-1:0] Out_reg; +assign shift_out = Out_reg; + +logic [$clog2(DEPTH)-1:0] Addr_w = 0; +logic [$clog2(DEPTH)-1:0] Addr_r = 0; + +(*ram_style=RAM_STYLE*) logic [WIDTH-1:0] Ram [DEPTH-1:0]; + +always_ff @(posedge clk) begin + if (rst_n == 1'b0) begin + Addr_w <= 0; + Addr_r <= 1; + end else begin + if (shift_enable) begin + Ram[Addr_w] <= shift_in; + Out_reg <= Ram[Addr_r]; + + if (Addr_w == DEPTH-1) + Addr_w <= 0; + else + Addr_w <= Addr_w + 1; + + if (Addr_r == DEPTH-1) + Addr_r <= 0; + else + Addr_r <= Addr_r + 1; + end + end +end +endmodule : swg_ram_buffer diff --git a/finn-rtllib/swg/swg_pkg.sv b/finn-rtllib/swg/swg_pkg.sv new file mode 100644 index 0000000000..1200310aca --- /dev/null +++ b/finn-rtllib/swg/swg_pkg.sv @@ -0,0 +1,41 @@ +/****************************************************************************** + * Copyright (C) 2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +package swg; + typedef enum logic [2:0] { + STATE_START, + STATE_LOOP_SIMD, + STATE_LOOP_KW, + STATE_LOOP_KH, + STATE_LOOP_W, + STATE_LOOP_H + } state_e; +endpackage : swg diff --git a/finn-rtllib/swg/swg_template_axilite.v b/finn-rtllib/swg/swg_template_axilite.v index 9479c7f80d..1f39e4440e 100644 --- a/finn-rtllib/swg/swg_template_axilite.v +++ b/finn-rtllib/swg/swg_template_axilite.v @@ -1,8 +1,35 @@ +/****************************************************************************** + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ -`timescale 1 ns / 1 ps - -module $TOP_MODULE_NAME$_axilite # -( +module $TOP_MODULE_NAME$_axilite #( // Users to add parameters here // User parameters ends @@ -12,8 +39,7 @@ module $TOP_MODULE_NAME$_axilite # parameter integer C_S_AXI_DATA_WIDTH = 32, // Width of S_AXI address bus parameter integer C_S_AXI_ADDR_WIDTH = 6 -) -( +)( // Users to add ports here output wire [C_S_AXI_DATA_WIDTH-1:0] cfg_reg0, output wire [C_S_AXI_DATA_WIDTH-1:0] cfg_reg1, diff --git a/finn-rtllib/swg/swg_template_default.sv b/finn-rtllib/swg/swg_template_default.sv index 06e65e9111..78a8d0a3b9 100644 --- a/finn-rtllib/swg/swg_template_default.sv +++ b/finn-rtllib/swg/swg_template_default.sv @@ -28,141 +28,6 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -module $TOP_MODULE_NAME$_controller #( - int unsigned LOOP_H_ITERATIONS = $LOOP_H_ITERATIONS$, - int unsigned LOOP_W_ITERATIONS = $LOOP_W_ITERATIONS$, - int unsigned LOOP_KH_ITERATIONS = $LOOP_KH_ITERATIONS$, - int unsigned LOOP_KW_ITERATIONS = $LOOP_KW_ITERATIONS$, - int unsigned LOOP_SIMD_ITERATIONS = $LOOP_SIMD_ITERATIONS$, - - int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$, - - bit IS_DEPTHWISE = $IS_DEPTHWISE$ -)( - input logic clk, - input logic rst_n, - - input logic advance, - output logic [INCR_BITWIDTH-1:0] addr_incr, - output logic [INCR_BITWIDTH-1:0] tail_incr -); - - // state and counters - typedef enum logic [2:0] { - STATE_START, - STATE_LOOP_SIMD, - STATE_LOOP_KW, - STATE_LOOP_KH, - STATE_LOOP_W, - STATE_LOOP_H - } state_e; - state_e State = $INNERMOST_STATE$; - state_e state_next; - - logic signed [$clog2(LOOP_H_ITERATIONS +2)+1-1:0] Counter_loop_h = LOOP_H_ITERATIONS; - logic signed [$clog2(LOOP_W_ITERATIONS +2)+1-1:0] Counter_loop_w = LOOP_W_ITERATIONS; - logic signed [$clog2(LOOP_KH_ITERATIONS +2)+1-1:0] Counter_loop_kh = LOOP_KH_ITERATIONS; - logic signed [$clog2(LOOP_KW_ITERATIONS +2)+1-1:0] Counter_loop_kw = LOOP_KW_ITERATIONS; - logic signed [$clog2(LOOP_SIMD_ITERATIONS+2)+1-1:0] Counter_loop_simd = LOOP_SIMD_ITERATIONS; - - // combinational logic for addr_incr generation - always_comb begin : blkHead - unique case (State) - 0 : addr_incr = 0; - 1 : addr_incr = $HEAD_INCR_SIMD$; - 2 : addr_incr = $HEAD_INCR_KW$; - 3 : addr_incr = $HEAD_INCR_KH$; - 4 : addr_incr = $HEAD_INCR_W$; - 5 : addr_incr = $HEAD_INCR_H$; - endcase - end - - // combinational logic for tail_incr generation - uwire tail_incr_inner_condition = IS_DEPTHWISE? (Counter_loop_kh >= 0) : 0; - assign tail_incr = - tail_incr_inner_condition? 1 : - Counter_loop_w >= 0? $TAIL_INCR_W$ : - Counter_loop_h >= 0? $TAIL_INCR_H$ : - /* else */ $TAIL_INCR_LAST$; - - // combinational next state logic - always_comb begin : blkState - state_next = State; - if(State != $INNERMOST_STATE$) state_next = $INNERMOST_STATE$; - else begin - if(Counter_loop_simd < 0) begin - state_next = - (Counter_loop_kw >= 0)? STATE_LOOP_KW : - (Counter_loop_kh >= 0)? STATE_LOOP_KH : - (Counter_loop_w >= 0)? STATE_LOOP_W : - (Counter_loop_h >= 0)? STATE_LOOP_H : - /* else */ STATE_START; - end - end - end : blkState - - // sequential logic - always_ff @ (posedge clk) begin - if(!rst_n) begin - State <= $INNERMOST_STATE$; - Counter_loop_h <= LOOP_H_ITERATIONS; - Counter_loop_w <= LOOP_W_ITERATIONS; - Counter_loop_kh <= LOOP_KH_ITERATIONS; - Counter_loop_kw <= LOOP_KW_ITERATIONS; - Counter_loop_simd <= LOOP_SIMD_ITERATIONS; - end - else if(advance) begin - State <= state_next; - if (State == $INNERMOST_STATE$) begin - if(Counter_loop_simd >= 0) Counter_loop_simd <= Counter_loop_simd-1; - else begin - Counter_loop_simd <= LOOP_SIMD_ITERATIONS; - if(Counter_loop_kw >= 0) Counter_loop_kw <= Counter_loop_kw-1; - else begin - Counter_loop_kw <= LOOP_KW_ITERATIONS; - if(Counter_loop_kh >= 0) Counter_loop_kh <= Counter_loop_kh-1; - else begin - Counter_loop_kh <= LOOP_KH_ITERATIONS; - if(Counter_loop_w >= 0) Counter_loop_w <= Counter_loop_w-1; - else begin - Counter_loop_w <= LOOP_W_ITERATIONS; - if(Counter_loop_h >= 0) Counter_loop_h <= Counter_loop_h-1; - else Counter_loop_h <= LOOP_H_ITERATIONS; - end - end - end - end - end - end - end - -endmodule : $TOP_MODULE_NAME$_controller - -module $TOP_MODULE_NAME$_cyclic_buffer_addressable #( - int unsigned WIDTH, - int unsigned DEPTH -)( - input logic clk, - - input logic write_enable, - input logic [$clog2(DEPTH)-1:0] write_addr, - input logic [WIDTH-1:0] data_in, - - input logic read_enable, - input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer - output logic [WIDTH-1:0] data_out -); - - $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH]; - logic [WIDTH-1:0] Out = 'x; - always_ff @(posedge clk) begin - if (read_enable) Out <= Ram[read_addr]; - if (write_enable) Ram[write_addr] <= data_in; - end - assign data_out = Out; - -endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable - module $TOP_MODULE_NAME$_impl #( int BIT_WIDTH, int SIMD, @@ -197,9 +62,10 @@ module $TOP_MODULE_NAME$_impl #( uwire window_buffer_read_enable; uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_write_addr; uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr; - $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + swg_cyclic_buffer_addressable #( .WIDTH(BUF_IN_WIDTH), - .DEPTH(BUF_ELEM_TOTAL) + .DEPTH(BUF_ELEM_TOTAL), + .RAM_STYLE($RAM_STYLE$) ) window_buffer_inst ( .clk(ap_clk), @@ -216,7 +82,25 @@ module $TOP_MODULE_NAME$_impl #( uwire advance_controller; uwire signed [INCR_BITWIDTH-1:0] addr_incr; uwire [INCR_BITWIDTH-1:0] tail_incr; - $TOP_MODULE_NAME$_controller controller_inst ( + swg_controller #( + .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$), + .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$), + .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$), + .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$), + .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$), + .HEAD_INCR_SIMD($HEAD_INCR_SIMD$), + .HEAD_INCR_KW($HEAD_INCR_KW$), + .HEAD_INCR_KH($HEAD_INCR_KH$), + .HEAD_INCR_W($HEAD_INCR_W$), + .HEAD_INCR_H($HEAD_INCR_H$), + .TAIL_INCR_W($TAIL_INCR_W$), + .TAIL_INCR_H($TAIL_INCR_H$), + .TAIL_INCR_LAST($TAIL_INCR_LAST$), + .INCR_BITWIDTH($INCR_BITWIDTH$), + .IS_DEPTHWISE($IS_DEPTHWISE$), + .INNERMOST_STATE(swg::$INNERMOST_STATE$) + ) + controller_inst ( .clk(ap_clk), .rst_n(ap_rst_n), .advance(advance_controller), diff --git a/finn-rtllib/swg/swg_template_default_dynamic.sv b/finn-rtllib/swg/swg_template_default_dynamic.sv index eb53978b58..5a6fdda170 100644 --- a/finn-rtllib/swg/swg_template_default_dynamic.sv +++ b/finn-rtllib/swg/swg_template_default_dynamic.sv @@ -1,3 +1,34 @@ +/****************************************************************************** + * Copyright (C) 2022-2023, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + module $TOP_MODULE_NAME$_controller #( int unsigned CNTR_BITWIDTH, int unsigned INCR_BITWIDTH, @@ -27,6 +58,8 @@ module $TOP_MODULE_NAME$_controller #( input logic [INCR_BITWIDTH-1:0] cfg_incr_tail_last ); + import swg::*; + // (dynamic) configuration registers logic [CNTR_BITWIDTH-1:0] Cfg_cntr_simd = $LOOP_SIMD_ITERATIONS$; logic [CNTR_BITWIDTH-1:0] Cfg_cntr_kw = $LOOP_KW_ITERATIONS$; @@ -62,14 +95,6 @@ module $TOP_MODULE_NAME$_controller #( end // state and counters - typedef enum logic [2:0] { - STATE_START, - STATE_LOOP_SIMD, - STATE_LOOP_KW, - STATE_LOOP_KH, - STATE_LOOP_W, - STATE_LOOP_H - } state_e; state_e State = $INNERMOST_STATE$; state_e state_next; @@ -152,31 +177,6 @@ module $TOP_MODULE_NAME$_controller #( endmodule : $TOP_MODULE_NAME$_controller -module $TOP_MODULE_NAME$_cyclic_buffer_addressable #( - int unsigned WIDTH, - int unsigned DEPTH -)( - input logic clk, - - input logic write_enable, - input logic [$clog2(DEPTH)-1:0] write_addr, - input logic [WIDTH-1:0] data_in, - - input logic read_enable, - input logic [$clog2(DEPTH)-1:0] read_addr, // absolute (!) read address of cyclic buffer - output logic [WIDTH-1:0] data_out -); - - $RAM_STYLE$ logic [WIDTH-1:0] Ram[DEPTH]; - logic [WIDTH-1:0] Out = 'x; - always_ff @(posedge clk) begin - if (read_enable) Out <= Ram[read_addr]; - if (write_enable) Ram[write_addr] <= data_in; - end - assign data_out = Out; - -endmodule : $TOP_MODULE_NAME$_cyclic_buffer_addressable - module $TOP_MODULE_NAME$_impl #( int BIT_WIDTH, int SIMD, @@ -242,9 +242,10 @@ module $TOP_MODULE_NAME$_impl #( uwire window_buffer_read_enable; uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_write_addr; uwire [$clog2(BUF_ELEM_TOTAL)-1:0] window_buffer_read_addr; - $TOP_MODULE_NAME$_cyclic_buffer_addressable #( + swg_cyclic_buffer_addressable #( .WIDTH(BUF_IN_WIDTH), - .DEPTH(BUF_ELEM_TOTAL) + .DEPTH(BUF_ELEM_TOTAL), + .RAM_STYLE($RAM_STYLE$) ) window_buffer_inst ( .clk(ap_clk), diff --git a/finn-rtllib/swg/swg_template_parallel.sv b/finn-rtllib/swg/swg_template_parallel.sv new file mode 100644 index 0000000000..b92f27b2ca --- /dev/null +++ b/finn-rtllib/swg/swg_template_parallel.sv @@ -0,0 +1,216 @@ +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module $TOP_MODULE_NAME$_wb +#( + int unsigned IN_WIDTH = 1, // bit-width*C*MMV_in + int unsigned OUT_ELEM_WIDTH = 1, // bit-width*C + int unsigned OUT_WIDTH = 1, // bit-width*C*MMV_out + int unsigned BUFFER_ELEM_TOTAL = 1 +) +( + input logic clk, + input logic rst_n, + input logic shift_enable, + input logic [IN_WIDTH-1:0] data_in, + output logic [OUT_WIDTH-1:0] data_out +); + +$GENERATE_REG_FIFOS$ + +$GENERATE_BRAM_FIFOS$ + +// fixed interconnect between linear buffers +$GENERATE_BUFFER_CONNECTION$ + +// fixed REG FIFO -> output mapping +$GENERATE_OUTPUT_MAPPING$ + +endmodule : $TOP_MODULE_NAME$_wb + +module $TOP_MODULE_NAME$_impl #( + int unsigned BIT_WIDTH, + int unsigned SIMD, + int unsigned MMV_IN, + int unsigned MMV_OUT, + int unsigned LAST_READ_ELEM = $LAST_READ_ELEM$, + int unsigned FIRST_WRITE_ELEM = $FIRST_WRITE_ELEM$, + int unsigned LAST_WRITE_ELEM = $LAST_WRITE_ELEM$, + int unsigned BUF_ELEM_TOTAL = $BUF_ELEM_TOTAL$, + int unsigned INCR_BITWIDTH = $INCR_BITWIDTH$ +)( + input logic ap_clk, + input logic ap_rst_n, + + input logic in0_V_V_TVALID, + output logic in0_V_V_TREADY, + input logic [BIT_WIDTH * SIMD * MMV_IN-1:0] in0_V_V_TDATA, + + output logic out_V_V_TVALID, + input logic out_V_V_TREADY, + output logic [BIT_WIDTH * SIMD * MMV_OUT-1:0] out_V_V_TDATA +); + // derived constants + localparam int unsigned BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; + localparam int unsigned BUF_OUT_ELEM_WIDTH = BIT_WIDTH * SIMD; + localparam int unsigned BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; + + // main buffer instantiation + uwire [BUF_IN_WIDTH -1:0] window_buffer_in; + uwire [BUF_OUT_WIDTH-1:0] window_buffer_out; + uwire window_buffer_shift_enable; + $TOP_MODULE_NAME$_wb + #( + .IN_WIDTH(BUF_IN_WIDTH), + .OUT_ELEM_WIDTH(BUF_OUT_ELEM_WIDTH), + .OUT_WIDTH(BUF_OUT_WIDTH), + .BUFFER_ELEM_TOTAL(BUF_ELEM_TOTAL) + ) + window_buffer_inst + ( + .clk(ap_clk), + .rst_n(ap_rst_n), + .data_in(window_buffer_in), + .shift_enable(window_buffer_shift_enable), + .data_out(window_buffer_out) + ); + + // controller instantiation + uwire advance_controller; + uwire signed [INCR_BITWIDTH-1:0] addr_incr; + uwire [INCR_BITWIDTH-1:0] tail_incr; + swg_controller #( + .LOOP_H_ITERATIONS($LOOP_H_ITERATIONS$), + .LOOP_W_ITERATIONS($LOOP_W_ITERATIONS$), + .LOOP_KH_ITERATIONS($LOOP_KH_ITERATIONS$), + .LOOP_KW_ITERATIONS($LOOP_KW_ITERATIONS$), + .LOOP_SIMD_ITERATIONS($LOOP_SIMD_ITERATIONS$), + .HEAD_INCR_SIMD($HEAD_INCR_SIMD$), + .HEAD_INCR_KW($HEAD_INCR_KW$), + .HEAD_INCR_KH($HEAD_INCR_KH$), + .HEAD_INCR_W($HEAD_INCR_W$), + .HEAD_INCR_H($HEAD_INCR_H$), + .TAIL_INCR_W($TAIL_INCR_W$), + .TAIL_INCR_H($TAIL_INCR_H$), + .TAIL_INCR_LAST($TAIL_INCR_LAST$), + .INCR_BITWIDTH($INCR_BITWIDTH$), + .IS_DEPTHWISE($IS_DEPTHWISE$), + .INNERMOST_STATE(swg::$INNERMOST_STATE$) + ) + controller_inst ( + .clk(ap_clk), + .rst_n(ap_rst_n), + .advance(advance_controller), + .addr_incr(addr_incr), + .tail_incr(tail_incr) + ); + + // counters/address registers + logic signed [$clog2(LAST_READ_ELEM+1)+1-1:0] Newest_buffered_elem = -1; + logic [$clog2(LAST_READ_ELEM+1)+1-1:0] Current_elem = FIRST_WRITE_ELEM; + + // control registers/signals + logic Writing_done = 0; + logic Write_done = 0; + uwire write_cmd = !($signed(Current_elem) > Newest_buffered_elem) && !Writing_done;; + uwire write_ok = write_cmd && (out_V_V_TREADY || Write_done); + uwire write_blocked = write_cmd && !out_V_V_TREADY && !Write_done; + + uwire reading_done = Newest_buffered_elem == LAST_READ_ELEM; + uwire read_cmd = !reading_done && (Writing_done || Newest_buffered_elem <= $signed(Current_elem)); + uwire read_ok = read_cmd && in0_V_V_TVALID && !write_blocked; + + // includes waiting on W if W-only cycle: wait only on W no R/W to wait for + uwire advance = read_ok || (!read_cmd && write_ok) || (!read_cmd && !write_cmd); + + // assign buffer control + assign window_buffer_shift_enable = advance; + assign advance_controller = write_ok; + + // assign I/O ports + assign window_buffer_in = in0_V_V_TDATA; + assign out_V_V_TDATA = window_buffer_out; + assign in0_V_V_TREADY = ap_rst_n && read_ok; //only asserted if data is available and we can store it (allowed) + assign out_V_V_TVALID = ap_rst_n && write_cmd && !Write_done; //only asserted if we have data available and it has not been read yet (don't wait for READY from sink) + + // write done logic + always_ff @(posedge ap_clk) begin + if(!ap_rst_n) begin + Write_done <= 1'b0; + end + else begin + if (advance) begin + Write_done <= 1'b0; //reset flag + end else if (write_ok) //successful W in this cycle, but R still outstanding + Write_done <= 1'b1; //write can happen even if read is blocked, but only for the current cycle! + end + end + + // main process for advancing counters + always_ff @(posedge ap_clk) begin + if(!ap_rst_n) begin + Newest_buffered_elem <= -1; + Current_elem <= FIRST_WRITE_ELEM; + Writing_done <= 0; + end + else begin + if (read_ok) begin + Newest_buffered_elem <= Newest_buffered_elem+1; + + // check if this is the last read cycle (reading_done will be true afterwards) + if ((Newest_buffered_elem == LAST_READ_ELEM-1) && Writing_done) begin + // start processing of next FM if writing is done already (possible due to unused input elements at the tail end) + // todo: allow for read overlapping between feature maps (i.e., reading first elements from next FM while still writing last window of current FM) + Newest_buffered_elem <= -1; + Current_elem <= FIRST_WRITE_ELEM; + Writing_done <= 0; + end + end + + if (write_ok) begin + // check if this is the last write cycle (Writing_done will be true afterwards) + if (Current_elem == LAST_WRITE_ELEM) begin + Writing_done <= 1; + + if (reading_done || (read_ok && (Newest_buffered_elem == LAST_READ_ELEM - 1))) begin + // start processing of next FM if reading is done already, or completes in the same cycle + Newest_buffered_elem <= -1; + Current_elem <= FIRST_WRITE_ELEM; + Writing_done <= 0; + end + end + else + Current_elem <= $signed(Current_elem) + addr_incr; + end + end + end + +endmodule : $TOP_MODULE_NAME$_impl diff --git a/finn-rtllib/swg/swg_template_wrapper.v b/finn-rtllib/swg/swg_template_wrapper.v index 0cc3579a25..22dc6bd8cd 100644 --- a/finn-rtllib/swg/swg_template_wrapper.v +++ b/finn-rtllib/swg/swg_template_wrapper.v @@ -28,19 +28,19 @@ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -`timescale 1 ns / 1 ps module $TOP_MODULE_NAME$ ( -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) -input ap_clk, -(* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V" *) -input ap_rst_n, -input [BUF_IN_WIDTH-1:0] in0_V_TDATA, -input in0_V_TVALID, -output in0_V_TREADY, -output [BUF_OUT_WIDTH-1:0] out_V_TDATA, -output out_V_TVALID, -input out_V_TREADY + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + input [IN_WIDTH_PADDED-1:0] in0_V_TDATA, + input in0_V_TVALID, + output in0_V_TREADY, + output [OUT_WIDTH_PADDED-1:0] out_V_TDATA, + output out_V_TVALID, + input out_V_TREADY ); // top-level parameters (set via code-generation) @@ -48,28 +48,27 @@ parameter BIT_WIDTH = $BIT_WIDTH$; parameter SIMD = $SIMD$; parameter MMV_IN = $MMV_IN$; parameter MMV_OUT = $MMV_OUT$; +parameter IN_WIDTH_PADDED = $IN_WIDTH_PADDED$; +parameter OUT_WIDTH_PADDED = $OUT_WIDTH_PADDED$; // derived constants parameter BUF_IN_WIDTH = BIT_WIDTH * SIMD * MMV_IN; parameter BUF_OUT_WIDTH = BIT_WIDTH * SIMD * MMV_OUT; -$TOP_MODULE_NAME$_impl -#( - .BIT_WIDTH(BIT_WIDTH), - .SIMD(SIMD), - .MMV_IN(MMV_IN), - .MMV_OUT(MMV_OUT) -) -impl -( - .ap_clk(ap_clk), - .ap_rst_n(ap_rst_n), - .in0_V_V_TDATA(in0_V_TDATA), - .in0_V_V_TVALID(in0_V_TVALID), - .in0_V_V_TREADY(in0_V_TREADY), - .out_V_V_TDATA(out_V_TDATA), - .out_V_V_TVALID(out_V_TVALID), - .out_V_V_TREADY(out_V_TREADY) +$TOP_MODULE_NAME$_impl #( + .BIT_WIDTH(BIT_WIDTH), + .SIMD(SIMD), + .MMV_IN(MMV_IN), + .MMV_OUT(MMV_OUT) +) impl ( + .ap_clk(ap_clk), + .ap_rst_n(ap_rst_n), + .in0_V_V_TDATA(in0_V_TDATA[BUF_IN_WIDTH-1:0]), + .in0_V_V_TVALID(in0_V_TVALID), + .in0_V_V_TREADY(in0_V_TREADY), + .out_V_V_TDATA(out_V_TDATA[BUF_OUT_WIDTH-1:0]), + .out_V_V_TVALID(out_V_TVALID), + .out_V_V_TREADY(out_V_TREADY) ); -endmodule //TOP_MODULE_NAME +endmodule : $TOP_MODULE_NAME$ diff --git a/finn-rtllib/swg/swg_template_wrapper_dynamic.v b/finn-rtllib/swg/swg_template_wrapper_dynamic.v index ca870ace11..158f3132e3 100644 --- a/finn-rtllib/swg/swg_template_wrapper_dynamic.v +++ b/finn-rtllib/swg/swg_template_wrapper_dynamic.v @@ -1,4 +1,33 @@ -`timescale 1 ns / 1 ps +/****************************************************************************** + * Copyright (C) 2022, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ module $TOP_MODULE_NAME$ #( // top-level parameters (set via code-generation) @@ -6,6 +35,8 @@ module $TOP_MODULE_NAME$ #( parameter SIMD = $SIMD$, parameter MMV_IN = $MMV_IN$, parameter MMV_OUT = $MMV_OUT$, + parameter IN_WIDTH_PADDED = $IN_WIDTH_PADDED$, + parameter OUT_WIDTH_PADDED = $OUT_WIDTH_PADDED$, parameter CNTR_BITWIDTH = $CNTR_BITWIDTH$, parameter INCR_BITWIDTH = $INCR_BITWIDTH$, @@ -18,14 +49,15 @@ module $TOP_MODULE_NAME$ #( parameter integer C_s_axilite_ADDR_WIDTH = 6 ) ( - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) input ap_clk, - (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF in0_V:out_V:s_axilite" *) + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) input ap_rst_n, - input [BUF_IN_WIDTH-1:0] in0_V_TDATA, + input [IN_WIDTH_PADDED-1:0] in0_V_TDATA, input in0_V_TVALID, output in0_V_TREADY, - output [BUF_OUT_WIDTH-1:0] out_V_TDATA, + output [OUT_WIDTH_PADDED-1:0] out_V_TDATA, output out_V_TVALID, input out_V_TREADY, @@ -113,23 +145,20 @@ $TOP_MODULE_NAME$_axilite # ( .cfg_reg15(cfg_last_write) ); -$TOP_MODULE_NAME$_impl -#( +$TOP_MODULE_NAME$_impl #( .BIT_WIDTH(BIT_WIDTH), .SIMD(SIMD), .MMV_IN(MMV_IN), .MMV_OUT(MMV_OUT), .CNTR_BITWIDTH(CNTR_BITWIDTH), .INCR_BITWIDTH(INCR_BITWIDTH) -) -impl -( +) impl ( .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), - .in0_V_V_TDATA(in0_V_TDATA), + .in0_V_V_TDATA(in0_V_TDATA[BUF_IN_WIDTH-1:0]), .in0_V_V_TVALID(in0_V_TVALID), .in0_V_V_TREADY(in0_V_TREADY), - .out_V_V_TDATA(out_V_TDATA), + .out_V_V_TDATA(out_V_TDATA[BUF_OUT_WIDTH-1:0]), .out_V_V_TVALID(out_V_TVALID), .out_V_V_TREADY(out_V_TREADY), @@ -151,4 +180,4 @@ impl .cfg_last_write(cfg_last_write) ); -endmodule //TOP_MODULE_NAME +endmodule : $TOP_MODULE_NAME$ diff --git a/finn-rtllib/thresholding/hdl/axilite_if.v b/finn-rtllib/thresholding/hdl/axilite_if.v new file mode 100644 index 0000000000..2aeff770d2 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/axilite_if.v @@ -0,0 +1,211 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +module axi4lite_if +#( + parameter ADDR_WIDTH = 32, + parameter DATA_WIDTH = 32,//AXI4 spec requires this to be strictly 32 or 64 + parameter IP_DATA_WIDTH = 64//can be any power-of-2 multiple of DATA_WIDTH +) +( +//system signals +input aclk, +input aresetn,//active low, asynchronous assertion and synchronous deassertion + +//Write channels +//write address +output reg awready, +input awvalid, +input [ADDR_WIDTH-1:0] awaddr, +input [2:0] awprot, +//write data +output reg wready, +input wvalid, +input [DATA_WIDTH-1:0] wdata, +input [(DATA_WIDTH/8)-1:0] wstrb, +//burst response +input bready, +output reg bvalid, +output reg [1:0] bresp,//NOTE: 00 = OKAY, 10 = SLVERR (write error) + +//Read channels +//read address +output reg arready, +input arvalid, +input [ADDR_WIDTH-1:0] araddr, +input [2:0] arprot, +//read data +input rready, +output reg rvalid, +output reg [1:0] rresp,//NOTE: 00 = OKAY, 10 = SLVERR (read error) +output reg [DATA_WIDTH-1:0] rdata, + +//IP-side interface +output reg ip_en, +output reg ip_wen, +output reg [ADDR_WIDTH-1:0] ip_addr, +output [IP_DATA_WIDTH-1:0] ip_wdata, +input ip_rack, +input [IP_DATA_WIDTH-1:0] ip_rdata +); + +localparam RESP_OKAY = 2'b00; +localparam RESP_SLVERR = 2'b10; +//get ceil(log2(ceil(IP_DATA_WIDTH/DATA_WIDTH))) +localparam NFOLDS_LOG = $clog2((IP_DATA_WIDTH + DATA_WIDTH - 1) / DATA_WIDTH); + +reg internal_ren; +reg internal_wen; +reg internal_wack; +reg [ADDR_WIDTH-1:0] internal_raddr; +reg [ADDR_WIDTH-1:0] internal_waddr; +reg [DATA_WIDTH-1:0] internal_wdata; +wire [DATA_WIDTH-1:0] internal_rdata; +reg internal_error = 0; + +//check DATA_WIDTH +initial begin + if(DATA_WIDTH != 32 & DATA_WIDTH != 64) begin + $display("AXI4Lite DATA_WIDTH must be 32 or 64"); + $finish; + end +end + +//transaction state machine +localparam STATE_IDLE = 0, + STATE_READ = 1, + STATE_WRITE = 2; + +reg [1:0] state; + +always @(posedge aclk or negedge aresetn) + if(~aresetn) + state <= STATE_IDLE; + else case(state) + STATE_IDLE: + if(awvalid & wvalid) + state <= STATE_WRITE; + else if(arvalid) + state <= STATE_READ; + STATE_READ: + if(rvalid & rready) + state <= STATE_IDLE; + STATE_WRITE: + if(bvalid & bready) + state <= STATE_IDLE; + default: state <= STATE_IDLE; + endcase + +//write-related internal signals +always @(*) begin + internal_waddr = awaddr >> $clog2(DATA_WIDTH/8); + internal_wdata = wdata; + internal_wen = (state == STATE_IDLE) & awvalid & wvalid; +end + +always @(posedge aclk) begin + awready <= internal_wen; + wready <= internal_wen; +end + +//read-related internal signals +always @(*) begin + internal_raddr = araddr >> $clog2(DATA_WIDTH/8); + internal_ren = (state == STATE_IDLE) & ~internal_wen & arvalid; +end + +always @(posedge aclk) + arready <= internal_ren; + +wire write_to_last_fold; + +always @(posedge aclk) begin + ip_wen <= write_to_last_fold; + ip_en <= internal_ren | write_to_last_fold; + if(internal_ren | write_to_last_fold) + ip_addr <= internal_ren ? (internal_raddr >> NFOLDS_LOG) : (internal_waddr >> NFOLDS_LOG); + internal_wack <= internal_wen; +end + +genvar i; +reg [(1<> (internal_rfold*DATA_WIDTH); + always @(posedge aclk) + if(internal_ren) + internal_rfold <= internal_raddr[NFOLDS_LOG-1:0]; + for(i=0; i<(1< + * + * @description + * Produces the N-bit count of those among 2^N-1 thresholds that are not + * larger than the corresponding input: + * y = Σ(T_i <= x) + * The result is computed by binary search. The runtime-configurable + * thresholds must be written in ascending order: + * i < j => T_i < T_j + * The design supports channel folding allowing each input to be processed + * with respect to a selectable set of thresholds. The corresponding + * threshold configuration relies on a channel address prefix. Inputs are + * accompanied by a channel selector. + * + * Parameter Layout as seen on AXI-Lite (row by row): + * | Base \ Offs | 0 1 2 ... 2^N-2 2^N-1 + * ---------+--------------------------------+------------------------------------ + * Chnl #0 | 0 | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #1 | 2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * Chnl #c | ((c/PE)*$clog2(PE) + c%PE)*2^N | T_0 T_1 T_2 ... T_{2^N-2} 'x + * + *****************************************************************************/ +module thresholding #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C, // number of channels + int unsigned PE, // parallel processing elements + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + bit USE_CONFIG = 1, + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel fold + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + // Global Control + input logic clk, + input logic rst, + + // Threshold Configuration + input logic cfg_en, + input logic cfg_we, + input logic [$clog2(CF)+$clog2(PE)+N-1:0] cfg_a, + input logic [K-1:0] cfg_d, + output logic cfg_rack, + output logic [K-1:0] cfg_q, + + // Input Stream + output logic irdy, + input logic ivld, + input logic [PE-1:0][K-1:0] idat, + + // Output Stream + input logic ordy, + output logic ovld, + output logic [PE-1:0][O_BITS-1:0] odat +); + + // Parameter Constraints Checking + initial begin + if(CF*PE != C) begin + $error("Parallelism PE=%0d is not a multiple of channel count C=%0d.", PE, C); + $finish; + end + end + + // Operations within Pipeline + typedef enum logic [1:0] { + NOP = 2'b00, // No operation + TH = 2'b01, // Thresholding + WR = 2'b11, // Write (initialization) + RB = 2'b10, // Readback (validation) + CFG = 2'b1x // Config op (pointer-preserving) + } op_e; + + // Pipeline Link Type + typedef logic [$clog2(CF)+N-1:0] ptr_t; + typedef logic [K -1:0] val_t; + typedef struct packed { + op_e op; + ptr_t ptr; // WR/RB: address; TH: result + val_t val; // WR/RB: threshold value; TH: input value + } pipe_t; + + //----------------------------------------------------------------------- + // Pipeline Feed + // - configuration always takes precedence + // - number of pending thresholding ops capped to N+3 + // across pipeline and output FIFO: pipe:N + A:1 + B:1 + 1 + localparam int unsigned MAX_PENDING = (DEEP_PIPELINE+1)*N + 3; + pipe_t pipe[PE][N+1]; + if(1) begin : blkFeed + + // Thresholding Input Guard ensuring Output FIFO is never overrun + logic signed [$clog2(MAX_PENDING):0] GuardSem = MAX_PENDING-1; // MAX_PENDING-1, ..., 0, -1 + uwire th_full = GuardSem[$left(GuardSem)]; + always_ff @(posedge clk) begin + if(rst) GuardSem <= MAX_PENDING-1; + else begin + automatic logic dec = !(USE_CONFIG && cfg_en) && !th_full && ivld; + automatic logic inc = ovld && ordy; + GuardSem <= GuardSem + (inc == dec? 0 : inc? 1 : -1); + end + end + + // PE Configuration Address Decoding + logic cfg_sel[PE]; + logic cfg_oob; + logic [N-1:0] cfg_ofs; + if(PE == 1) begin + assign cfg_sel[0] = 1; + assign cfg_oob = 0; + assign cfg_ofs = cfg_a[0+:N]; + end + else begin + uwire [$clog2(PE)-1:0] cfg_pe = cfg_a[N+:$clog2(PE)]; + always_comb begin + foreach(cfg_sel[pe]) begin + cfg_sel[pe] = USE_CONFIG && cfg_en && (cfg_pe == pe); + end + cfg_oob = (cfg_pe >= PE); + cfg_ofs = cfg_a[0+:N]; + if(cfg_oob && !cfg_we) begin + // Map readbacks from padded rows (non-existent PEs) to padded highest threshold index of first PE + cfg_sel[0] = 1; + cfg_ofs = '1; + end + end + end + + uwire ptr_t iptr; + assign iptr[0+:N] = cfg_ofs; + if(CF > 1) begin + // Channel Fold Rotation + logic [$clog2(CF)-1:0] CnlCnt = 0; + logic CnlLst = 0; + always_ff @(posedge clk) begin + if(rst) begin + CnlCnt <= 0; + CnlLst <= 0; + end + else if(!(USE_CONFIG && cfg_en) && !th_full && ivld) begin + CnlCnt <= CnlCnt + (CnlLst? 1-CF : 1); + CnlLst <= CnlCnt == CF-2; + end + end + + assign iptr[N+:$clog2(CF)] = USE_CONFIG && cfg_en? cfg_a[N+$clog2(PE)+:$clog2(CF)] : CnlCnt; + end + + for(genvar pe = 0; pe < PE; pe++) begin + assign pipe[pe][0] = '{ + op: USE_CONFIG && cfg_en? + (!cfg_sel[pe]? NOP : cfg_we? WR : RB) : + (ivld && !th_full? TH : NOP), + ptr: iptr, + val: !(USE_CONFIG && cfg_en)? idat[pe] : cfg_we? cfg_d : 0 + }; + end + + assign irdy = !(USE_CONFIG && cfg_en) && !th_full; + end : blkFeed + + //----------------------------------------------------------------------- + // Free-Running Thresholding Pipeline + for(genvar stage = 0; stage < N; stage++) begin : genStages + + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin : genPE + uwire pipe_t p = pipe[pe][stage]; + uwire cs = (p.ptr[SN:0] == 2**SN-1); + + // Threshold Memory + val_t Thresh; // Read-out register + if(1) begin : blkThresh + localparam int unsigned DEPTH = CF * 2**stage; + localparam RAM_STYLE = + DEPTH_TRIGGER_URAM && (DEPTH >= DEPTH_TRIGGER_URAM)? "ultra" : + DEPTH_TRIGGER_BRAM && (DEPTH >= DEPTH_TRIGGER_BRAM)? "block" : + // If BRAM trigger defined, force distributed memory below if Vivado may be tempted to use BRAM nonetheless. + DEPTH_TRIGGER_BRAM && (DEPTH >= 64)? "distributed" : "auto"; + + (* RAM_STYLE = RAM_STYLE *) + val_t Threshs[DEPTH]; + if(THRESHOLDS_PATH != "") begin + initial $readmemh($sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage), Threshs); + end + + if(USE_CONFIG) begin : genThreshMem + uwire we = (p.op ==? WR) && cs; + if((CF == 1) && (stage == 0)) begin + always @(posedge clk) begin + if(we) Threshs[0] <= p.val; + end + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always @(posedge clk) begin + if(we) Threshs[addr] <= p.val; + end + end + end : genThreshMem + + if((CF == 1) && (stage == 0)) begin + assign Thresh = Threshs[0]; + end + else begin + uwire [$clog2(CF)+stage-1:0] addr = p.ptr[$clog2(CF)+N-1:SN+1]; + always_ff @(posedge clk) begin + Thresh <= Threshs[addr]; + end + end + + end : blkThresh + + // Pipeline State + pipe_t P = '{ op: NOP, default: 'x }; + logic Reval = 0; + always_ff @(posedge clk) begin + if(rst) begin + P <= '{ op: NOP, default: 'x }; + Reval <= 0; + end + else begin + P <= p; + Reval <= (p.op ==? RB) && cs; + end + end + + logic cmp; + if(!SIGNED) assign cmp = $unsigned(Thresh) <= $unsigned(P.val); + else if(!FPARG) assign cmp = $signed(Thresh) <= $signed(P.val); + else begin : blkSignedFloat + uwire mag_eq = Thresh[K-2:0] == P.val[K-2:0]; + uwire mag_le = Thresh[K-2:0] <= P.val[K-2:0]; + always_comb begin + unique case({Thresh[K-1], P.val[K-1]}) + 2'b00: cmp = mag_le; + 2'b01: cmp = 0; + 2'b10: cmp = 1; + 2'b11: cmp = !mag_le || mag_eq; + default: cmp = 'x; + endcase + end + end : blkSignedFloat + + // Pipeline State Update + pipe_t pp; + always_comb begin + pp = P; + if(P.op !=? CFG) pp.ptr[SN] = cmp; + if(Reval) pp.val = Thresh; + end + + // Pipeline State Forward (potentially additional register) + pipe_t pf; + if(!DEEP_PIPELINE) assign pf = pp; + else begin + pipe_t Pf = '{ op: NOP, default: 'x }; + always_ff @(posedge clk) begin + if(rst) Pf <= '{ op: NOP, default: 'x }; + else Pf <= pp; + end + assign pf = Pf; + end + + assign pipe[pe][stage+1] = pf; + + end : genPE + end : genStages + + //----------------------------------------------------------------------- + // Configuration Readback + always_comb begin + cfg_rack = 0; + cfg_q = 0; + foreach(pipe[pe]) begin + automatic pipe_t p = pipe[pe][N]; + cfg_rack |= p.op ==? RB; + cfg_q |= p.val; + end + end + + //----------------------------------------------------------------------- + // Stream Output through FIFO + // - Depth of N + Output Reg to allow pipe to drain entirely under backpressure + // - Typically mapped to an SRL shift register + if(1) begin : blkStreamOutput + localparam int unsigned A_DEPTH = MAX_PENDING - 1; + logic [PE-1 : 0][N-1 : 0] ADat[A_DEPTH]; + logic signed [$clog2(A_DEPTH):0] APtr = '1; // -1, 0, 1, ..., A_DEPTH-1 + uwire avld = !APtr[$left(APtr)]; + + logic [PE-1:0][N-1:0] BDat = 'x; + logic BVld = 0; + + uwire aload = pipe[0][N].op ==? TH; + uwire bload = !BVld || ordy; + + always_ff @(posedge clk) begin + if(aload) begin + assert(APtr < $signed(A_DEPTH-1)) else begin + $error("Overrun after failing stream guard."); + $stop; + end + foreach(pipe[pe]) ADat[0][pe] <= pipe[pe][N].ptr; + for(int unsigned i = 1; i < A_DEPTH; i++) ADat[i] <= ADat[i-1]; + end + end + always_ff @(posedge clk) begin + if(rst) APtr <= '1; + else APtr <= APtr + (aload == (avld && bload)? 0 : aload? 1 : -1); + end + always_ff @(posedge clk) begin + if(rst) begin + BDat <= 'x; + BVld <= 0; + end + else if(bload) begin + BDat <= ADat[APtr]; + BVld <= avld; + end + end + + assign ovld = BVld; + for(genvar pe = 0; pe < PE; pe++) begin + assign odat[pe] = BDat[pe] + BIAS; + end + end : blkStreamOutput + +endmodule : thresholding diff --git a/finn-rtllib/thresholding/hdl/thresholding_axi.sv b/finn-rtllib/thresholding/hdl/thresholding_axi.sv new file mode 100644 index 0000000000..5c7182b214 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_axi.sv @@ -0,0 +1,164 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief All-AXI interface adapter for thresholding module. + * @author Thomas B. Preußer + * + * @description + * This AXI adapter fits the core thresholding functionality: + * - with AXI stream data interfaces with flow control + * - with implicit round-robin channel rotation as used by FINN, and + * - performs aligned byte address to parameter word address translation. + *****************************************************************************/ + +module thresholding_axi #( + int unsigned N, // output precision + int unsigned K, // input/threshold precision + int unsigned C = 1, // Channels + int unsigned PE = 1, // Processing Parallelism, requires C = k*PE + + bit SIGNED = 1, // signed inputs + bit FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + int BIAS = 0, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + // Initial Thresholds + parameter THRESHOLDS_PATH = "", + + bit USE_AXILITE, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + int unsigned DEPTH_TRIGGER_URAM = 0, // if non-zero, local mems of this depth or more go into URAM (prio) + int unsigned DEPTH_TRIGGER_BRAM = 0, // if non-zero, local mems of this depth or more go into BRAM + bit DEEP_PIPELINE = 0, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2, + localparam int unsigned O_BITS = BIAS >= 0? + /* unsigned */ $clog2(2**N+BIAS) : + /* signed */ 1+$clog2(-BIAS >= 2**(N-1)? -BIAS : 2**N+BIAS) +)( + //- Global Control ------------------ + input logic ap_clk, + input logic ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input logic s_axilite_AWVALID, + output logic s_axilite_AWREADY, + input logic [ADDR_BITS-1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input logic s_axilite_WVALID, + output logic s_axilite_WREADY, + input logic [31:0] s_axilite_WDATA, + input logic [ 3:0] s_axilite_WSTRB, + + output logic s_axilite_BVALID, + input logic s_axilite_BREADY, + output logic [1:0] s_axilite_BRESP, + + // Reading + input logic s_axilite_ARVALID, + output logic s_axilite_ARREADY, + input logic [ADDR_BITS-1:0] s_axilite_ARADDR, + + output logic s_axilite_RVALID, + input logic s_axilite_RREADY, + output logic [31:0] s_axilite_RDATA, + output logic [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output logic s_axis_tready, + input logic s_axis_tvalid, + input logic [((PE*K+7)/8)*8-1:0] s_axis_tdata, + + //- AXI Stream - Output ------------- + input logic m_axis_tready, + output logic m_axis_tvalid, + output logic [((PE*O_BITS+7)/8)*8-1:0] m_axis_tdata +); + + //----------------------------------------------------------------------- + // AXI-lite Configuration Interface + uwire cfg_en; + uwire cfg_we; + uwire [ADDR_BITS-3:0] cfg_a; + uwire [K -1:0] cfg_d; + uwire cfg_rack; + uwire [K -1:0] cfg_q; + + if(USE_AXILITE) begin + uwire [ADDR_BITS-1:0] cfg_a0; + axi4lite_if #(.ADDR_WIDTH(ADDR_BITS), .DATA_WIDTH(32), .IP_DATA_WIDTH(K)) axi ( + .aclk(ap_clk), .aresetn(ap_rst_n), + + .awready(s_axilite_AWREADY), .awvalid(s_axilite_AWVALID), .awaddr(s_axilite_AWADDR), .awprot('x), + .wready(s_axilite_WREADY), .wvalid(s_axilite_WVALID), .wdata(s_axilite_WDATA), .wstrb(s_axilite_WSTRB), + .bready(s_axilite_BREADY), .bvalid(s_axilite_BVALID), .bresp(s_axilite_BRESP), + + .arready(s_axilite_ARREADY), .arvalid(s_axilite_ARVALID), .araddr(s_axilite_ARADDR), .arprot('x), + .rready(s_axilite_RREADY), .rvalid(s_axilite_RVALID), .rresp(s_axilite_RRESP), .rdata(s_axilite_RDATA), + + .ip_en(cfg_en), .ip_wen(cfg_we), .ip_addr(cfg_a0), .ip_wdata(cfg_d), + .ip_rack(cfg_rack), .ip_rdata(cfg_q) + ); + assign cfg_a = cfg_a0[ADDR_BITS-3:0]; + always_ff @(posedge ap_clk) begin + assert(!ap_rst_n || !cfg_en || (cfg_a0[ADDR_BITS-2+:2] === 3'h0)) else begin + $error("%m: Spurious high address bits."); + $stop; + end + end + end + else begin + assign cfg_en = 0; + assign cfg_we = 'x; + assign cfg_a = 'x; + assign cfg_d = 'x; + end + + //----------------------------------------------------------------------- + // Kernel Implementation + thresholding #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), .FPARG(FPARG), .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), .USE_CONFIG(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) impl ( + .clk(ap_clk), .rst(!ap_rst_n), + + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + .irdy(s_axis_tready), .ivld(s_axis_tvalid), .idat(s_axis_tdata), + .ordy(m_axis_tready), .ovld(m_axis_tvalid), .odat(m_axis_tdata) + ); + +endmodule : thresholding_axi diff --git a/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v new file mode 100644 index 0000000000..f35db156f6 --- /dev/null +++ b/finn-rtllib/thresholding/hdl/thresholding_template_wrapper.v @@ -0,0 +1,121 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @author Thomas B. Preußer + * @brief Verilog wrapper for IP packaging. + */ + +module $MODULE_NAME_AXI_WRAPPER$ #( + parameter N = $N$, // output precision + parameter K = $M$, // input/threshold precision + parameter C = $C$, // Channels + parameter PE = $PE$, // Processing Parallelism, requires C = k*PE + + parameter SIGNED = $SIGNED$, // signed inputs + parameter FPARG = 0, // floating-point inputs: [sign] | exponent | mantissa + parameter BIAS = $BIAS$, // offsetting the output [0, 2^N-1] -> [BIAS, 2^N-1 + BIAS] + + parameter THRESHOLDS_PATH = $THRESHOLDS_PATH$, // Directory with initial threshold data + parameter USE_AXILITE = $USE_AXILITE$, // Implement AXI-Lite for threshold read/write + + // Force Use of On-Chip Memory Blocks + parameter DEPTH_TRIGGER_URAM = $DEPTH_TRIGGER_URAM$, // if non-zero, local mems of this depth or more go into URAM (prio) + parameter DEPTH_TRIGGER_BRAM = $DEPTH_TRIGGER_BRAM$, // if non-zero, local mems of this depth or more go into BRAM + parameter DEEP_PIPELINE = $DEEP_PIPELINE$, // [bit] extra pipeline stages for easier timing closure + + parameter O_BITS = $O_BITS$ +)( + // Global Control + (* X_INTERFACE_PARAMETER = "ASSOCIATED_BUSIF s_axilite:in0_V:out_V, ASSOCIATED_RESET ap_rst_n" *) + (* X_INTERFACE_INFO = "xilinx.com:signal:clock:1.0 ap_clk CLK" *) + input ap_clk, + (* X_INTERFACE_PARAMETER = "POLARITY ACTIVE_LOW" *) + input ap_rst_n, + + //- AXI Lite ------------------------ + // Writing + input s_axilite_AWVALID, + output s_axilite_AWREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_AWADDR, // lowest 2 bits (byte selectors) are ignored + + input s_axilite_WVALID, + output s_axilite_WREADY, + input [31:0] s_axilite_WDATA, + input [ 3:0] s_axilite_WSTRB, + + output s_axilite_BVALID, + input s_axilite_BREADY, + output [1:0] s_axilite_BRESP, + + // Reading + input s_axilite_ARVALID, + output s_axilite_ARREADY, + input [$clog2(C/PE) + $clog2(PE) + N + 1:0] s_axilite_ARADDR, + + output s_axilite_RVALID, + input s_axilite_RREADY, + output [31:0] s_axilite_RDATA, + output [ 1:0] s_axilite_RRESP, + + //- AXI Stream - Input -------------- + output in0_V_TREADY, + input in0_V_TVALID, + input [((PE*K+7)/8)*8-1:0] in0_V_TDATA, + + //- AXI Stream - Output ------------- + input out_V_TREADY, + output out_V_TVALID, + output [((PE*O_BITS+7)/8)*8-1:0] out_V_TDATA +); + + thresholding_axi #( + .N(N), .K(K), .C(C), .PE(PE), + .SIGNED(SIGNED), + .FPARG(FPARG), + .BIAS(BIAS), + .THRESHOLDS_PATH(THRESHOLDS_PATH), + .USE_AXILITE(USE_AXILITE), + .DEPTH_TRIGGER_URAM(DEPTH_TRIGGER_URAM), + .DEPTH_TRIGGER_BRAM(DEPTH_TRIGGER_BRAM), + .DEEP_PIPELINE(DEEP_PIPELINE) + ) core ( + .ap_clk(ap_clk), .ap_rst_n(ap_rst_n), + + .s_axilite_AWVALID(s_axilite_AWVALID), .s_axilite_AWREADY(s_axilite_AWREADY), .s_axilite_AWADDR(s_axilite_AWADDR), + .s_axilite_WVALID(s_axilite_WVALID), .s_axilite_WREADY(s_axilite_WREADY), .s_axilite_WDATA(s_axilite_WDATA), .s_axilite_WSTRB(s_axilite_WSTRB), + .s_axilite_BVALID(s_axilite_BVALID), .s_axilite_BREADY(s_axilite_BREADY), .s_axilite_BRESP(s_axilite_BRESP), + + .s_axilite_ARVALID(s_axilite_ARVALID), .s_axilite_ARREADY(s_axilite_ARREADY), .s_axilite_ARADDR(s_axilite_ARADDR), + .s_axilite_RVALID(s_axilite_RVALID), .s_axilite_RREADY(s_axilite_RREADY), .s_axilite_RDATA(s_axilite_RDATA), .s_axilite_RRESP(s_axilite_RRESP), + .s_axis_tready(in0_V_TREADY), .s_axis_tvalid(in0_V_TVALID), .s_axis_tdata(in0_V_TDATA), + .m_axis_tready(out_V_TREADY), .m_axis_tvalid(out_V_TVALID), .m_axis_tdata(out_V_TDATA) + ); + +endmodule // $MODULE_NAME_AXI_WRAPPER$ diff --git a/finn-rtllib/thresholding/sim/thresh_gen.sv b/finn-rtllib/thresholding/sim/thresh_gen.sv new file mode 100644 index 0000000000..ae30503f8f --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresh_gen.sv @@ -0,0 +1,75 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ +module thresh_gen; + localparam int unsigned K = 9; + localparam int unsigned N = 4; + localparam int unsigned C = 6; + + typedef logic [K-1:0] thresh_t; + localparam thresh_t THRESHOLDS[C][2**N-1] = '{ + '{ 'h00, 'h01, 'h02, 'h03, 'h04, 'h05, 'h06, 'h07, 'h08, 'h09, 'h0a, 'h0b, 'h0c, 'h0d, 'h0e }, + '{ 'h10, 'h11, 'h12, 'h13, 'h14, 'h15, 'h16, 'h17, 'h18, 'h19, 'h1a, 'h1b, 'h1c, 'h1d, 'h1e }, + '{ 'h20, 'h21, 'h22, 'h23, 'h24, 'h25, 'h26, 'h27, 'h28, 'h29, 'h2a, 'h2b, 'h2c, 'h2d, 'h2e }, + '{ 'h30, 'h31, 'h32, 'h33, 'h34, 'h35, 'h36, 'h37, 'h38, 'h39, 'h3a, 'h3b, 'h3c, 'h3d, 'h3e }, + '{ 'h40, 'h41, 'h42, 'h43, 'h44, 'h45, 'h46, 'h47, 'h48, 'h49, 'h4a, 'h4b, 'h4c, 'h4d, 'h4e }, + '{ 'h50, 'h51, 'h52, 'h53, 'h54, 'h55, 'h56, 'h57, 'h58, 'h59, 'h5a, 'h5b, 'h5c, 'h5d, 'h5e } + }; + localparam THRESHOLDS_PATH = "./"; + + localparam int unsigned PE = 2; + localparam int unsigned CF = C/PE; + + for(genvar stage = 0; stage < N; stage++) begin + localparam int unsigned SN = N-1-stage; + for(genvar pe = 0; pe < PE; pe++) begin + initial begin + automatic string file = $sformatf("%sthreshs_%0d_%0d.dat", THRESHOLDS_PATH, pe, stage); + + automatic thresh_t threshs[CF * 2**stage]; + for(int unsigned c = 0; c < CF; c++) begin + for(int unsigned i = 0; i < 2**stage; i++) begin + threshs[(c << stage) + i] = THRESHOLDS[c*PE + pe][(i<<(N-stage)) + 2**SN-1]; + end + end + + $writememh(file, threshs); + end + end + end + + // Quit after running all initializers + initial begin + #1ns; + $display("Generation done."); + $finish; + end + +endmodule : thresh_gen diff --git a/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv new file mode 100644 index 0000000000..429fb7776f --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_axi_tb.sv @@ -0,0 +1,314 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_axi_tb #( + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + real M0 = 7.3, // slope of the uniform thresholding line + real B0 = 3.1, // offset of the uniform thresholding line + bit THROTTLED = 1, + + localparam int unsigned CF = C/PE, // Channel Fold + localparam int unsigned ADDR_BITS = $clog2(CF) + $clog2(PE) + N + 2 +); + + //----------------------------------------------------------------------- + // Design Geometry + + // For each channel = [0,channel): + // M_channel = M0 + CX*channel + // B_channel = B0 + CX*channel + // Input/threshold precision computed according with the maximum posible value + localparam real CX = 1.375; + localparam int unsigned K = $clog2((2**N-1)*(M0+C*CX) + (B0+C*CX)); // unused sign + magnitude + localparam int unsigned C_BITS = C < 2? 1 : $clog2(C); + + localparam int unsigned MST_STRM_WROUNDS = 503; + + typedef int unsigned threshs_t[C][2**N-1]; + function threshs_t init_thresholds(); + automatic threshs_t res; + for(int unsigned c = 0; c < C; c++) begin + automatic real m = M0 + c*CX; + automatic real b = B0 + c*CX; + foreach(res[c][i]) begin + res[c][i] = int'($ceil(m*i + b)); + end + end + return res; + endfunction : init_thresholds + localparam threshs_t THRESHS = init_thresholds(); + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // DUT + logic s_axilite_AWVALID; + uwire s_axilite_AWREADY; + logic [ADDR_BITS-1:0] s_axilite_AWADDR; // lowest 2 bits (byte selectors) are ignored + logic s_axilite_WVALID; + uwire s_axilite_WREADY; + logic [ 31:0] s_axilite_WDATA; + uwire s_axilite_BVALID; + logic s_axilite_BREADY; + uwire [ 1:0] s_axilite_BRESP; + logic s_axilite_ARVALID; + uwire s_axilite_ARREADY; + logic [ADDR_BITS-1:0] s_axilite_ARADDR; + uwire s_axilite_RVALID; + uwire s_axilite_RREADY = 1; + uwire [ 31:0] s_axilite_RDATA; + uwire [ 1:0] s_axilite_RRESP; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding_axi #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(0), .USE_AXILITE(1)) dut ( + .ap_clk(clk), .ap_rst_n(!rst), + + // Configuration + .s_axilite_AWVALID, .s_axilite_AWREADY, .s_axilite_AWADDR, + .s_axilite_WVALID, .s_axilite_WREADY, .s_axilite_WDATA, .s_axilite_WSTRB('1), + .s_axilite_BVALID, .s_axilite_BREADY, .s_axilite_BRESP, + .s_axilite_ARVALID, .s_axilite_ARREADY, .s_axilite_ARADDR, + .s_axilite_RVALID, .s_axilite_RREADY, .s_axilite_RDATA, .s_axilite_RRESP, + + // Stream Processing + .s_axis_tready(irdy), .s_axis_tvalid(ivld), .s_axis_tdata(idat), + .m_axis_tready(ordy), .m_axis_tvalid(ovld), .m_axis_tdata(odat) + ); + + //----------------------------------------------------------------------- + // Input Stimuli + typedef logic [PE-1:0][K-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + input_t QW[$]; // Input Feed Tracing + addr_t QC[$]; + + int unsigned error_cnt = 0; + bit done = 0; + initial begin + // Report testbench details + $display("Testbench - tresholding K=%0d -> N=%0d", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("Channel #%0d: Thresholds = {", c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0d", THRESHS[c][i]); + $display(" }"); + end + + // Config + s_axilite_AWVALID = 0; + s_axilite_AWADDR = 'x; + s_axilite_WVALID = 0; + s_axilite_WDATA = 'x; + s_axilite_BREADY = 0; + s_axilite_ARVALID = 0; + s_axilite_ARADDR = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuration + for(int unsigned c = 0; c < C; c+=PE) begin + automatic addr_t addr = 0; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) addr[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + addr[0+:N] = t; + fork + begin + s_axilite_AWVALID <= 1; + s_axilite_AWADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_AWREADY); + s_axilite_AWVALID <= 0; + s_axilite_AWADDR <= 'x; + end + begin + s_axilite_WVALID <= 1; + s_axilite_WDATA <= THRESHS[c+pe][t]; + @(posedge clk iff s_axilite_WREADY); + s_axilite_WVALID <= 0; + s_axilite_WDATA <= 'x; + end + begin + s_axilite_BREADY <= 1; + @(posedge clk iff s_axilite_BVALID); + assert(s_axilite_BRESP == '0) else begin + $error("Error on parameter write."); + $stop; + end + s_axilite_BREADY <= 0; + end + join + end + end + end + + fork + // Intermittent configuration readback + while(!done) begin + if(($urandom()%37) != 0) begin + s_axilite_ARVALID <= 0; + s_axilite_ARADDR <= 'x; + @(posedge clk); + end + else begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + s_axilite_ARVALID <= 1; + s_axilite_ARADDR <= { addr, 2'b00 }; + @(posedge clk iff s_axilite_ARREADY); + + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat(N+6) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("Missing %0d outputs.", QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("Missing %0d readback replies.", QC.size()); + $stop; + end + + $display("Test completed: %0d errors in %0d tests.", error_cnt, MST_STRM_WROUNDS); + $display("========================================="); + $finish; + end + + // Output Checker ------------------------------------------------------- + + // Configuration Readback + always_ff @(posedge clk iff s_axilite_RVALID) begin + assert(s_axilite_RRESP == '0) else begin + $error("Read back error."); + $stop; + end + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(s_axilite_RDATA == exp) else begin + $error("Readback mismatch on #%0d.%0d: %0d instead of %0d", cnl, addr[0+:N], s_axilite_RDATA, exp); + $stop; + end + end + else begin + $error("Spurious readback output."); + $stop; + end + end + + // Stream Output + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("Mapped CNL=%0d DAT=%3d -> #%2d", cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (THRESHS[cnl][odat[pe]-1] <= x[pe])) && + ((odat[pe] == 2**N-1) || (x[pe] < THRESHS[cnl][odat[pe]])) + ) else begin + $error("Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("Spurious output."); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + +endmodule: thresholding_axi_tb diff --git a/finn-rtllib/thresholding/sim/thresholding_tb.sv b/finn-rtllib/thresholding/sim/thresholding_tb.sv new file mode 100644 index 0000000000..1564f28f0d --- /dev/null +++ b/finn-rtllib/thresholding/sim/thresholding_tb.sv @@ -0,0 +1,274 @@ +/****************************************************************************** + * Copyright (C) 2024, Advanced Micro Devices, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, + * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR + * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; + * OR BUSINESS INTERRUPTION). HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR + * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF + * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * @brief Testbench for thresholding_axi. + * @author Monica Chiosa + * + */ + +module thresholding_tb #( + int unsigned K = 10, // input precision + int unsigned N = 4, // output precision + int unsigned C = 6, // number of channels + int unsigned PE = 2, + + localparam int unsigned CF = C/PE // Channel Fold +); + localparam bit DEEP_PIPELINE = 1; + + localparam int unsigned MST_STRM_WROUNDS = 507; + localparam bit THROTTLED = 1; + + //----------------------------------------------------------------------- + // Clock and Reset Control + logic clk = 0; + always #5ns clk = !clk; + logic rst = 1; + initial begin + #10ns; + @(posedge clk); + rst <= 0; + end + + //----------------------------------------------------------------------- + // Parallel Instances differing in Data Type + typedef logic [K -1:0] val_t; + typedef val_t threshs_t[C][2**N-1]; + typedef val_t [PE-1:0] input_t; + typedef logic [$clog2(CF)+$clog2(PE)+N-1:0] addr_t; + logic [0:2] term = '0; + always_comb begin + if(&term) $finish; + end + for(genvar i = 0; i < 3; i++) begin : genTypes + localparam bit SIGNED = i>0; + localparam bit FPARG = i>1; + + //- DUT ------------------------- + logic cfg_en; + logic cfg_we; + logic [$clog2(C)+N-1:0] cfg_a; + logic [K-1:0] cfg_d; + uwire cfg_rack; + uwire [K-1:0] cfg_q; + + uwire irdy; + logic ivld; + logic [PE-1:0][K-1:0] idat; + + logic ordy = 0; + uwire ovld; + uwire [PE-1:0][N-1:0] odat; + + thresholding #(.N(N), .K(K), .C(C), .PE(PE), .SIGNED(SIGNED), .FPARG(FPARG), .USE_CONFIG(1), .DEEP_PIPELINE(DEEP_PIPELINE)) dut ( + .clk, .rst, + + // Configuration + .cfg_en, .cfg_we, .cfg_a, .cfg_d, + .cfg_rack, .cfg_q, + + // Stream Processing + .irdy, .ivld, .idat, + .ordy, .ovld, .odat + ); + + //- Stimulus Driver ------------- + threshs_t THRESHS; + function val_t sigord(input val_t x); + automatic val_t res = x; + if(SIGNED) begin + if(FPARG && x[K-1]) res[K-2:0] = ~x[K-2:0]; + res[K-1] = !x[K-1]; + end + return res; + endfunction : sigord + + input_t QW[$]; // Input tracing + addr_t QC[$]; // Readback tracking + int unsigned error_cnt = 0; + bit done = 0; + initial begin + + // Generate thresholds + std::randomize(THRESHS); + foreach(THRESHS[c]) begin + val_t row[2**N-1] = THRESHS[c]; + row.sort with (sigord(item)); + THRESHS[c] = row; + end + + // Report test case details + $display("[%0d] Thresholding %s%s%0d -> uint%0d", i, SIGNED? "s" : "u", FPARG? "fp" : "int", K, N); + for(int unsigned c = 0; c < C; c++) begin + $write("[%0d] Channel #%0d: Thresholds = {", i, c); + for(int unsigned i = 0; i < 2**N-1; i++) $write(" %0X", THRESHS[c][i]); + $display(" }"); + end + + // Config + cfg_en = 0; + cfg_we = 'x; + cfg_a = 'x; + cfg_d = 'x; + + // Stream Input + ivld = 0; + idat = 'x; + + @(posedge clk iff !rst); + + // Threshold Configuratin + cfg_en <= 1; + cfg_we <= 1; + for(int unsigned c = 0; c < C; c+=PE) begin + if(CF > 1) cfg_a[N+$clog2(PE)+:$clog2(CF)] <= c/PE; + for(int unsigned pe = 0; pe < PE; pe++) begin + if(PE > 1) cfg_a[N+:$clog2(PE)] = pe; + for(int unsigned t = 0; t < 2**N-1; t++) begin + cfg_a[0+:N] <= t; + cfg_d <= THRESHS[c+pe][t]; + @(posedge clk); + end + end + end + cfg_d <= 'x; + + fork + // Intermittent configuration readback + while(!done) begin + cfg_en <= 0; + cfg_we <= 'x; + cfg_a <= 'x; + @(posedge clk); + if(($urandom()%41) == 0) begin + automatic addr_t addr = $urandom()%(N-1); + if(PE > 1) addr[N+:$clog2(PE)] = $urandom()%PE; + if(CF > 1) addr[N+$clog2(PE)+:$clog2(CF)] = $urandom()%CF; + + cfg_en <= 1; + cfg_we <= 0; + cfg_a <= addr; + @(posedge clk); + QC.push_back(addr); + end + end + + // AXI4Stream MST Writes input values + repeat(MST_STRM_WROUNDS) begin + automatic input_t dat; + + while(THROTTLED && ($urandom()%7 == 0)) @(posedge clk); + + std::randomize(dat); + ivld <= 1; + idat <= dat; + @(posedge clk iff irdy); + ivld <= 0; + idat <= 'x; + QW.push_back(dat); + end + join_any + done <= 1; + repeat((DEEP_PIPELINE+1)*N+8) @(posedge clk); + + assert(QW.size() == 0) else begin + $error("[%0d] Missing %0d outputs.", i, QW.size()); + $stop; + end + assert(QC.size() == 0) else begin + $error("[%0d] Missing %0d readback replies.", i, QC.size()); + $stop; + end + + $display("[%0d] Test completed: %0d errors in %0d tests.", i, error_cnt, MST_STRM_WROUNDS); + $display("============================================="); + term[i] <= 1; + end + + //- Readback Checker -------------- + always_ff @(posedge clk iff cfg_rack) begin + assert(QC.size()) begin + automatic addr_t addr = QC.pop_front(); + automatic int unsigned cnl = + (CF == 1? 0 : addr[N+$clog2(PE)+:$clog2(CF)] * PE) + + (PE == 1? 0 : addr[N+:$clog2(PE)]); + automatic logic [K-1:0] exp = THRESHS[cnl][addr[0+:N]]; + assert(cfg_q == exp) else begin + $error("[%0d] Readback mismatch on #%0d.%0d: %0d instead of %0d", i, cnl, addr[0+:N], cfg_q, exp); + $stop; + end + end + else begin + $error("[%0d] Spurious readback output.", i); + $stop; + end + end + + // Output Checker + int unsigned OCnl = 0; + always @(posedge clk) begin + if(rst) begin + OCnl <= 0; + ordy <= 1'b0; + end + else begin + if(!ordy || ovld) ordy <= ($urandom()%5 != 0) || !THROTTLED; + + if(ordy && ovld) begin + assert(QW.size()) begin + automatic input_t x = QW.pop_front(); + + for(int unsigned pe = 0; pe < PE; pe++) begin + automatic int unsigned cnl = OCnl + pe; + + $display("[%0d] Mapped CNL=%0d DAT=%3x -> #%2d", i, cnl, x[pe], odat[pe]); + assert( + ((odat[pe] == 0) || (sigord(THRESHS[cnl][odat[pe]-1]) <= sigord(x[pe]))) && + ((odat[pe] == 2**N-1) || (sigord(x[pe]) < sigord(THRESHS[cnl][odat[pe]]))) + ) else begin + $error("[%0d] Output error on presumed input CNL=%0d DAT=0x%0x -> #%0d", i, cnl, x[pe], odat[pe]); + error_cnt++; + $stop; + end + end + end + else begin + $error("[%0d] Spurious output.", i); + $stop; + end + + OCnl <= (OCnl + PE)%C; + end + end + end + + end : genTypes + +endmodule: thresholding_tb diff --git a/notebooks/advanced/0_custom_analysis_pass.ipynb b/notebooks/advanced/0_custom_analysis_pass.ipynb index f8444520c3..5ed48ca6d8 100644 --- a/notebooks/advanced/0_custom_analysis_pass.ipynb +++ b/notebooks/advanced/0_custom_analysis_pass.ipynb @@ -52,7 +52,9 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron(\"../LFCW1A1.onnx\")" + "import os\n", + "notebook_dir = os.environ['FINN_ROOT'] + \"/notebooks\"\n", + "showInNetron(notebook_dir + \"/LFCW1A1.onnx\")" ] }, { @@ -69,7 +71,7 @@ "outputs": [], "source": [ "from qonnx.core.modelwrapper import ModelWrapper\n", - "model = ModelWrapper('../LFCW1A1.onnx')" + "model = ModelWrapper(notebook_dir + \"/LFCW1A1.onnx\")" ] }, { @@ -151,9 +153,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/1_custom_transformation_pass.ipynb b/notebooks/advanced/1_custom_transformation_pass.ipynb index 391e852a71..91dd925b25 100644 --- a/notebooks/advanced/1_custom_transformation_pass.ipynb +++ b/notebooks/advanced/1_custom_transformation_pass.ipynb @@ -110,8 +110,11 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", + "notebook_dir = os.environ['FINN_ROOT'] + \"/notebooks\"\n", + "\n", "import onnx\n", - "onnx_model = onnx.load('../LFCW1A1.onnx')\n", + "onnx_model = onnx.load(notebook_dir + \"/LFCW1A1.onnx\")\n", "from qonnx.core.modelwrapper import ModelWrapper\n", "onnx_model = ModelWrapper(onnx_model)" ] @@ -122,7 +125,7 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron('../LFCW1A1.onnx')" + "showInNetron(notebook_dir + \"/LFCW1A1.onnx\")" ] }, { @@ -209,7 +212,7 @@ "\n", "To control the degree of parallelization the argument `num_workers` can be specified. When the Docker container is started, the env variable `NUM_DEFAULT_WORKERS` is set to 4 by default, this can be increased or decreased depending on the system. You can also set the number of workers manually to a specific value when calling a transformation that allows parallelization. If the value is set to 0, all available CPU cores are used.\n", "\n", - "In the following we want to take a closer look at the implementation using the compile transformation as example." + "In the following we want to take a closer look at the implementation using the compile transformation that is used for cpp simulation as example." ] }, { @@ -227,7 +230,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is fpgadataflow node." + "The class is derived from the NodeLocalTransformation class and performs the compilation at every node that is an hls node." ] } ], @@ -247,9 +250,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/advanced/2_custom_op.ipynb b/notebooks/advanced/2_custom_op.ipynb index 636da64dd5..bdd2976412 100644 --- a/notebooks/advanced/2_custom_op.ipynb +++ b/notebooks/advanced/2_custom_op.ipynb @@ -672,7 +672,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/advanced/3_folding.ipynb b/notebooks/advanced/3_folding.ipynb new file mode 100644 index 0000000000..8c7b97d6c6 --- /dev/null +++ b/notebooks/advanced/3_folding.ipynb @@ -0,0 +1,668 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# FINN - Folding\n", + "--------------------------------------\n", + "**Note: We will utilize one of the intermediate models generated in the process of the cybersecurity end2end example**\n", + "\n", + "There is a local copy of `step_specialize_layers.onnx` in this directory, which was renamed to `cybsec_PE_SIMD.onnx` to be able to go through this tutorial without requisites. But you can also generate it yourself with the [third cybersecurity Jupyter notebook](../end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb). After the execution of the estimates only build flow, it can be found in `../end2end_example/cybersecurity/output_estimates_only/intermediate_models/step_specialize_layers.onnx`. \n", + "\n", + "This notebook describes the use of FINN parallelization parameters (PE & SIMD), also called folding factors, to efficiently optimize models so as to extract the maximum performance out of them. \n", + "\n", + "Please be aware that the folding factors can not be selected arbitrarily, each layer has constraints on which values the parallelization parameters can be set to, for more information see here: https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer\n", + "\n", + "We'll use the utility function `showInNetron()` to visualize and interact with our network in the Jupyter Notebook and `showSrc()` to show source code of FINN library calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.util.visualization import showInNetron, showSrc" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note: The build_flow in the cybsec_mlp notebook comprises a transformation step `step_target_fps_parallelization` that automatically sets custom parallelization parameters needed to achieve a given `target_fps` by invoking the [`SetFolding` transformation](https://github.com/Xilinx/finn/blob/main/src/finn/transformation/fpgadataflow/set_folding.py#L46).\n", + "\n", + "More details of the above step can be found [here](https://github.com/Xilinx/finn/blob/main/src/finn/builder/build_dataflow_steps.py#L394)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook shows the manual version of this step and explains how these attributes can improve performance and what are their effects on resource utilization for developers who need to maximize the performance of their network. \n", + "\n", + "For that we will use the `cybsec_PE_SIMD.onnx` file as starting point. This intermediate model from the cybersecurity example is the model representation after the high-level ONNX layers are converted to HW layers and then specialized to either HLS or RTL variants. In this example, all nodes were converted to HLS variants this means that each node in the graph now corresponds to an HLS C++ function call and the parallelization parameters can be set using the node attributes.\n", + "\n", + "We will take this model to show how to set the folding factors manually and analyze the estimated execution clock cycles and the resource utilization of each layer in the network." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### FINN-style Dataflow Architectures \n", + "\n", + "We start with a quick recap of FINN-style dataflow architectures. The key idea in such architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, as illustrated in the figure below.\n", + "\n", + "![](finn-dataflow.png)\n", + "\n", + "In practice, the layers are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by RTL modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n", + "\n", + "Since each layer will be instantiated, we can flexibly set the parallelization of each layer and thus control resources and throughput of our network, as visualized in the image below:\n", + "\n", + "![](finn-folding.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part-1 : Loading the ONNX model.\n", + "\n", + "As discussed above, the network needs to go through a few preparation steps before it can be fed into our estimation functions.\n", + "\n", + "The `.onnx` file loaded here is taken from the cybersecurity end2end example notebook. \n", + "We pick the onnx file `cybsec_PE_SIMD.onnx` to which the necessary transformations have been applied for this notebook. This means, network layers mapped to necessary FINN-HW blocks. In this case, the HLS variants of MatrixVectorActivation, `MVAU_hls` units. \n", + "\n", + "To interact with the `.onnx` file we use `ModelWrapper()`. This wrapper simplifies the access to different model attributes and allows us to apply custom transformations on the model.\n", + "\n", + "In the below cell, we load our onnx file and view the cybersecurity MLP network in Netron. Additionally, we call the transformation `GiveUniqueNodeNames` as a preparation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from qonnx.core.modelwrapper import ModelWrapper\n", + "from qonnx.transformation.general import GiveUniqueNodeNames\n", + "\n", + "model = ModelWrapper(os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD.onnx\")\n", + "model = model.transform(GiveUniqueNodeNames())\n", + "model_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/cybsec_PE_SIMD_named_nodes.onnx\"\n", + "model.save(model_path)\n", + "\n", + "showInNetron(model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Part 2 : Parallelization Parameters: PE & SIMD" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The computational parallelism can be varied by setting the folding factors or also called parallelization parameters **PE** and **SIMD** of each layer. These parallelization attributes are subject to certain constraints and should be selected accordingly.\n", + "\n", + "To see more details about how this is implemented in the HLS variant of the MatrixVectorActivation layer (`MVAU_hls`), please have a look at [this documentation](https://github.com/Xilinx/finn/blob/github-pages/docs/finn-sheduling-and-folding.pptx). A schematic of the folding in an MVAU for a fully-connected layer is shown below:\n", + "\n", + "![](finn-folding-mvau.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the case of the MVAU, `PE` & `SIMD` are subject to the following constraints: \n", + "\n", + "If `MW` is the number of input features and `MH` the number of output features:\n", + "\n", + " MW % SIMD == 0\n", + " MH % PE == 0\n", + " \n", + "Total folding in the case of the MVAU is defined as:\n", + "\n", + " Total folding = (MH/PE) x (MW/SIMD)\n", + "\n", + "In a streaming dataflow architecture like it is in FINN designs the throughput is determined by the slowest layer. So, the goal of adjusting these parameters is to get an almost balanced pipeline i.e. equalizing the throughput rate of layers in the generated dataflow architecture.\n", + "\n", + "The FINN compiler provides analysis passes to facilitate the exploration of the folding factors of each layer. In this notebook we will show how to use these functions and explore how the parallelization parameters affect the clock cycles and the resource utilization of the generated dataflow architecture.\n", + "\n", + "We start with a naive case where `PE` & `SIMD` values across all layers are 1, this is the starting point of our exploration and is also the state the network is in after the conversion to HLS layers. If you take a look at the model using Netron and click on one of the MVAU layers, you can see that `PE` and `SIMD` are both set to 1 by default." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(model_path)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We import the analysis passes `exp_cycles_per_layer()` and `res_estimation()` to estimate the number of clock cycles and resource utilization of each network layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer\n", + "from finn.analysis.fpgadataflow.res_estimation import res_estimation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Analysis passes in FINN return information about the model in form of a dictionary, you can learn more about analysis passes in general in this Jupyter notebook: [0_custom_analysis_pass.ipynb](0_custom_analysis_pass.ipynb).\n", + "\n", + "We start by calling the analysis pass `exp_cycles_per_layer()`, which returns a dictionary with the layer names as keys and the expected cycles as values. Afterwards, we plot the result in a block diagram." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cycles_dict = model.analysis(exp_cycles_per_layer)\n", + "cycles_dict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "\n", + "fig = plt.figure(figsize = (10, 5))\n", + "plt.bar(cycles_dict.keys(), cycles_dict.values(), color ='blue', width = 0.3)\n", + "plt.xlabel(\"Network layers\")\n", + "plt.ylabel(\"Number of clock cycles\")\n", + "plt.title(\"Clock cycles per layer PE=SIMD=1\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We observe that the bottleneck in the execution of the model on hardware would come from the execution of the first layer which takes estimated 38400 clock cycles to execute one set of its inputs.\n", + "\n", + "No matter how quickly the other layers execute, the throughput will be defined by the first layer's execution latency.\n", + "\n", + "Let's have a look now at the estimated resources per layer by calling another analysis pass.\n", + "The keys are again the layer names, but the values are now a dictionary with the resource estimates per layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res_dict = model.analysis(res_estimation)\n", + "res_dict" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next to the absolute numbers of LUTs, BRAM, URAM and DSPs, the analysis pass also provides information about the efficiency of the memory usage. If the memory type is not utilized, the efficiency is by default 1. You can see that above for the `URAM_efficiency`. In all other cases the efficiency indicates the actual parameter storage needed divided by the allocated BRAM/URAM storage. So, this means in our example MVAU_hls_0 uses 5 block ram and they are 83% utilized. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After we extract that information from the model, we plot the number of LUTs. In this notebook we concentrate on the influence on the LUT usage, but by manipulating the code below, you can also extract information about memory and dsp usage." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extracting LUTs from res_dict\n", + "LUTs = [res_dict[key][\"LUT\"] for key in res_dict.keys()] \n", + "\n", + "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n", + "fig = plt.figure(figsize = (10, 5))\n", + "plt.bar(res_dict.keys(), LUTs, color ='green', width = 0.3)\n", + "plt.xlabel(\"Network layers\")\n", + "plt.ylabel(\"Number of LUTs\")\n", + "plt.title(\"No. of LUTs per layer PE=SIMD=1\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since we identified above that the first layer takes the highest number of cycles to complete the execution, we will now try to adjust the folding parameters to reduce its latency at the expense of an increase in resource utilization." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Modify Parameters\n", + "\n", + "We now modify the parallelization parameters of the first network layer to reduce its latency.\n", + "We only extract the first `MVAU_hls` block from the model and set the parallelization parameters manually.\n", + "\n", + "In the first step, we left the `PE` & `SIMD` values for all the layers on default (=1) to establish a baseline and measure the estimated clock cycles and resource utilization for each of the individual layers.\n", + "\n", + "To set `PE` & `SIMD`, we will utilize functionality from the FINN compiler. Each layer type has a Python wrapper which can be instantiated using the `getCustomOp()` function. The wrapper offers several helper functions like `get_nodeattr()` and `set_nodeattr()` to access and set the attributes of a node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from qonnx.custom_op.registry import getCustomOp\n", + "\n", + "list_of_mvaus = model.get_nodes_by_op_type(\"MVAU_hls\")\n", + "mvau0 = list_of_mvaus[0]\n", + "\n", + "mvau0_inst = getCustomOp(mvau0)\n", + "\n", + "# Get the node attributes to check the current setting\n", + "print(\"The parallelization parameters of %s were: \" % mvau0.name)\n", + "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n", + "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))\n", + "\n", + "# Set the new node attributes\n", + "mvau0_inst.set_nodeattr(\"PE\", 2)\n", + "mvau0_inst.set_nodeattr(\"SIMD\", 5)\n", + "\n", + "# Get the node attributes to check the updated setting\n", + "print(\"The parallelization parameters of %s are updated to: \" % mvau0.name)\n", + "print(\"PE: \" + str(mvau0_inst.get_nodeattr(\"PE\")))\n", + "print(\"SIMD: \" + str(mvau0_inst.get_nodeattr(\"SIMD\")))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We save the model and view it. On expanding the first `MVAU_hls` we can see the updated `PE` & `SIMD` parameters for that layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.save(\"cybsec_PE_SIMD_modified.onnx\")\n", + "showInNetron(\"cybsec_PE_SIMD_modified.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the above total folding formula, we have reduced the total folding of our layer from `600 x 64` to `120 x 32`. Hence, resulting in an estimated `10x` decrease in the execution latency of our layer. \n", + "This can be observed in the new estimated clock cycles." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cycles_dict_updated = model.analysis(exp_cycles_per_layer)\n", + "cycles_dict_updated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = plt.figure(figsize = (10, 5))\n", + "plt.bar(cycles_dict_updated.keys(), cycles_dict_updated.values(), color ='blue', width = 0.3)\n", + "plt.xlabel(\"Network layers\")\n", + "plt.ylabel(\"Number of clock cycles\")\n", + "plt.title(\"Clock cycles per layer with updated folding factors\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This has of course consequences for the resource usage of the network." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "res_dict_updated = model.analysis(res_estimation)\n", + "res_dict_updated" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extracting LUTs from res_dict\n", + "LUTs_updated = [res_dict_updated[key][\"LUT\"] for key in res_dict_updated.keys()] \n", + "\n", + "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n", + "fig = plt.figure(figsize = (10, 5))\n", + "plt.bar(res_dict_updated.keys(), LUTs_updated, color ='green', width = 0.3)\n", + "plt.xlabel(\"Network Layers\")\n", + "plt.ylabel(\"LUT Utilisation\")\n", + "plt.title(\"No. of LUTs per layer with updated folding factors\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From these numbers, we see that the first layer has been removed as the bottleneck and that the entire network can now perform one inference in ~4096 clock cycles (when the pipeline is full) as compared to the earlier configuration where it took ~38400 execution cycles.\n", + "\n", + "This decrease in execution latency of the network though comes at a cost of a 45% increase in LUT resource utilization for the first layer of the network." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Important Note : StreamingDataWidthConverters" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next to resources and performance, folding factors (or parallelization parameters) are influencing also other properties of the generated design. Since we are able to generate results in parallel, the data that gets fed into the layer needs to be packed in a specific format to provide the correct data at the correct time for the internal parallelism. Also, the data that comes out of a layer will be in a specific format depending on the internal parallelism." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To analyze the influence of the folding factors on the data streams between layers, we first will import the original model (with `PE=SIMD=1`) and then we will import the updated model, so that we can compare the two of them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dir_path = os.environ[\"FINN_ROOT\"] + \"/notebooks/advanced/\" \n", + "model_orig = ModelWrapper(dir_path + \"cybsec_PE_SIMD_named_nodes.onnx\")\n", + "model_updated = ModelWrapper(\"cybsec_PE_SIMD_modified.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the next step we extract the information from all layers. For MVAUs the input shape is (1, MW/SIMD, SIMD) and the output shape is (1, MH/PE, PE)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Original model\n", + "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n", + "print(\"In the original model (pe=simd=1): \")\n", + "for mvau in list_of_mvaus:\n", + " mvau_inst = getCustomOp(mvau)\n", + " print(\"Layer: \" + mvau.name)\n", + " print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n", + " print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Updated model\n", + "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n", + "print(\"In the original model (pe=simd=1): \")\n", + "for mvau in list_of_mvaus:\n", + " mvau_inst = getCustomOp(mvau)\n", + " print(\"Layer: \" + mvau.name)\n", + " print(\"Input shape: \" + str(mvau_inst.get_folded_input_shape()))\n", + " print(\"Output shape: \" + str(mvau_inst.get_folded_output_shape()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the input and output shape for MVAU_hls_0 has changed after we have changed the folding factors. These changes have direct influence on the in/out stream width. We can have a closer look at the formula to calculate the stream width of an MVAU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "showSrc(mvau_inst.get_instream_width)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "showSrc(mvau_inst.get_outstream_width)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The input stream width can be calculated by multiplying the input bit width with SIMD and the output stream width can be calculated by multiplying the output bit width with PE." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To connect two layers with each other for the final design, the input stream width of a node needs to match the output stream width of the preceding node. If that is not the case FINN inserts DataWidthConverters (DWCs) to resolve this mismatch. Let's have a look at the input/output stream width of the layers before updating the parallelization parameters." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Original model\n", + "list_of_mvaus = model_orig.get_nodes_by_op_type(\"MVAU_hls\")\n", + "print(\"In the original model (pe=simd=1): \")\n", + "for mvau in list_of_mvaus:\n", + " mvau_inst = getCustomOp(mvau)\n", + " print(\"Layer: \" + mvau.name)\n", + " print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n", + " print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the original model the output stream width of one layer matches the input stream width of the following layer. So there would be no DWC required when generating the final design." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For the updated model, the situation is different. Let's have a look how the stream widths have changed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Updated model\n", + "list_of_mvaus = model_updated.get_nodes_by_op_type(\"MVAU_hls\")\n", + "print(\"In the original model (pe=simd=1): \")\n", + "for mvau in list_of_mvaus:\n", + " mvau_inst = getCustomOp(mvau)\n", + " print(\"Layer: \" + mvau.name)\n", + " print(\"Input stream width: \" + str(mvau_inst.get_instream_width()))\n", + " print(\"Output stream width: \" + str(mvau_inst.get_outstream_width()))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, the output stream width of MVAU_hls_0 has now changed to `4`, while the input stream width of MatrixVectorActivation_1 stayed `2`. So, the FINN compiler would insert a DWC between these nodes, we can manually invoke this behavior by first calling the transformation `InsertDWC` and then converting the resulting DWCs into an HLS or RTL variant by calling `SpecializeLayers`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", + "\n", + "model_updated = model_updated.transform(InsertDWC())\n", + "model_updated = model_updated.transform(SpecializeLayers())\n", + "model_updated = model_updated.transform(GiveUniqueNodeNames())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_updated.save(\"cybsec_DWC.onnx\")\n", + "showInNetron(\"cybsec_DWC.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can observe in the model that a DWC was inserted between the first two layers.\n", + "Since the DWC will also be a hardware block in our final FINN design, it has a latency and resources associated with it. Let's have a final look in our resource estimates." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_dwc = ModelWrapper(\"cybsec_DWC.onnx\")\n", + "res_dict_dwc = model_dwc.analysis(res_estimation)\n", + "res_dict_dwc" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since we have now one additional layer, we manipulate the data to shorten the layer names in the plot." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "layers = res_dict_dwc.keys()\n", + "# replace names of layers with abbreviations\n", + "layers = [n.replace(\"StreamingDataWidthConverter_Batch\", \"DWC\") for n in layers]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Extracting LUTs from res_dict\n", + "LUTs_dwc = [res_dict_dwc[key][\"LUT\"] for key in res_dict_dwc.keys()] \n", + "\n", + "#Plotting the bar graph of each network layer with their corresponding LUT resource utilization\n", + "fig = plt.figure(figsize = (10, 5))\n", + "plt.bar(layers, LUTs_dwc, color ='red', width = 0.3)\n", + "plt.xlabel(\"Network Layers\")\n", + "plt.ylabel(\"LUT Utilisation\")\n", + "plt.title(\"Estimated LUT values used for each network layer\")\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In the case of our example network, the `StreamingDataWidthConverter_Batch` layer does not consume a large number of LUT resources as shown in the graph. This might be different for larger models and if there are a higher number of DWCs inserted. Please be aware of this when setting the folding factors for your network." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/advanced/4_advanced_builder_settings.ipynb b/notebooks/advanced/4_advanced_builder_settings.ipynb new file mode 100644 index 0000000000..5139377342 --- /dev/null +++ b/notebooks/advanced/4_advanced_builder_settings.ipynb @@ -0,0 +1,1844 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8fcff912", + "metadata": {}, + "source": [ + "# Advanced Builder settings\n", + "\n", + "\"drawing\"\n", + "\n", + "In this notebook, we'll use the FINN compiler to generate an FPGA accelerator with a streaming dataflow architecture from a small convolutional network trained on CIFAR-10. The key idea in streaming dataflow architectures is to parallelize across layers as well as within layers by dedicating a proportionate amount of compute resources to each layer, illustrated on the figure to the left. You can read more about the general concept in the [FINN](https://arxiv.org/pdf/1612.07119) and [FINN-R](https://dl.acm.org/doi/pdf/10.1145/3242897) papers. This is done by mapping each layer to a Vitis HLS or RTL description, parallelizing each layer's implementation to the appropriate degree and using on-chip FIFOs to link up the layers to create the full accelerator.\n", + "These implementations offer a good balance of performance and flexibility, but building them by hand is difficult and time-consuming. This is where the FINN compiler comes in: it can build streaming dataflow accelerators from an ONNX description to match the desired throughput." + ] + }, + { + "cell_type": "markdown", + "id": "a830e730", + "metadata": {}, + "source": [ + "In this tutorial, we will have a more detailed look into the FINN builder tool and explore different options to customize your FINN design. We assume that you have already completed the [Cybersecurity notebooks](../end2end_example/cybersecurity) and that you have a basic understanding of how the FINN compiler works and how to use the FINN builder tool." + ] + }, + { + "cell_type": "markdown", + "id": "5ec9a0db", + "metadata": {}, + "source": [ + "## Outline\n", + "---------------\n", + "\n", + "1. [Introduction to the CNV-w2a2 network](#intro_cnv)\n", + "2. [Recap default builder flow](#recap_builder)\n", + "3. [Build steps](#build_step)\n", + " 1. [How to create a custom build step](#custom_step)\n", + "4. [Specialize layers configuration json](#specialize_layers)\n", + "5. [Folding configuration json](#folding_config)\n", + "6. [Additional builder arguments](#builder_arg)\n", + " 1. [Verification steps](#verify)\n", + " 2. [Other builder arguments](#other_args)\n", + " 3. [Examples for additional builder arguments & bitfile generation](#example_args)" + ] + }, + { + "cell_type": "markdown", + "id": "5dbed63f", + "metadata": {}, + "source": [ + "## Introduction to the CNV-w2a2 network \n", + "\n", + "The particular quantized neural network (QNN) we will be targeting in this notebook is referred to as CNV-w2a2 and it classifies 32x32 RGB images into one of ten CIFAR-10 classes. All weights and activations in this network are quantized to two bit, with the exception of the input (which is RGB with 8 bits per channel) and the final output (which is 32-bit numbers). It is similar to the convolutional neural network used in the [cnv_end2end_example](../end2end_example/bnn-pynq/cnv_end2end_example.ipynb) Jupyter notebook.\n", + "\n", + "\n", + "You'll have a chance to interactively examine the layers that make up the network in Netron. We start by setting the build directory to the directory this notebook is in and importing helper functions to use in the notebook to examine ONNX graphs and source code." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce459f3c", + "metadata": {}, + "outputs": [], + "source": [ + "from finn.util.visualization import showInNetron, showSrc\n", + "import os\n", + " \n", + "build_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"" + ] + }, + { + "cell_type": "markdown", + "id": "7fc6444c", + "metadata": {}, + "source": [ + "In the next step, we will export the trained network directly from Brevitas to the QONNX format. QONNX is the intermediate representation (IR) that is used as the frontend to the FINN compiler. Please note that the internal representation of the network is still the FINN-ONNX format. [QONNX and FINN-ONNX](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-qonnx-and-finn-onnx) are extensions to the ONNX format to represent quantization, especially below 8 bit, in ONNX graphs. The main difference is that quantization in QONNX graphs is represented using dedicated quantization nodes ([more about QONNX](https://github.com/fastmachinelearning/qonnx)) while the quantization in FINN-ONNX is an annotation attached to the tensors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe262964", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from finn.util.test import get_test_model_trained\n", + "from brevitas.export import export_qonnx\n", + "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n", + "\n", + "cnv = get_test_model_trained(\"CNV\", 2, 2)\n", + "export_onnx_path = build_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path)\n", + "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)" + ] + }, + { + "cell_type": "markdown", + "id": "d24b632f", + "metadata": {}, + "source": [ + "After the export, we call a clean up function on the model. This makes sure, that for example all shapes in the network are inferred, constant folding was applied and all tensors and nodes have unique names. In the next step, we can visualize the graph using Netron. When scrolling through the graph, you can see the Quant nodes that indicate the quantization in the network. In the [first step](https://github.com/Xilinx/finn/blob/main/src/finn/builder/build_dataflow_steps.py#L260) of the FINN builder flow, the network gets converted from the QONNX format to the FINN-ONNX format. That means these Quant nodes will not be present in the graph anymore and instead the quantization will be attached as an annotation to the tensors." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87f59da6", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/end2end_cnv_w2a2_export.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "c764ed76", + "metadata": {}, + "source": [ + "## Quick recap, how to setup up default builder flow for resource estimations " + ] + }, + { + "cell_type": "markdown", + "id": "a26e5418", + "metadata": {}, + "source": [ + "As a quick recap, let's set up the builder like we have done in the cybersecurity example to get the resource estimates for our example network." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9007705a", + "metadata": {}, + "outputs": [], + "source": [ + "## Quick recap on how to setup the default builder flow for resource estimations\n", + "\n", + "import finn.builder.build_dataflow as build\n", + "import finn.builder.build_dataflow_config as build_cfg\n", + "import os\n", + "import shutil\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "estimates_output_dir = build_dir + \"/output_estimates_only\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(estimates_output_dir):\n", + " shutil.rmtree(estimates_output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = estimates_output_dir,\n", + " mvau_wwidth_max = 80,\n", + " target_fps = 10000,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_cfg.estimate_only_dataflow_steps,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "02e4c0f0", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "markdown", + "id": "4fa0b9f5", + "metadata": {}, + "source": [ + "The output directory was created and we can extract information about our model and also how it was processed in the FINN compiler from the generated files. Let's focus on the intermediate models for now. You can find them in the output directory in the folder \"intermediate_models\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05a941ef", + "metadata": {}, + "outputs": [], + "source": [ + "!ls -t -r {build_dir}/output_estimates_only/intermediate_models" + ] + }, + { + "cell_type": "markdown", + "id": "d746eff3", + "metadata": {}, + "source": [ + "After each FINN builder step, the graph is saved as .onnx file. In the cell above we sort the intermediate models by time in descending order (`ls -t -r`) to visualize the builder flow. As you can see after the conversion to the FINN-ONNX format (`step_qonnx_to_finn`), the graph is prepared by tidy up and streamlining (`step_tidy_up` and `step_streamline`) and then the high level nodes are converted to HW abstraction layers (`step_convert_to_hw`). Then there is a partition created from all layers that were converted to HW layers (`step_create_dataflow_partition`), then we convert each of the HW abstraction layers into an HLS or RTL variant (`step_specialize_layers`). Afterwards optimizations are applied (`step_target_fps_parallelization`, `step_apply_folding_config` and `step_minimize_bit_width`). In the final step of this example we generate resource and performance reports for the network (`step_generate_estimate_reports`). Use the code below to investigate the network after each step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72de8d4c", + "metadata": {}, + "outputs": [], + "source": [ + "model_to_investigate = \"step_qonnx_to_finn.onnx\"\n", + "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/\"+model_to_investigate)" + ] + }, + { + "cell_type": "markdown", + "id": "bccebd0d", + "metadata": {}, + "source": [ + "The analysis of these .onnx files can help us identifying points in the flow in which we might need to intervene and provide the compiler with additional information. When investigating the network after the conversion to HW layers, we can see that there are layers that were not converted. We can see this by clicking on the different nodes. HW layers have the module `finn.custom_op.fpgadataflow`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6d86463a", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_estimates_only/intermediate_models/step_convert_to_hw.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "2719cc09", + "metadata": {}, + "source": [ + "As you can see in the graph, the first two nodes (a MultiThreshold and Transpose node) and the last two nodes (a Mul and Add node) are not converted into HW layers. FINN currently only converts integer only operations into HW layers, this means only when the input, output & weights are quantized to integer the node will be converted." + ] + }, + { + "cell_type": "markdown", + "id": "ff7fa549", + "metadata": {}, + "source": [ + "
\n", + "Important notice: We are working on supporting additional data types and this limitation might disappear in the near future.\n", + "
" + ] + }, + { + "cell_type": "markdown", + "id": "6e6d942e", + "metadata": {}, + "source": [ + "When we click on the `global_in` in the graph, we can see that the quantization annotation does not contain a data type. If no data type is set and it can not be derived from the preceeding node, the FINN compiler automatically assumes that the data type is floating point. This is why the first node does not get converted into an HW layer, the input is assumed to be floating point." + ] + }, + { + "cell_type": "markdown", + "id": "8b8994e6", + "metadata": {}, + "source": [ + "The solution to the problem depends on the actual data input.\n", + "1. The data set is quantized and `global_in` is an integer: We set the data type of the tensor `global_in` before passing the model to the FINN compiler using [helper functions of ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#helper-functions-for-tensors).\n", + "2. The data set is not quantized: we can either execute the first layer in software (e.g. as part of the Python driver) or we can add a preprocessing step into the graph." + ] + }, + { + "cell_type": "markdown", + "id": "7504dce7", + "metadata": {}, + "source": [ + "Even though in the example of the CNVw2a2, the inputs are 32x32 RGB images, so the input values are 8 bit (UINT8) \"quantized\", the input to the exported model is floating point. For training in Brevitas, these values were normalized between 0 and 1.0 and so the exported model expects floating point values as input. \n", + "This means we are in scenario 2. In the next section we will develop a custom step for the FINN builder flow to add preprocessing to our network.\n", + "\n", + "But before we move to the next section, let's take a look at the last two nodes in the graph that were not converted to HW layers." + ] + }, + { + "cell_type": "markdown", + "id": "f9c2696b", + "metadata": {}, + "source": [ + "We have two nodes at the end of the graph that we were not able to convert: a floating poing scalar multiplication and addition. These operations are \"left-over\" from streamlining and cannot be merged into a succeeding thresholding operation. \n", + "\n", + "Our example is a network for image classification, so the output is a vector of 10 values that give a predicition score for each of the classes in the CIFAR-10 data set. If we are only interested in the Top-1 result of the classification, we can add a post-processing step which inserts a TopK node in the graph. \n", + "\n", + "Since the last two layers are scalar operations, they have the same influence on all predicition scores in the output vector and we can safely merge them into the TopK node. " + ] + }, + { + "cell_type": "markdown", + "id": "4fc8fbf5", + "metadata": {}, + "source": [ + "These pre-processing and post-processing steps are network dependent and we will need to write **custom steps** that can then be executed using the FINN builder tool.\n", + "\n", + "In the next section we will first look into how a standard build step inside FINN looks like and then we will write our own custom steps for pre- and post-processing and add them to the builder configuration." + ] + }, + { + "cell_type": "markdown", + "id": "7e561a91", + "metadata": {}, + "source": [ + "## Build steps " + ] + }, + { + "cell_type": "markdown", + "id": "fb18b21d", + "metadata": {}, + "source": [ + "The following steps are executed when using the `estimates_only`-flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3fe1186", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\".join(build_cfg.estimate_only_dataflow_steps))" + ] + }, + { + "cell_type": "markdown", + "id": "dd3ef987", + "metadata": {}, + "source": [ + "You can have a closer look at each step by either using the `showSrc()` function or by accessing the doc string." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "313fac18", + "metadata": {}, + "outputs": [], + "source": [ + "import finn.builder.build_dataflow_steps as build_dataflow_steps\n", + "print(build_dataflow_steps.step_tidy_up.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "029da0da", + "metadata": {}, + "outputs": [], + "source": [ + "import finn.builder.build_dataflow_steps as build_dataflow_steps\n", + "showSrc(build_dataflow_steps.step_tidy_up)" + ] + }, + { + "cell_type": "markdown", + "id": "2809f6a7", + "metadata": {}, + "source": [ + "Each steps gets the model (`model: ModelWrapper`) and the build configuration (`cfg: DataflowBuildConfig`) as input arguments. Then a certain sequence of transformations is applied to the model. In some of the steps, verification can be run to ensure that the applied transformations have not changed the behaviour of the network. In the end the modified model is returned." + ] + }, + { + "cell_type": "markdown", + "id": "e9c2c97f", + "metadata": {}, + "source": [ + "### How to create a custom build step " + ] + }, + { + "cell_type": "markdown", + "id": "537a44e7", + "metadata": {}, + "source": [ + "When writing our own custom steps, we use the same pattern. See below the code for the pre-processing for the example network." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9d43cc8", + "metadata": {}, + "outputs": [], + "source": [ + "from finn.util.pytorch import ToTensor\n", + "from qonnx.transformation.merge_onnx_models import MergeONNXModels\n", + "from qonnx.core.modelwrapper import ModelWrapper\n", + "from qonnx.core.datatype import DataType\n", + "\n", + "def custom_step_add_pre_proc(model: ModelWrapper, cfg: build.DataflowBuildConfig):\n", + " ishape = model.get_tensor_shape(model.graph.input[0].name)\n", + " # preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n", + " preproc = ToTensor()\n", + " export_qonnx(preproc, torch.randn(ishape), \"preproc.onnx\", opset_version=11)\n", + " preproc_model = ModelWrapper(\"preproc.onnx\")\n", + " # set input finn datatype to UINT8\n", + " preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType[\"UINT8\"])\n", + " # merge pre-processing onnx model with cnv model (passed as input argument)\n", + " model = model.transform(MergeONNXModels(preproc_model))\n", + " return model\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "7a6798aa", + "metadata": {}, + "source": [ + "In the next step we can modify the builder configuration to execute a custom sequence of builder steps, including the newly implemented pre-processing custom step.\n", + "\n", + "For that we create a list `build_steps` which contains next to the standard steps from the `estimate_only` flow, also the new custom step to add the pre-processing. This list then gets passed in the build configuration." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f00b465", + "metadata": {}, + "outputs": [], + "source": [ + "## Builder flow with custom step for pre-processing\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_pre_proc\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_target_fps_parallelization\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " target_fps = 10000,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3a2bcea", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51b7dbd5", + "metadata": {}, + "outputs": [], + "source": [ + "!ls -t -r {build_dir}/output_pre_proc/intermediate_models" + ] + }, + { + "cell_type": "markdown", + "id": "4690049f", + "metadata": {}, + "source": [ + "An intermediate .onnx file after the execution of the custom step was automatically created, let's have a look at the graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87e5651e", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_pre_proc/intermediate_models/custom_step_add_pre_proc.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "90c6bef9", + "metadata": {}, + "source": [ + "The graph is in QONNX format and a division by 255 is inserted in the beginning. We can now use the CIFAR-10 images directly as input to the graph and the new `global_in` tensor is UINT8.\n", + "\n", + "You can already have a look on how the intermediate models have changed by modifying the code in the cell above. Before we go into more detail, we will add another custom step to insert the post-processing. In this case this means the insertion of a TopK node." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8c6f1bd0", + "metadata": {}, + "outputs": [], + "source": [ + "from qonnx.transformation.insert_topk import InsertTopK\n", + "\n", + "def custom_step_add_post_proc(model: ModelWrapper, cfg: build.DataflowBuildConfig):\n", + " model = model.transform(InsertTopK(k=1))\n", + " return model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57adbb44", + "metadata": {}, + "outputs": [], + "source": [ + "## Builder flow with custom step for pre-processing and post-processing\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_pre_and_post_proc\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_target_fps_parallelization\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " target_fps = 10000,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0598b81", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95230896", + "metadata": {}, + "outputs": [], + "source": [ + "!ls -t -r {build_dir}/output_pre_and_post_proc/intermediate_models" + ] + }, + { + "cell_type": "markdown", + "id": "3a0263b1", + "metadata": {}, + "source": [ + "You can use the code in the cell below to investigate the generated intermediate models. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "44127417", + "metadata": {}, + "outputs": [], + "source": [ + "model_to_investigate = \"custom_step_add_post_proc.onnx\"\n", + "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/\"+model_to_investigate)" + ] + }, + { + "cell_type": "markdown", + "id": "5cc97505", + "metadata": {}, + "source": [ + "Let's have a look at the model after the conversion to hw, to verify that now all layers are correctly converted." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63131e3e", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_pre_and_post_proc/intermediate_models/step_convert_to_hw.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "8fd0af6b", + "metadata": {}, + "source": [ + "The model contains now a `Thresholding` layer in the beginning and a `LabelSelect` layer at the end. Please note, that there is still a `Transpose` node as the first layer of the graph, but we can solve this by converting the input data to the NHWC format before streaming it into the FINN accelerator." + ] + }, + { + "cell_type": "markdown", + "id": "a6edf5c4-9213-45cd-834f-615c12685d9e", + "metadata": {}, + "source": [ + "## Specialize layers configuration json " + ] + }, + { + "cell_type": "markdown", + "id": "4ae83d6e-c704-4c7f-a922-a4b470c0a55f", + "metadata": {}, + "source": [ + "The FINN compiler was developed with the assumption that the hardware blocks corresponding to the neural network layers are developed based on HLS. Although we do not want to abolish this HLS implementation at this time, it has become apparent over the years that for certain modules it makes sense to implement them in RTL. This allows us greater control over the resulting hardware and we can make optimal use of FPGA resources.\n" + ] + }, + { + "cell_type": "markdown", + "id": "ed72aabf-0517-422f-a686-6c70e7492114", + "metadata": {}, + "source": [ + "So, with the growth of more and more RTL variants of common FINN hardware building blocks, we introduced an additional builder step called `step_specialize_layers`. In this step HW nodes get specialized to either an HLS or RTL variant of the node. " + ] + }, + { + "cell_type": "markdown", + "id": "82a2bc39-8a37-49aa-a79d-2818e66ebd11", + "metadata": {}, + "source": [ + "They get converted either based on pre-determined rules or the user provides a configuration file which contains the desired setting. If the user preference cannot be fulfilled, a warning will be printed and the implementation style will be set to a default. " + ] + }, + { + "cell_type": "markdown", + "id": "bc90b589-7a92-4996-9704-02736ac4e60e", + "metadata": {}, + "source": [ + "The builder flow step before `step_create_dataflow_partition` generates a template json file to set the preferred implementation style per layer. We can copy it from one of the previous runs to this folder and manipulate it to pass it to a new build." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ddb88eb1-3f11-4343-ae7c-3e5e8cbc34dc", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n", + " specialize_layers_config = json.load(json_file)\n", + "\n", + "print(json.dumps(specialize_layers_config, indent=1))" + ] + }, + { + "cell_type": "markdown", + "id": "158d7d8c-a072-4a50-9714-43ebaefa53d1", + "metadata": {}, + "source": [ + "As you can see, each node is listed in the .json file and an empty string for the node attribute `preferred_impl_style` is instantiated by default. We can now use this .json and set the `preferred_impl_style` to pass to a new builder flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3f464d35-6774-4751-80b4-b6230e501539", + "metadata": {}, + "outputs": [], + "source": [ + "with open(build_dir+\"/output_pre_and_post_proc/template_specialize_layers_config.json\", 'r') as json_file:\n", + " specialize_layers_config = json.load(json_file)\n", + "\n", + "# Set all preferred_impl_style to all HLS\n", + "for key in specialize_layers_config:\n", + " if \"preferred_impl_style\" in specialize_layers_config[key]:\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n", + "# Save as .json \n", + "with open(\"specialize_layers_all_hls.json\", \"w\") as jsonFile:\n", + " json.dump(specialize_layers_config, jsonFile)\n", + " \n", + "# Set SWG to RTL variant\n", + "for key in specialize_layers_config:\n", + " if \"preferred_impl_style\" in specialize_layers_config[key]:\n", + " if key.startswith(\"ConvolutionInputGenerator\"):\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"rtl\"\n", + " else:\n", + " specialize_layers_config[key][\"preferred_impl_style\"] = \"hls\" \n", + "# Save as .json \n", + "with open(\"specialize_layers_swg_rtl.json\", \"w\") as jsonFile:\n", + " json.dump(specialize_layers_config, jsonFile)" + ] + }, + { + "cell_type": "markdown", + "id": "52592ea6-cd12-46b9-af91-5960b4749e7e", + "metadata": {}, + "source": [ + "We created two `specialize_layers_config_files`:\n", + "* One which sets all layers to `\"hls\"`\n", + "* One that sets `preferred_impl_style` for the ConvolutionInputGenerator to `\"rtl\"`" + ] + }, + { + "cell_type": "markdown", + "id": "701905d8-c5cc-4cc0-b872-156c5b9d0432", + "metadata": {}, + "source": [ + "In the following we will setup two build flows and run them to the estimate reports step. Afterwards we will investigate the intermediate .onnx files and compare the two runs." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22ff1a91-7ef7-44cb-86d3-60b9af7a8c5e", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## specialize_layers_config_file = \"specialize_layers_all_hls.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_all_hls\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9df41ff-ef6a-4d0e-ab36-241bb11ed241", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ff617f21-6001-4bb7-9cf7-2cc2acd3fbec", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## specialize_layers_config_file = \"specialize_layers_swg_rtl.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_swg_rtl\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " specialize_layers_config_file = \"specialize_layers_swg_rtl.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f48ba95-f7b5-455b-8041-25b7341ad115", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "markdown", + "id": "bed4bedd-397d-4bd1-8531-c6ceac306715", + "metadata": {}, + "source": [ + "First we are looking into the intermediate model after `step_create_dataflow_partition` and then after `step_specialize_layers`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e64db23-98cb-494b-851f-3cc2c3847451", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_create_dataflow_partition.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e1a6351-367f-47a6-b802-a2613ea455a1", + "metadata": {}, + "source": [ + "Let's have a look first at the model which we specialize to \"all HLS\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f85d6c42-153d-4a40-b3cc-a4c8c89fe636", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_hls/intermediate_models/step_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "e1520920-b7de-42a5-9ec8-e8503992fbd1", + "metadata": {}, + "source": [ + "As you can see, each op type has now a suffix indicating that it is an HLS variant of the node. Additionally, when you click on one of the node in the Netron visualization, you can see that module is set to `finn.custom_op.fpgadataflow.hls`.\n", + "\n", + "Let's now have a look at the model in which we specialized the ConvolutionInputGenerator to `\"rtl\"`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9f1f26a0-3a62-4920-bf40-5b1b798fa02e", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_swg_rtl/intermediate_models/step_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "3f9c4de4-61ef-4698-ab23-87bf5953c5ae", + "metadata": {}, + "source": [ + "You can use the cells above to try out different settings and pass it to the builder flow. Please note that not all layers have HLS and RTL variants, so it might be that the setting you define in `specialize_layers_config.json` gets ignored and a sensible default is set instead. The FINN compiler will display a warning in this case." + ] + }, + { + "cell_type": "markdown", + "id": "5ffbadd1", + "metadata": {}, + "source": [ + "## Folding configuration json " + ] + }, + { + "cell_type": "markdown", + "id": "c164040f", + "metadata": {}, + "source": [ + "The FINN compiler allows the user to implement a network in streaming dataflow architecture, this means every layer is implemented individually and the data is streamed through the accelerator. We can customize each layer for specific performance and resource requirements by adjusting the parallelism and resource type of each layer. In the FINN context we refer to this customization of parallelism in each layer as folding. To learn more details about the influence of folding factors/parallelism in FINN, please have a look at our [folding tutorial](./3_folding.ipynb).\n", + "\n", + "In this section, we will look into the interface over which we can influence the customization of each layer using the FINN builder tool: A json file containing the folding configuration." + ] + }, + { + "cell_type": "markdown", + "id": "1299b86d", + "metadata": {}, + "source": [ + "Depending on the invoked step, the FINN compiler can produce or consume a .json file containing the folding configuration for each layer. In the cell below, we will have a look at the automatically generated .json file, which is produced by `step_target_fps_parallelization`. We use this then as starting point to manipulate the folding configuration and feed it back into the builder tool." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f75f5634", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n", + " folding_config = json.load(json_file)\n", + "\n", + "print(json.dumps(folding_config, indent=1))" + ] + }, + { + "cell_type": "markdown", + "id": "8de787a7", + "metadata": {}, + "source": [ + "As you can see from the printed cell above, the keys in the .json file are the node names of the layers in our network. For each of the layers, some node attributes are listed:\n", + "* `PE` and `SIMD` are the folding parameters that determine the parallelism of each layer, depending on the layer they can be set to different values, for details refer to [this table](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer).\n", + "* `mem_mode`: determines if the parameter memory will be implemented as part of the HLS/RTL code (`const`) or instantiated separately and connected with the layer over a memory streamer unit (`decoupled`). You can find more details in this part of the documentation: https://finn-dev.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode . It is also possible to set the mem_mode to external which allows for the implementation for external weights.\n", + "* `ram_style`: when selecting `decoupled` mode, the FINN compiler allows us to choose which memory resource will be used for the layer. The argument `ram_style` is set to the selected memory type:\n", + " * `auto`: Vivado will make the decision if the implementation is using LUTRAM or BRAM\n", + " * `distributed`: LUTRAM will be used\n", + " * `block`: BRAM will be used\n", + " * `ultra`: URAM will be used, if available on the selected board\n", + "\n", + "* `resType`: This is a node attribute for the MVAU layer and can be set to `lut` or `dsp`. Please note that selecting `dsp` will not enable the optimized RTL variant of the MVAU but rather generate HLS code utilizing DSPs, this is not optimal yet but can give an additional parameter for design space exploration.\n", + "* `runtime_writeable_weights`: FINN offers the option to implement the weights as \"runtime writable\", this means you can write the weight values from the driver via an axilite interface." + ] + }, + { + "cell_type": "markdown", + "id": "fd1519fe", + "metadata": {}, + "source": [ + "In the following part of the tutorial, we will use the auto generated json file as starting point to create two new json files which explore the `ram_style` attribute. We will use one of the generated reports from the FINN builder to see the impact of these changes.\n", + "For that, we will extract the total resources from the *estimate_layer_resources.json* report in the following cell." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7f42774", + "metadata": {}, + "outputs": [], + "source": [ + "with open(build_dir+\"/output_pre_and_post_proc/report/estimate_layer_resources.json\", 'r') as json_file:\n", + " json_object = json.load(json_file)\n", + "\n", + "print(json.dumps(json_object[\"total\"], indent=1))" + ] + }, + { + "cell_type": "markdown", + "id": "0be3b0e1", + "metadata": {}, + "source": [ + "The FINN compiler estimates the network to use ~500 BRAM blocks and ~100k LUTs." + ] + }, + { + "cell_type": "markdown", + "id": "d4d177dc", + "metadata": {}, + "source": [ + "We will use the `auto_folding_config.json` and create two folding configuration from that file:\n", + "* All `ram_style` attributes set to `distributed`\n", + "* All `ram_style` attributes set to `block`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "112af6fd", + "metadata": {}, + "outputs": [], + "source": [ + "with open(build_dir+\"/output_pre_and_post_proc/auto_folding_config.json\", 'r') as json_file:\n", + " folding_config = json.load(json_file)\n", + "\n", + "# Set all ram_style to LUT RAM\n", + "for key in folding_config:\n", + " if \"ram_style\" in folding_config[key]:\n", + " folding_config[key][\"ram_style\"] = \"distributed\" \n", + "# Save as .json \n", + "with open(\"folding_config_all_lutram.json\", \"w\") as jsonFile:\n", + " json.dump(folding_config, jsonFile)\n", + " \n", + "# Set all ram_style to BRAM\n", + "for key in folding_config:\n", + " if \"ram_style\" in folding_config[key]:\n", + " folding_config[key][\"ram_style\"] = \"block\" \n", + "# Save as .json \n", + "with open(\"folding_config_all_bram.json\", \"w\") as jsonFile:\n", + " json.dump(folding_config, jsonFile)" + ] + }, + { + "cell_type": "markdown", + "id": "0e64a499", + "metadata": {}, + "source": [ + "After generating these files, we will invoke the builder flow. To enable the FINN builder to take the generated folding configuration as input, we will need to set the additional builder argument `folding_config_file` and we will change the `build_steps` to not run `step_target_fps_parallelization`. The build step does not necessarily need to be excluded, but since we pass a separate folding configuration, the output from that step would be overwritten anyways, so we skip it for a faster execution." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdd9f706", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## folding_config_file = \"folding_config_all_lutram.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_all_lutram\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " folding_config_file = \"folding_config_all_lutram.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "99b647c0", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "markdown", + "id": "e705767d", + "metadata": {}, + "source": [ + "We can now have a look at the produced model, when clicking on the individual nodes, you can see that all layers have the node attribute `ram_style` set to `distributed`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc680178", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_lutram/intermediate_models/step_generate_estimate_reports.onnx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "695ecfb1", + "metadata": {}, + "outputs": [], + "source": [ + "with open(build_dir+\"/output_all_lutram/report/estimate_layer_resources.json\", 'r') as json_file:\n", + " json_object = json.load(json_file)\n", + "\n", + "print(json.dumps(json_object[\"total\"], indent=1))" + ] + }, + { + "cell_type": "markdown", + "id": "55208c70", + "metadata": {}, + "source": [ + "The estimation report shows that BRAM utilization is down to zero and the LUT count went up to around 150k." + ] + }, + { + "cell_type": "markdown", + "id": "11b8430a", + "metadata": {}, + "source": [ + "Let's do the same with the folding configuration which sets all memory resources to use BRAM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "59e8aaaa", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with custom folding configuration\n", + "## folding_config_file = \"folding_config_all_bram.json\"\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_all_bram\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " folding_config_file = \"folding_config_all_bram.json\",\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2cdc1aa0", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cd0388fd", + "metadata": {}, + "outputs": [], + "source": [ + "showInNetron(build_dir+\"/output_all_bram/intermediate_models/step_generate_estimate_reports.onnx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e60a3efb", + "metadata": {}, + "outputs": [], + "source": [ + "with open(build_dir+\"/output_all_bram/report/estimate_layer_resources.json\", 'r') as json_file:\n", + " json_object = json.load(json_file)\n", + "\n", + "print(json.dumps(json_object[\"total\"], indent=1))" + ] + }, + { + "cell_type": "markdown", + "id": "97f87780", + "metadata": {}, + "source": [ + "The initial implementation already had a high utilization of BRAM, but the estimations went now up to ~500 BRAMs while the LUT count went down to ~99k." + ] + }, + { + "cell_type": "markdown", + "id": "e65a8ded", + "metadata": {}, + "source": [ + "You can use this example as a starting point to manipulate the folding configuration yourself. Instead of using the above code, you can also manually open one of the example .json files and set the values differently. Please be aware that the node attributes can not be set to arbitrary values. Especially the folding factors need to fulfil [certain constraints](https://finn-dev.readthedocs.io/en/latest/internals.html#constraints-to-folding-factors-per-layer). The other settings for node attributes, can be best looked up in the individual custom operator classes: [e.g. for MVAU](https://github.com/Xilinx/finn/blob/dev/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py#L64)" + ] + }, + { + "cell_type": "markdown", + "id": "4a675834", + "metadata": {}, + "source": [ + "## Additional builder arguments " + ] + }, + { + "cell_type": "markdown", + "id": "f7012b9a", + "metadata": {}, + "source": [ + "In this section, we will have a peak into additional builder arguments the FINN compiler exposes. We will not be able to cover all but you will be able to have a look at a list and we encourage you to take your time to look into the different options there are to customize the FINN builder configuration." + ] + }, + { + "cell_type": "markdown", + "id": "467d8829", + "metadata": {}, + "source": [ + "We start by enabling the verification flow in the builder. The FINN compiler applies multiple transformations to the model before it gets turned into hardware, so we need to make sure that the functional behavior of the network does not change." + ] + }, + { + "cell_type": "markdown", + "id": "e0c167f4", + "metadata": {}, + "source": [ + "### Verification steps " + ] + }, + { + "cell_type": "markdown", + "id": "308d52ba", + "metadata": {}, + "source": [ + "Earlier in the tutorial, we had a look at how build steps are written. When investigating the `step_tidy_up`, we can see that before the changed model is returned a verification step can be run. In the case of `step_tidy_up` it is the step `\"initial python\"` that can be initiated by setting `VerificationStepType.TIDY_UP_PYTHON`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fe7318e", + "metadata": {}, + "outputs": [], + "source": [ + "import finn.builder.build_dataflow_steps as build_dataflow_steps\n", + "showSrc(build_dataflow_steps.step_tidy_up)" + ] + }, + { + "cell_type": "markdown", + "id": "2bbb84fb", + "metadata": {}, + "source": [ + "Some of the default build steps have automatic verification enabled, when the corresponding verification step is set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce1aa025", + "metadata": {}, + "outputs": [], + "source": [ + "showSrc(build_cfg.VerificationStepType)" + ] + }, + { + "cell_type": "markdown", + "id": "da1a2b88", + "metadata": {}, + "source": [ + "In the cells below, we will use an example input from the CIFAR-10 data set and use the forward pass in Brevitas to generate a reference output. We save the input as `input.npy` and the reference output as `expected_output.npy`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e157d03c", + "metadata": {}, + "outputs": [], + "source": [ + "# Get golden io pair from Brevitas and save as .npy files\n", + "from finn.util.test import get_trained_network_and_ishape, get_example_input, get_topk\n", + "import numpy as np\n", + "\n", + "\n", + "(brevitas_model, ishape) = get_trained_network_and_ishape(\"cnv\", 2, 2)\n", + "input_tensor_npy = get_example_input(\"cnv\")\n", + "input_tensor_torch = torch.from_numpy(input_tensor_npy).float()\n", + "input_tensor_torch = ToTensor().forward(input_tensor_torch).detach()\n", + "output_tensor_npy = brevitas_model.forward(input_tensor_torch).detach().numpy()\n", + "output_tensor_npy = get_topk(output_tensor_npy, k=1)\n", + "\n", + "np.save(\"input.npy\", input_tensor_npy)\n", + "np.save(\"expected_output.npy\", output_tensor_npy)" + ] + }, + { + "cell_type": "markdown", + "id": "d03450e7", + "metadata": {}, + "source": [ + "In the next step we set up the builder flow again, this time we will set the build argument `verify_steps` and pass a list of verification steps." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5cd3032b", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with additional builder arguments enabled\n", + "## verification steps\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_with_verification\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_target_fps_parallelization\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " target_fps = 10000,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " steps = build_steps,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ],\n", + " verify_steps=[\n", + " build_cfg.VerificationStepType.QONNX_TO_FINN_PYTHON,\n", + " build_cfg.VerificationStepType.TIDY_UP_PYTHON,\n", + " build_cfg.VerificationStepType.STREAMLINED_PYTHON,\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "1d05b985", + "metadata": {}, + "source": [ + "When execution the code below, the verification will be invoked in the background. After the execution we can check if the verification was successful by investigating the output directory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3a46e76", + "metadata": {}, + "outputs": [], + "source": [ + "%%time\n", + "build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "markdown", + "id": "ca1d571d", + "metadata": {}, + "source": [ + "The output directory has now an additional directory called `verification_output`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ca74d537", + "metadata": {}, + "outputs": [], + "source": [ + "!ls {build_dir}/output_with_verification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "908ecda4", + "metadata": {}, + "outputs": [], + "source": [ + "!ls {build_dir}/output_with_verification/verification_output" + ] + }, + { + "cell_type": "markdown", + "id": "bcbc6f49", + "metadata": {}, + "source": [ + "The directory contains three .npy files. These files are the saved output files from the different verification steps. The suffix indicates if the array matches with the expected output. In our case, the suffix is for all verification steps `_SUCCESS`. Since the outputs are saved as .npy, we can open and investigate the files simply in Python." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a1b6ca9", + "metadata": {}, + "outputs": [], + "source": [ + "verify_initial_python = np.load(build_dir + \"/output_with_verification/verification_output/verify_initial_python_0_SUCCESS.npy\")\n", + "print(\"The output of the verification step after the step_tidy_up is: \" + str(verify_initial_python))" + ] + }, + { + "cell_type": "markdown", + "id": "6558e19e", + "metadata": {}, + "source": [ + "If the generated output does not match the expected output, these files can be used for debugging." + ] + }, + { + "cell_type": "markdown", + "id": "4609f94d", + "metadata": {}, + "source": [ + "### Other builder arguments " + ] + }, + { + "cell_type": "markdown", + "id": "37b6853d", + "metadata": {}, + "source": [ + "Next to the enablement of the verification flows, the FINN builder has numerous additional builder arguments to further customize your network. \n", + "Let's have a look at the options for the arguments. We want to only filter out the FINN specific arguments." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9f6aa29", + "metadata": {}, + "outputs": [], + "source": [ + "# Filter out methods\n", + "builder_args = [m for m in dir(build_cfg.DataflowBuildConfig) if not m.startswith('_')]\n", + "print(\"\\n\".join(builder_args))" + ] + }, + { + "cell_type": "markdown", + "id": "b12ab370", + "metadata": {}, + "source": [ + "There are attributes that come from the dataclasses-json class: `to_dict`, `to_json`, `schema`, `from_json`, `from_dict`. This class is used for the implementation of the FINN builder. In this tutorial, we are mainly interested in the FINN specific arguments. \n", + "\n", + "Some of these arguments we have seen already in the Cybersecurity notebook and in this notebook, e.g. `target_fps`, `fpga_part` and `folding_config_file`. In the code of the FINN builder, the function of each builder argument is documents, you can have a look [here](https://github.com/Xilinx/finn/blob/dev/src/finn/builder/build_dataflow_config.py#L155) and scroll through the available builder arguments." + ] + }, + { + "cell_type": "markdown", + "id": "9aba0493", + "metadata": {}, + "source": [ + "So far, in this notebook, we only looked at configurations up to the generation of estimate reports, a lot of these builder arguments actually become relevant at a later stage in the FINN flow.\n", + "\n", + "Let's have a look at the default build dataflow steps for the complete FINN flow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ec39b9f2", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"\\n\".join(build_cfg.default_build_dataflow_steps))" + ] + }, + { + "cell_type": "markdown", + "id": "b9bc5715", + "metadata": {}, + "source": [ + "You can see that after the generation of the estimate reports, the code generation and the ip generation is invoked (`step_hw_codegen` and `step_hw_ipgen`). The FIFO depths are determined and the FIFOs are inserted in the network (`step_set_fifo_depths`), we can then create an IP design of our whole network by stitching the IPs from each layer together (`step_create_stitched_ip`). At this point we have an implementation of the neural network that we can integrate within a bigger FPGA design, we can run performance measurements using simulation (`step_measure_rtlsim_performance`) and out-of-context synthesis (`step_out_of_context_synthesis`) for it.\n", + "The FINN builder also provides automatic system integration for Zynq and Alveo devices, this can be invoked by running `step_synthesize_bitfile`, `step_make_pynq_driver` and `step_deployment_package`." + ] + }, + { + "cell_type": "markdown", + "id": "76df000f", + "metadata": {}, + "source": [ + "You can have a closer look at each step by either using the `showSrc()` function or by accessing the doc string." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "caf49f03", + "metadata": {}, + "outputs": [], + "source": [ + "import finn.builder.build_dataflow_steps as build_dataflow_steps\n", + "print(build_dataflow_steps.step_hw_codegen.__doc__)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c84a9fbc", + "metadata": {}, + "outputs": [], + "source": [ + "showSrc(build_dataflow_steps.step_hw_codegen)" + ] + }, + { + "cell_type": "markdown", + "id": "c249f141", + "metadata": {}, + "source": [ + "This concludes the advanced builder settings tutorial. Below you can find code that can help you investigating more of the builder arguments and invoking the whole flow to generate a bitfile." + ] + }, + { + "cell_type": "markdown", + "id": "3b98eb65", + "metadata": {}, + "source": [ + "### Example for additional builder arguments & bitfile generation " + ] + }, + { + "cell_type": "markdown", + "id": "0dbdab42", + "metadata": {}, + "source": [ + "#### Standalone Thresholds" + ] + }, + { + "cell_type": "markdown", + "id": "e21ff36f", + "metadata": {}, + "source": [ + "In FINN, convolutions are expressed with three components:\n", + "* An Im2Col operation\n", + "* A matrix multiplication\n", + "* A MultiThreshold operation\n", + "\n", + "When converting these nodes into HW layers, by default the MatMul and the MultiThreshold gets converted into **one** component called Matrix-Vector-Activation Unit (MVAU). But the FINN compiler allows us to implement the activation separately. This gives an additional possibility for customization because we can adjust the folding parameters of the standalone threshold unit independently. \n", + "\n", + "If you would like to enable this feature, you can set the build argument `standalone_thresholds` to `True`. In the code below this feature is enabled and you can have a look at the generated .onnx file. Please note that you need to uncomment the code first." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2619ebde", + "metadata": {}, + "outputs": [], + "source": [ + "## Build flow with additional builder arguments enabled\n", + "## standalone_thresholds = True\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_standalone_thresholds\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_target_fps_parallelization\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + "]\n", + "\n", + "cfg_estimates = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " target_fps = 10000,\n", + " synth_clk_period_ns = 10.0,\n", + " fpga_part = \"xc7z020clg400-1\",\n", + " standalone_thresholds = True,\n", + " steps = build_steps,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b2e9bc42", + "metadata": {}, + "outputs": [], + "source": [ + "#%%time\n", + "#build.build_dataflow_cfg(model_file, cfg_estimates);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32ae296e", + "metadata": {}, + "outputs": [], + "source": [ + "#showInNetron(build_dir+\"/output_standalone_thresholds/intermediate_models/step_generate_estimate_reports.onnx\")" + ] + }, + { + "cell_type": "markdown", + "id": "601eb5f8", + "metadata": {}, + "source": [ + "#### Run the whole flow" + ] + }, + { + "cell_type": "markdown", + "id": "42aa929b", + "metadata": {}, + "source": [ + "The code below can be used to invoke the full builder flow and obtain more output products, be aware that this runs synthesis and bitfile generation and it might take over an hour. Please note that you need to uncomment the code first." + ] + }, + { + "cell_type": "markdown", + "id": "ffa2a352", + "metadata": {}, + "source": [ + "For an optimized design, we saved a local copy of the folding configuration for cnv-w2a2 on the Pynq-Z1 board from [finn-examples](https://github.com/Xilinx/finn-examples) in this folder. And will pass it to the build flow. Please also note below that we now pass the board as argument to the builder (`board = \"Pynq-Z1\"`) instead of just the fpga part. This time we will select all possible outputs to generate. Please be aware that running the full build might take a few hours." + ] + }, + { + "cell_type": "markdown", + "id": "8d1b041f-027c-444e-81ac-98ce9b6d1b51", + "metadata": {}, + "source": [ + "Note that we set one additional argument: `default_swg_exception = True`. This is done because this example is customized to fit on the Pynq-Z1 board, to optimize the resources we remove FIFOs between SWGs and MVAUs manually to avoid unnecessary buffering." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4efd46f4", + "metadata": {}, + "outputs": [], + "source": [ + "import finn.builder.build_dataflow as build\n", + "import finn.builder.build_dataflow_config as build_cfg\n", + "import os\n", + "import shutil\n", + "\n", + "## Build flow with hardware build\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/advanced\"\n", + "model_file = model_dir + \"/end2end_cnv_w2a2_export.onnx\"\n", + "\n", + "output_dir = build_dir + \"/output_bitfile\"\n", + "\n", + "#Delete previous run results if exist\n", + "if os.path.exists(output_dir):\n", + " shutil.rmtree(output_dir)\n", + " print(\"Previous run results deleted!\")\n", + "\n", + "build_steps = [\n", + " custom_step_add_pre_proc,\n", + " custom_step_add_post_proc,\n", + " \"step_qonnx_to_finn\",\n", + " \"step_tidy_up\",\n", + " \"step_streamline\",\n", + " \"step_convert_to_hw\",\n", + " \"step_create_dataflow_partition\",\n", + " \"step_specialize_layers\",\n", + " \"step_target_fps_parallelization\",\n", + " \"step_apply_folding_config\",\n", + " \"step_minimize_bit_width\",\n", + " \"step_generate_estimate_reports\",\n", + " \"step_hw_codegen\",\n", + " \"step_hw_ipgen\",\n", + " \"step_set_fifo_depths\",\n", + " \"step_create_stitched_ip\",\n", + " \"step_measure_rtlsim_performance\",\n", + " \"step_out_of_context_synthesis\",\n", + " \"step_synthesize_bitfile\",\n", + " \"step_make_pynq_driver\",\n", + " \"step_deployment_package\",\n", + "]\n", + "\n", + "cfg_build = build.DataflowBuildConfig(\n", + " output_dir = output_dir,\n", + " mvau_wwidth_max = 80,\n", + " synth_clk_period_ns = 10.0,\n", + " #specialize_layers_config_file = \"specialize_layers_all_hls.json\",\n", + " folding_config_file = \"cnv-w2a2_folding_config.json\",\n", + " board = \"Pynq-Z1\",\n", + " shell_flow_type = build_cfg.ShellFlowType.VIVADO_ZYNQ,\n", + " steps = build_steps,\n", + " default_swg_exception = True,\n", + " generate_outputs=[\n", + " build_cfg.DataflowOutputType.ESTIMATE_REPORTS,\n", + " build_cfg.DataflowOutputType.STITCHED_IP,\n", + " build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE,\n", + " build_cfg.DataflowOutputType.OOC_SYNTH,\n", + " build_cfg.DataflowOutputType.BITFILE,\n", + " build_cfg.DataflowOutputType.PYNQ_DRIVER,\n", + " build_cfg.DataflowOutputType.DEPLOYMENT_PACKAGE,\n", + " ],\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c7ff6c19", + "metadata": {}, + "outputs": [], + "source": [ + "#%%time\n", + "#build.build_dataflow_cfg(model_file, cfg_build);" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/advanced/cnv-w2a2_folding_config.json b/notebooks/advanced/cnv-w2a2_folding_config.json new file mode 100644 index 0000000000..68409ff695 --- /dev/null +++ b/notebooks/advanced/cnv-w2a2_folding_config.json @@ -0,0 +1,79 @@ +{ + "Defaults": {}, + "Thresholding_hls_0": { + "PE": 1, + "ram_style": "distributed" + }, + "ConvolutionInputGenerator_rtl_0": { + "SIMD": 3, + "ram_style": "distributed" + }, + "MVAU_hls_0": { + "PE": 8, + "SIMD": 3, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_1": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_1": { + "PE": 16, + "SIMD": 16, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_2": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_2": { + "PE": 8, + "SIMD": 16, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_3": { + "SIMD": 16, + "ram_style": "distributed" + }, + "MVAU_hls_3": { + "PE": 8, + "SIMD": 16, + "ram_style": "block" + }, + "ConvolutionInputGenerator_rtl_4": { + "SIMD": 8, + "ram_style": "distributed" + }, + "MVAU_hls_4": { + "PE": 4, + "SIMD": 8, + "ram_style": "auto" + }, + "ConvolutionInputGenerator_rtl_5": { + "SIMD": 8, + "ram_style": "distributed" + }, + "MVAU_hls_5": { + "PE": 1, + "SIMD": 8, + "ram_style": "auto" + }, + "MVAU_hls_6": { + "PE": 1, + "SIMD": 2, + "ram_style": "distributed" + }, + "MVAU_hls_7": { + "PE": 2, + "SIMD": 2, + "ram_style": "block" + }, + "MVAU_hls_8": { + "PE": 5, + "SIMD": 1, + "ram_style": "distributed" + }, + "LabelSelect_hls_0": { + "PE": 1 + } +} diff --git a/notebooks/advanced/cybsec_PE_SIMD.onnx b/notebooks/advanced/cybsec_PE_SIMD.onnx new file mode 100644 index 0000000000..8d42b2e37b Binary files /dev/null and b/notebooks/advanced/cybsec_PE_SIMD.onnx differ diff --git a/notebooks/advanced/finn-dataflow.png b/notebooks/advanced/finn-dataflow.png new file mode 100755 index 0000000000..ebe98d0fbd Binary files /dev/null and b/notebooks/advanced/finn-dataflow.png differ diff --git a/notebooks/advanced/finn-folding-mvau.png b/notebooks/advanced/finn-folding-mvau.png new file mode 100755 index 0000000000..bbba00182c Binary files /dev/null and b/notebooks/advanced/finn-folding-mvau.png differ diff --git a/notebooks/advanced/finn-folding.png b/notebooks/advanced/finn-folding.png new file mode 100755 index 0000000000..019b4aa1e7 Binary files /dev/null and b/notebooks/advanced/finn-folding.png differ diff --git a/notebooks/basics/0_how_to_work_with_onnx.ipynb b/notebooks/basics/0_how_to_work_with_onnx.ipynb index 35a83ea97b..f1b3dcf68b 100644 --- a/notebooks/basics/0_how_to_work_with_onnx.ipynb +++ b/notebooks/basics/0_how_to_work_with_onnx.ipynb @@ -613,9 +613,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/basics/1_brevitas_network_import.ipynb b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb similarity index 65% rename from notebooks/basics/1_brevitas_network_import.ipynb rename to notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb index a884e90d75..5c2f10310f 100644 --- a/notebooks/basics/1_brevitas_network_import.ipynb +++ b/notebooks/basics/1_brevitas_network_import_via_QONNX.ipynb @@ -4,13 +4,15 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Importing Brevitas networks into FINN\n", + "# Importing Brevitas networks into FINN with the QONNX interchange format\n", + "\n", + "**Note: Previously it was possible to directly export the FINN-ONNX interchange format from Brevitas to pass to the FINN compiler. This support is deprecated and FINN uses the export to the QONNX format as a front end, internally FINN uses still the FINN-ONNX format.**\n", "\n", "In this notebook we'll go through an example of how to import a Brevitas-trained QNN into FINN. The steps will be as follows:\n", "\n", "1. Load up the trained PyTorch model\n", - "2. Call Brevitas FINN-ONNX export and visualize with Netron\n", - "3. Import into FINN and call cleanup transformations\n", + "2. Call Brevitas QONNX export and visualize with Netron\n", + "3. Import into FINN and converting QONNX to FINN-ONNX\n", "\n", "We'll use the following utility functions to print the source code for function calls (`showSrc()`) and to visualize a network using netron (`showInNetron()`) in the Jupyter notebook:" ] @@ -120,15 +122,18 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 2. Call Brevitas FINN-ONNX export and visualize with Netron\n", + "## 2. Call Brevitas QONNX export and visualize with Netron\n", + "\n", + "Brevitas comes with built-in QONNX export functionality. This is similar to the regular ONNX export capabilities of PyTorch, with a few differences:\n", "\n", - "Brevitas comes with built-in FINN-ONNX export functionality. This is similar to the regular ONNX export capabilities of PyTorch, with a few differences:\n", + "1. Weight and activation quantization is represented as a 'fake-quantization' with Quant and BipolarQuant nodes.\n", + "2. Truncation operations as required by average pooling are represented with a Trunc node.\n", "\n", - "1. The weight quantization logic is not exported as part of the graph; rather, the quantized weights themselves are exported.\n", - "2. Special quantization annotations are used to preserve the low-bit quantization information. ONNX (at the time of writing) supports 8-bit quantization as the minimum bitwidth, whereas FINN-ONNX quantization annotations can go down to binary/bipolar quantization.\n", - "3. Low-bit quantized activation functions are exported as MultiThreshold operators.\n", + "One can read more about how QONNX works and why it was developed here: https://xilinx.github.io/finn//2021/11/03/qonnx-and-finn.html\n", "\n", - "It's actually quite straightforward to export ONNX from our Brevitas model as follows:" + "Additionally QONNX comes with a set of tools for working with the format. These are maintained together with the Fast Machinelearning collaboration as an open-source projet here: https://github.com/fastmachinelearning/qonnx\n", + "\n", + "It's actually quite straightforward to export QONNX from our Brevitas model as follows:" ] }, { @@ -137,10 +142,10 @@ "metadata": {}, "outputs": [], "source": [ - "import brevitas.onnx as bo\n", - "export_onnx_path = \"/tmp/LFCW1A1.onnx\"\n", + "from brevitas.export import export_qonnx\n", + "export_onnx_path = \"/tmp/LFCW1A1_qonnx.onnx\"\n", "input_shape = (1, 1, 28, 28)\n", - "bo.export_finn_onnx(lfc, input_shape, export_onnx_path)" + "export_qonnx(lfc, torch.randn(input_shape), export_onnx_path);" ] }, { @@ -156,23 +161,23 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron('/tmp/LFCW1A1.onnx')" + "showInNetron(export_onnx_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "When running this notebook in the FINN Docker container, you should be able to see an interactive visualization of the imported network above, and click on individual nodes to inspect their parameters. If you look at any of the MatMul nodes, you should be able to see that the weights are all {-1, +1} values, and the activations are Sign functions." + "When running this notebook in the FINN Docker container, you should be able to see an interactive visualization of the imported network above, and click on individual nodes to inspect their parameters. If you look at any of the MatMul nodes, you should be able to see that the weights are all {-1, +1} values." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Import into FINN and call cleanup transformations\n", + "## 3. Import into FINN and converting QONNX to FINN-ONNX\n", "\n", - "We will now import this ONNX model into FINN using the ModelWrapper, and examine some of the graph attributes from Python." + "We will first run a cleanup transformation on the exported QONNX model." ] }, { @@ -181,16 +186,10 @@ "metadata": {}, "outputs": [], "source": [ - "from qonnx.core.modelwrapper import ModelWrapper\n", - "model = ModelWrapper(export_onnx_path)\n", - "model.graph.node[8]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The ModelWrapper exposes a range of other useful functions as well. For instance, by convention the second input of the MatMul node will be a pre-initialized weight tensor, which we can view using the following:" + "from qonnx.util.cleanup import cleanup\n", + "\n", + "export_onnx_path_cleaned = \"/tmp/LFCW1A1-qonnx-clean.onnx\"\n", + "cleanup(export_onnx_path, out_file=export_onnx_path_cleaned)" ] }, { @@ -199,14 +198,14 @@ "metadata": {}, "outputs": [], "source": [ - "model.get_initializer(model.graph.node[8].input[1])" + "showInNetron(export_onnx_path_cleaned)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can also examine the quantization annotations and shapes of various tensors using the convenience functions provided by ModelWrapper." + "We will now import this QONNX model into FINN using the ModelWrapper. Here we can immediatley execute the model to verify correctness." ] }, { @@ -215,7 +214,14 @@ "metadata": {}, "outputs": [], "source": [ - "model.get_tensor_datatype(model.graph.node[8].input[1]).name" + "from qonnx.core.modelwrapper import ModelWrapper\n", + "import qonnx.core.onnx_exec as oxe\n", + "model = ModelWrapper(export_onnx_path_cleaned)\n", + "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n", + "output_dict = oxe.execute_onnx(model, input_dict)\n", + "produced_qonnx = output_dict[list(output_dict.keys())[0]]\n", + "\n", + "produced_qonnx" ] }, { @@ -224,14 +230,14 @@ "metadata": {}, "outputs": [], "source": [ - "model.get_tensor_shape(model.graph.node[8].input[1])" + "np.isclose(produced, produced_qonnx).all()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "If we want to operate further on this model in FINN, it is a good idea to execute certain \"cleanup\" transformations on this graph. Here, we will run shape inference and constant folding on this graph, and visualize the resulting graph in Netron again." + "Using the `QONNXtoFINN` transformation we can convert the model to the FINN internal FINN-ONNX representation. Notably all Quant and BipolarQuant nodes will have disappeared and are converted into MultiThreshold nodes." ] }, { @@ -240,12 +246,13 @@ "metadata": {}, "outputs": [], "source": [ - "from qonnx.transformation.fold_constants import FoldConstants\n", - "from qonnx.transformation.infer_shapes import InferShapes\n", - "model = model.transform(InferShapes())\n", - "model = model.transform(FoldConstants())\n", - "export_onnx_path_transformed = \"/tmp/LFCW1A1-clean.onnx\"\n", - "model.save(export_onnx_path_transformed)" + "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n", + "model = ModelWrapper(export_onnx_path_cleaned)\n", + "\n", + "model = model.transform(ConvertQONNXtoFINN())\n", + "\n", + "export_onnx_path_converted = \"/tmp/LFCW1A1-qonnx-converted.onnx\"\n", + "model.save(export_onnx_path_converted)" ] }, { @@ -254,14 +261,14 @@ "metadata": {}, "outputs": [], "source": [ - "showInNetron('/tmp/LFCW1A1-clean.onnx')" + "showInNetron(export_onnx_path_converted)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that the resulting graph has become smaller and simpler. Specifically, the input reshaping is now a single Reshape node instead of the Shape -> Gather -> Unsqueeze -> Concat -> Reshape sequence. We can now use the internal ONNX execution capabilities of FINN to ensure that we still get the same output from this model as we did with PyTorch." + "And once again we can execute the model with the FINN/QONNX execution engine." ] }, { @@ -270,8 +277,8 @@ "metadata": {}, "outputs": [], "source": [ - "import finn.core.onnx_exec as oxe\n", - "input_dict = {\"0\": nph.to_array(input_tensor)}\n", + "model = ModelWrapper(export_onnx_path_cleaned)\n", + "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n", "output_dict = oxe.execute_onnx(model, input_dict)\n", "produced_finn = output_dict[list(output_dict.keys())[0]]\n", "\n", @@ -284,7 +291,7 @@ "metadata": {}, "outputs": [], "source": [ - "np.isclose(produced, produced_finn).all()" + "np.isclose(produced_qonnx, produced_finn).all()" ] }, { @@ -311,9 +318,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb index 388accad3a..3141d54ddf 100644 --- a/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/cnv_end2end_example.ipynb @@ -46,8 +46,8 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n", - "There is an additional section for functional verification (red section) on the left side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", + "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n", + "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", "\n", "\n", "We will use the helper function `showInNetron` to show the ONNX model at the current transformation step. The Netron displays are interactive, but they only work when running the notebook actively and not on GitHub (i.e. if you are viewing this on GitHub you'll only see blank squares)." @@ -72,7 +72,7 @@ "source": [ "## 1. Brevitas Export, FINN Import and Tidy-Up\n", "\n", - "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology." + "Similar to what we did in the TFC-w1a1 end-to-end notebook, we will start by exporting the [pretrained CNV-w1a1 network](https://github.com/Xilinx/brevitas/tree/master/src/brevitas_examples/bnn_pynq) to ONNX, importing that into FINN and running the \"tidy-up\" transformations to have a first look at the topology. The network will be exported in QONNX format and then converted into the FINN-ONNX format to prepare it for the FINN compiler." ] }, { @@ -81,17 +81,23 @@ "metadata": {}, "outputs": [], "source": [ + "import torch\n", "import onnx\n", "from finn.util.test import get_test_model_trained\n", - "import brevitas.onnx as bo\n", + "from brevitas.export import export_qonnx\n", + "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n", "from qonnx.core.modelwrapper import ModelWrapper\n", + "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n", "from qonnx.transformation.infer_shapes import InferShapes\n", "from qonnx.transformation.fold_constants import FoldConstants\n", "from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames, RemoveStaticGraphInputs\n", "\n", "cnv = get_test_model_trained(\"CNV\", 1, 1)\n", - "bo.export_finn_onnx(cnv, (1, 3, 32, 32), build_dir + \"/end2end_cnv_w1a1_export.onnx\")\n", - "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_export.onnx\")\n", + "export_onnx_path = build_dir + \"/end2end_cnv_w1a1_export.onnx\"\n", + "export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path)\n", + "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)\n", + "model = ModelWrapper(export_onnx_path)\n", + "model = model.transform(ConvertQONNXtoFINN())\n", "model = model.transform(InferShapes())\n", "model = model.transform(FoldConstants())\n", "model = model.transform(GiveUniqueNodeNames())\n", @@ -148,10 +154,12 @@ "# preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n", "totensor_pyt = ToTensor()\n", "chkpt_preproc_name = build_dir+\"/end2end_cnv_w1a1_preproc.onnx\"\n", - "bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)\n", + "export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name)\n", + "qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)\n", + "pre_model = ModelWrapper(chkpt_preproc_name)\n", + "pre_model = pre_model.transform(ConvertQONNXtoFINN())\n", "\n", "# join preprocessing and core model\n", - "pre_model = ModelWrapper(chkpt_preproc_name)\n", "model = model.transform(MergeONNXModels(pre_model))\n", "# add input quantization annotation: UINT8 for all BNN-PYNQ models\n", "global_inp_name = model.graph.input[0].name\n", @@ -199,7 +207,7 @@ "\n", "![](cnv-mp-fc.png)\n", "\n", - "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU), but now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib).\n", + "Note how the convolution layer looks very similar to the fully connected one in terms of the matrix-vector-threshold unit (MVTU) or sometimes called matrix-vector-activation unit (MVAU). But now the MVTU is preceded by a sliding window unit that produces the matrix from the input image. All of these building blocks, including the `MaxPool` layer you see in this figure, exist as templated Vitis HLS C++ functions in [finn-hlslib](https://github.com/Xilinx/finn-hlslib) and/or as RTL modules in [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib).\n", "\n", "\n", "To target this kind of hardware architecture with our network we'll apply a convolution lowering transformation, in addition to streamlining. You may recall the *streamlining transformation* that we applied to the TFC-w1a1 network, which is a series of mathematical simplifications that allow us to get rid of floating point scaling operations by implementing few-bit activations as thresholding operations. \n", @@ -240,11 +248,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We won't go into too much detail about what happens in each transformation and why they are called in the particular order they are (feel free to visualize the intermediate steps using Netron yourself if you are curious) but here is a brief summmmary:\n", + "We won't go into too much detail about what happens in each transformation and why they are called in the particular order they are (feel free to visualize the intermediate steps using Netron yourself if you are curious) but here is a brief summary:\n", "\n", "* `Streamline` moves floating point scaling and addition operations closer to the input of the nearest thresholding activation and absorbs them into thresholds\n", "* `LowerConvsToMatMul` converts ONNX `Conv` nodes into sequences of `Im2Col, MatMul` nodes as discussed above. `Im2Col` is a custom FINN ONNX high-level node type that implements the sliding window operator.\n", - "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n", + "* `MakeMaxPoolNHWC` and `AbsorbTransposeIntoMultiThreshold` convert the *data layout* of the network into the NHWC data layout that finn-hlslib and finn-rtllib primitives use. NCHW means the tensor dimensions are ordered as `(N : batch, H : height, W : width, C : channels)` (assuming 2D images). The ONNX standard ops normally use the NCHW layout, but the ONNX intermediate representation itself does not dictate any data layout.\n", "* You may recall `ConvertBipolarMatMulToXnorPopcount` from the TFC-w1a1 example, which is needed to implement bipolar-by-bipolar (w1a1) networks correctly using finn-hlslib.\n", "\n", "Let's visualize the streamlined and lowered network with Netron. Observe how all the `Conv` nodes have turned into pairs of `Im2Col, MatMul` nodes, and many nodes including `BatchNorm, Mul, Add` nodes have disappeared and replaced with `MultiThreshold` nodes." @@ -263,9 +271,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## 3. Partitioning, Conversion to HLS Layers and Folding\n", + "## 3. Partitioning, Conversion to HW Layers and Folding\n", "\n", - "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HLS equivalents and separate them out into a *dataflow partition*:\n" + "The next steps will be (again) very similar to what we did for the TFC-w1a1 network. We'll first convert the layers that we can put into the FPGA into their HW equivalents, separate them out into a *dataflow partition* and specialize them to HLS variants:\n" ] }, { @@ -274,27 +282,25 @@ "metadata": {}, "outputs": [], "source": [ - "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n", + "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n", "from finn.transformation.fpgadataflow.create_dataflow_partition import (\n", " CreateDataflowPartition,\n", ")\n", "from finn.transformation.move_reshape import RemoveCNVtoFCFlatten\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "from qonnx.custom_op.registry import getCustomOp\n", "from qonnx.transformation.infer_data_layouts import InferDataLayouts\n", "\n", - "# choose the memory mode for the MVTU units, decoupled or const\n", - "mem_mode = \"decoupled\"\n", - "\n", "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_streamlined.onnx\")\n", - "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode))\n", - "model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode))\n", + "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n", + "model = model.transform(to_hw.InferQuantizedMatrixVectorActivation())\n", "# TopK to LabelSelect\n", - "model = model.transform(to_hls.InferLabelSelectLayer())\n", + "model = model.transform(to_hw.InferLabelSelectLayer())\n", "# input quantization (if any) to standalone thresholding\n", - "model = model.transform(to_hls.InferThresholdingLayer())\n", - "model = model.transform(to_hls.InferConvInpGen())\n", - "model = model.transform(to_hls.InferStreamingMaxPool())\n", - "# get rid of Reshape(-1, 1) operation between hlslib nodes\n", + "model = model.transform(to_hw.InferThresholdingLayer())\n", + "model = model.transform(to_hw.InferConvInpGen())\n", + "model = model.transform(to_hw.InferStreamingMaxPool())\n", + "# get rid of Reshape(-1, 1) operation between hw nodes\n", "model = model.transform(RemoveCNVtoFCFlatten())\n", "# get rid of Tranpose -> Tranpose identity seq\n", "model = model.transform(absorb.AbsorbConsecutiveTransposes())\n", @@ -306,7 +312,9 @@ "sdp_node = getCustomOp(sdp_node)\n", "dataflow_model_filename = sdp_node.get_nodeattr(\"model\")\n", "# save the dataflow partition with a different name for easier access\n", + "# and specialize the layers to HLS variants\n", "dataflow_model = ModelWrapper(dataflow_model_filename)\n", + "dataflow_model = dataflow_model.transform(SpecializeLayers())\n", "dataflow_model.save(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")" ] }, @@ -314,7 +322,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations in hlslib. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*" + "Notice the additional `RemoveCNVtoFCFlatten` transformation that was not used for TFC-w1a1. In the last Netron visualization you may have noticed a `Reshape` operation towards the end of the network where the convolutional part of the network ends and the fully-connected layers started. That `Reshape` is essentialy a tensor flattening operation, which we can remove for the purposes of hardware implementation. We can examine the contents of the dataflow partition with Netron, and observe the `ConvolutionInputGenerator`, `MatrixVectorActivation` and `StreamingMaxPool_Batch` nodes that implement the sliding window, matrix multiply and maxpool operations. *Note that the MatrixVectorActivation instances following the ConvolutionInputGenerator nodes are really implementing the convolutions, despite the name. The final three MatrixVectorActivation instances implement actual FC layers.*" ] }, { @@ -356,7 +364,7 @@ "outputs": [], "source": [ "model = ModelWrapper(build_dir + \"/end2end_cnv_w1a1_dataflow_model.onnx\")\n", - "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "# each tuple is (PE, SIMD, in_fifo_depth) for a layer\n", "folding = [\n", " (16, 3, [128]),\n", @@ -376,7 +384,7 @@ " fcl_inst.set_nodeattr(\"inFIFODepths\", ififodepth)\n", "\n", "# use same SIMD values for the sliding window operators\n", - "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator\")\n", + "swg_layers = model.get_nodes_by_op_type(\"ConvolutionInputGenerator_rtl\")\n", "for i in range(len(swg_layers)):\n", " swg_inst = getCustomOp(swg_layers[i])\n", " simd = folding[i][1]\n", @@ -390,7 +398,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Below we visualize in Netron to observe the `StreamingDataWidthConverter` and `StreamingFIFO` nodes that have been inserted into graph, as well as the folding factors in the `PE` and `SIMD` attributes of each `MatrixVectorActivation`." + "Below we visualize in Netron to observe the folding factors in the `PE` and `SIMD` attributes of each `MVAU_hls`." ] }, { @@ -508,12 +516,13 @@ "metadata": {}, "outputs": [], "source": [ - "import pkg_resources as pk\n", + "import importlib_resources\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "\n", - "fn = pk.resource_filename(\"finn.qnn-data\", \"cifar10/cifar10-test-data-class3.npz\")\n", - "x = np.load(fn)[\"arr_0\"]\n", + "ref = importlib_resources.files(\"finn.qnn-data\") / \"cifar10/cifar10-test-data-class3.npz\"\n", + "with importlib_resources.as_file(ref) as fn:\n", + " x = np.load(fn)[\"arr_0\"]\n", "x = x.reshape(3, 32,32).transpose(1, 2, 0)\n", "plt.imshow(x)" ] @@ -632,9 +641,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg index fa36be96c5..561770f2da 100755 --- a/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg +++ b/notebooks/end2end_example/bnn-pynq/finn-design-flow-example.svg @@ -1 +1 @@ - + diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb index eec17b2fa7..bbaa74dbff 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_example.ipynb @@ -33,7 +33,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) for the Vitis HLS synthesis and Vivado IPI stitching (orange section), and finally building a PYNQ overlay bitfile and testing it on a PYNQ board (yellow section).\n", + "The white fields show the state of the network representation in the respective step. The colored fields represent the transformations that are applied to the network to achieve a certain result. The diagram is divided into 5 sections represented by a different color, each of it includes several flow steps. The flow starts in top left corner with Brevitas export (green section), followed by the preparation of the network (blue section) to bring the network into a form in which each layer can be represented by either a Vitis HLS function or a Verilog module. The model then gets passed to Vivado IPI stitching (orange section), and finally a PYNQ overlay bitfile is built and can be tested on a PYNQ board (yellow section).\n", "There is an additional section for functional verification (red section) on the right side of the diagram, which we will not cover in this notebook. For details please take a look in the verification notebook which you can find [here](tfc_end2end_verification.ipynb)\n", "\n", "\n", @@ -81,19 +81,23 @@ "metadata": {}, "outputs": [], "source": [ + "import torch\n", "import onnx\n", "from finn.util.test import get_test_model_trained\n", - "import brevitas.onnx as bo\n", + "from brevitas.export import export_qonnx\n", + "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n", "\n", "tfc = get_test_model_trained(\"TFC\", 1, 1)\n", - "bo.export_finn_onnx(tfc, (1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\"); # semicolon added to suppress log" + "export_onnx_path = build_dir+\"/tfc_w1_a1.onnx\"\n", + "export_qonnx(tfc, torch.randn(1, 1, 28, 28), build_dir+\"/tfc_w1_a1.onnx\"); # semicolon added to suppress log\n", + "qonnx_cleanup(export_onnx_path, out_file=export_onnx_path)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The model was now exported, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n", + "The model was now exported in QONNX format, loaded with the pretrained weights and saved under the name \"tfc_w1_a1.onnx\".\n", "To visualize the exported model, Netron can be used. Netron is a visualizer for neural networks and allows interactive investigation of network properties. For example, you can click on the individual nodes and view the properties." ] }, @@ -110,7 +114,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. 'ModelWrapper' is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN." + "Now that we have the model in .onnx format, we can work with it using FINN. For that, `ModelWrapper` is used. It is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. `ModelWrapper` is imported from the [QONNX repo](https://github.com/fastmachinelearning/qonnx), this repository contains several functionality that is used in FINN. The model was exported in QONNX format, to feed it into the FINN flow, our first step is to convert it to the FINN-ONNX format." ] }, { @@ -120,7 +124,26 @@ "outputs": [], "source": [ "from qonnx.core.modelwrapper import ModelWrapper\n", - "model = ModelWrapper(build_dir+\"/tfc_w1_a1.onnx\")" + "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n", + "model = ModelWrapper(build_dir+\"/tfc_w1_a1.onnx\")\n", + "model = model.transform(ConvertQONNXtoFINN())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the conversion we save the model and visualize it using Netron. As you can see, quantization is now expressed differently. Where we had Quant nodes before, there are now MultiThreshold nodes present in the graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model.save(build_dir+\"/tfc_w1_a1_finn.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_finn.onnx\")" ] }, { @@ -143,8 +166,9 @@ "* [FINN-style Dataflow Architectures](#dataflow_arch)\n", "* [Tidy-up transformations](#basic_trafo)\n", "* [Streamlining](#streamline)\n", - "* [Conversion to HLS layers](#hls_layers)\n", + "* [Conversion to HW layers](#hw_layers)\n", "* [Creating a Dataflow Partition](#dataflow_partition)\n", + "* [Specialize layers](#specialize_layers)\n", "* [Folding and Datawidth Converter, FIFO and TLastMarker Insertion](#folding)\n", "\n", "\n", @@ -161,7 +185,7 @@ "\n", "![](finn-hw-arch.png)\n", "\n", - "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library. As these function calls can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls, which is the goal of the network preparation process." + "In practice, the compute arrays are instantiated by function calls to optimized Vitis HLS building blocks from the [finn-hlslib](https://github.com/Xilinx/finn-hlslib) library or by Verilog modules from the [finn-rtllib](https://github.com/Xilinx/finn/tree/main/finn-rtllib). As these function calls/modules can only handle certain patterns/cases, we need to transform the network into an appropriate form so that we can replace network layers with these function calls/modules, which is the goal of the network preparation process." ] }, { @@ -248,7 +272,7 @@ "\n", "In FINN, we can bake some of these pre/postprocessing operatings into the graph, and in some cases these can be highly beneficial for performance by allowing our accelerator to directly consume raw data instead of going through CPU preprocessing. \n", "\n", - "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L86), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use." + "We'll demonstrate this for our small image classification network as follows. Brevitas preprocesses BNN-PYNQ network inputs with `torchvision.transforms.ToTensor()` [prior to training](https://github.com/Xilinx/brevitas/blob/master/src/brevitas_examples/bnn_pynq/trainer.py#L93), which converts 8-bit RGB values into floats between 0 and 1 by dividing the input by 255. We can achieve the same effect in FINN by exporting a single-node ONNX graph for division by 255 (which already exists as `finn.util.pytorch.ToTensor` and merging this with our original model. Finally, we're going to mark our input tensor as 8-bit to let FINN know which level of precision to use." ] }, { @@ -267,10 +291,12 @@ "# preprocessing: torchvision's ToTensor divides uint8 inputs by 255\n", "totensor_pyt = ToTensor()\n", "chkpt_preproc_name = build_dir+\"/tfc_w1_a1_preproc.onnx\"\n", - "bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name)\n", + "export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name)\n", + "qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name)\n", + "pre_model = ModelWrapper(chkpt_preproc_name)\n", + "pre_model = pre_model.transform(ConvertQONNXtoFINN())\n", "\n", "# join preprocessing and core model\n", - "pre_model = ModelWrapper(chkpt_preproc_name)\n", "model = model.transform(MergeONNXModels(pre_model))\n", "# add input quantization annotation: UINT8 for all BNN-PYNQ models\n", "global_inp_name = model.graph.input[0].name\n", @@ -399,32 +425,25 @@ "model = model.transform(InferDataLayouts())\n", "model = model.transform(RemoveUnusedTensors())\n", "\n", - "model.save(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n", - "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")" + "model.save(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to HLS layers." + "Observe the pairs of `XnorPopcountmatMul` and `MultiThreshold` layers following each other -- this is the particular pattern that the next step will be looking for in order to convert them to hardware (HW) layers." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Conversion to HLS layers \n", - "Converts the nodes to HLS layers that correspond to the functions in [finn-hls library](https://finn-hlslib.readthedocs.io/en/latest/). In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MatrixVectorActivation layers. Any immediately following MultiThreshold layers will also be absorbed into the MVTU.\n", + "### Conversion to HW layers \n", + "Converts the nodes to HW layers, these layers are abstraction layers that do not directly correspond to an HLS or Verilog implementation but they will be converted in either one later in the flow. In our case this transformation converts pairs of binary XnorPopcountMatMul layers to MVAU layers (matrix vector activation unit). Any immediately following MultiThreshold layers will also be absorbed into the MVAU.\n", "\n", - "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MatrixVectorActivation` nodes, which will correspond to a function call from the [finn-hlslib](https://finn-hlslib.readthedocs.io/en/latest/library/matrixvector.html) library." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Note:** The transformation `to_hls.InferBinaryMatrixVectorActivation` gets the string \"decoupled\" as argument, this indicates the `mem_mode` for the weights. In FINN there are different options to set the way the weights are stored and accessed. For details please have a look on the [FINN readthedocs website](https://finn.readthedocs.io/) under Internals." + "Below is the code for the transformation and the network is visualized using netron to create the new structure with `MVAU` nodes." ] }, { @@ -433,22 +452,15 @@ "metadata": {}, "outputs": [], "source": [ - "import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls\n", - "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")\n", - "model = model.transform(to_hls.InferBinaryMatrixVectorActivation(\"decoupled\"))\n", + "import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw\n", + "model = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")\n", + "model = model.transform(to_hw.InferBinaryMatrixVectorActivation())\n", "# TopK to LabelSelect\n", - "model = model.transform(to_hls.InferLabelSelectLayer())\n", + "model = model.transform(to_hw.InferLabelSelectLayer())\n", "# input quantization (if any) to standalone thresholding\n", - "model = model.transform(to_hls.InferThresholdingLayer())\n", - "model.save(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n", - "showInNetron(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Each MatrixVectorActivation node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. We will shortly cover how these can be adjusted, but first we want to separate the HLS layers from the non-HLS layers in this network." + "model = model.transform(to_hw.InferThresholdingLayer())\n", + "model.save(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")" ] }, { @@ -457,7 +469,7 @@ "source": [ "### Creating a Dataflow Partition \n", "\n", - "In the graph above, you can see that there is a mixture of FINN HLS layers (MatrixVectorActivation and Thresholding_Batch) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HLS layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition." + "In the graph above, you can see that there is a mixture of FINN HW layers (`MVAU` and `Thresholding`) with one regular ONNX layers (Reshape). To create a bitstream, FINN needs a model with only HW layers. In order to achieve this, we will use the `CreateDataflowPartition` transformation to create a \"dataflow partition\" in this graph, separating out the HLS layers into another model, and replacing them with a placeholder layer called StreamingDataflowPartition." ] }, { @@ -468,7 +480,7 @@ "source": [ "from finn.transformation.fpgadataflow.create_dataflow_partition import CreateDataflowPartition\n", "\n", - "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hls_layers.onnx\")\n", + "model = ModelWrapper(build_dir+\"/tfc_w1_a1_hw_layers.onnx\")\n", "parent_model = model.transform(CreateDataflowPartition())\n", "parent_model.save(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")\n", "showInNetron(build_dir+\"/tfc_w1_a1_dataflow_parent.onnx\")" @@ -478,7 +490,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that the `MatrixVectorActivation` instances and the `Thresholding_Batch` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HLS dataflow-only graph:" + "We can see that the `MVAU` instances and the `Thresholding` in the beginning have all been replaced with a single `StreamingDataflowPartition`, which has an attribute `model` that points to the extracted, HW dataflow-only graph:" ] }, { @@ -498,7 +510,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see all the extracted `MatrixVectorActivation` instances and the `Thresholding_Batch` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it." + "We can see all the extracted `MVAU` instances and the `Thresholding` have been moved to the child (dataflow) model. We will load the child model with `ModelWrapper` and continue working on it." ] }, { @@ -510,6 +522,60 @@ "model = ModelWrapper(dataflow_model_filename)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Specialize layers " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The network is converted to HW abstraction layers and we have excluded the non-HW layers to continue with the processing of the model. HW abstraction layers are abstract (placeholder) layers that can be either implemented in HLS or as an RTL module using FINN. In the next flow step, we convert each of these layers to either an HLS or RTL variant by calling the `SpecializeLayers` transformation. It is possible to let the FINN flow know a preference for the implementation style `{\"hls\", \"rtl\"}` and depending on the layer type this wish will be fulfilled or it will be set to a reasonable default. In the tfc example, we will set all layers to their HLS variants. To showcase how to set the preferred implementation, we will set the node attribute in the `Thresholding` layer to `\"hls\"`, for the `MVAUs` and the `LabelSelect` we will leave this node attribute empty and in this case by default it will be set to HLS." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "thresh_node = model.get_nodes_by_op_type(\"Thresholding\")[0]\n", + "thresh_node_inst = getCustomOp(thresh_node)\n", + "thresh_node_inst.set_nodeattr(\"preferred_impl_style\", \"hls\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Then we will call `SpecializeLayers` to convert each HW abstraction layer to (in this case) an HLS variant." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", + "model = model.transform(SpecializeLayers())\n", + "\n", + "model.save(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")\n", + "showInNetron(build_dir+\"/tfc_w1_a1_specialize_layers.onnx\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Each node type has now a suffix (`_hls`) and the module (`\n", + "finn.custom_op.fpgadataflow.hls` also indicates that that the HLS variant of the layer is selected.\n", + "We can now proceed by adjusting the parallelism of each node to customize the performance and resource usage.)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -518,14 +584,17 @@ "\n", "*Folding* in FINN describes how much a layer is time-multiplexed in terms of execution resources. There are several *folding factors* for each layer, controlled by the PE (parallelization over outputs) and SIMD (parallelization over inputs) parameters as described by the original [FINN paper](https://arxiv.org/pdf/1612.07119). The higher the PE and SIMD values are set, the faster the generated accelerator will run, and the more FPGA resources it will consume. \n", "\n", - "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a MatrixVectorActivation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four MatrixVectorActivation. So as an example we extract the second node of the graph." + "Each MVAU_hls node has two attributes that specify the degree of folding, PE and SIMD. In all nodes the values for these attributes are set as default to 1, which would correspond to a maximum folding (time multiplexing) and thus minimum performance. \n", + "\n", + "Since the folding parameters are node attributes, they can be easily accessed and changed using a helper function of the `ModelWrapper`. But first we take a closer look at one of the nodes that implement a Matrix-Vector-Activation operation. This is where the Netron visualization helps us, in the above diagram we can see that the model contains four `MVAUs`. So as an example we extract the second node of the graph." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "We can use the higher-level [HLSCustomOp](https://github.com/Xilinx/finn/blob/main/src/finn/custom_op/fpgadataflow/hlscustomop.py) wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes." + "We can use the higher-level CustomOp wrappers for this node. These wrappers provide easy access to specific properties of these nodes, such as the folding factors (PE and SIMD). Above, we have already used this abstraction to set the node attribute of the Thresholding HW layer.\n", + "Let's have a look at which node attributes are defined by the CustomOp wrapper, and adjust the SIMD and PE attributes." ] }, { @@ -556,7 +625,7 @@ "metadata": {}, "outputs": [], "source": [ - "fc_layers = model.get_nodes_by_op_type(\"MatrixVectorActivation\")\n", + "fc_layers = model.get_nodes_by_op_type(\"MVAU_hls\")\n", "# (PE, SIMD, in_fifo_depth, out_fifo_depth, ramstyle) for each layer\n", "config = [\n", " (16, 49, [16], [64], \"block\"),\n", @@ -573,7 +642,7 @@ " fcl_inst.set_nodeattr(\"ram_style\", ramstyle)\n", " \n", "# set parallelism for input quantizer to be same as first layer's SIMD\n", - "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_Batch\")[0]\n", + "inp_qnt_node = model.get_nodes_by_op_type(\"Thresholding_hls\")[0]\n", "inp_qnt = getCustomOp(inp_qnt_node)\n", "inp_qnt.set_nodeattr(\"PE\", 49)" ] @@ -650,7 +719,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "In previous versions of FINN, we had to manually go through several steps to generate HLS code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**" + "In previous versions of FINN, we had to manually go through several steps to generate HLS/RTL code, stitch IP, create a PYNQ project and run synthesis. All these steps are now performed by the `ZynqBuild` transform (or the `VitisBuild` transform for Alveo). **As this involves calling HLS synthesis and Vivado synthesis, this transformation will run for some time (up to half an hour depending on your PC).**" ] }, { @@ -732,7 +801,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:" + "We can see that `StreamingFIFO` and `StreamingDataWidthConverter` instances have been automatically inserted into the graph prior to hardware build. Both layer types are inserted as RTL variants. Transformations like `ZynqBuild` use the `metadata_props` of the model to put in additional metadata information relevant to the results of the transformation. Let's examine the metadata for the current graph containing all layers:" ] }, { @@ -1006,9 +1075,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb index 6c3b796509..a07a8d2254 100644 --- a/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb +++ b/notebooks/end2end_example/bnn-pynq/tfc_end2end_verification.ipynb @@ -7,16 +7,16 @@ "# FINN - Functional Verification of End-to-End Flow\n", "-----------------------------------------------------------------\n", "\n", - "**Important: This notebook depends on the tfc_end2end_example notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n", + "**Important: This notebook depends on the [tfc_end2end_example](tfc_end2end_example.ipynb) notebook, because we are using models that are available at intermediate steps in the end-to-end flow. So please make sure the needed .onnx files are generated to run this notebook.**\n", "\n", - "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. Besides the methods in this notebook, there is another one that is covered in the Jupyter notebook [tfc_end2end_example](tfc_end2end_example.ipynb): remote execution. The remote execution allows functional verification directly on the PYNQ board, for details please have a look at the mentioned Jupyter notebook." + "In this notebook, we will show how to take the intermediate results of the end-to-end tfc example and verify their functionality with different methods. In the following picture you can see the section in the end-to-end flow about the *Simulation & Emulation Flows*. " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "\"Drawing\"" + "\"Drawing\"" ] }, { @@ -72,9 +72,9 @@ "source": [ "## Simulation using Python \n", "\n", - "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow`) this model can be checked for functionality using Python.\n", + "If an ONNX model consists of [standard ONNX](https://github.com/onnx/onnx/blob/main/docs/Operators.md) nodes and/or FINN custom operations that do not belong to the fpgadataflow (`backend` $\\neq$ `fpgadataflow.hls` or `backend` $\\neq$ `fpgadataflow.rtl`) this model can be checked for functionality using Python.\n", "\n", - "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of a XNOR popcount node.\n" + "To simulate a standard ONNX node [onnxruntime](https://github.com/microsoft/onnxruntime) is used. onnxruntime is an open source tool developed by Microsoft to run standard ONNX nodes. For the FINN custom op nodes execution, functions are defined. The following is an example of the execution function of an XNOR popcount node.\n" ] }, { @@ -95,7 +95,7 @@ "\n", "This execution function and onnxruntime is used when `execute_onnx` from `onnx_exec` is applied to the model. The model is then simulated node by node and the result is stored in a context dictionary, which contains the values of each tensor at the end of the execution. To get the result, only the output tensor has to be extracted.\n", "\n", - "The procedure is shown below. We take the model right before the nodes should be converted into HLS layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs." + "The procedure is shown below. We take the model right before the nodes should be converted into HW layers and generate an input tensor to pass to the execution function. The input tensor is generated from the Brevitas example inputs." ] }, { @@ -108,7 +108,7 @@ "from qonnx.core.modelwrapper import ModelWrapper\n", "input_dict = {\"global_in\": nph.to_array(input_tensor)}\n", "\n", - "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hls_conversion.onnx\")" + "model_for_sim = ModelWrapper(build_dir+\"/tfc_w1a1_ready_for_hw_conversion.onnx\")" ] }, { @@ -121,12 +121,11 @@ "output_dict = oxe.execute_onnx(model_for_sim, input_dict, return_full_exec_context=False)\n", "output_pysim = output_dict[list(output_dict.keys())[0]]\n", "\n", - "\n", - "\n", - "if np.isclose(output_pysim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n", + "try:\n", + " assert np.isclose(output_pysim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n", " print(\"Results are the same!\")\n", - "else:\n", - " print(\"The results are not the same!\")" + "except AssertionError:\n", + " assert False, \"The results are not the same!\"" ] }, { @@ -142,7 +141,16 @@ "source": [ "## Simulation (cppsim) using C++\n", "\n", - "When dealing with HLS custom op nodes in FINN the simulation using Python is no longer sufficient. After the nodes have been converted to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding finn-hlslib function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS layers is used, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model." + "When dealing with HLS or RTL custom op nodes in FINN the simulation using Python is no longer sufficient. If the nodes are specialized to HLS layers, the simulation using C++ can be used. To do this, the input tensor is stored in a .npy file and C++ code is generated that reads the values from the .npy array, streams them to the corresponding `finn-hlslib` function and writes the result to a new .npy file. This in turn can be read in Python and processed in the FINN flow. For this example the model after setting the folding factors in the HLS variants of the layers, please be aware that this is not the full model, but the dataflow partition, so before executing at the end of this section we have to integrate the model back into the parent model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Note: HW layer can also be converted to RTL variants, in this case \"cppsim\" is not an option we can execute. If nevertheless \"cppsim\" is selected as execution mode for the layer, the execution defaults to the parent class. Like this, networks with a mix of HLS and RTL layers can be executed using \"cppsim\" for the HLS layers. \n", + "
" ] }, { @@ -159,7 +167,7 @@ "metadata": {}, "source": [ "To generate the code for this simulation and to generate the executable two transformations are used:\n", - "* `PrepareCppSim` which generates the C++ code for the corresponding hls layer\n", + "* `PrepareCppSim` which generates the C++ code for the corresponding HLS layer\n", "* `CompileCppSim` which compules the C++ code and stores the path to the executable" ] }, @@ -268,10 +276,11 @@ "output_dict = oxe.execute_onnx(parent_model, input_dict)\n", "output_cppsim = output_dict[list(output_dict.keys())[0]]\n", "\n", - "if np.isclose(output_cppsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n", + "try:\n", + " assert np.isclose(output_cppsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n", " print(\"Results are the same!\")\n", - "else:\n", - " print(\"The results are not the same!\")" + "except AssertionError:\n", + " assert False, \"The results are not the same!\"" ] }, { @@ -280,9 +289,9 @@ "source": [ "## Emulation (rtlsim) using PyVerilator\n", "\n", - "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n", + "The emulation using [PyVerilator](https://github.com/maltanar/pyverilator) can be done after IP blocks are generated from the corresponding HLS layers or for RTL layers directly using the generated Verilog files. Pyverilator is a tool which makes it possible to simulate verilog files using verilator via a python interface.\n", "\n", - "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS nodes could also be executed as whole." + "We have two ways to use rtlsim, one is to run the model node-by-node as with the simulation methods, but if the model is in the form of the dataflow partition, the part of the graph that consist of only HLS/RTL nodes could also be executed as whole." ] }, { @@ -356,10 +365,11 @@ "output_dict = oxe.execute_onnx(model_for_rtlsim, input_dict)\n", "output_rtlsim = output_dict[list(output_dict.keys())[0]]\n", "\n", - "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n", + "try:\n", + " assert np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n", " print(\"Results are the same!\")\n", - "else:\n", - " print(\"The results are not the same!\")" + "except AssertionError:\n", + " assert False, \"The results are not the same!\"" ] }, { @@ -379,18 +389,14 @@ "source": [ "from finn.transformation.fpgadataflow.insert_dwc import InsertDWC\n", "from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO\n", + "from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers\n", "from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP\n", "\n", "child_model = ModelWrapper(build_dir + \"/tfc_w1_a1_dataflow_child.onnx\")\n", - "child_model = child_model.transform(InsertDWC())\n", - "\n", - "# set all impl_styles of the DWCs to hls to enable emulation\n", - "dwc_nodes = child_model.get_nodes_by_op_type(\"StreamingDataWidthConverter_Batch\")\n", - "for dwc in dwc_nodes:\n", - " dwc_inst = getCustomOp(dwc)\n", - " dwc_inst.set_nodeattr(\"impl_style\", \"hls\")\n", - " \n", + "child_model = child_model.transform(InsertDWC()) \n", "child_model = child_model.transform(InsertFIFO(create_shallow_fifos=True))\n", + "# DWC and FIFOs need to be specialized to either HLS or RTL variants\n", + "child_model = child_model.transform(SpecializeLayers())\n", "child_model.save(build_dir + \"/test.onnx\");\n", "child_model = child_model.transform(GiveUniqueNodeNames())\n", "child_model = child_model.transform(PrepareIP(test_fpga_part, target_clk_ns))\n", @@ -430,10 +436,11 @@ "metadata": {}, "outputs": [], "source": [ - "if np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all():\n", + "try:\n", + " assert np.isclose(output_rtlsim, np.where(output_golden[0]==np.amax(output_golden[0])), atol=1e-3).all()\n", " print(\"Results are the same!\")\n", - "else:\n", - " print(\"The results are not the same!\")" + "except AssertionError:\n", + " assert False, \"The results are not the same!\"" ] } ], @@ -453,7 +460,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/bnn-pynq/verification.png b/notebooks/end2end_example/bnn-pynq/verification.png deleted file mode 100755 index cb50ba1b67..0000000000 Binary files a/notebooks/end2end_example/bnn-pynq/verification.png and /dev/null differ diff --git a/notebooks/end2end_example/bnn-pynq/verification.svg b/notebooks/end2end_example/bnn-pynq/verification.svg new file mode 100755 index 0000000000..9cf8e86088 --- /dev/null +++ b/notebooks/end2end_example/bnn-pynq/verification.svg @@ -0,0 +1 @@ + diff --git a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb index 3d77586258..da037050bb 100644 --- a/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb +++ b/notebooks/end2end_example/cybersecurity/1-train-mlp-with-brevitas.ipynb @@ -53,7 +53,7 @@ " * [(Option 1) Train the Model from Scratch](#train_scratch)\n", " * [(Option 2) Load Pre-Trained Parameters](#load_pretrained)\n", "* [Network Surgery Before Export](#network_surgery)\n", - "* [Export to FINN-ONNX](#export_finn_onnx)" + "* [Export to QONNX and Conversion to FINN-ONNX](#export_qonnx)" ] }, { @@ -62,8 +62,11 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "import onnx\n", - "import torch" + "import torch\n", + "\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"" ] }, { @@ -483,13 +486,14 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "import torch\n", "\n", "# Make sure the model is on CPU before loading a pretrained state_dict\n", "model = model.cpu()\n", "\n", "# Load pretrained weights\n", - "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n", + "trained_state_dict = torch.load(model_dir + \"/state_dict.pth\")[\"models_state_dict\"][0]\n", "\n", "model.load_state_dict(trained_state_dict, strict=False)" ] @@ -663,12 +667,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Export to FINN-ONNX \n", + "# Export to QONNX and Conversion to FINN-ONNX \n", "\n", "\n", "[ONNX](https://onnx.ai/) is an open format built to represent machine learning models, and the FINN compiler expects an ONNX model as input. We'll now export our network into ONNX to be imported and used in FINN for the next notebooks. Note that the particular ONNX representation used for FINN differs from standard ONNX, you can read more about this [here](https://finn.readthedocs.io/en/latest/internals.html#intermediate-representation-finn-onnx).\n", "\n", - "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation. Note how we create a `QuantTensor` instance with dummy data to tell Brevitas how our inputs look like, which will be used to set the input quantization annotation on the exported model." + "You can see below how we export a trained network in Brevitas into a FINN-compatible ONNX representation (QONNX). QONNX is the format we can export from Brevitas, to feed it into the FINN compiler, we will need to make a conversion to the FINN-ONNX format which is the intermediate representation the compiler works on. The conversion of the FINN-ONNX format is a FINN compiler transformation and to be able to apply it to our model, we will need to wrap it into [ModelWrapper](https://finn.readthedocs.io/en/latest/internals.html#modelwrapper). This is a wrapper around the ONNX model which provides several helper functions to make it easier to work with the model. Then we can call the conversion function to obtain the model in FINN-ONNX format." ] }, { @@ -677,10 +681,13 @@ "metadata": {}, "outputs": [], "source": [ - "import brevitas.onnx as bo\n", - "from brevitas.quant_tensor import QuantTensor\n", + "from brevitas.export import export_qonnx\n", + "from qonnx.util.cleanup import cleanup as qonnx_cleanup\n", + "from qonnx.core.modelwrapper import ModelWrapper\n", + "from qonnx.core.datatype import DataType\n", + "from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN\n", "\n", - "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n", + "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n", "input_shape = (1, 600)\n", "\n", "# create a QuantTensor instance to mark input as bipolar during export\n", @@ -688,18 +695,25 @@ "input_a = 2 * input_a - 1\n", "scale = 1.0\n", "input_t = torch.from_numpy(input_a * scale)\n", - "input_qt = QuantTensor(\n", - " input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True\n", - ")\n", "\n", "#Move to CPU before export\n", "model_for_export.cpu()\n", "\n", "# Export to ONNX\n", - "bo.export_finn_onnx(\n", - " model_for_export, export_path=ready_model_filename, input_t=input_qt\n", + "export_qonnx(\n", + " model_for_export, export_path=ready_model_filename, input_t=input_t\n", ")\n", "\n", + "# clean-up\n", + "qonnx_cleanup(ready_model_filename, out_file=ready_model_filename)\n", + "\n", + "# ModelWrapper\n", + "model = ModelWrapper(ready_model_filename)\n", + "# Setting the input datatype explicitly because it doesn't get derived from the export function\n", + "model.set_tensor_datatype(model.graph.input[0].name, DataType[\"BIPOLAR\"])\n", + "model = model.transform(ConvertQONNXtoFINN())\n", + "model.save(ready_model_filename)\n", + "\n", "print(\"Model saved to %s\" % ready_model_filename)" ] }, @@ -755,7 +769,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb index e4848a1f40..33b64e11c0 100644 --- a/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb +++ b/notebooks/end2end_example/cybersecurity/2-import-into-finn-and-verify.ipynb @@ -62,9 +62,11 @@ "metadata": {}, "outputs": [], "source": [ + "import os\n", "from qonnx.core.modelwrapper import ModelWrapper\n", "\n", - "ready_model_filename = \"cybsec-mlp-ready.onnx\"\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n", + "ready_model_filename = model_dir + \"/cybsec-mlp-ready.onnx\"\n", "model_for_sim = ModelWrapper(ready_model_filename)" ] }, @@ -151,7 +153,7 @@ "model_for_sim = model_for_sim.transform(InferDataTypes())\n", "model_for_sim = model_for_sim.transform(RemoveStaticGraphInputs())\n", "\n", - "verif_model_filename = \"cybsec-mlp-verification.onnx\"\n", + "verif_model_filename = model_dir + \"/cybsec-mlp-verification.onnx\"\n", "model_for_sim.save(verif_model_filename)" ] }, @@ -258,7 +260,8 @@ "\n", "# replace this with your trained network checkpoint if you're not\n", "# using the pretrained weights\n", - "trained_state_dict = torch.load(\"state_dict.pth\")[\"models_state_dict\"][0]\n", + "trained_state_dict = torch.load(model_dir + \"/state_dict.pth\")[\"models_state_dict\"][0]\n", + "\n", "# Uncomment the following line if you previously chose to train the network yourself\n", "#trained_state_dict = torch.load(\"state_dict_self-trained.pth\")\n", "\n", @@ -365,10 +368,11 @@ "metadata": {}, "outputs": [], "source": [ - "if ok == n_verification_inputs:\n", + "try:\n", + " assert ok == n_verification_inputs\n", " print(\"Verification succeeded. Brevitas and FINN-ONNX execution outputs are identical\")\n", - "else:\n", - " print(\"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\")" + "except AssertionError:\n", + " assert False, \"Verification failed. Brevitas and FINN-ONNX execution outputs are NOT identical\"" ] }, { @@ -395,7 +399,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb index a18cafd604..73cd25cf20 100644 --- a/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb +++ b/notebooks/end2end_example/cybersecurity/3-build-accelerator-with-finn.ipynb @@ -115,7 +115,8 @@ "import os\n", "import shutil\n", "\n", - "model_file = \"cybsec-mlp-ready.onnx\"\n", + "model_dir = os.environ['FINN_ROOT'] + \"/notebooks/end2end_example/cybersecurity\"\n", + "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n", "\n", "estimates_output_dir = \"output_estimates_only\"\n", "\n", @@ -148,6 +149,15 @@ "build.build_dataflow_cfg(model_file, cfg_estimates)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert os.path.exists(estimates_output_dir + \"/report/estimate_network_performance.json\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -255,7 +265,7 @@ "\n", "**Live FINN tutorial:** These next builds will take about 10 minutes to complete since multiple calls to Vivado and a call to RTL simulation are involved. While this is running, you can examine the generated files with noVNC -- it is running on **(your AWS URL):6080/vnc.html**\n", "\n", - "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MatrixVectorActivation_XXXXXX`\n", + "* Once the `step_hls_codegen [8/16]` below is completed, you can view the generated HLS code under its own folder for each layer: `/tmp/finn_dev_ubuntu/code_gen_ipgen_MVAU_hls_XXXXXX`\n", " \n", "* Once the `step_create_stitched_ip [11/16]` below is completed, you can view the generated stitched IP in Vivado under `/home/ubuntu/finn/notebooks/end2end_example/cybersecurity/output_ipstitch_ooc_rtlsim/stitched_ip`\n", " " @@ -272,7 +282,7 @@ "import os\n", "import shutil\n", "\n", - "model_file = \"cybsec-mlp-ready.onnx\"\n", + "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n", "\n", "rtlsim_output_dir = \"output_ipstitch_ooc_rtlsim\"\n", "\n", @@ -305,6 +315,17 @@ "build.build_dataflow_cfg(model_file, cfg_stitched_ip)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert os.path.exists(rtlsim_output_dir + \"/report/ooc_synth_and_timing.json\")\n", + "assert os.path.exists(rtlsim_output_dir + \"/report/rtlsim_performance.json\")\n", + "assert os.path.exists(rtlsim_output_dir + \"/final_hw_config.json\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -412,7 +433,7 @@ "import os\n", "import shutil\n", "\n", - "model_file = \"cybsec-mlp-ready.onnx\"\n", + "model_file = model_dir + \"/cybsec-mlp-ready.onnx\"\n", "\n", "final_output_dir = \"output_final\"\n", "\n", @@ -638,7 +659,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py index 738811fa72..38505fb6ef 100644 --- a/notebooks/end2end_example/cybersecurity/dataloader_quantized.py +++ b/notebooks/end2end_example/cybersecurity/dataloader_quantized.py @@ -48,7 +48,6 @@ def __init__( onehot=False, train=True, ): - self.dataframe = ( pd.concat([pd.read_csv(file_path_train), pd.read_csv(file_path_test)]) .reset_index() @@ -77,9 +76,7 @@ def __getitem__(self, index): data_val = self.data[index][:-1] return data_val, target - def dec2bin( - self, column: pd.Series, number_of_bits: int, left_msb: bool = True - ) -> pd.Series: + def dec2bin(self, column: pd.Series, number_of_bits: int, left_msb: bool = True) -> pd.Series: """Convert a decimal pd.Series to binary pd.Series with numbers in their # base-2 equivalents. The output is a numpy nd array. @@ -133,6 +130,7 @@ def integer_encoding(self, df): def quantize_df(self, df): """Quantized the input dataframe. The scaling is done by multiplying every column by the inverse of the minimum of that column""" + # gets the smallest positive number of a vector def get_min_positive_number(vector): return vector[vector > 0].min() @@ -178,24 +176,18 @@ def char_split(s): column_data = np.clip( column_data, 0, 4294967295 ) # clip due to overflow of uint32 of matlab code - column_data = self.round_like_matlab_series( - column_data - ) # round like matlab + column_data = self.round_like_matlab_series(column_data) # round like matlab column_data = column_data.astype(np.uint32) # cast like matlab if column == "rate": column_data.update(pd.Series(dict_correct_rate_values)) python_quantized_df[column] = ( - self.dec2bin(column_data, maxbits, left_msb=False) - .reshape((-1, 1)) - .flatten() + self.dec2bin(column_data, maxbits, left_msb=False).reshape((-1, 1)).flatten() ) for column in python_quantized_df.columns: - python_quantized_df[column] = ( - python_quantized_df[column].apply(char_split).values - ) + python_quantized_df[column] = python_quantized_df[column].apply(char_split).values python_quantized_df_separated = pd.DataFrame( np.column_stack(python_quantized_df.values.T.tolist()) diff --git a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py index 0ffb525544..c4570616d2 100644 --- a/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py +++ b/notebooks/end2end_example/cybersecurity/validate-unsw-nb15.py @@ -57,9 +57,7 @@ def make_unsw_nb15_test_batches(bsize, dataset_root): help='name of bitfile (i.e. "resizer.bit")', default="../bitfile/finn-accel.bit", ) - parser.add_argument( - "--dataset_root", help="dataset root dir for download/reuse", default="." - ) + parser.add_argument("--dataset_root", help="dataset root dir for download/reuse", default=".") # parse arguments args = parser.parse_args() bsize = args.batchsize diff --git a/requirements.txt b/requirements.txt index 83aad07d72..c2973f9432 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,19 +1,21 @@ bitstring==3.1.7 -clize==4.1.1 +clize==5.0.1 dataclasses-json==0.5.7 -docrep==0.2.7 gspread==3.6.0 -numpy==1.22.0 +importlib-resources==6.1.0 +ipython==8.12.2 +numpy==1.24.1 onnx==1.13.0 onnxoptimizer -onnxruntime==1.11.1 -pre-commit==2.9.2 +onnxruntime==1.16.1 +pre-commit==3.3.2 protobuf==3.20.3 psutil==5.9.4 -pyscaffold==3.2.1 -scipy==1.5.2 +pyscaffold==4.4 +scipy==1.10.1 setupext-janitor>=1.1.2 -sigtools==2.0.3 -toposort==1.5 +setuptools==68.2.2 +sigtools==4.0.1 +toposort==1.7.0 vcdvcd==1.0.5 wget==3.2 diff --git a/run-docker.sh b/run-docker.sh index 381be35293..e732492728 100755 --- a/run-docker.sh +++ b/run-docker.sh @@ -47,7 +47,7 @@ if [ -z "$FINN_XILINX_PATH" ];then fi if [ -z "$FINN_XILINX_VERSION" ];then - recho "Please set the FINN_XILINX_VERSION to the version of the Xilinx tools to use (e.g. 2020.1)" + recho "Please set the FINN_XILINX_VERSION to the version of the Xilinx tools to use (e.g. 2022.2)" recho "FINN functionality depending on Vivado, Vitis or HLS will not be available." fi @@ -86,23 +86,29 @@ SCRIPTPATH=$(dirname "$SCRIPT") : ${ALVEO_BOARD="U250"} : ${ALVEO_TARGET_DIR="/tmp"} : ${PLATFORM_REPO_PATHS="/opt/xilinx/platforms"} -: ${XRT_DEB_VERSION="xrt_202210.2.13.466_18.04-amd64-xrt"} +: ${XRT_DEB_VERSION="xrt_202220.2.14.354_22.04-amd64-xrt"} : ${FINN_HOST_BUILD_DIR="/tmp/$DOCKER_INST_NAME"} : ${FINN_DOCKER_TAG="xilinx/finn:$(git describe --always --tags --dirty).$XRT_DEB_VERSION"} : ${FINN_DOCKER_PREBUILT="0"} : ${FINN_DOCKER_RUN_AS_ROOT="0"} : ${FINN_DOCKER_GPU="$(docker info | grep nvidia | wc -m)"} : ${FINN_DOCKER_EXTRA=""} +: ${FINN_DOCKER_BUILD_EXTRA=""} : ${FINN_SKIP_DEP_REPOS="0"} +: ${FINN_SKIP_BOARD_FILES="0"} : ${OHMYXILINX="${SCRIPTPATH}/deps/oh-my-xilinx"} : ${NVIDIA_VISIBLE_DEVICES=""} : ${DOCKER_BUILDKIT="1"} +: ${FINN_SINGULARITY=""} DOCKER_INTERACTIVE="" +# Catch FINN_DOCKER_EXTRA options being passed in without a trailing space +FINN_DOCKER_EXTRA+=" " + if [ "$1" = "test" ]; then gecho "Running test suite (all tests)" - DOCKER_CMD="python setup.py test" + DOCKER_CMD="pytest" elif [ "$1" = "quicktest" ]; then gecho "Running test suite (non-Vivado, non-slow tests)" DOCKER_CMD="quicktest.sh" @@ -116,8 +122,10 @@ elif [ "$1" = "notebook" ]; then DOCKER_CMD="jupyter notebook --allow-root --no-browser --ip=0.0.0.0 --port $JUPYTER_PORT $JUPYTER_PASSWD_ARG notebooks" FINN_DOCKER_EXTRA+="-e JUPYTER_PORT=$JUPYTER_PORT " FINN_DOCKER_EXTRA+="-e NETRON_PORT=$NETRON_PORT " - FINN_DOCKER_EXTRA+="-p $JUPYTER_PORT:$JUPYTER_PORT " - FINN_DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT " + if [ -z "$FINN_SINGULARITY" ]; then + FINN_DOCKER_EXTRA+="-p $JUPYTER_PORT:$JUPYTER_PORT " + FINN_DOCKER_EXTRA+="-p $NETRON_PORT:$NETRON_PORT " + fi elif [ "$1" = "build_dataflow" ]; then BUILD_DATAFLOW_DIR=$(readlink -f "$2") FINN_DOCKER_EXTRA+="-v $BUILD_DATAFLOW_DIR:$BUILD_DATAFLOW_DIR " @@ -143,7 +151,7 @@ else fi -if [ "$FINN_DOCKER_GPU" != 0 ];then +if [ "$FINN_DOCKER_GPU" != 0 ] && [ -z "$FINN_SINGULARITY" ];then gecho "nvidia-docker detected, enabling GPUs" if [ ! -z "$NVIDIA_VISIBLE_DEVICES" ];then FINN_DOCKER_EXTRA+="--runtime nvidia -e NVIDIA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES " @@ -174,19 +182,18 @@ if [ "$FINN_SKIP_DEP_REPOS" = "0" ]; then fi # Build the FINN Docker image -if [ "$FINN_DOCKER_PREBUILT" = "0" ]; then +if [ "$FINN_DOCKER_PREBUILT" = "0" ] && [ -z "$FINN_SINGULARITY" ]; then # Need to ensure this is done within the finn/ root folder: OLD_PWD=$(pwd) cd $SCRIPTPATH - docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --tag=$FINN_DOCKER_TAG . + docker build -f docker/Dockerfile.finn --build-arg XRT_DEB_VERSION=$XRT_DEB_VERSION --tag=$FINN_DOCKER_TAG $FINN_DOCKER_BUILD_EXTRA . cd $OLD_PWD fi # Launch container with current directory mounted # important to pass the --init flag here for correct Vivado operation, see: # https://stackoverflow.com/questions/55733058/vivado-synthesis-hangs-in-docker-container-spawned-by-jenkins -DOCKER_EXEC="docker run -t --rm $DOCKER_INTERACTIVE --tty --init " -DOCKER_EXEC+="--hostname $DOCKER_INST_NAME " -DOCKER_EXEC+="-e SHELL=/bin/bash " +DOCKER_BASE="docker run -t --rm $DOCKER_INTERACTIVE --tty --init --hostname $DOCKER_INST_NAME " +DOCKER_EXEC="-e SHELL=/bin/bash " DOCKER_EXEC+="-w $SCRIPTPATH " DOCKER_EXEC+="-v $SCRIPTPATH:$SCRIPTPATH " DOCKER_EXEC+="-v $FINN_HOST_BUILD_DIR:$FINN_HOST_BUILD_DIR " @@ -201,7 +208,10 @@ DOCKER_EXEC+="-e PYNQ_PASSWORD=$PYNQ_PASSWORD " DOCKER_EXEC+="-e PYNQ_TARGET_DIR=$PYNQ_TARGET_DIR " DOCKER_EXEC+="-e OHMYXILINX=$OHMYXILINX " DOCKER_EXEC+="-e NUM_DEFAULT_WORKERS=$NUM_DEFAULT_WORKERS " -if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ];then +# Workaround for FlexLM issue, see: +# https://community.flexera.com/t5/InstallAnywhere-Forum/Issues-when-running-Xilinx-tools-or-Other-vendor-tools-in-docker/m-p/245820#M10647 +DOCKER_EXEC+="-e LD_PRELOAD=/lib/x86_64-linux-gnu/libudev.so.1 " +if [ "$FINN_DOCKER_RUN_AS_ROOT" = "0" ] && [ -z "$FINN_SINGULARITY" ];then DOCKER_EXEC+="-v /etc/group:/etc/group:ro " DOCKER_EXEC+="-v /etc/passwd:/etc/passwd:ro " DOCKER_EXEC+="-v /etc/shadow:/etc/shadow:ro " @@ -241,6 +251,17 @@ if [ ! -z "$FINN_XILINX_PATH" ];then fi fi DOCKER_EXEC+="$FINN_DOCKER_EXTRA " -DOCKER_EXEC+="$FINN_DOCKER_TAG $DOCKER_CMD" -$DOCKER_EXEC +if [ -z "$FINN_SINGULARITY" ];then + CMD_TO_RUN="$DOCKER_BASE $DOCKER_EXEC $FINN_DOCKER_TAG $DOCKER_CMD" +else + SINGULARITY_BASE="singularity exec" + # Replace command options for Singularity + SINGULARITY_EXEC="${DOCKER_EXEC//"-e "/"--env "}" + SINGULARITY_EXEC="${SINGULARITY_EXEC//"-v "/"-B "}" + SINGULARITY_EXEC="${SINGULARITY_EXEC//"-w "/"--pwd "}" + CMD_TO_RUN="$SINGULARITY_BASE $SINGULARITY_EXEC $FINN_SINGULARITY /usr/local/bin/finn_entrypoint.sh $DOCKER_CMD" + gecho "FINN_SINGULARITY is set, launching Singularity container instead of Docker" +fi + +$CMD_TO_RUN diff --git a/setup.cfg b/setup.cfg index 1893aa4231..4834011dea 100644 --- a/setup.cfg +++ b/setup.cfg @@ -34,12 +34,12 @@ name = finn description = A Framework for Fast, Scalable Quantized Neural Network Inference author = Yaman Umuroglu -author-email = yamanu@xilinx.com +author_email = yamanu@xilinx.com license = new-bsd -long-description = file: README.md -long-description-content-type = text/markdown +long_description = file: README.md +long_description_content_type = text/markdown url = https://xilinx.github.io/finn/ -project-urls = +project_urls = Documentation = https://finn.readthedocs.io/ # Change if running only on Windows, Mac or Linux (comma-separated) platforms = any @@ -56,8 +56,6 @@ packages = find_namespace: include_package_data = True package_dir = =src -# DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD! -setup_requires = pyscaffold>=3.2a0,<3.3a0 # The usage of test_requires is discouraged, see `Dependency Management` docs # tests_require = pytest; pytest-cov # Require a specific Python version, e.g. Python 2.7 or >= 3.4 @@ -81,6 +79,8 @@ docs = pytest netron vcdvcd + sphinx==5.0.2 + sphinx_rtd_theme==0.5.0 torchvision torch qonnx@git+https://github.com/fastmachinelearning/qonnx@main#egg=qonnx @@ -127,6 +127,12 @@ markers = transform: mark tests that test transformations (before hls layers) fpgadataflow: mark tests related to hls layers end2end: mark tests that run the end2end flow + notebooks: mark tests that execute all Jupyter notebooks + sanity_bnn: mark tests that execute the sanity BNN test + bnn_u250: mark tests that execute U250 BNN tests + bnn_kv260: mark tests that execute KV260 BNN tests + bnn_pynq: mark tests that execute Pynq-Z1 BNN tests + bnn_zcu104: mark tests that execute ZCU104 BNN tests norecursedirs = dist build diff --git a/setup.py b/setup.py index 8fd781462c..9a06632af1 100644 --- a/setup.py +++ b/setup.py @@ -35,17 +35,7 @@ PyScaffold helps you to put up the scaffold of your new Python project. Learn more under: https://pyscaffold.org/ """ -from pkg_resources import VersionConflict, require from setuptools import setup -import sys - -try: - require("setuptools>=38.3") -except VersionConflict: - print("Error: version of setuptools is too old (<38.3)!") - sys.exit(1) - - if __name__ == "__main__": - setup(use_pyscaffold=True) + setup() diff --git a/src/finn/analysis/fpgadataflow/dataflow_performance.py b/src/finn/analysis/fpgadataflow/dataflow_performance.py index 5726702666..a4bf40760e 100644 --- a/src/finn/analysis/fpgadataflow/dataflow_performance.py +++ b/src/finn/analysis/fpgadataflow/dataflow_performance.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,7 +29,7 @@ from qonnx.custom_op.registry import getCustomOp -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def dataflow_performance(model): @@ -38,7 +39,7 @@ def dataflow_performance(model): for each node along the critical path. Preconditions: - - model consists of fpgadataflow nodes + - model consists of HLS/RTL nodes - model has cycle estimates annotated (see AnnotateCycles transformation) - nodes have unique names (see GiveUniqueNodeNames) @@ -52,7 +53,7 @@ def dataflow_performance(model): max_node_name = "" for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = getCustomOp(node) node_cycles = int(inst.get_nodeattr("cycles_estimate")) if node_cycles > max_cycles: @@ -66,9 +67,7 @@ def dataflow_performance(model): max_pred_latency = 0 else: # find max of any of predecessors - pred_latencies = map( - lambda x: latency_at_node_output[x.name], predecessors - ) + pred_latencies = map(lambda x: latency_at_node_output[x.name], predecessors) max_pred_latency = max(pred_latencies) latency_at_node_output[node.name] = node_cycles + max_pred_latency critical_path_cycles = max(latency_at_node_output.values()) diff --git a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py index e1517ec636..50585720fe 100644 --- a/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py +++ b/src/finn/analysis/fpgadataflow/exp_cycles_per_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,7 +29,7 @@ import qonnx.custom_op.registry as registry -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def exp_cycles_per_layer(model): @@ -41,7 +42,7 @@ def exp_cycles_per_layer(model): cycle_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) cycle_dict[node.name] = int(inst.get_exp_cycles()) diff --git a/src/finn/analysis/fpgadataflow/floorplan_params.py b/src/finn/analysis/fpgadataflow/floorplan_params.py index d57b660bce..be03966fb9 100644 --- a/src/finn/analysis/fpgadataflow/floorplan_params.py +++ b/src/finn/analysis/fpgadataflow/floorplan_params.py @@ -1,4 +1,5 @@ # Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ def floorplan_params(model): } } for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): node_inst = getCustomOp(node) node_slr = node_inst.get_nodeattr("slr") node_pid = node_inst.get_nodeattr("partition_id") diff --git a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py index 4d921438f6..330494315a 100644 --- a/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py +++ b/src/finn/analysis/fpgadataflow/hls_synth_res_estimation.py @@ -30,11 +30,12 @@ import warnings import xml.etree.ElementTree as ET -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node def hls_synth_res_estimation(model): - """Extracts the FPGA resource results from the Vivado HLS synthesis estimates. + """Extracts the FPGA resource results from the Vitis HLS synthesis estimates. + Note that this analysis pass only works on nodes that have an HLS backend. Ensure that all nodes have unique names (by calling the GiveUniqueNodeNames transformation) prior to calling this analysis pass to ensure all nodes are visible in the results. @@ -43,7 +44,7 @@ def hls_synth_res_estimation(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): # init values to zero res_dict[node.name] = dict() res_dict[node.name]["BRAM_18K"] = 0 diff --git a/src/finn/analysis/fpgadataflow/post_synth_res.py b/src/finn/analysis/fpgadataflow/post_synth_res.py index 8b9c5d2a04..7b65b60fa7 100644 --- a/src/finn/analysis/fpgadataflow/post_synth_res.py +++ b/src/finn/analysis/fpgadataflow/post_synth_res.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,7 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def post_synth_res(model, override_synth_report_filename=None): @@ -85,8 +86,8 @@ def get_instance_stats(inst_name): row = root.findall(".//*[@contents='%s']/.." % inst_name) if row != []: node_dict = {} - row = row[0].getchildren() - for (restype, ind) in restype_to_ind.items(): + row = list(row[0]) + for restype, ind in restype_to_ind.items(): node_dict[restype] = int(row[ind].attrib["contents"]) return node_dict else: @@ -102,7 +103,7 @@ def get_instance_stats(inst_name): sdp_model = ModelWrapper(getCustomOp(node).get_nodeattr("model")) sdp_res_dict = post_synth_res(sdp_model, synth_report_filename) res_dict.update(sdp_res_dict) - elif _is_fpgadataflow_node(node): + elif is_hls_node(node) or is_rtl_node(node): node_dict = get_instance_stats(node.name) if node_dict is not None: res_dict[node.name] = node_dict diff --git a/src/finn/analysis/fpgadataflow/res_estimation.py b/src/finn/analysis/fpgadataflow/res_estimation.py index 406496bc0e..a6be1f1f53 100644 --- a/src/finn/analysis/fpgadataflow/res_estimation.py +++ b/src/finn/analysis/fpgadataflow/res_estimation.py @@ -28,7 +28,7 @@ import qonnx.custom_op.registry as registry -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def res_estimation(model): @@ -41,7 +41,7 @@ def res_estimation(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) res_dict[node.name] = inst.node_res_estimation() @@ -59,13 +59,10 @@ def res_estimation_complete(model): res_dict = {} for node in model.graph.node: - if is_fpgadataflow_node(node) is True: - op_type = node.op_type + if is_hls_node(node) or is_rtl_node(node): inst = registry.getCustomOp(node) - if ( - op_type == "MatrixVectorActivation" - or op_type == "VectorVectorActivation" - ): + op_type = node.op_type + if op_type.startswith("MVAU") or op_type.startswith("VVAU"): orig_restype = inst.get_nodeattr("resType") res_dict[node.name] = [] inst.set_nodeattr("resType", "dsp") @@ -73,7 +70,7 @@ def res_estimation_complete(model): inst.set_nodeattr("resType", "lut") res_dict[node.name].append(inst.node_res_estimation()) inst.set_nodeattr("resType", orig_restype) - elif op_type == "ConvolutionInputGenerator": + elif op_type.startswith("ConvolutionInputGenerator"): orig_ramstyle = inst.get_nodeattr("ram_style") res_dict[node.name] = [] inst.set_nodeattr("ram_style", "block") diff --git a/src/finn/builder/build_dataflow.py b/src/finn/builder/build_dataflow.py index d6864994a7..284cd2baa3 100644 --- a/src/finn/builder/build_dataflow.py +++ b/src/finn/builder/build_dataflow.py @@ -91,12 +91,8 @@ def resolve_build_steps(cfg: DataflowBuildConfig, partial: bool = True): return steps_as_fxns -def resolve_step_filename( - step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0 -): - step_names = list( - map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False)) - ) +def resolve_step_filename(step_name: str, cfg: DataflowBuildConfig, step_delta: int = 0): + step_names = list(map(lambda x: x.__name__, resolve_build_steps(cfg, partial=False))) assert step_name in step_names, "start_step %s not found" + step_name step_no = step_names.index(step_name) + step_delta assert step_no >= 0, "Invalid step+delta combination" @@ -150,19 +146,13 @@ def build_dataflow_cfg(model_filename, cfg: DataflowBuildConfig): for transform_step in build_dataflow_steps: try: step_name = transform_step.__name__ - print( - "Running step: %s [%d/%d]" - % (step_name, step_num, len(build_dataflow_steps)) - ) + print("Running step: %s [%d/%d]" % (step_name, step_num, len(build_dataflow_steps))) # redirect output to logfile if not cfg.verbose: sys.stdout = stdout_logger sys.stderr = stderr_logger # also log current step name to logfile - print( - "Running step: %s [%d/%d]" - % (step_name, step_num, len(build_dataflow_steps)) - ) + print("Running step: %s [%d/%d]" % (step_name, step_num, len(build_dataflow_steps))) # run the step step_start = time.time() model = transform_step(model, cfg) diff --git a/src/finn/builder/build_dataflow_config.py b/src/finn/builder/build_dataflow_config.py index a38cb6e572..e35c1cd346 100644 --- a/src/finn/builder/build_dataflow_config.py +++ b/src/finn/builder/build_dataflow_config.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -64,15 +65,6 @@ class DataflowOutputType(str, Enum): DEPLOYMENT_PACKAGE = "deployment_package" -class ComputeEngineMemMode(str, Enum): - """Memory mode for generated compute engines. See - https://finn.readthedocs.io/en/latest/internals.html#matrixvectoractivation-mem-mode - for more information.""" - - CONST = "const" - DECOUPLED = "decoupled" - - class VitisOptStrategyCfg(str, Enum): """Vitis optimization strategy with serializable string enum values.""" @@ -115,13 +107,15 @@ class VerificationStepType(str, Enum): "step_qonnx_to_finn", "step_tidy_up", "step_streamline", - "step_convert_to_hls", + "step_convert_to_hw", "step_create_dataflow_partition", + "step_specialize_layers", "step_target_fps_parallelization", "step_apply_folding_config", + "step_minimize_bit_width", "step_generate_estimate_reports", - "step_hls_codegen", - "step_hls_ipgen", + "step_hw_codegen", + "step_hw_ipgen", "step_set_fifo_depths", "step_create_stitched_ip", "step_measure_rtlsim_performance", @@ -136,16 +130,18 @@ class VerificationStepType(str, Enum): "step_qonnx_to_finn", "step_tidy_up", "step_streamline", - "step_convert_to_hls", + "step_convert_to_hw", "step_create_dataflow_partition", + "step_specialize_layers", "step_target_fps_parallelization", "step_apply_folding_config", + "step_minimize_bit_width", "step_generate_estimate_reports", ] -#: List of steps to run for a dataflow build including HLS code generation, but +#: List of steps to run for a dataflow build including HW code generation, but #: without any synthesis. -hls_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hls_codegen"] +hw_codegen_dataflow_steps = estimate_only_dataflow_steps + ["step_hw_codegen"] @dataclass_json @@ -168,6 +164,14 @@ class DataflowBuildConfig: #: DataflowOutputType for available options. generate_outputs: List[DataflowOutputType] + #: (Optional) Path to configuration JSON file in which user can specify + #: a preferred implementation style (HLS or RTL) for each node. + #: The SpecializeLayers transformation picks up these settings and if possible + #: fulfills the desired implementation style for each layer by converting the + #: node into its HLS or RTL variant. + #: Will be applied with :py:mod:`qonnx.transformation.general.ApplyConfig` + specialize_layers_config_file: Optional[str] = None + #: (Optional) Path to configuration JSON file. May include parallelization, #: FIFO sizes, RAM and implementation style attributes and so on. #: If the parallelization attributes (PE, SIMD) are part of the config, @@ -228,11 +232,17 @@ class DataflowBuildConfig: mvau_wwidth_max: Optional[int] = 36 #: (Optional) Whether thresholding layers (which implement quantized - #: activations in FINN) will be implemented as stand-alone HLS layers, + #: activations in FINN) will be implemented as stand-alone HW layers, #: instead of being part of MatrixVectorActivation layer. This gives larger #: flexibility, and makes it possible to have runtime-writable thresholds. standalone_thresholds: Optional[bool] = False + #: (Optional) Whether optimizations that minimize the bit width of the + #: weights and accumulator will be applied. Because this optimization relies + #: on the the values of the weights, it will only be applied if runtime- + #: writeable weights is not enabled. + minimize_bit_width: Optional[bool] = True + #: Target board, only needed for generating full bitfiles where the FINN #: design is integrated into a shell. #: e.g. "Pynq-Z1" or "U250" @@ -259,9 +269,7 @@ class DataflowBuildConfig: #: When `auto_fifo_depths = True`, select which method will be used for #: setting the FIFO sizes. - auto_fifo_strategy: Optional[ - AutoFIFOSizingMethod - ] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM + auto_fifo_strategy: Optional[AutoFIFOSizingMethod] = AutoFIFOSizingMethod.LARGEFIFO_RTLSIM #: Avoid using C++ rtlsim for auto FIFO sizing and rtlsim throughput test #: if set to True, always using Python instead @@ -271,17 +279,14 @@ class DataflowBuildConfig: #: Only relevant when `auto_fifo_depths = True` large_fifo_mem_style: Optional[LargeFIFOMemStyle] = LargeFIFOMemStyle.AUTO - #: Target clock frequency (in nanoseconds) for Vivado HLS synthesis. + #: Target clock frequency (in nanoseconds) for Vitis HLS synthesis. #: e.g. `hls_clk_period_ns=5.0` will target a 200 MHz clock. #: If not specified it will default to synth_clk_period_ns hls_clk_period_ns: Optional[float] = None - #: Which memory mode will be used for compute layers - default_mem_mode: Optional[ComputeEngineMemMode] = ComputeEngineMemMode.DECOUPLED - - #: Force inference of RTL ConvolutionInputGenerator over HLS implementation - #: If set to False, falls back to the default behavior of InferConvInpGen() - force_rtl_conv_inp_gen: Optional[bool] = False + #: Call CapConvolutionFIFODepths in InsertAndSetFIFODepths transform + #: to make convolution FIFOs smaller where appropriate + default_swg_exception: Optional[bool] = False #: Which Vitis platform will be used. #: Only relevant when `shell_flow_type = ShellFlowType.VITIS_ALVEO` @@ -341,8 +346,8 @@ class DataflowBuildConfig: #: Override the number of inputs for rtlsim performance measurement. rtlsim_batch_size: Optional[int] = 1 - #: If set to True, FIFOs and DWCs with impl_style=vivado will be kept during - #: rtlsim, otherwise they will be replaced by HLS implementations. + #: If set to True, FIFOs with impl_style=vivado will be kept during + #: rtlsim, otherwise they will be replaced by RTL implementations. rtlsim_use_vivado_comps: Optional[bool] = True def _resolve_hls_clk_period(self): @@ -358,9 +363,7 @@ def _resolve_driver_platform(self): elif self.shell_flow_type == ShellFlowType.VITIS_ALVEO: return "alveo" else: - raise Exception( - "Couldn't resolve driver platform for " + str(self.shell_flow_type) - ) + raise Exception("Couldn't resolve driver platform for " + str(self.shell_flow_type)) def _resolve_fpga_part(self): if self.fpga_part is None: @@ -402,8 +405,7 @@ def _resolve_vitis_platform(self): return alveo_default_platform[self.board] else: raise Exception( - "Could not resolve Vitis platform:" - " need either board or vitis_platform specified" + "Could not resolve Vitis platform:" " need either board or vitis_platform specified" ) def _resolve_verification_steps(self): @@ -421,8 +423,7 @@ def _resolve_verification_io_pair(self): ) verify_input_npy = np.load(self.verify_input_npy) assert os.path.isfile(self.verify_expected_output_npy), ( - "verify_expected_output_npy not found: " - + self.verify_expected_output_npy + "verify_expected_output_npy not found: " + self.verify_expected_output_npy ) verify_expected_output_npy = np.load(self.verify_expected_output_npy) return ( diff --git a/src/finn/builder/build_dataflow_steps.py b/src/finn/builder/build_dataflow_steps.py index 2ee898bc7d..443d2df54c 100644 --- a/src/finn/builder/build_dataflow_steps.py +++ b/src/finn/builder/build_dataflow_steps.py @@ -1,4 +1,5 @@ # Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -52,7 +53,7 @@ from qonnx.util.config import extract_model_config_to_json from shutil import copy -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer @@ -89,6 +90,12 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver from finn.transformation.fpgadataflow.make_zynq_proj import ZynqBuild +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim @@ -102,6 +109,7 @@ SplitLargeFIFOs, ) from finn.transformation.fpgadataflow.set_folding import SetFolding +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.fpgadataflow.synth_ooc import SynthOutOfContext from finn.transformation.fpgadataflow.vitis_build import VitisBuild from finn.transformation.move_reshape import RemoveCNVtoFCFlatten @@ -139,9 +147,7 @@ def verify_step( in_npy = np.expand_dims(in_npy_all[b], axis=0) exp_out_npy = np.expand_dims(exp_out_npy_all[b], axis=0) if need_parent: - assert ( - cfg.save_intermediate_models - ), "Enable save_intermediate_models for verification" + assert cfg.save_intermediate_models, "Enable save_intermediate_models for verification" parent_model_fn = intermediate_models_dir + "/dataflow_parent.onnx" child_model_fn = intermediate_models_dir + "/verify_%s.onnx" % step_name model.save(child_model_fn) @@ -155,9 +161,7 @@ def verify_step( ) print("Attempting to force model shape on verification input") in_npy = in_npy.reshape(exp_ishape) - out_dict = execute_parent( - parent_model_fn, child_model_fn, in_npy, return_full_ctx=True - ) + out_dict = execute_parent(parent_model_fn, child_model_fn, in_npy, return_full_ctx=True) out_npy = out_dict[out_tensor_name] else: inp_tensor_name = model.graph.input[0].name @@ -214,25 +218,15 @@ def verify_step( def prepare_for_stitched_ip_rtlsim(verify_model, cfg): if not cfg.rtlsim_use_vivado_comps: need_restitch = False - # switch impl_style=vivado components to rtl/hls + # switch impl_style=vivado components to rtl # StreamingFIFO must have impl_style=rtl - for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO"): + for fifo_layer in verify_model.get_nodes_by_op_type("StreamingFIFO_rtl"): inst = getCustomOp(fifo_layer) if inst.get_nodeattr("impl_style") != "rtl": inst.set_nodeattr("impl_style", "rtl") inst.set_nodeattr("code_gen_dir_ipgen", "") inst.set_nodeattr("ipgen_path", "") need_restitch = True - # StreamingDataWidthConverter must have impl_style=hls - for dwc_layer in verify_model.get_nodes_by_op_type( - "StreamingDataWidthConverter_Batch" - ): - inst = getCustomOp(dwc_layer) - if inst.get_nodeattr("impl_style") != "hls": - inst.set_nodeattr("impl_style", "hls") - inst.set_nodeattr("code_gen_dir_ipgen", "") - inst.set_nodeattr("ipgen_path", "") - need_restitch = True # if we've made alterations to the model, need to do some re-prep if need_restitch: print("Need to regen/re-stitch some IP for STITCHED_IP_RTLSIM") @@ -336,48 +330,46 @@ def step_streamline(model: ModelWrapper, cfg: DataflowBuildConfig): return model -def step_convert_to_hls(model: ModelWrapper, cfg: DataflowBuildConfig): - """Convert eligible nodes to `HLSCustomOp` subclasses that represent HLS - layers. Which nodes and particular configurations can be converted to HLS - is limited, see the source code of the `convert_to_hls` module for more.""" +def step_convert_to_hw(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert eligible nodes to `HWCustomOp` subclasses that represent HW + layers. Which nodes and particular configurations can be converted to HW + is limited, see the source code of the `convert_to_hw` module for more. + In the end am empty json file is created which can be used to set user specific + preferred implementation styles for each node.""" - mem_mode = cfg.default_mem_mode.value if cfg.standalone_thresholds: # doing this first causes all threshold layers to be standalone - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) # needed for non-bipolar MatMul layers - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) # TopK to LabelSelect - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) as standalone threshold - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for convolutions -- TODO always exec? need_conv = len(model.get_nodes_by_op_type("Im2Col")) > 0 if need_conv: - if cfg.force_rtl_conv_inp_gen: - model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) - else: - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataLayouts()) + return model def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig): - """Separate consecutive groups of HLSCustomOp nodes into StreamingDataflowPartition + """Separate consecutive groups of HWCustomOp nodes into StreamingDataflowPartition nodes, which point to a separate ONNX file. Dataflow accelerator synthesis - can only be performed on those HLSCustomOp sub-graphs.""" + can only be performed on those HWCustomOp sub-graphs.""" parent_model = model.transform( CreateDataflowPartition( - partition_model_dir=cfg.output_dir - + "/intermediate_models/supported_op_partitions" + partition_model_dir=cfg.output_dir + "/intermediate_models/supported_op_partitions" ) ) sdp_nodes = parent_model.get_nodes_by_op_type("StreamingDataflowPartition") @@ -388,6 +380,31 @@ def step_create_dataflow_partition(model: ModelWrapper, cfg: DataflowBuildConfig if cfg.save_intermediate_models: parent_model.save(cfg.output_dir + "/intermediate_models/dataflow_parent.onnx") model = ModelWrapper(dataflow_model_filename) + + # create a configuration json file that can be used to set the specialize layer config + attrs = [ + "preferred_impl_style", + ] + extract_model_config_to_json( + model, cfg.output_dir + "/template_specialize_layers_config.json", attrs + ) + + return model + + +def step_specialize_layers(model: ModelWrapper, cfg: DataflowBuildConfig): + """Convert HW nodes to either an HLS or RTL variant of the node. HW nodes + get converted either based on pre-determined rules (details can be found + in `specialize_layers` source code) or the user provides a configuration file + which contains the desired setting. If the user preference cannot be fulfilled, + a warning will be printed and the implementation style will be set to a default.""" + + if cfg.specialize_layers_config_file is not None: + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(ApplyConfig(cfg.specialize_layers_config_file)) + model = model.transform(SpecializeLayers(cfg._resolve_fpga_part())) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) return model @@ -410,14 +427,15 @@ def step_target_fps_parallelization(model: ModelWrapper, cfg: DataflowBuildConfi hw_attrs = [ "PE", "SIMD", + "parallel_window", "ram_style", "resType", "mem_mode", "runtime_writeable_weights", + "depth_trigger_uram", + "depth_trigger_bram", ] - extract_model_config_to_json( - model, cfg.output_dir + "/auto_folding_config.json", hw_attrs - ) + extract_model_config_to_json(model, cfg.output_dir + "/auto_folding_config.json", hw_attrs) return model @@ -452,9 +470,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig with open(report_dir + "/estimate_layer_cycles.json", "w") as f: json.dump(estimate_layer_cycles, f, indent=2) estimate_layer_resources = model.analysis(res_estimation) - estimate_layer_resources["total"] = aggregate_dict_keys( - estimate_layer_resources - ) + estimate_layer_resources["total"] = aggregate_dict_keys(estimate_layer_resources) with open(report_dir + "/estimate_layer_resources.json", "w") as f: json.dump(estimate_layer_resources, f, indent=2) estimate_layer_resources_complete = model.analysis(res_estimation_complete) @@ -468,8 +484,7 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig est_fps = n_clock_cycles_per_sec / estimate_network_performance["max_cycles"] estimate_network_performance["estimated_throughput_fps"] = est_fps est_latency_ns = ( - estimate_network_performance["critical_path_cycles"] - * cfg.synth_clk_period_ns + estimate_network_performance["critical_path_cycles"] * cfg.synth_clk_period_ns ) estimate_network_performance["estimated_latency_ns"] = est_latency_ns with open(report_dir + "/estimate_network_performance.json", "w") as f: @@ -477,18 +492,27 @@ def step_generate_estimate_reports(model: ModelWrapper, cfg: DataflowBuildConfig return model -def step_hls_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): - "Generate Vivado HLS code to prepare HLSCustomOp nodes for IP generation." +def step_minimize_bit_width(model: ModelWrapper, cfg: DataflowBuildConfig): + """Tighten the weight and accumulator bit widths for each layer.""" + if cfg.minimize_bit_width: + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + # make sure the changed datatypes are propagated through the network + model = model.transform(InferDataTypes()) + return model - model = model.transform( - PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) - ) + +def step_hw_codegen(model: ModelWrapper, cfg: DataflowBuildConfig): + """Generate Vitis HLS code to prepare HLSBackend nodes for IP generation. + And fills RTL templates for RTLBackend nodes.""" + + model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) return model -def step_hls_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): - """Run Vivado HLS synthesis on generated code for HLSCustomOp nodes, - in order to generate IP blocks.""" +def step_hw_ipgen(model: ModelWrapper, cfg: DataflowBuildConfig): + """Run Vitis HLS synthesis on generated code for HLSBackend nodes, + in order to generate IP blocks. For RTL nodes this step does not do anything.""" model = model.transform(HLSSynthIP()) model = model.transform(ReplaceVerilogRelPaths()) @@ -516,6 +540,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.auto_fifo_depths: if cfg.auto_fifo_strategy == "characterize": model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform( PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) @@ -533,6 +558,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): create_shallow_fifos=True, ) ) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) elif cfg.auto_fifo_strategy == "largefifo_rtlsim": @@ -548,6 +574,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): InsertAndSetFIFODepths( cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period(), + swg_exception=cfg.default_swg_exception, vivado_ram_style=cfg.large_fifo_mem_style, force_python_sim=force_python_sim, ) @@ -563,6 +590,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # need to make sure all FIFOs are created so that their depth can be # set by ApplyConfig, so create_shallow_fifos=True model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) if cfg.folding_config_file is not None: @@ -572,6 +600,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): hw_attrs = [ "PE", "SIMD", + "parallel_window", "ram_style", "depth", "impl_style", @@ -580,10 +609,10 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): "runtime_writeable_weights", "inFIFODepths", "outFIFODepths", + "depth_trigger_uram", + "depth_trigger_bram", ] - extract_model_config_to_json( - model, cfg.output_dir + "/final_hw_config.json", hw_attrs - ) + extract_model_config_to_json(model, cfg.output_dir + "/final_hw_config.json", hw_attrs) # perform FIFO splitting and shallow FIFO removal only after the final config # json file has been written. otherwise, since these transforms may add/remove @@ -594,9 +623,7 @@ def step_set_fifo_depths(model: ModelWrapper, cfg: DataflowBuildConfig): # after FIFOs are ready to go, call PrepareIP and HLSSynthIP again # this will only run for the new nodes (e.g. FIFOs and DWCs) - model = model.transform( - PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period()) - ) + model = model.transform(PrepareIP(cfg._resolve_fpga_part(), cfg._resolve_hls_clk_period())) model = model.transform(HLSSynthIP()) return model @@ -633,9 +660,7 @@ def step_create_stitched_ip(model: ModelWrapper, cfg: DataflowBuildConfig): if cfg.verify_save_rtlsim_waveforms: report_dir = cfg.output_dir + "/report" os.makedirs(report_dir, exist_ok=True) - verify_model.set_metadata_prop( - "rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir) - ) + verify_model.set_metadata_prop("rtlsim_trace", "%s/verify_rtlsim.vcd" % (report_dir)) verify_step(verify_model, cfg, "stitched_ip_rtlsim", need_parent=True) os.environ["LIVENESS_THRESHOLD"] = str(prev_liveness) return model @@ -656,9 +681,7 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi rtlsim_model = deepcopy(model) rtlsim_model = prepare_for_stitched_ip_rtlsim(rtlsim_model, cfg) # multi-in/out streams currently not supported in our C++ verilator driver - model_multi_io = ( - len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1 - ) + model_multi_io = len(rtlsim_model.graph.input) > 1 or len(rtlsim_model.graph.output) > 1 force_python_rtlsim = cfg.force_python_rtlsim or model_multi_io if model_multi_io: warnings.warn( @@ -668,7 +691,6 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi rtlsim_bs = int(cfg.rtlsim_batch_size) orig_rtlsim_trace_depth = get_rtlsim_trace_depth() if force_python_rtlsim: - # run with single input to get latency assert rtlsim_bs > 0, "rtlsim batch size must be >0" if cfg.verify_save_rtlsim_waveforms: # set depth to 3 for layer-by-layer visibility @@ -677,12 +699,12 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi "rtlsim_trace", "%s/rtlsim_perf_batch_%d.vcd" % (report_dir, rtlsim_bs), ) - rtlsim_model.set_metadata_prop( - "extra_verilator_args", str(["-CFLAGS", "-O3"]) - ) + rtlsim_model.set_metadata_prop("extra_verilator_args", str(["-CFLAGS", "-O3"])) + # run with single input to get latency + rtlsim_latency_dict = throughput_test_rtlsim(rtlsim_model, 1) + # run with batch to get stable-state throughput rtlsim_perf_dict = throughput_test_rtlsim(rtlsim_model, rtlsim_bs) - rtlsim_latency = rtlsim_perf_dict["cycles"] - rtlsim_perf_dict["latency_cycles"] = rtlsim_latency + rtlsim_perf_dict["latency_cycles"] = rtlsim_latency_dict["cycles"] else: rtlsim_perf_dict = verilator_fifosim(model, rtlsim_bs) # keep keys consistent between the Python and C++-styles @@ -693,9 +715,22 @@ def step_measure_rtlsim_performance(model: ModelWrapper, cfg: DataflowBuildConfi rtlsim_perf_dict["runtime[ms]"] = runtime_s * 1000 rtlsim_perf_dict["throughput[images/s]"] = rtlsim_bs / runtime_s rtlsim_perf_dict["fclk[mhz]"] = fclk_mhz - for (key, val) in rtlsim_perf_dict.items(): + for key, val in rtlsim_perf_dict.items(): if "max_count" in key: del rtlsim_perf_dict[key] + # estimate stable-state throughput based on latency+throughput + if rtlsim_bs == 1: + rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_perf_dict[ + "throughput[images/s]" + ] + else: + total_cycles = rtlsim_perf_dict["cycles"] + latency_cycles = rtlsim_perf_dict["latency_cycles"] + stablestate_cycles = total_cycles - latency_cycles + clk_ns = float(model.get_metadata_prop("clk_ns")) + fclk_mhz = 1 / (clk_ns * 0.001) + runtime_s = (stablestate_cycles * clk_ns) * (10**-9) + rtlsim_perf_dict["stable_throughput[images/s]"] = rtlsim_bs / runtime_s with open(report_dir + "/rtlsim_performance.json", "w") as f: json.dump(rtlsim_perf_dict, f, indent=2) @@ -722,13 +757,9 @@ def step_out_of_context_synthesis(model: ModelWrapper, cfg: DataflowBuildConfig) """Run out-of-context synthesis and generate reports. Depends on the DataflowOutputType.STITCHED_IP output product.""" if DataflowOutputType.OOC_SYNTH in cfg.generate_outputs: - assert ( - DataflowOutputType.STITCHED_IP in cfg.generate_outputs - ), "OOC needs stitched IP" + assert DataflowOutputType.STITCHED_IP in cfg.generate_outputs, "OOC needs stitched IP" model = model.transform( - SynthOutOfContext( - part=cfg._resolve_fpga_part(), clk_period_ns=cfg.synth_clk_period_ns - ) + SynthOutOfContext(part=cfg._resolve_fpga_part(), clk_period_ns=cfg.synth_clk_period_ns) ) report_dir = cfg.output_dir + "/report" os.makedirs(report_dir, exist_ok=True) @@ -819,13 +850,15 @@ def step_deployment_package(model: ModelWrapper, cfg: DataflowBuildConfig): "step_qonnx_to_finn": step_qonnx_to_finn, "step_tidy_up": step_tidy_up, "step_streamline": step_streamline, - "step_convert_to_hls": step_convert_to_hls, + "step_convert_to_hw": step_convert_to_hw, + "step_specialize_layers": step_specialize_layers, "step_create_dataflow_partition": step_create_dataflow_partition, "step_target_fps_parallelization": step_target_fps_parallelization, "step_apply_folding_config": step_apply_folding_config, + "step_minimize_bit_width": step_minimize_bit_width, "step_generate_estimate_reports": step_generate_estimate_reports, - "step_hls_codegen": step_hls_codegen, - "step_hls_ipgen": step_hls_ipgen, + "step_hw_codegen": step_hw_codegen, + "step_hw_ipgen": step_hw_ipgen, "step_set_fifo_depths": step_set_fifo_depths, "step_create_stitched_ip": step_create_stitched_ip, "step_measure_rtlsim_performance": step_measure_rtlsim_performance, diff --git a/src/finn/core/onnx_exec.py b/src/finn/core/onnx_exec.py index 2695113661..588e97e9e4 100644 --- a/src/finn/core/onnx_exec.py +++ b/src/finn/core/onnx_exec.py @@ -31,13 +31,10 @@ import qonnx.analysis.topology as ta from qonnx.core.onnx_exec import execute_onnx as execute_onnx_base -from finn.core.remote_exec import remote_exec from finn.core.rtlsim_exec import rtlsim_exec -def execute_onnx( - model, input_dict, return_full_exec_context=False, start_node=None, end_node=None -): +def execute_onnx(model, input_dict, return_full_exec_context=False, start_node=None, end_node=None): """Executes given ONNX ModelWrapper with given named inputs. If return_full_exec_context is False, a dict of named outputs is returned as indicated by the model.graph.output. @@ -51,13 +48,10 @@ def execute_onnx( # check if model has an execution mode set # if None, execute model node using the QONNX-provided execute_onnx impl - # if set to "remote_pynq" execute model on PYNQ board # if set to "rtlsim" execute model using pyverilator model_exec_mode = model.get_metadata_prop("exec_mode") if (model_exec_mode is None) or (model_exec_mode == ""): - return execute_onnx_base( - model, input_dict, return_full_exec_context, start_node, end_node - ) + return execute_onnx_base(model, input_dict, return_full_exec_context, start_node, end_node) if not model.check_all_tensor_shapes_specified(): raise Exception("Found unspecified tensor shapes, try infer_shapes") @@ -91,22 +85,17 @@ def execute_onnx( # check if model has an execution mode set # if None, execute model node by node using execute_node() - # if set to "remote_pynq" execute model on PYNQ board # if set to "rtlsim" execute model using pyverilator model_exec_mode = model.get_metadata_prop("exec_mode") if (model_exec_mode is None) or (model_exec_mode == ""): return execute_onnx_base() - elif model_exec_mode == "remote_pynq": - # use remote exec metadata built into model to execute on a remote PYNQ - remote_exec(model, execution_context) elif model_exec_mode == "rtlsim": # use stitched IP for rtlsim rtlsim_exec(model, execution_context) else: raise Exception( - """Metadata property "exec_mode" is set to an unknown value. - Can be left unset or has to be set to "remote_pynq" for remote execution - on PYNQ board or "rtlsim" for execution using pyverilator!""" + """Metadata property "exec_mode" is set to an unknown value. Can be left + unset or has to be set to "rtlsim" for execution using pyverilator!""" ) if return_full_exec_context: diff --git a/src/finn/core/remote_exec.py b/src/finn/core/remote_exec.py deleted file mode 100644 index f487b48f86..0000000000 --- a/src/finn/core/remote_exec.py +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2020 Xilinx, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of Xilinx nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import subprocess -import warnings - - -def remote_exec(model, execution_context): - """Executes the given model remotely on the pynq board. The metadata properties - related to the pynq board have to be set. The execution context contains the - input values.""" - # TODO fix for multi input-output - pynq_ip = model.get_metadata_prop("pynq_ip") - pynq_port = int(model.get_metadata_prop("pynq_port")) - pynq_username = model.get_metadata_prop("pynq_username") - pynq_password = model.get_metadata_prop("pynq_password") - pynq_target_dir = model.get_metadata_prop("pynq_target_dir") - deployment_dir = model.get_metadata_prop("pynq_deploy_dir") - platform = model.get_metadata_prop("platform") - assert platform in ["alveo", "zynq-iodma"] - bitfile = model.get_metadata_prop("bitfile") - bitfile = os.path.basename(bitfile) - if pynq_password == "": - if "zynq" in platform: - raise Exception("PYNQ board remote exec needs password for sudo") - else: - local_prefix = "" # assume we are using an ssh key - warnings.warn("Empty password, make sure you've set up an ssh key") - else: - local_prefix = "sshpass -p %s " % pynq_password - - if platform == "alveo": - # Alveo can run without sudo - remote_prefix = "" - elif "zynq" in platform: - # PYNQ Zynq boards need to execute with sudo - remote_prefix = "echo %s | sudo -S " % pynq_password - - inp = execution_context[model.graph.input[0].name] - # make copy of array before saving it - inp = inp.copy() - batchsize = inp.shape[0] - np.save(os.path.join(deployment_dir, "input.npy"), inp) - # extracting last folder of absolute path (deployment_dir) - deployment_folder = os.path.basename(os.path.normpath(deployment_dir)) - # copy input to PYNQ board - cmd = local_prefix + "scp -P{} -r {}/input.npy {}@{}:{}/{}".format( - pynq_port, - deployment_dir, - pynq_username, - pynq_ip, - pynq_target_dir, - deployment_folder, - ) - bash_command = ["/bin/bash", "-c", cmd] - process_scp_in = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_scp_in.communicate() - - # use platform attribute for correct remote execution - if platform == "alveo": - remote_cmd = "bash -ic 'bash alveo_run.sh execute %d' \"" % batchsize - else: - remote_cmd = ( - "python3.6 driver.py --exec_mode=execute --batchsize={} " - "--bitfile={} --inputfile=input.npy --outputfile=output.npy " - '--platform={} "' - ).format(batchsize, bitfile, platform) - cmd = ( - local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd - ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder) - bash_command = ["/bin/bash", "-c", cmd] - process_exec_accel = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_exec_accel.communicate() - # remove stale output file from local dir, if any - try: - os.remove("{}/output.npy".format(deployment_dir)) - except FileNotFoundError: - pass - # copy generated output to local - cmd = local_prefix + "scp -P{} {}@{}:{}/{}/output.npy {}".format( - pynq_port, - pynq_username, - pynq_ip, - pynq_target_dir, - deployment_folder, - deployment_dir, - ) - bash_command = ["/bin/bash", "-c", cmd] - process_scp_out = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_scp_out.communicate() - outp = np.load("{}/output.npy".format(deployment_dir)) - execution_context[model.graph.output[0].name] = outp diff --git a/src/finn/core/throughput_test.py b/src/finn/core/throughput_test.py index 3533fd1339..08633be33b 100644 --- a/src/finn/core/throughput_test.py +++ b/src/finn/core/throughput_test.py @@ -28,90 +28,11 @@ import numpy as np import os -import subprocess -import warnings from qonnx.util.basic import gen_finn_dt_tensor from finn.core.rtlsim_exec import rtlsim_exec -def throughput_test_remote(model, batchsize=1000, timeout=None): - """Runs the throughput test for the given model remotely on the pynq board. - The metadata properties related to the pynq board have to be set. - Additionally a timeout for the SSH communication can be set. - Returns a dictionary with results of the throughput test. Returns None - if the test fails.""" - - pynq_ip = model.get_metadata_prop("pynq_ip") - pynq_port = int(model.get_metadata_prop("pynq_port")) - pynq_username = model.get_metadata_prop("pynq_username") - pynq_password = model.get_metadata_prop("pynq_password") - pynq_target_dir = model.get_metadata_prop("pynq_target_dir") - deployment_dir = model.get_metadata_prop("pynq_deploy_dir") - # extracting last folder of absolute path (deployment_dir) - deployment_folder = os.path.basename(os.path.normpath(deployment_dir)) - platform = model.get_metadata_prop("platform") - assert platform in ["alveo", "zynq-iodma"] - bitfile = model.get_metadata_prop("bitfile") - bitfile = os.path.basename(bitfile) - if pynq_password == "": - if "zynq" in platform: - raise Exception("PYNQ board remote exec needs password for sudo") - else: - local_prefix = "" # assume we are using an ssh key - warnings.warn("Empty password, make sure you've set up an ssh key") - else: - local_prefix = "sshpass -p %s " % pynq_password - - if platform == "alveo": - # Alveo can run without sudo but needs correct environment - remote_prefix = "conda activate finn-pynq-alveo; " - elif "zynq" in platform: - # PYNQ Zynq boards need to execute with sudo - remote_prefix = "echo %s | sudo -S " % pynq_password - - # use platform attribute for correct remote execution - if platform == "alveo": - remote_cmd = "bash -ic 'bash alveo_run.sh throughput_test %d' \"" % batchsize - else: - remote_cmd = ( - "python3.6 driver.py --exec_mode=throughput_test --batchsize={} " - "--bitfile={} --inputfile=input.npy --outputfile=output.npy " - '--platform={} "' - ).format(batchsize, bitfile, platform) - cmd = ( - local_prefix + 'ssh {}@{} -p {} "cd {}/{}; ' + remote_prefix + remote_cmd - ).format(pynq_username, pynq_ip, pynq_port, pynq_target_dir, deployment_folder) - bash_command = ["/bin/bash", "-c", cmd] - process_throughput_test = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_throughput_test.communicate(timeout=timeout) - - # remove any pre-existing metrics file - try: - os.remove("{}/nw_metrics.txt".format(deployment_dir)) - except FileNotFoundError: - pass - - cmd = local_prefix + "scp -P{} {}@{}:{}/{}/nw_metrics.txt {}".format( - pynq_port, - pynq_username, - pynq_ip, - pynq_target_dir, - deployment_folder, - deployment_dir, - ) - bash_command = ["/bin/bash", "-c", cmd] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate(timeout=timeout) - - try: - with open("{}/nw_metrics.txt".format(deployment_dir), "r") as file: - res = eval(file.read()) - return res - except FileNotFoundError: - return None - - def throughput_test_rtlsim(model, batchsize=100): """Runs a throughput test for the given IP-stitched model. When combined with tracing, useful to determine bottlenecks and required FIFO sizes.""" diff --git a/src/finn/custom_op/fpgadataflow/__init__.py b/src/finn/custom_op/fpgadataflow/__init__.py index 56d4230a3a..aed2ab7fe1 100644 --- a/src/finn/custom_op/fpgadataflow/__init__.py +++ b/src/finn/custom_op/fpgadataflow/__init__.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,70 +27,57 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from finn.custom_op.fpgadataflow.addstreams_batch import AddStreams_Batch -from finn.custom_op.fpgadataflow.channelwise_op_batch import ChannelwiseOp_Batch -from finn.custom_op.fpgadataflow.checksum import CheckSum +from finn.custom_op.fpgadataflow.addstreams import AddStreams +from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp from finn.custom_op.fpgadataflow.concat import StreamingConcat from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( ConvolutionInputGenerator, ) -from finn.custom_op.fpgadataflow.convolutioninputgenerator1d import ( - ConvolutionInputGenerator1D, -) -from finn.custom_op.fpgadataflow.convolutioninputgenerator_rtl import ( - ConvolutionInputGenerator_rtl, -) from finn.custom_op.fpgadataflow.downsampler import DownSampler -from finn.custom_op.fpgadataflow.duplicatestreams_batch import DuplicateStreams_Batch -from finn.custom_op.fpgadataflow.eltwise import StreamingEltwise -from finn.custom_op.fpgadataflow.fmpadding_batch import FMPadding_Batch -from finn.custom_op.fpgadataflow.fmpadding_rtl import FMPadding_rtl -from finn.custom_op.fpgadataflow.globalaccpool_batch import GlobalAccPool_Batch -from finn.custom_op.fpgadataflow.iodma import IODMA -from finn.custom_op.fpgadataflow.labelselect_batch import LabelSelect_Batch +from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel +from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool +from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.custom_op.fpgadataflow.lookup import Lookup -from finn.custom_op.fpgadataflow.matrixvectoractivation import MatrixVectorActivation -from finn.custom_op.fpgadataflow.pool_batch import Pool_Batch +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.custom_op.fpgadataflow.pool import Pool from finn.custom_op.fpgadataflow.streamingdataflowpartition import ( StreamingDataflowPartition, ) -from finn.custom_op.fpgadataflow.streamingdatawidthconverter_batch import ( - StreamingDataWidthConverter_Batch, +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, ) +from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO -from finn.custom_op.fpgadataflow.streamingmaxpool_batch import StreamingMaxPool_Batch -from finn.custom_op.fpgadataflow.thresholding_batch import Thresholding_Batch -from finn.custom_op.fpgadataflow.tlastmarker import TLastMarker -from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour_Batch -from finn.custom_op.fpgadataflow.vectorvectoractivation import VectorVectorActivation +from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool +from finn.custom_op.fpgadataflow.thresholding import Thresholding +from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU custom_op = dict() # make sure new HLSCustomOp subclasses are imported here so that they get # registered and plug in correctly into the infrastructure -custom_op["DownSampler"] = DownSampler -custom_op["StreamingMaxPool_Batch"] = StreamingMaxPool_Batch -custom_op["MatrixVectorActivation"] = MatrixVectorActivation -custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator -custom_op["ConvolutionInputGenerator1D"] = ConvolutionInputGenerator1D -custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl -custom_op["TLastMarker"] = TLastMarker -custom_op["StreamingDataWidthConverter_Batch"] = StreamingDataWidthConverter_Batch +custom_op["MVAU"] = MVAU custom_op["StreamingFIFO"] = StreamingFIFO -custom_op["GlobalAccPool_Batch"] = GlobalAccPool_Batch -custom_op["Pool_Batch"] = Pool_Batch -custom_op["FMPadding_Batch"] = FMPadding_Batch -custom_op["Thresholding_Batch"] = Thresholding_Batch -custom_op["AddStreams_Batch"] = AddStreams_Batch -custom_op["LabelSelect_Batch"] = LabelSelect_Batch -custom_op["DuplicateStreams_Batch"] = DuplicateStreams_Batch -custom_op["VectorVectorActivation"] = VectorVectorActivation -custom_op["ChannelwiseOp_Batch"] = ChannelwiseOp_Batch -custom_op["IODMA"] = IODMA +custom_op["Thresholding"] = Thresholding +custom_op["VVAU"] = VVAU custom_op["StreamingDataflowPartition"] = StreamingDataflowPartition -custom_op["UpsampleNearestNeighbour_Batch"] = UpsampleNearestNeighbour_Batch + +custom_op["AddStreams"] = AddStreams +custom_op["ChannelwiseOp"] = ChannelwiseOp +custom_op["ConvolutionInputGenerator"] = ConvolutionInputGenerator +custom_op["DownSampler"] = DownSampler +custom_op["DuplicateStreams"] = DuplicateStreams +custom_op["FMPadding"] = FMPadding +custom_op["FMPadding_Pixel"] = FMPadding_Pixel +custom_op["GlobalAccPool"] = GlobalAccPool +custom_op["LabelSelect"] = LabelSelect custom_op["Lookup"] = Lookup +custom_op["Pool"] = Pool custom_op["StreamingConcat"] = StreamingConcat -custom_op["CheckSum"] = CheckSum +custom_op["StreamingDataWidthConverter"] = StreamingDataWidthConverter custom_op["StreamingEltwise"] = StreamingEltwise -custom_op["FMPadding_rtl"] = FMPadding_rtl +custom_op["StreamingMaxPool"] = StreamingMaxPool +custom_op["UpsampleNearestNeighbour"] = UpsampleNearestNeighbour diff --git a/src/finn/custom_op/fpgadataflow/addstreams.py b/src/finn/custom_op/fpgadataflow/addstreams.py new file mode 100644 index 0000000000..ac61786ac1 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/addstreams.py @@ -0,0 +1,171 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class AddStreams(HWCustomOp): + """Abstraction layer for HW implementation of AddStreams.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) + return my_attrs + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + # we need to set output datatype to the next larger int or uint + # enhancement: consider specifying w/ explicit outputDataType attribute + # to allow overflow and use the same idt if user wants + idt = DataType[self.get_nodeattr("inputDataType")] + if idt.signed(): + return DataType.get_smallest_possible(2 * idt.min()) + else: + return DataType.get_smallest_possible(2 * idt.max()) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # simulate behavior using Python + node = self.onnx_node + inp0_values = context[node.input[0]] + inp1_values = context[node.input[1]] + oshape = context[node.output[0]].shape + ishape0 = inp0_values.shape + ishape1 = inp1_values.shape + assert ishape0 == ishape1, "Shapes of inputs should be the same for Addstreams" + result = inp0_values + inp1_values + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + "in1": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op.py b/src/finn/custom_op/fpgadataflow/channelwise_op.py new file mode 100644 index 0000000000..9bf4ebdf62 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/channelwise_op.py @@ -0,0 +1,234 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import onnxruntime as rt +import warnings +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# ONNX i/o tensor shape assumptions for channelwise ops: +# input 0 is the input tensor, shape (..., NumChannels) +# input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) +# output 0 is the output tensor, shape (..., NumChannels) - same as input +# the ... here can be any shape (representing groups of vectors) + + +def get_smallest_possible(vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals, dtype=np.float64) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.get_accumulator_dt_cands(): + dt = DataType[k] + + if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType["UINT64"] + else: + return DataType["INT64"] + + +class ChannelwiseOp(HWCustomOp): + """Abstraction layer for HW implementation of ChannelwiseOp.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # channelwise "map" function to apply: + # one of cmp_le, cmp_ge, add, mul + "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}), + "PE": ("i", True, 0), + "NumChannels": ("i", True, 0), + # string defining memory resource type for parameters + "ram_style": ("s", False, "distributed", {"distributed", "block"}), + # FINN DataTypes for inputs, weights, outputs + "inputDataType": ("s", True, ""), + "paramDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def calc_tmem(self): + """Calculates and returns TMEM, the depth of the memory used + to store the channelwise op parameters.""" + chn = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return chn // pe + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + # implement tensor with correct shape + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt = model.get_tensor_datatype(node.input[0]) + + exp_idt_name = self.get_nodeattr("inputDataType") + if exp_idt_name != idt.name: + func = self.get_nodeattr("Func") + assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer" + + self.set_nodeattr("inputDataType", idt.name) + # update the func in ['add','mul'] cases + + # get parameter ranges + param = model.get_initializer(node.input[1]) + param_min = min(param.flatten()) + param_max = max(param.flatten()) + + # set function and determine output data type + if func == "add": + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = get_smallest_possible([out_min, out_max]) + elif func == "mul": + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = get_smallest_possible(possible_limits) + + self.set_nodeattr("outputDataType", odt.name) + + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + fold = ich // pe + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # create a standard onnx node to help calculate the result + # depending on Func node attribute either a Mul or an Add node + node = self.onnx_node + func = self.get_nodeattr("Func") + inp_values = context[node.input[0]] + param_values = context[node.input[1]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + pshape = param_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + param = helper.make_tensor_value_info(node.input[1], TensorProto.FLOAT, pshape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_func = helper.make_node( + func.capitalize(), + inputs=node.input, + outputs=[node.output[0]], + ) + graph_func = helper.make_graph( + nodes=[node_func], + name="single-add-exec", + inputs=[inp, param], + outputs=[outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_func = qonnx_make_model(graph_func, **onnx_kwargs) + idict = {node.input[0]: inp_values, node.input[1]: param_values} + sess = rt.InferenceSession(model_func.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/concat.py b/src/finn/custom_op/fpgadataflow/concat.py index 4437bcd198..210b6b7fdd 100644 --- a/src/finn/custom_op/fpgadataflow/concat.py +++ b/src/finn/custom_op/fpgadataflow/concat.py @@ -1,4 +1,5 @@ # Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,20 +28,18 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class StreamingConcat(HLSCustomOp): - """Streaming concatenation node with dynamically generated HLS. +class StreamingConcat(HWCustomOp): + """Abstraction layer for HW implementation of Concat. Only supports concatenating along the last axis.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -127,238 +126,13 @@ def get_number_output_values(self): def get_exp_cycles(self): return np.prod(self.get_folded_output_shape()[:-1]) - def generate_params(self, model, path): - elems_per_stream = self.get_nodeattr("ElemsPerStream") - inp_streams = [] - commands = [] - idt = self.get_input_datatype() - total_elems = self.get_total_elems() - total_bw = idt.bitwidth() * total_elems - for (i, elems) in enumerate(elems_per_stream): - bw = idt.bitwidth() * elems - inp_stream = "hls::stream > &in%d" % (bw, i) - inp_streams.append(inp_stream) - cmd = "in%d.read()" % i - commands.append(cmd) - out_stream = "hls::stream > &out" % (total_bw) - inp_streams.append(out_stream) - - impl_hls_code = [] - impl_hls_code.append("void StreamingConcat(") - impl_hls_code.append(",".join(inp_streams)) - impl_hls_code.append(", unsigned int numReps) {") - impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {") - impl_hls_code.append("#pragma HLS PIPELINE II=1") - impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw) - # FIXME: the order of streams for concatenation works out differently - # for cppsim vs rtlsim, addressed via reversing the order of commands - # for now - impl_hls_code.append("#ifdef __SYNTHESIS__") - impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");") - impl_hls_code.append("#else") - impl_hls_code.append("out_elem = (" + ",".join(commands) + ");") - impl_hls_code.append("#endif") - impl_hls_code.append("out.write(out_elem);") - impl_hls_code.append("}") - impl_hls_code.append("}") - impl_hls_code = "\n".join(impl_hls_code) - - impl_filename = "{}/concat_impl.hpp".format(path) - f_impl = open(impl_filename, "w") - f_impl.write(impl_hls_code) - f_impl.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") node = self.onnx_node - n_inps = len(self.onnx_node.input) - ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)] - folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)] - exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() - export_idt = self.get_input_datatype() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - for i in range(n_inps): - inp = context[node.input[i]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i] - # reshape input into folded form - inp = inp.reshape(folded_ishapes[i]) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - io_dict = {"inputs": {}, "outputs": {"out": []}} - for i in range(n_inps): - nbits = self.get_instream_width(i) - rtlsim_inp = npy_to_rtlsim_input( - "%s/input_%d.npy" % (code_gen_dir, i), - export_idt, - nbits, - reverse_inner=True, - ) - io_dict["inputs"]["in%d" % i] = rtlsim_inp - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - self.rtlsim_multi_io(sim, io_dict) - rtlsim_output = io_dict["outputs"]["out"] - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - reverse_inner=True, - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"'] - - def defines(self, var): - num_reps = self.get_nodeattr("numInputVectors") - num_reps = np.prod(num_reps) - self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps] - - def read_npy_data(self): - n_inputs = self.get_n_inputs() - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - npy_type = "float" - self.code_gen_dict["$READNPYDATA$"] = [] - idt = self.get_input_datatype() - idt_bw = idt.bitwidth() - elem_hls_type = idt.get_hls_datatype_str() - elem_bits = idt_bw - for i in range(n_inputs): - packed_bits = self.get_instream_width(i) - packed_hls_type = "ap_uint<%d>" % packed_bits - npy_in = "%s/input_%d.npy" % (code_gen_dir, i) - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in%d);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in, i) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - n_inputs = self.get_n_inputs() - for i in range(n_inputs): - packed_bits = self.get_instream_width(i) - packed_hls_type = "ap_uint<%d>" % packed_bits - stream_name = "in%d" % i - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream<%s> %s ("%s");' - % (packed_hls_type, stream_name, stream_name) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [] - n_inputs = self.get_n_inputs() - in_stream_names = ["in%d" % x for x in range(n_inputs)] - in_stream_names = ",".join(in_stream_names) - comp_call = "StreamingConcat(%s, out, NumReps);" % (in_stream_names) - self.code_gen_dict["$DOCOMPUTE$"] = [comp_call] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - n_inputs = self.get_n_inputs() - in_streams = [] - for i in range(n_inputs): - iwidth = self.get_instream_width(i) - in_streams.append("hls::stream> &in%d" % (iwidth, i)) - in_streams = ",".join(in_streams) - total_width = self.get_input_datatype().bitwidth() * self.get_total_elems() - out_stream = "hls::stream> &out" % (total_width) - blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream) - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls] - - def pragmas(self): - n_inputs = self.get_n_inputs() - pragmas = [] - for i in range(n_inputs): - pragmas.append( - "#pragma HLS INTERFACE axis port=in%d name=in%d_%s" - % (i, i, self.hls_sname()) - ) - self.code_gen_dict["$PRAGMAS$"] = pragmas - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) + inp_values = [] + for inp in node.input: + inp_values.append(context[inp]) + result = np.concatenate(inp_values, axis=-1) + context[node.output[0]] = result def get_instream_width_padded(self, ind=0): in_width = self.get_instream_width(ind) @@ -370,7 +144,5 @@ def get_verilog_top_module_intf_names(self): sname = self.hls_sname() intf_names["s_axis"] = [] for i in range(n_inputs): - intf_names["s_axis"].append( - ("in%d_%s" % (i, sname), self.get_instream_width_padded(i)) - ) + intf_names["s_axis"].append(("in%d_%s" % (i, sname), self.get_instream_width_padded(i))) return intf_names diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py index 1566445999..96f49069c7 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py +++ b/src/finn/custom_op/fpgadataflow/convolutioninputgenerator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,36 +26,27 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import math import numpy as np -import os +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator: # input 0 is the input tensor, shape NHWC = (1, IFMDim, IFMDim, IFMChannels) # output 0 is the output tensor, shape NHWC: # = (1, OFMDim, OFMDim, (ConvKernelDim^2)*IFMChannels) -# note: the actual data layout produced by the hlslib kernels is different -# for depthwise and non-depthwise ops. -# * non-depthwise SWG: (1, OFMDim, OFMDim, K, K, IFMChannels/SIMD, SIMD) -# * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/SIMD, K, K, SIMD) -# see test_fpgadataflow_slidingwindow.py for an example of how to transform -# between the two layouts +class ConvolutionInputGenerator(HWCustomOp): + """Abstraction layer for HW implementation of ConvolutionInputGenerator""" -class ConvolutionInputGenerator(HLSCustomOp): - """Class that corresponds to one of the finn-hlslib ConvolutionInputGenerator - (sliding window) function variants. Depending on the combination of - attributes (e.g. depthwise or not, whether k % stride is 0) a different - variant will be picked for the actual HLS implementation.""" - - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -82,23 +73,16 @@ def get_nodeattr_types(self): "distributed", {"auto", "block", "distributed", "ultra"}, ), + "parallel_window": ("i", False, 0, {0, 1}), + # 1D (True) or 2D (False) spatial data + "is1D": ("i", False, 0), + # Enable reprogrammable implementation to change FM dimensions, + # stride, or dilation during runtime (requires parallel_window = 0) + "dynamic_mode": ("i", False, 0, {0, 1}), } my_attrs.update(super().get_nodeattr_types()) return my_attrs - def get_nodeattr(self, name): - # overriding get_nodeattr to check for square kernel/img.. requirement - # since this can't be done with the attribute restriction in nodeattr_types - # TODO non-square can be enabled in theory but needs testing - ret = super().get_nodeattr(name) - props_to_check = ["ConvKernelDim", "IFMDim", "OFMDim", "Stride", "Dilation"] - if name in props_to_check: - is_square = ret[0] == ret[1] - assert is_square, "Only square %s supported" % name - if name == "Dilation": - assert ret[0] == ret[1] == 1, "Only dilation=1 supported" - return ret - def get_normal_input_shape(self, ind=0): ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") ifm_ch = self.get_nodeattr("IFMChannels") @@ -137,8 +121,12 @@ def get_folded_output_shape(self, ind=0): ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) + if self.use_parallel_window_output(): + wf = int((ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) + else: + wf = int((k_h * k_w * ifm_ch) // simd) + folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) return folded_oshape def make_shape_compatible_op(self, model): @@ -177,322 +165,93 @@ def get_instream_width(self, ind=0): return in_width def get_outstream_width(self, ind=0): - """Returns stream width, input and output stream width are equal for - the sliding window function, so the function to determine the input - stream width can be reused.""" - return self.get_instream_width() + if self.use_parallel_window_output(): + # feed all window pixels in parallel + k_h, k_w = self.get_nodeattr("ConvKernelDim") + return self.get_instream_width() * k_h * k_w + else: + # if parallel variant not in use: same width for output and input stream + return self.get_instream_width() def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() num_output_elems = np.prod(folded_oshape[:-1]) return num_output_elems - def get_exp_cycles(self): - simd = self.get_nodeattr("SIMD") + def get_1d_conv_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # For the kernel, presenting the input data of size D as + # [H, W] = [Y, X] = [1, D] or [D, 1] + # effectively gives the same result. + # For consistency and ease of programming, this function + # returns the attributes of the layer as follows: + # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. + # The dummy ('1') dimension is the Y-dimension. ifm_ch = self.get_nodeattr("IFMChannels") - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv - cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) - max_cycles = max(cycles_write_block, cycles_read_block) - exp_cycles = ( - ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles - ) + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + + # see defines() for an explanation + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + ofm_dim = ofm_dim[::-1] + k = k[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) - return int(exp_cycles) + def get_exp_cycles(self): + return 0 def bram_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "block" or ram_style == "auto": - ram_depth = ifm_dim * ifm_ch / simd - if ram_depth <= 512: - ram_width = 36 - elif ram_depth <= 1024: - ram_width = 18 - elif ram_depth <= 2048: - ram_width = 9 - elif ram_depth <= 4096: - ram_width = 4 - elif ram_depth <= 8192: - ram_width = 2 - else: - ram_width = 1 - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) - * math.ceil(ifm_dim * ifm_ch / simd / ram_depth) - ) - ) - else: - return 0 + return 0 def lut_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "distributed": - ram_luts = int( - (k + stride) - * ( - simd - * self.get_input_datatype().bitwidth() - * math.ceil(ifm_dim * ifm_ch / simd / 64) - ) - ) - else: - ram_luts = 0 - return 300 + ram_luts + return 0 def uram_estimation(self): - # NOTE: only tested with a square convolution - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - ifm_dim = self.get_nodeattr("IFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - return int( - (k + stride) - * ( - math.ceil(simd * self.get_input_datatype().bitwidth() / 64) - * math.ceil(ifm_dim * ifm_ch / simd / 4096) - ) - ) - else: - return 0 + return 0 def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # using Im2Col node to calculate output node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ifm_dim_h, ifm_dim_w, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] - - def defines(self, var): - numReps = 1 - ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_dim = self.get_nodeattr("IFMDim") + k = self.get_nodeattr("ConvKernelDim") + s = self.get_nodeattr("Stride") + d = self.get_nodeattr("Dilation") ifm_ch = self.get_nodeattr("IFMChannels") - ofm_dim = self.get_nodeattr("OFMDim")[0] - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - simd = self.get_nodeattr("SIMD") - ifm_precision = self.get_input_datatype().bitwidth() - - self.code_gen_dict["$DEFINES$"] = [ - """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n - #define Input_precision1 {}\n #define IFMDim1 {}\n - #define OFMDim1 {}\n #define SIMD1 {}\n - #define Stride1 {}\n #define numReps {}""".format( - k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + im2col_node = helper.make_node( + "Im2Col", + [node.input[0]], + [node.output[0]], + domain="qonnx.custom_op.general", + stride=[s[0], s[1]], + kernel_size=[k[0], k[1]], + dilations=[d[0], d[1]], + input_shape="(1,{},{},{})".format(ifm_dim[0], ifm_dim[1], ifm_ch), ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + graph_im2col = helper.make_graph( + nodes=[im2col_node], + name="single-im2col-exec", + inputs=[inp], + outputs=[outp], ) - def docompute(self): - node = self.onnx_node - ram_style = self.get_nodeattr("ram_style") - map_to_hls_ram_style = { - "auto": "ap_resource_dflt()", - "block": "ap_resource_bram()", - "distributed": "ap_resource_lutram()", - "ultra": "ap_resource_uram()", - } - hls_ram_style = map_to_hls_ram_style[ram_style] - hls_call = node.op_type - - # check which ConvolutionInputGenerator is needed - k = self.get_nodeattr("ConvKernelDim")[0] - stride = self.get_nodeattr("Stride")[0] - - if k % stride != 0: - hls_call += "_kernel_stride" - - if self.get_nodeattr("depthwise") == 1: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}_dws (in0, out, numReps, {});""".format( - hls_call, hls_ram_style - ) - ] - else: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0, out, numReps, {});""".format( - hls_call, hls_ram_style - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &out)""".format( - self.onnx_node.name - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs)) + model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype()) + # use execution function from Im2Col node + # this automatically updates the execution context + inst = getCustomOp(im2col_node) + inst.execute_node(context, model_im2col.graph) diff --git a/src/finn/custom_op/fpgadataflow/downsampler.py b/src/finn/custom_op/fpgadataflow/downsampler.py index b7efaff440..4f919d1b50 100644 --- a/src/finn/custom_op/fpgadataflow/downsampler.py +++ b/src/finn/custom_op/fpgadataflow/downsampler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,20 +27,22 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class DownSampler(HLSCustomOp): - """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. +class DownSampler(HWCustomOp): + """Abstraction layer for HW implementation of DownSampling Basically performs a down sampling of the image removing rows and columns.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -174,180 +176,54 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] - - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] - - idim = self.get_nodeattr("ImgDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] - - simd = self.get_nodeattr("SIMD") - self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)] - - stride = self.get_nodeattr("Stride") - self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)] - - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" - self.code_gen_dict["$DOCOMPUTE$"] = [ - f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0, out, numReps);""" - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_hls_type, packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # using Im2Col node to calculate output node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + ifm_dim = self.get_nodeattr("ImgDim") + stride = self.get_nodeattr("Stride") + ifm_ch = self.get_nodeattr("NumChannels") + # check if 1D or 2D case + if self.get_nodeattr("is1D"): + if self.get_nodeattr("is1D_unitx"): + ifm_dim_w = 1 + sw = 1 + ifm_dim_h = ifm_dim + sh = stride + else: + ifm_dim_h = 1 + sh = 1 + ifm_dim_w = ifm_dim + sw = stride else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + ifm_dim_h = ifm_dim_w = ifm_dim + sh = sw = stride + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + im2col_node = helper.make_node( + "Im2Col", + [node.input[0]], + [node.output[0]], + domain="qonnx.custom_op.general", + stride=[sh, sw], + kernel_size=[1, 1], + input_shape="(1,{},{},{})".format(ifm_dim_h, ifm_dim_w, ifm_ch), + ) + graph_im2col = helper.make_graph( + nodes=[im2col_node], + name="single-im2col-exec", + inputs=[inp], + outputs=[outp], + ) - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_im2col = ModelWrapper(qonnx_make_model(graph_im2col, **onnx_kwargs)) + model_im2col.set_tensor_datatype(node.input[0], self.get_input_datatype()) + # use execution function from Im2Col node + # this automatically updates the execution context + inst = getCustomOp(im2col_node) + inst.execute_node(context, model_im2col.graph) diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams.py b/src/finn/custom_op/fpgadataflow/duplicatestreams.py new file mode 100644 index 0000000000..8943ffc9e3 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/duplicatestreams.py @@ -0,0 +1,177 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class DuplicateStreams(HWCustomOp): + """Abstraction layer for HW implementation of DuplicateStreams""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # how many duplicated output streams to create + "NumOutputStreams": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_num_output_streams(self): + return self.get_nodeattr("NumOutputStreams") + + def get_normal_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ch]) + return ishape + + def get_folded_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + # since the output shape of both out streams are the same + # return independently from index + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + # since the output shape of both out streams are the same + # return independently from index + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + num_out = self.get_num_output_streams() + assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs" + + oshape = self.get_normal_output_shape() + ret = super().make_const_shape_op(oshape) + ret.output[:] = self.onnx_node.output + return ret + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + odt = self.get_output_datatype() + for my_out in self.onnx_node.output: + model.set_tensor_datatype(my_out, odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return self.get_num_output_streams() * np.prod(self.get_folded_output_shape()[1:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # passing input to both outputs to make + # abstraction layer executable + node = self.onnx_node + inp = context[node.input[0]] + exp_shape = self.get_normal_input_shape() + + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + for outp in node.output: + context[outp] = output + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + n_outputs = self.get_num_output_streams() + sname = self.hls_sname() + intf_names["m_axis"] = [] + for i in range(n_outputs): + intf_names["m_axis"].append( + ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) + ) + return intf_names + + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out0": [], "out1": []}, + } + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding.py b/src/finn/custom_op/fpgadataflow/fmpadding.py new file mode 100644 index 0000000000..5767028ea7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/fmpadding.py @@ -0,0 +1,172 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class FMPadding(HWCustomOp): + """Abstraction layer for HW impplementation of FMPadding. + Pads input image by given amount.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # spatial size of input images + "ImgDim": ("ints", True, []), # [H, W] = [Y, X] + # total padding (per dimension) to apply + "Padding": ( + "ints", + True, + [1, 1, 1, 1], + ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] + # number of channels in input image + "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), + # FINN input datatype + "inputDataType": ("s", True, ""), + # shape describing input vecs per execution + "numInputVectors": ("i", False, 1), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_padded_odim(self): + "Return the padded spatial size of the output." + idim_h, idim_w = self.get_nodeattr("ImgDim") + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + odim_h = idim_h + pad_h + odim_w = idim_w + pad_w + return [odim_h, odim_w] + + def get_exp_cycles(self): + odim_h, odim_w = self.get_padded_odim() + channels = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + batch_size = self.get_nodeattr("numInputVectors") + exp_cycles = (channels / simd) * batch_size * odim_h * odim_w + return int(exp_cycles) + + def get_normal_input_shape(self, ind=0): + idim_h, idim_w = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + ishape = (1, idim_h, idim_w, num_ch) + return ishape + + def get_normal_output_shape(self, ind=0): + odim_h, odim_w = self.get_padded_odim() + num_ch = self.get_nodeattr("NumChannels") + + oshape = (1, odim_h, odim_w, num_ch) + return oshape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for FMPadding." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + # the hlslib op always pads with zeros, so ensure that the DataType + # is able to represent zeros + assert ret.allowed(0), "FMPadding_Batch DataType must support zero" + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self, ind=0): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return ibits * simd + + def get_outstream_width(self, ind=0): + obits = self.get_output_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return obits * simd + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def execute_node(self, context, graph): + # simulate behavior with Python functionality + node = self.onnx_node + pad = self.get_nodeattr("Padding") + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + result = np.pad( + inp_values, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant" + ) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py b/src/finn/custom_op/fpgadataflow/fmpadding_batch.py deleted file mode 100644 index dfc55d283f..0000000000 --- a/src/finn/custom_op/fpgadataflow/fmpadding_batch.py +++ /dev/null @@ -1,391 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class FMPadding_Batch(HLSCustomOp): - """Corresponds to finn-hlslib FMPadding_Batch function. - Pads input image by given amount.""" - - def __init__(self, onnx_node): - super().__init__(onnx_node) - - def get_nodeattr_types(self): - my_attrs = { - # spatial size of input images - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - # total padding (per dimension) to apply - "Padding": ( - "ints", - True, - [1, 1, 1, 1], - ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] - # number of channels in input image - "NumChannels": ("i", True, 0), - # SIMD Input parallelism - "SIMD": ("i", False, 1), - # FINN input datatype - "inputDataType": ("s", True, ""), - # shape describing input vecs per execution - "numInputVectors": ("i", False, 1), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_padded_odim(self): - "Return the padded spatial size of the output." - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - odim_h = idim_h + pad_h - odim_w = idim_w + pad_w - return [odim_h, odim_w] - - def get_exp_cycles(self): - odim_h, odim_w = self.get_padded_odim() - channels = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = (channels / simd) * batch_size * odim_h * odim_w - return int(exp_cycles) - - def get_normal_input_shape(self, ind=0): - idim_h, idim_w = self.get_nodeattr("ImgDim") - num_ch = self.get_nodeattr("NumChannels") - ishape = (1, idim_h, idim_w, num_ch) - return ishape - - def get_normal_output_shape(self, ind=0): - odim_h, odim_w = self.get_padded_odim() - num_ch = self.get_nodeattr("NumChannels") - - oshape = (1, odim_h, odim_w, num_ch) - return oshape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_ishape[-1] / simd) - folded_ishape = normal_ishape[:-1] + [fold, simd] - return tuple(folded_ishape) - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_oshape[-1] / simd) - folded_oshape = normal_oshape[:-1] + [fold, simd] - return tuple(folded_oshape) - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for SameResize." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - # the hlslib op always pads with zeros, so ensure that the DataType - # is able to represent zeros - assert ret.allowed(0), "FMPadding_Batch DataType must support zero" - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output. (Same as input datatype)""" - return self.get_input_datatype() - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return ibits * simd - - def get_outstream_width(self, ind=0): - obits = self.get_output_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return obits * simd - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - idim_h, idim_w = self.get_nodeattr("ImgDim") - odim_h, odim_w = self.get_padded_odim() - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - is_square_img = idim_h == idim_w - is_square_pad = pad_h == pad_w - - if is_square_img and is_square_pad: - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim1 {}\n#define OutputDim1 {}\n - #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n - #define NumChannels1 {}\n#define SIMD1 {}\n - #define numReps {}\n""".format( - idim_h, - odim_h, - pad[0], - pad[2], - self.get_nodeattr("NumChannels"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("numInputVectors"), - ) - ] - else: - self.code_gen_dict["$DEFINES$"] = [ - """ - #define OutputDim1_x {}\n - #define OutputDim1_y {}\n - #define PaddingLeft1 {}\n - #define PaddingRight1 {}\n - #define PaddingTop1 {}\n - #define PaddingBottom1 {}\n - #define NumChannels1 {}\n - #define SIMD1 {}\n - #define numReps {}\n - """.format( - odim_w, - odim_h, - pad[1], - pad[3], - pad[0], - pad[2], - self.get_nodeattr("NumChannels"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("numInputVectors"), - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - in_t = self.get_input_datatype().get_hls_datatype_str() - node = self.onnx_node - - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - is_square_img = idim_h == idim_w - is_square_pad = pad_h == pad_w - - if is_square_img and is_square_pad: - hls_call = node.op_type - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0, out, numReps);""".format( - hls_call, in_t - ) - ] - else: - hls_call = "FMPadding_nonsquare_Batch" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} (in0, out, numReps);""".format( - hls_call, in_t - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_hls_type, packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py new file mode 100644 index 0000000000..b1f9900070 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/fmpadding_pixel.py @@ -0,0 +1,175 @@ +# Copyright (c) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class FMPadding_Pixel(HWCustomOp): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # spatial size of input images + "ImgDim": ("ints", True, []), + # stride to apply, can be non-square + "Stride": ("ints", True, []), + # number of channels in input image + "NumChannels": ("i", True, 0), + # SIMD Input parallelism + "SIMD": ("i", False, 1), + # FINN input datatype + "inputDataType": ("s", True, ""), + # shape describing input vecs per execution + "numInputVectors": ("i", False, 1), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_padded_odim(self): + "Return the padded spatial size of the output." + idim_h, idim_w = self.get_nodeattr("ImgDim") + stride_h, stride_w = self.get_nodeattr("Stride") + odim_h = idim_h + (idim_h - 1) * (stride_h - 1) + odim_w = idim_w + (idim_w - 1) * (stride_w - 1) + return [odim_h, odim_w] + + def get_exp_cycles(self): + odim_h, odim_w = self.get_padded_odim() + channels = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + batch_size = self.get_nodeattr("numInputVectors") + exp_cycles = (channels / simd) * batch_size * odim_h * odim_w + return int(exp_cycles) + + def get_normal_input_shape(self, ind=0): + idim_h, idim_w = self.get_nodeattr("ImgDim") + num_ch = self.get_nodeattr("NumChannels") + ishape = (1, idim_h, idim_w, num_ch) + return ishape + + def get_normal_output_shape(self, ind=0): + odim_h, odim_w = self.get_padded_odim() + num_ch = self.get_nodeattr("NumChannels") + oshape = (1, odim_h, odim_w, num_ch) + return oshape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_ishape[-1] / simd) + folded_ishape = normal_ishape[:-1] + [fold, simd] + return tuple(folded_ishape) + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("NumChannels") + simd = self.get_nodeattr("SIMD") + assert ifm_ch % simd == 0, "SIMD must divide input channels" + fold = int(normal_oshape[-1] / simd) + folded_oshape = normal_oshape[:-1] + [fold, simd] + return tuple(folded_oshape) + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for FMPadding_Pixel." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + # the hlslib op always pads with zeros, so ensure that the DataType + # is able to represent zeros + assert ret.allowed(0), "FMPadding_Pixel DataType must support zero" + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output. (Same as input datatype)""" + return self.get_input_datatype() + + def get_instream_width(self, ind=0): + ibits = self.get_input_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return ibits * simd + + def get_outstream_width(self, ind=0): + obits = self.get_output_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + return obits * simd + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def execute_node(self, context, graph): + # simulate behavior with Python functionality + node = self.onnx_node + s_h, s_w = self.get_nodeattr("Stride") + inp_values = context[node.input[0]] + ishape = inp_values.shape + result = np.zeros( + ( + ishape[0], + ishape[1] + (ishape[1] - 1) * (s_h - 1), + ishape[2] + (ishape[2] - 1) * (s_w - 1), + ishape[3], + ) + ) + for b in range(ishape[0]): + for h in range(ishape[1]): + for w in range(ishape[2]): + oh = h * s_h + ow = w * s_w + result[b, oh, ow, :] = inp_values[b, h, w, :] + oshape = context[node.output[0]].shape + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool.py b/src/finn/custom_op/fpgadataflow/globalaccpool.py new file mode 100644 index 0000000000..4008cdc7c9 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/globalaccpool.py @@ -0,0 +1,160 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class GlobalAccPool(HWCustomOp): + """Abstraction layer for HW implementation of GlobalAccPool""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + "NumChannels": ("i", True, 0), + "PE": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ch]) + return ishape + + def get_folded_input_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + if len(vecs) == 1: + oshape = tuple(vecs + [ch]) + elif len(vecs) == 3: + oshape = tuple([vecs[0]] + [1, 1, ch]) + return oshape + + def get_folded_output_shape(self, ind=0): + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + unfolded_shape = list(self.get_normal_output_shape()) + assert ch % pe == 0, "PE must divide NumChannels" + folds = int(ch / pe) + oshape = tuple(unfolded_shape[:-1] + [folds, pe]) + return oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + # determine data type from image size and input type + idt = DataType[self.get_nodeattr("inputDataType")] + vecs = list(self.get_nodeattr("numInputVectors")) + npixels = vecs[-1] * vecs[-2] + if idt.signed(): + extreme_value = npixels * idt.min() + else: + extreme_value = npixels * idt.max() + return DataType.get_smallest_possible(extreme_value) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[1:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * idim * idim + Channels/PE + ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + folds = int(ch / pe) + return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) + + def execute_node(self, context, graph): + # simulate behavior with Python functionality + node = self.onnx_node + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + result = np.apply_over_axes(np.sum, inp_values, [1, 2]) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py b/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py deleted file mode 100644 index e7fa5bc004..0000000000 --- a/src/finn/custom_op/fpgadataflow/globalaccpool_batch.py +++ /dev/null @@ -1,340 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - - -class GlobalAccPool_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib AccPool_Batch function.""" - - def __init__(self, onnx_node): - super().__init__(onnx_node) - - def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, 0), - "PE": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_normal_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ch]) - return ishape - - def get_folded_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - if len(vecs) == 1: - oshape = tuple(vecs + [ch]) - elif len(vecs) == 3: - oshape = tuple([vecs[0]] + [1, 1, ch]) - return oshape - - def get_folded_output_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - unfolded_shape = list(self.get_normal_output_shape()) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - oshape = tuple(unfolded_shape[:-1] + [folds, pe]) - return oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append( - """The required GlobalAccPool_Batch attributes do not exist.""" - ) - - # verify that input data is 2D - if len(self.get_nodeattr("numInputVectors")) != 3: - info_messages.append("""GlobalAccPool_Batch requires 2D data input.""") - raise Exception - - return info_messages - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - # determine data type from image size and input type - idt = DataType[self.get_nodeattr("inputDataType")] - vecs = list(self.get_nodeattr("numInputVectors")) - npixels = vecs[-1] * vecs[-2] - if idt.signed(): - extreme_value = npixels * idt.min() - else: - extreme_value = npixels * idt.max() - return DataType.get_smallest_possible(extreme_value) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[1:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * idim * idim + Channels/PE - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - folds = int(ch / pe) - return int(np.prod(self.get_folded_input_shape()[:-1]) + folds) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" - export_idt = self.get_input_datatype() - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - self.code_gen_dict["$DOCOMPUTE$"] = [ - """AccPool_Batch<{}, {}, {}, {}, {}> (in0, out, 1);""".format( - self.get_normal_input_shape()[1], - self.get_nodeattr("NumChannels"), - self.get_input_datatype().get_hls_datatype_str(), - self.get_nodeattr("PE"), - self.get_output_datatype().get_hls_datatype_str(), - ) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &out)""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), - ) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) diff --git a/src/finn/custom_op/fpgadataflow/hls/__init__.py b/src/finn/custom_op/fpgadataflow/hls/__init__.py new file mode 100644 index 0000000000..405c47a08d --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/__init__.py @@ -0,0 +1,81 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +from finn.custom_op.fpgadataflow.hls.addstreams_hls import AddStreams_hls +from finn.custom_op.fpgadataflow.hls.channelwise_op_hls import ChannelwiseOp_hls +from finn.custom_op.fpgadataflow.hls.checksum_hls import CheckSum_hls +from finn.custom_op.fpgadataflow.hls.concat_hls import StreamingConcat_hls +from finn.custom_op.fpgadataflow.hls.convolutioninputgenerator_hls import ( + ConvolutionInputGenerator_hls, +) +from finn.custom_op.fpgadataflow.hls.downsampler_hls import DownSampler_hls +from finn.custom_op.fpgadataflow.hls.duplicatestreams_hls import DuplicateStreams_hls +from finn.custom_op.fpgadataflow.hls.fmpadding_hls import FMPadding_hls +from finn.custom_op.fpgadataflow.hls.fmpadding_pixel_hls import FMPadding_Pixel_hls +from finn.custom_op.fpgadataflow.hls.globalaccpool_hls import GlobalAccPool_hls +from finn.custom_op.fpgadataflow.hls.iodma_hls import IODMA_hls +from finn.custom_op.fpgadataflow.hls.labelselect_hls import LabelSelect_hls +from finn.custom_op.fpgadataflow.hls.lookup_hls import Lookup_hls +from finn.custom_op.fpgadataflow.hls.matrixvectoractivation_hls import MVAU_hls +from finn.custom_op.fpgadataflow.hls.pool_hls import Pool_hls +from finn.custom_op.fpgadataflow.hls.streamingdatawidthconverter_hls import ( + StreamingDataWidthConverter_hls, +) +from finn.custom_op.fpgadataflow.hls.streamingeltwise_hls import StreamingEltwise_hls +from finn.custom_op.fpgadataflow.hls.streamingmaxpool_hls import StreamingMaxPool_hls +from finn.custom_op.fpgadataflow.hls.thresholding_hls import Thresholding_hls +from finn.custom_op.fpgadataflow.hls.tlastmarker_hls import TLastMarker_hls +from finn.custom_op.fpgadataflow.hls.upsampler_hls import UpsampleNearestNeighbour_hls +from finn.custom_op.fpgadataflow.hls.vectorvectoractivation_hls import VVAU_hls + +custom_op = dict() + +# make sure new HLSCustomOp subclasses are imported here so that they get +# registered and plug in correctly into the infrastructure +custom_op["AddStreams_hls"] = AddStreams_hls +custom_op["ChannelwiseOp_hls"] = ChannelwiseOp_hls +custom_op["CheckSum_hls"] = CheckSum_hls +custom_op["ConvolutionInputGenerator_hls"] = ConvolutionInputGenerator_hls +custom_op["DownSampler_hls"] = DownSampler_hls +custom_op["DuplicateStreams_hls"] = DuplicateStreams_hls +custom_op["FMPadding_hls"] = FMPadding_hls +custom_op["FMPadding_Pixel_hls"] = FMPadding_Pixel_hls +custom_op["GlobalAccPool_hls"] = GlobalAccPool_hls +custom_op["IODMA_hls"] = IODMA_hls +custom_op["LabelSelect_hls"] = LabelSelect_hls +custom_op["Lookup_hls"] = Lookup_hls +custom_op["Pool_hls"] = Pool_hls +custom_op["StreamingConcat_hls"] = StreamingConcat_hls +custom_op["StreamingEltwise_hls"] = StreamingEltwise_hls +custom_op["StreamingDataWidthConverter_hls"] = StreamingDataWidthConverter_hls +custom_op["StreamingMaxPool_hls"] = StreamingMaxPool_hls +custom_op["Thresholding_hls"] = Thresholding_hls +custom_op["TLastMarker_hls"] = TLastMarker_hls +custom_op["UpsampleNearestNeighbour_hls"] = UpsampleNearestNeighbour_hls +custom_op["MVAU_hls"] = MVAU_hls +custom_op["VVAU_hls"] = VVAU_hls diff --git a/src/finn/custom_op/fpgadataflow/addstreams_batch.py b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py similarity index 53% rename from src/finn/custom_op/fpgadataflow/addstreams_batch.py rename to src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py index cd0af6b3ab..a3f0e043f8 100644 --- a/src/finn/custom_op/fpgadataflow/addstreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/addstreams_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,81 +28,24 @@ import numpy as np import os -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.addstreams import AddStreams +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class AddStreams_Batch(HLSCustomOp): +class AddStreams_hls(AddStreams, HLSBackend): """Class that corresponds to finn-hlslib AddStreams_Batch function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = super().get_nodeattr_types() - my_attrs.update( - { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - "inFIFODepths": ("ints", False, [2, 2]), - } - ) + my_attrs = {} + my_attrs.update(AddStreams.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich]) - return ishape - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - assert ich % pe == 0, "PE must divide NumChannels" - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich // pe, pe]) - return ishape - - def get_normal_output_shape(self, ind=0): - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input1 shape." - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) - assert ishape == exp_ishape, "Unexpected input2 shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # enforce output data type (calculated based on idt) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -121,48 +64,10 @@ def verify_node(self): self.get_nodeattr("inputDataType") info_messages.append("All necessary attributes exist") except Exception: - info_messages.append( - """The required LabelSelect_Batch attributes do not exist.""" - ) + info_messages.append("""The required LabelSelect_Batch attributes do not exist.""") return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - # we need to set output datatype to the next larger int or uint - # enhancement: consider specifying w/ explicit outputDataType attribute - # to allow overflow and use the same idt if user wants - idt = DataType[self.get_nodeattr("inputDataType")] - if idt.signed(): - return DataType.get_smallest_possible(2 * idt.min()) - else: - return DataType.get_smallest_possible(2 * idt.max()) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -184,9 +89,7 @@ def execute_node(self, context, graph): inp = context[node.input[0]] assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input0 shape doesn't match expected shape .""" + assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape .""" export_idt = self.get_input_datatype() # reshape input into folded form inp = inp.reshape(folded_ishape) @@ -197,9 +100,7 @@ def execute_node(self, context, graph): # exact same thing for input1 inp = context[node.input[1]] assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input1 shape doesn't match expected shape .""" + assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape .""" export_idt = self.get_input_datatype() # reshape input into folded form inp = inp.reshape(folded_ishape) @@ -268,106 +169,85 @@ def read_npy_data(self): self.code_gen_dict["$READNPYDATA$"] = [] npy_in = "%s/input_0.npy" % code_gen_dir self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) npy_in = "%s/input_1.npy" % code_gen_dir self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in1);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in1 ("in1");'.format(self.get_instream_width()) + 'hls::stream> in1_{} ("in1_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) ) def docompute(self): - node = self.onnx_node + hls_call = "AddStreams_Batch" self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, {}, {}, {}, {}> (in0, in1, out, 1);""".format( - node.op_type, + """{}<{}, {}, {}, {}, {}> (in0_{}, in1_{}, out_{}, 1);""".format( + hls_call, self.get_nodeattr("PE"), self.get_input_datatype().get_hls_datatype_str(), self.get_input_datatype().get_hls_datatype_str(), self.get_output_datatype().get_hls_datatype_str(), self.get_number_output_values(), + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, hls::stream> &in1, - hls::stream> &out)""".format( + """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, + hls::stream> &out_{})""".format( self.onnx_node.name, self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(), + self.hls_sname(), self.get_nodeattr("PE") * self.get_input_datatype().bitwidth(), + self.hls_sname(), self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), + self.hls_sname(), ) ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname() ) self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - sname = self.hls_sname() - swidth = self.get_instream_width_padded() - intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] - return intf_names - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - "in1": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py similarity index 66% rename from src/finn/custom_op/fpgadataflow/channelwise_op_batch.py rename to src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py index 46adca680d..14efa113dd 100644 --- a/src/finn/custom_op/fpgadataflow/channelwise_op_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/channelwise_op_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,19 +28,17 @@ import numpy as np import os -import warnings from math import ceil from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.channelwise_op import ChannelwiseOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, rtlsim_output_to_npy, ) -from . import templates - # ONNX i/o tensor shape assumptions for channelwise ops: # input 0 is the input tensor, shape (..., NumChannels) # input 1 is the channelwise parameter tensor, shape (NumChannels, params_per_channel) @@ -48,118 +46,21 @@ # the ... here can be any shape (representing groups of vectors) -def get_smallest_possible(vals): - """Returns smallest (fewest bits) possible DataType that can represent - value. Prefers unsigned integers where possible.""" - vals = np.array(vals, dtype=np.float64) - for v in vals: - assert int(v) == v, "Error float value" - - for k in DataType.get_accumulator_dt_cands(): - dt = DataType[k] - - if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: - # not currently supported - continue - - if (dt.min() <= vals).all() and (vals <= dt.max()).all(): - return dt - - warnings.warn( - """InferChannelwiseLinearLayer: Output values may not be - representable with supported data types. - Setting maximum width data type available. - This will lead to errors if there are no constrains on the input - """ - ) - - if (0 <= vals).all(): - return DataType["UINT64"] - else: - return DataType["INT64"] - - -class ChannelwiseOp_Batch(HLSCustomOp): +class ChannelwiseOp_hls(ChannelwiseOp, HLSBackend): """Class that corresponds to finn-hls Thresholding_Batch function. It can implement a variety of channel-wise parametrized operations, including Add, Mul and multi-thresholding. """ - def __init__(self, onnx_node): - super().__init__(onnx_node) - self.decoupled_wrapper = templates.decoupled_wrapper + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - # channelwise "map" function to apply: - # one of cmp_le, cmp_ge, add, mul - "Func": ("s", False, "cmp_le", {"cmp_le", "cmp_ge", "add", "mul"}), - "PE": ("i", True, 0), - "NumChannels": ("i", True, 0), - # string defining memory resource type for parameters - "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "paramDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(ChannelwiseOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def calc_tmem(self): - """Calculates and returns TMEM, the depth of the memory used - to store the channelwise op parameters.""" - chn = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return chn // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - # implement tensor with correct shape - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # check input datatype against property - idt = model.get_tensor_datatype(node.input[0]) - - exp_idt_name = self.get_nodeattr("inputDataType") - if exp_idt_name != idt.name: - func = self.get_nodeattr("Func") - assert func in ["add", "mul"], "Bad input DataType for ChannelwiseOp layer" - - self.set_nodeattr("inputDataType", idt.name) - # update the func in ['add','mul'] cases - - # get parameter ranges - param = model.get_initializer(node.input[1]) - param_min = min(param.flatten()) - param_max = max(param.flatten()) - - # set function and determine output data type - if func == "add": - out_min = idt.min() + param_min - out_max = idt.max() + param_max - odt = get_smallest_possible([out_min, out_max]) - elif func == "mul": - possible_limits = [] - possible_limits += [idt.min() * param_min] - possible_limits += [idt.min() * param_max] - possible_limits += [idt.max() * param_min] - possible_limits += [idt.max() * param_max] - odt = get_smallest_possible(possible_limits) - - self.set_nodeattr("outputDataType", odt.name) - - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -181,9 +82,7 @@ def verify_node(self): self.get_nodeattr("outputDataType") info_messages.append("All necessary attributes exist") except Exception: - info_messages.append( - """The required Threshold_Batch attributes do not exist.""" - ) + info_messages.append("""The required Threshold_Batch attributes do not exist.""") return info_messages @@ -218,52 +117,6 @@ def lut_estimation(self): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - fold = ich // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [ich]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def get_template_param_values(self): """Returns the template parameter values according to input, output and weight data types.""" @@ -303,9 +156,7 @@ def get_hls_compatible_parameter_tensor(self, orig_param_vector): assert (orig_param_vector.astype(np.int32) == orig_param_vector).all() ret = orig_param_vector - assert ( - ret.shape[0] == chn - ), "Cardinality of parameter vector is not as expected (chn)" + assert ret.shape[0] == chn, "Cardinality of parameter vector is not as expected (chn)" # distribute rows between PEs ret = ret.reshape(tmem, pe).transpose() @@ -327,9 +178,7 @@ def generate_params(self, model, path): parameter_tensor = self.get_hls_compatible_parameter_tensor(parameters) pdt = DataType[self.get_nodeattr("paramDataType")] - parameters_hls_code = numpy_to_hls_code( - parameter_tensor, pdt, "parameters", False, True - ) + parameters_hls_code = numpy_to_hls_code(parameter_tensor, pdt, "parameters", False, True) # get input data type export_idt = self.get_input_datatype() if self.get_input_datatype() == DataType["BIPOLAR"]: @@ -433,9 +282,7 @@ def execute_node(self, context, graph): elif mode == "rtlsim": sim = self.get_rtlsim() nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) output = self.rtlsim(sim, inp) @@ -444,9 +291,7 @@ def execute_node(self, context, graph): packed_bits = self.get_outstream_width() out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) # load and reshape output output = np.load(out_npy_path) @@ -465,7 +310,6 @@ def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "params.h"'] - # TODO check and add whatever missing def defines(self, var): numInputVectors = list(self.get_nodeattr("numInputVectors")) numReps = numInputVectors[0] @@ -489,17 +333,15 @@ def read_npy_data(self): self.code_gen_dict["$READNPYDATA$"] = [] # note: the innermost dim is reversed for the input self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) def docompute(self): @@ -515,10 +357,12 @@ def docompute(self): raise Exception("""Unexpeted input shape""") self.code_gen_dict["$DOCOMPUTE$"] = [ """Thresholding_Batch<{}, NumChannels1, PE1, {}, {}> - (in0, out, threshs, numReps);""".format( + (in0_{}, out_{}, threshs, numReps);""".format( spatial_dim, tmpl_args["TSrcI"], tmpl_args["TDstI"], + self.hls_sname(), + self.hls_sname(), ) ] @@ -539,58 +383,46 @@ def dataoutstrm(self): # note: the innermost dim is not reversed for the output self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' % ( packed_hls_type, elem_hls_type, elem_bits, npy_type, + self.hls_sname(), shape_cpp_str, npy_out, ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &out + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} )""".format( self.onnx_node.name, self.get_instream_width(), + self.hls_sname(), self.get_outstream_width(), + self.hls_sname(), ) ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") # the channelwise parameter tensor is acc_type [PE][TMEM][N_PARAMS_PER_CHANNEL] # partition for parallel access along PE and N_PARAMS_PER_CHANNEL # dimensions (dims 1 and 3) self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " - "complete dim=1" - ) + ("#pragma HLS ARRAY_PARTITION variable=threshs.parameters " "complete dim=1") ) - # self.code_gen_dict["$PRAGMAS$"].append( - # ( - # "#pragma HLS ARRAY_PARTITION variable=threshs.parameters " - # "complete dim=3" - # ) - # ) - # set resource type ram_style = self.get_nodeattr("ram_style") pe = self.get_nodeattr("PE") @@ -600,17 +432,11 @@ def pragmas(self): if pe < ich: if ram_style == "distributed": self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.parameters " - "core=ROM_2P_LUTRAM" - ) + ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_LUTRAM") ) elif ram_style == "block": self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.parameters " - "core=ROM_2P_BRAM" - ) + ("#pragma HLS RESOURCE variable=threshs.parameters " "core=ROM_2P_BRAM") ) else: raise Exception( diff --git a/src/finn/custom_op/fpgadataflow/checksum.py b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py similarity index 86% rename from src/finn/custom_op/fpgadataflow/checksum.py rename to src/finn/custom_op/fpgadataflow/hls/checksum_hls.py index c927c07df2..8a72ca3c6c 100644 --- a/src/finn/custom_op/fpgadataflow/checksum.py +++ b/src/finn/custom_op/fpgadataflow/hls/checksum_hls.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,15 +32,16 @@ import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class CheckSum(HLSCustomOp): +class CheckSum_hls(HWCustomOp, HLSBackend): """Class that corresponds to custom_hls checksum function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -52,7 +54,8 @@ def get_nodeattr_types(self): # folded shape of input/output "folded_shape": ("ints", True, []), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def make_shape_compatible_op(self, model): @@ -183,9 +186,7 @@ def execute_node(self, context, graph): np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) sim = self.get_rtlsim() nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) io_dict = { @@ -199,9 +200,7 @@ def execute_node(self, context, graph): packed_bits = self.get_outstream_width() out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) # load and reshape output output = np.load(out_npy_path) @@ -241,17 +240,28 @@ def read_npy_data(self): self.code_gen_dict["$READNPYDATA$"] = [] # note: the innermost dim is reversed for the input self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append("ap_uint<32> chk;") # set drain = false for cppsim @@ -259,7 +269,8 @@ def strm_decl(self): def docompute(self): self.code_gen_dict["$DOCOMPUTE$"] = [ - """checksum(in0, out, chk, drain);""" + """checksum(in0_%s, out_%s, chk, drain);""" + % (self.hls_sname(), self.hls_sname()) ] def dataoutstrm(self): @@ -279,38 +290,35 @@ def dataoutstrm(self): # note: the innermost dim is not reversed for the output self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' % ( packed_hls_type, elem_hls_type, elem_bits, npy_type, + self.hls_sname(), shape_cpp_str, npy_out, ), "std::vector checksum(1);", "checksum[0] = chk;", - 'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");' - % code_gen_dir, + 'cnpy::npy_save("%s/output_checksum.npy",&checksum[0],{1},"w");' % code_gen_dir, ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """using T = ap_uint;\n void {}(hls::stream &in0, - hls::stream &out, ap_uint<32> &chk, ap_uint<1> &drain)""".format( - self.onnx_node.name + """using T = ap_uint;\n void {}(hls::stream &in0_{}, + hls::stream &out_{}, ap_uint<32> &chk, ap_uint<1> &drain)""".format( + self.onnx_node.name, self.hls_sname(), self.hls_sname() ) ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS interface axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS interface axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS interface axis port=out name=out_" + self.hls_sname() + "#pragma HLS interface axis port=out_" + self.hls_sname() ) self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS interface s_axilite port=chk bundle=checksum" @@ -318,13 +326,9 @@ def pragmas(self): self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS interface s_axilite port=drain bundle=checksum" ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS interface ap_ctrl_none port=return" - ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS interface ap_ctrl_none port=return") self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow") - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS dataflow disable_start_propagation" - ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS dataflow disable_start_propagation") def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() diff --git a/src/finn/custom_op/fpgadataflow/hls/concat_hls.py b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py new file mode 100644 index 0000000000..008fa9cee8 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/concat_hls.py @@ -0,0 +1,267 @@ +# Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.concat import StreamingConcat +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingConcat_hls(StreamingConcat, HLSBackend): + """Streaming concatenation node with dynamically generated HLS. + Only supports concatenating along the last axis.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingConcat.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def generate_params(self, model, path): + elems_per_stream = self.get_nodeattr("ElemsPerStream") + inp_streams = [] + commands = [] + idt = self.get_input_datatype() + total_elems = self.get_total_elems() + total_bw = idt.bitwidth() * total_elems + for i, elems in enumerate(elems_per_stream): + bw = idt.bitwidth() * elems + inp_stream = "hls::stream > &in%d" % (bw, i) + inp_streams.append(inp_stream) + cmd = "in%d.read()" % i + commands.append(cmd) + out_stream = "hls::stream > &out" % (total_bw) + inp_streams.append(out_stream) + + impl_hls_code = [] + impl_hls_code.append("void StreamingConcat(") + impl_hls_code.append(",".join(inp_streams)) + impl_hls_code.append(", unsigned int numReps) {") + impl_hls_code.append("for(unsigned int i = 0; i < numReps; i++) {") + impl_hls_code.append("#pragma HLS PIPELINE II=1") + impl_hls_code.append("ap_uint<%d> out_elem;" % total_bw) + # FIXME: the order of streams for concatenation works out differently + # for cppsim vs rtlsim, addressed via reversing the order of commands + # for now + impl_hls_code.append("#ifdef __SYNTHESIS__") + impl_hls_code.append("out_elem = (" + ",".join(commands[::-1]) + ");") + impl_hls_code.append("#else") + impl_hls_code.append("out_elem = (" + ",".join(commands) + ");") + impl_hls_code.append("#endif") + impl_hls_code.append("out.write(out_elem);") + impl_hls_code.append("}") + impl_hls_code.append("}") + impl_hls_code = "\n".join(impl_hls_code) + + impl_filename = "{}/concat_impl.hpp".format(path) + f_impl = open(impl_filename, "w") + f_impl.write(impl_hls_code) + f_impl.close() + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + n_inps = len(self.onnx_node.input) + ishapes = [self.get_normal_input_shape(x) for x in range(n_inps)] + folded_ishapes = [self.get_folded_input_shape(x) for x in range(n_inps)] + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + export_idt = self.get_input_datatype() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + for i in range(n_inps): + inp = context[node.input[i]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == ishapes[i], "Input shape mismatch for " + node.input[i] + # reshape input into folded form + inp = inp.reshape(folded_ishapes[i]) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_%d.npy" % i), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + io_dict = {"inputs": {}, "outputs": {"out": []}} + for i in range(n_inps): + nbits = self.get_instream_width(i) + rtlsim_inp = npy_to_rtlsim_input( + "%s/input_%d.npy" % (code_gen_dir, i), + export_idt, + nbits, + reverse_inner=True, + ) + io_dict["inputs"]["in%d" % i] = rtlsim_inp + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + self.rtlsim_multi_io(sim, io_dict) + rtlsim_output = io_dict["outputs"]["out"] + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + reverse_inner=True, + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "concat_impl.hpp"'] + + def defines(self, var): + num_reps = self.get_nodeattr("numInputVectors") + num_reps = np.prod(num_reps) + self.code_gen_dict["$DEFINES$"] = ["#define NumReps %d" % num_reps] + + def read_npy_data(self): + n_inputs = self.get_n_inputs() + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + npy_type = "float" + self.code_gen_dict["$READNPYDATA$"] = [] + idt = self.get_input_datatype() + idt_bw = idt.bitwidth() + elem_hls_type = idt.get_hls_datatype_str() + elem_bits = idt_bw + for i in range(n_inputs): + packed_bits = self.get_instream_width(i) + packed_hls_type = "ap_uint<%d>" % packed_bits + npy_in = "%s/input_%d.npy" % (code_gen_dir, i) + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in%d_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + i, + self.hls_sname(), + ) + ) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + n_inputs = self.get_n_inputs() + for i in range(n_inputs): + packed_bits = self.get_instream_width(i) + packed_hls_type = "ap_uint<%d>" % packed_bits + stream_name = "in%d_%s" % (i, self.hls_sname()) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream<%s> %s ("%s");' % (packed_hls_type, stream_name, stream_name) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [] + n_inputs = self.get_n_inputs() + in_streams = [] + for i in range(n_inputs): + in_streams.append("in%d_%s" % (i, self.hls_sname())) + in_stream_names = ",".join(in_streams) + comp_call = "StreamingConcat(%s, out_%s, NumReps);" % ( + in_stream_names, + self.hls_sname(), + ) + self.code_gen_dict["$DOCOMPUTE$"] = [comp_call] + + def blackboxfunction(self): + n_inputs = self.get_n_inputs() + in_streams = [] + for i in range(n_inputs): + iwidth = self.get_instream_width(i) + in_streams.append("hls::stream> &in%d_%s" % (iwidth, i, self.hls_sname())) + in_streams = ",".join(in_streams) + total_width = self.get_input_datatype().bitwidth() * self.get_total_elems() + out_stream = "hls::stream> &out_%s" % ( + total_width, + self.hls_sname(), + ) + blackbox_hls = "void %s(%s, %s)" % (self.onnx_node.name, in_streams, out_stream) + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [blackbox_hls] + + def pragmas(self): + n_inputs = self.get_n_inputs() + pragmas = [] + for i in range(n_inputs): + pragmas.append("#pragma HLS INTERFACE axis port=in%d_%s" % (i, self.hls_sname())) + self.code_gen_dict["$PRAGMAS$"] = pragmas + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py similarity index 52% rename from src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py rename to src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py index f1c84662cc..4a5c02ee06 100644 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator1d.py +++ b/src/finn/custom_op/fpgadataflow/hls/convolutioninputgenerator_hls.py @@ -1,4 +1,5 @@ # Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,15 +32,13 @@ import os import warnings from qonnx.core.datatype import DataType -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( + ConvolutionInputGenerator, +) +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -# This operation should only be used for 1D convolutions. Either the -# IFMDim_H or IFMDim_W should be '1', which represents the so-called -# dummy-dimension - # ONNX i/o tensor shape assumptions for ConvolutionInputGenerator1D: # input 0 is the input tensor, shape NHWC = (1, IFMDim_H, IFMDim_W, IFMChannels) # output 0 is the output tensor, shape NHWC: @@ -53,185 +52,59 @@ # between the two layouts -class ConvolutionInputGenerator1D(HLSCustomOp): +class ConvolutionInputGenerator_hls(ConvolutionInputGenerator, HLSBackend): """Class that corresponds to one of the 1D finn-hlslib ConvolutionInputGenerator (sliding window) function variants. Depending on the combination of attributes (e.g. depthwise or not, whether dilation is 0) a different variant will be picked for the actual HLS implementation.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] - "IFMChannels": ("i", True, 0), - "IFMDim": ("ints", True, []), # [H, W] = [Y, X] - "OFMDim": ("ints", True, []), # [H, W] = [Y, X] - "SIMD": ("i", True, 0), - "Stride": ("ints", True, []), # [H, W] = [Y, X] - "Dilation": ("ints", True, []), # [H, W] = [Y, X] - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - "depthwise": ("i", False, 0, {0, 1}), - # FPGA resource type for ConvolutionInputGenerator input buffer - # auto -- let Vivado HLS decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use URAM - "ram_style": ( - "s", - False, - "distributed", - {"auto", "block", "distributed", "ultra"}, - ), - "parallel_window": ("i", False, 0, {0, 1}), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - simd = self.get_nodeattr("SIMD") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - if self.use_parallel_window_output(): - wf = int((ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) - else: - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) - return folded_oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = model.get_tensor_datatype(node.input[0]) - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - in_width = simd * ibits - return in_width - - def get_outstream_width(self, ind=0): - if self.use_parallel_window_output(): - # feed all window pixels in parallel - k_h, k_w = self.get_nodeattr("ConvKernelDim") - return self.get_instream_width() * k_h * k_w - else: - # if parallel variant not in use: same width for output and input stream - return self.get_instream_width() - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - num_output_elems = np.prod(folded_oshape[:-1]) - return num_output_elems - def get_swu_variant(self): - # checks which variant of the 1D ConvolutionInputGenerator (SWU) can be used - # We have 5 variants: ConvolutionInputGenerator_1D_parallel, + # checks which variant of the ConvolutionInputGenerator (SWU) can be used + # For the 2D case, we have 4 variants: + # ConvolutioninputGenerator, ConvolutioninputGenerator_dws, + # ConvolutioninputGenerator_kernel_stride, ConvolutioninputGenerator_kernel_stride_dws + # For the 1D case, we have 5 variants: ConvolutionInputGenerator_1D_parallel, # ConvolutionInputGenerator_1D_dws_naive, ConvolutionInputGenerator_1D, # ConvolutioninputGenerator_1D_dws, ConvolutionInputGenerator_1D_dws_stride is_dws = self.get_nodeattr("depthwise") - is_strided = np.prod(self.get_nodeattr("Stride")) > 1 - is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2 - is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1 - if self.use_parallel_window_output(): - return "ConvolutionInputGenerator_1D_parallel" - if not is_dws: - return "ConvolutionInputGenerator_1D" - if is_dws: - if (is_strided and not is_stride_2) or (is_dilated): - return "ConvolutionInputGenerator_1D_dws_naive" - elif is_stride_2: - return "ConvolutionInputGenerator_1D_dws_stride" - else: - return "ConvolutionInputGenerator_1D_dws" - - def get_1d_conv_attrs_normalized(self): - # support both (1, D) and (D, 1) cases transparently: - # For the kernel, presenting the input data of size D as - # [H, W] = [Y, X] = [1, D] or [D, 1] - # effectively gives the same result. - # For consistency and ease of programming, this function - # returns the attributes of the layer as follows: - # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. - # The dummy ('1') dimension is the Y-dimension. - ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") - dilation = self.get_nodeattr("Dilation") - - # see defines() for an explanation - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - ofm_dim = ofm_dim[::-1] - k = k[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + if self.get_nodeattr("is1D"): + is_strided = np.prod(self.get_nodeattr("Stride")) > 1 + is_stride_2 = np.prod(self.get_nodeattr("Stride")) == 2 + is_dilated = np.prod(self.get_nodeattr("Dilation")) > 1 + if self.use_parallel_window_output(): + return "ConvolutionInputGenerator_1D_parallel" + if not is_dws: + return "ConvolutionInputGenerator_1D" + if is_dws: + if (is_strided and not is_stride_2) or (is_dilated): + return "ConvolutionInputGenerator_1D_dws_naive" + elif is_stride_2: + return "ConvolutionInputGenerator_1D_dws_stride" + else: + return "ConvolutionInputGenerator_1D_dws" + else: + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + hls_call = "ConvolutionInputGenerator" + if k % stride != 0: + hls_call += "_kernel_stride" + if is_dws: + hls_call += "_dws" + return hls_call def use_parallel_window_output(self): - # Check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to + if not self.get_nodeattr("is1D"): + return False + # If 1D, check if simple "ConvolutionInputGenerator_1D_parallel" variant can be used to # feed window in parallel to the following layer, enabling full SIMD unfolding. stride = self.get_nodeattr("Stride") dilation = self.get_nodeattr("Dilation") @@ -245,13 +118,7 @@ def use_parallel_window_output(self): no_dilation = dilation_h == 1 and dilation_w == 1 supported_ram_style = ram_style in ["auto", "distributed"] if self.get_nodeattr("parallel_window") == 1: - if ( - fully_unfolded - and non_dws - and no_stride - and no_dilation - and supported_ram_style - ): + if fully_unfolded and non_dws and no_stride and no_dilation and supported_ram_style: return True else: warnings.warn( @@ -267,64 +134,88 @@ def use_parallel_window_output(self): def get_exp_cycles(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() - - # since mmv != 1 is not supported yet, we set mmv for now to 1 - # mmv = 1 - # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h - swu_variant = self.get_swu_variant() - if swu_variant == "ConvolutionInputGenerator_1D_parallel": - exp_cycles = k_w + ofm_dim_w - elif swu_variant == "ConvolutionInputGenerator_1D": - exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - exp_cycles = ( - 1 - + ofm_dim_w * k_w * ifm_ch / simd - + (ifm_ch / simd) * (k_w - 1) - - (k_w - 1) - ) - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - cycles_read_block = ifm_dim_w * ifm_ch / simd - cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd - exp_cycles = cycles_read_block + cycles_write_block + # 2D case + if not self.get_nodeattr("is1D"): + ifm_ch = self.get_nodeattr("IFMChannels") + k_h, k_w = self.get_nodeattr("ConvKernelDim") + ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") + ofm_dim_h, ofm_dim_w = self.get_nodeattr("OFMDim") + stride_h, stride_w = self.get_nodeattr("Stride") + dilation_h, dilation_w = self.get_nodeattr("Dilation") + + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + # see https://github.com/Xilinx/finn-hlslib/blob/master/slidingwindow.h + cycles_write_block = (ofm_dim_w * k_w * k_h * (ifm_ch / simd)) / mmv + cycles_read_block = stride_w * ifm_dim_w * (ifm_ch / simd) + max_cycles = max(cycles_write_block, cycles_read_block) + exp_cycles = ifm_dim_w * k_h * dilation_h * (ifm_ch / simd) + ofm_dim_h * max_cycles + # 1D case + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + swu_variant = self.get_swu_variant() + if swu_variant == "ConvolutionInputGenerator_1D_parallel": + exp_cycles = k_w + ofm_dim_w + elif swu_variant == "ConvolutionInputGenerator_1D": + exp_cycles = 1 + ofm_dim_w * k_w * ifm_ch / simd + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + exp_cycles = ( + 1 + ofm_dim_w * k_w * ifm_ch / simd + (ifm_ch / simd) * (k_w - 1) - (k_w - 1) + ) + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + cycles_read_block = ifm_dim_w * ifm_ch / simd + cycles_write_block = ofm_dim_w * k_w * ifm_ch / simd + exp_cycles = cycles_read_block + cycles_write_block return int(exp_cycles) def bram_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": return 0 if ram_style == "block" or ram_style == "auto": - if swu_variant == "ConvolutionInputGenerator_1D": - ram_depth = (k_w - 1) * ifm_ch / simd - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - ram_depth = ifm_dim_w * ifm_ch / simd - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - ram_depth = k_w * ifm_ch / simd + if not is1D: + ram_depth = ifm_dim * ifm_ch / simd + else: + if swu_variant == "ConvolutionInputGenerator_1D": + ram_depth = (k_w - 1) * ifm_ch / simd + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + ram_depth = ifm_dim_w * ifm_ch / simd + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + ram_depth = k_w * ifm_ch / simd + # after calculate the ram_depth depending on the variant + # determine ram_width if ram_depth <= 512: ram_width = 36 elif ram_depth <= 1024: @@ -337,80 +228,108 @@ def bram_estimation(self): ram_width = 2 else: ram_width = 1 - width_mul = math.ceil( - simd * self.get_input_datatype().bitwidth() / ram_width - ) - depth_mul = math.ceil(ram_depth / 18432) - return width_mul * depth_mul + + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / ram_width) + if not is1D: + depth_mul = math.ceil(ifm_dim * ifm_ch / simd / ram_depth) + return int((k + stride) * width_mul * depth_mul) + else: + depth_mul = math.ceil(ram_depth / 18432) + return int(width_mul * depth_mul) else: return 0 def lut_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": - ram_luts = math.ceil( - simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64 - ) - elif ram_style == "distributed": - if swu_variant == "ConvolutionInputGenerator_1D": - ram_luts = math.ceil( - self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64 + ram_luts = math.ceil(simd * self.get_input_datatype().bitwidth() * (k_w + 1) / 64) + if ram_style == "distributed": + if not is1D: + ram_luts = int( + (k + stride) + * ( + simd + * self.get_input_datatype().bitwidth() + * math.ceil(ifm_dim * ifm_ch / simd / 64) + ) ) + if swu_variant == "ConvolutionInputGenerator_1D": + ram_luts = math.ceil(self.get_input_datatype().bitwidth() * (k_w - 1) * ifm_ch / 64) elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - ram_luts = math.ceil( - self.get_input_datatype().bitwidth() * ifm_dim_w * ifm_ch / 64 - ) + ram_luts = math.ceil(self.get_input_datatype().bitwidth() * ifm_dim_w * ifm_ch / 64) elif swu_variant in [ "ConvolutionInputGenerator_1D_dws", "ConvolutionInputGenerator_1D_dws_stride", ]: - ram_luts = math.ceil( - self.get_input_datatype().bitwidth() * k_w * ifm_ch / 64 - ) + ram_luts = math.ceil(self.get_input_datatype().bitwidth() * k_w * ifm_ch / 64) else: ram_luts = 0 return 300 + ram_luts def uram_estimation(self): simd = self.get_nodeattr("SIMD") - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") + if not is1D: + ifm_ch = self.get_nodeattr("IFMChannels") + ifm_dim = self.get_nodeattr("IFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + ram_style = self.get_nodeattr("ram_style") swu_variant = self.get_swu_variant() if swu_variant == "ConvolutionInputGenerator_1D_parallel": return 0 - elif ram_style == "ultra": - if swu_variant == "ConvolutionInputGenerator_1D": - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096) - return width_mul * depth_mul - elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096) - return width_mul * depth_mul - elif swu_variant in [ - "ConvolutionInputGenerator_1D_dws", - "ConvolutionInputGenerator_1D_dws_stride", - ]: - width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) - depth_mul = math.ceil(k_w * ifm_ch / simd / 4096) - return width_mul * depth_mul + if ram_style == "ultra": + if not is1D: + return int( + (k + stride) + * ( + math.ceil(simd * self.get_input_datatype().bitwidth() / 64) + * math.ceil(ifm_dim * ifm_ch / simd / 4096) + ) + ) + else: + if swu_variant == "ConvolutionInputGenerator_1D": + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil((k_w - 1) * ifm_ch / simd / 4096) + return width_mul * depth_mul + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil(ifm_dim_w * ifm_ch / simd / 4096) + return width_mul * depth_mul + elif swu_variant in [ + "ConvolutionInputGenerator_1D_dws", + "ConvolutionInputGenerator_1D_dws_stride", + ]: + width_mul = math.ceil(simd * self.get_input_datatype().bitwidth() / 72) + depth_mul = math.ceil(k_w * ifm_ch / simd / 4096) + return width_mul * depth_mul else: return 0 @@ -504,18 +423,28 @@ def global_includes(self): def defines(self, var): numReps = 1 - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() + is1D = self.get_nodeattr("is1D") simd = self.get_nodeattr("SIMD") ifm_precision = self.get_input_datatype().bitwidth() + if not is1D: + ifm_dim = self.get_nodeattr("IFMDim")[0] + ifm_ch = self.get_nodeattr("IFMChannels") + ofm_dim = self.get_nodeattr("OFMDim")[0] + k = self.get_nodeattr("ConvKernelDim")[0] + stride = self.get_nodeattr("Stride")[0] + else: + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + swu_variant = self.get_swu_variant() + # check all different 1D scenarios if swu_variant in [ "ConvolutionInputGenerator_1D_parallel", "ConvolutionInputGenerator_1D", @@ -542,7 +471,7 @@ def defines(self, var): numReps, ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws": + elif swu_variant == "ConvolutionInputGenerator_1D_dws": self.code_gen_dict["$DEFINES$"] = [ """ #define ConvKernelDim1_x {}\n @@ -562,7 +491,7 @@ def defines(self, var): numReps, ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": self.code_gen_dict["$DEFINES$"] = [ """ #define ConvKernelDim1_x {}\n @@ -586,33 +515,16 @@ def defines(self, var): numReps, ) ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) + # default to 2D cases + else: + self.code_gen_dict["$DEFINES$"] = [ + """#define ConvKernelDim1 {}\n #define IFMChannels1 {}\n + #define Input_precision1 {}\n #define IFMDim1 {}\n + #define OFMDim1 {}\n #define SIMD1 {}\n + #define Stride1 {}\n #define numReps {}""".format( + k, ifm_ch, ifm_precision, ifm_dim, ofm_dim, simd, stride, numReps + ) + ] def docompute(self): ram_style = self.get_nodeattr("ram_style") @@ -625,45 +537,52 @@ def docompute(self): hls_ram_style = map_to_hls_ram_style[ram_style] swu_variant = self.get_swu_variant() - # check which ConvolutionInputGenerator is needed + # check which 1D ConvolutionInputGenerator is needed if swu_variant == "ConvolutionInputGenerator_1D_parallel": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} - (in0, out, numReps, {});""".format( - swu_variant, hls_ram_style + (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D": + elif swu_variant == "ConvolutionInputGenerator_1D": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} - (in0, out, numReps, {});""".format( - swu_variant, hls_ram_style + (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws": + elif swu_variant == "ConvolutionInputGenerator_1D_dws": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} - (in0, out, numReps, {});""".format( - swu_variant, hls_ram_style + (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_stride": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_stride": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} - (in0, out, numReps, {});""".format( - swu_variant, hls_ram_style + (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] - if swu_variant == "ConvolutionInputGenerator_1D_dws_naive": + elif swu_variant == "ConvolutionInputGenerator_1D_dws_naive": self.code_gen_dict["$DOCOMPUTE$"] = [ """{} - (in0, out, numReps, {});""".format( - swu_variant, hls_ram_style + (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style + ) + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{}, numReps, {});""".format( + swu_variant, self.hls_sname(), self.hls_sname(), hls_ram_style ) ] @@ -690,45 +609,32 @@ def dataoutstrm(self): multi_pixel_out = 1 self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", true, 1, %d);' + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", true, 1, %d);' % ( packed_hls_type, elem_hls_type, elem_bits, npy_type, + self.hls_sname(), oshape_cpp_str, npy_out, multi_pixel_out, ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): if self.use_parallel_window_output(): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, + """void {}(hls::stream> &in0_{}, hls::stream> - &out)""".format( - self.onnx_node.name + &out_{})""".format( + self.onnx_node.name, self.hls_sname(), self.hls_sname() ) ] else: self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &out)""".format( - self.onnx_node.name + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{})""".format( + self.onnx_node.name, self.hls_sname(), self.hls_sname() ) ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) diff --git a/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py new file mode 100644 index 0000000000..56f472b9c0 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/downsampler_hls.py @@ -0,0 +1,165 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.downsampler import DownSampler +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class DownSampler_hls(DownSampler, HLSBackend): + """Corresponds to finn-hlslib ConvolutionInputGenerator_*_kernel1 function. + Basically performs a down sampling of the image removing rows and columns.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(DownSampler.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "slidingwindow.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("ImgDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + simd = self.get_nodeattr("SIMD") + self.code_gen_dict["$DEFINES$"] += ["#define SIMD {}".format(simd)] + + stride = self.get_nodeattr("Stride") + self.code_gen_dict["$DEFINES$"] += ["#define Stride {}".format(stride)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def docompute(self): + dim_var = "1D" if (self.get_nodeattr("is1D") == 1) else "2D" + sname = self.hls_sname() + self.code_gen_dict["$DOCOMPUTE$"] = [ + f"""ConvolutionInputGenerator_{dim_var}_kernel1 (in0_{sname}, out_{sname}, numReps);""" + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py similarity index 60% rename from src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py rename to src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py index 93cde15ca7..e19149435e 100644 --- a/src/finn/custom_op/fpgadataflow/duplicatestreams_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/duplicatestreams_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,91 +28,24 @@ import numpy as np import os -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.duplicatestreams import DuplicateStreams +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class DuplicateStreams_Batch(HLSCustomOp): +class DuplicateStreams_hls(DuplicateStreams, HLSBackend): """Class that corresponds to finn-hlslib function of the same name.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - "NumChannels": ("i", True, 0), - "PE": ("i", True, 0), - # how many duplicated output streams to create - "NumOutputStreams": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(DuplicateStreams.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_num_output_streams(self): - return self.get_nodeattr("NumOutputStreams") - - def get_normal_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ch]) - return ishape - - def get_folded_input_shape(self, ind=0): - ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert ch % pe == 0, "PE must divide NumChannels" - folds = int(ch / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - # since the output shape of both out streams are the same - # return independently from index - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - # since the output shape of both out streams are the same - # return independently from index - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - num_out = self.get_num_output_streams() - assert len(self.onnx_node.output) == num_out, "Unexpected number of outputs" - - oshape = self.get_normal_output_shape() - ret = super().make_const_shape_op(oshape) - ret.output[:] = self.onnx_node.output - return ret - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - odt = self.get_output_datatype() - for my_out in self.onnx_node.output: - model.set_tensor_datatype(my_out, odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -132,43 +65,10 @@ def verify_node(self): self.get_nodeattr("inputDataType") info_messages.append("All necessary attributes exist") except Exception: - info_messages.append( - """The required GlobalAccPool_Batch attributes do not exist.""" - ) + info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return self.get_num_output_streams() * np.prod( - self.get_folded_output_shape()[1:-1] - ) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def generate_params(self, model, path): n_outputs = self.get_num_output_streams() inp_streams = [] @@ -235,9 +135,7 @@ def execute_node(self, context, graph): # execute the precompiled model super().exec_precompiled_singlenode_model() # load output npy file - super().npy_to_dynamic_outputs( - context, ["output%d.npy" % i for i in range(n_outputs)] - ) + super().npy_to_dynamic_outputs(context, ["output%d.npy" % i for i in range(n_outputs)]) for i in range(n_outputs): assert ( context[node.output[i]].shape == exp_oshape @@ -298,29 +196,16 @@ def global_includes(self): def defines(self, var): self.code_gen_dict["$DEFINES$"] = [] - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - def strm_decl(self): n_outputs = self.get_num_output_streams() self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) ) for i in range(n_outputs): - out_name = "out%d" % i + out_name = "out%d_%s" % (i, self.hls_sname()) self.code_gen_dict["$STREAMDECLARATIONS$"].append( 'hls::stream> %s ("%s");' % (self.get_outstream_width(), out_name, out_name) @@ -328,8 +213,13 @@ def strm_decl(self): def docompute(self): n_outputs = self.get_num_output_streams() - ostreams = ["out%d" % x for x in range(n_outputs)] - dc = "DuplicateStreamsCustom(in0, %s);" % (",".join(ostreams)) + ostreams = [] + for i in range(n_outputs): + ostreams.append("out%d_%s" % (i, self.hls_sname())) + dc = "DuplicateStreamsCustom(in0_%s, %s);" % ( + self.hls_sname(), + ",".join(ostreams), + ) self.code_gen_dict["$DOCOMPUTE$"] = [dc] def dataoutstrm(self): @@ -346,7 +236,7 @@ def dataoutstrm(self): outstrm_code = [] for i in range(n_outputs): - out_name = "out%d" % i + out_name = "out%d_%s" % (i, self.hls_sname()) npy_out = "%s/output%d.npy" % (code_gen_dir, i) outstrm_code.append( 'apintstream2npy<%s, %s, %d, %s>(%s, %s, "%s");' @@ -363,18 +253,19 @@ def dataoutstrm(self): self.code_gen_dict["$DATAOUTSTREAM$"] = outstrm_code - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): n_outputs = self.get_num_output_streams() inp_streams = [] o_stream_w = self.get_outstream_width() i_stream_w = self.get_instream_width() - in_stream = "hls::stream > &in0" % (i_stream_w) + in_stream = "hls::stream > &in0_%s" % (i_stream_w, self.hls_sname()) inp_streams.append(in_stream) for i in range(n_outputs): - out_stream = "hls::stream > &out%d" % (o_stream_w, i) + out_stream = "hls::stream > &out%d_%s" % ( + o_stream_w, + i, + self.hls_sname(), + ) inp_streams.append(out_stream) self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ @@ -387,34 +278,10 @@ def blackboxfunction(self): def pragmas(self): n_outputs = self.get_num_output_streams() self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] for i in range(n_outputs): self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out%d name=out%d_%s" - % (i, i, self.hls_sname()) - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - n_outputs = self.get_num_output_streams() - sname = self.hls_sname() - intf_names["m_axis"] = [] - for i in range(n_outputs): - intf_names["m_axis"].append( - ("out%d_%s" % (i, sname), self.get_outstream_width_padded()) + "#pragma HLS INTERFACE axis port=out%d_%s" % (i, self.hls_sname()) ) - return intf_names - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out0": [], "out1": []}, - } - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py new file mode 100644 index 0000000000..d57699af05 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_hls.py @@ -0,0 +1,212 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class FMPadding_hls(FMPadding, HLSBackend): + """Corresponds to finn-hlslib FMPadding_Batch function. + Pads input image by given amount.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(FMPadding.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + idim_h, idim_w = self.get_nodeattr("ImgDim") + odim_h, odim_w = self.get_padded_odim() + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w + + if is_square_img and is_square_pad: + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim1 {}\n#define OutputDim1 {}\n + #define PaddingBefore1 {}\n#define PaddingBehind1 {}\n + #define NumChannels1 {}\n#define SIMD1 {}\n + #define numReps {}\n""".format( + idim_h, + odim_h, + pad[0], + pad[2], + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("numInputVectors"), + ) + ] + else: + self.code_gen_dict["$DEFINES$"] = [ + """ + #define OutputDim1_x {}\n + #define OutputDim1_y {}\n + #define PaddingLeft1 {}\n + #define PaddingRight1 {}\n + #define PaddingTop1 {}\n + #define PaddingBottom1 {}\n + #define NumChannels1 {}\n + #define SIMD1 {}\n + #define numReps {}\n + """.format( + odim_w, + odim_h, + pad[1], + pad[3], + pad[0], + pad[2], + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("numInputVectors"), + ) + ] + + def docompute(self): + in_t = self.get_input_datatype().get_hls_datatype_str() + idim_h, idim_w = self.get_nodeattr("ImgDim") + pad = self.get_nodeattr("Padding") + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + is_square_img = idim_h == idim_w + is_square_pad = pad_h == pad_w + + if is_square_img and is_square_pad: + hls_call = "FMPadding_Batch" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{}, numReps);""".format( + hls_call, in_t, self.hls_sname(), self.hls_sname() + ) + ] + else: + hls_call = "FMPadding_nonsquare_Batch" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{}, numReps);""".format( + hls_call, in_t, self.hls_sname(), self.hls_sname() + ) + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py new file mode 100644 index 0000000000..b7ba301fbc --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/fmpadding_pixel_hls.py @@ -0,0 +1,167 @@ +# Copyright (c) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.fmpadding_pixel import FMPadding_Pixel +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class FMPadding_Pixel_hls(FMPadding_Pixel, HLSBackend): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(FMPadding_Pixel.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + odim_h, odim_w = self.get_padded_odim() + stride_h, stride_w = self.get_nodeattr("Stride") + self.code_gen_dict["$DEFINES$"] = [ + """ + #define OutputDim_x {}\n + #define OutputDim_y {}\n + #define Stride_x {}\n + #define Stride_y {}\n + #define NumChannels {}\n + #define SIMD {}\n + """.format( + odim_w, + odim_h, + stride_w, + stride_h, + self.get_nodeattr("NumChannels"), + self.get_nodeattr("SIMD"), + ) + ] + + def docompute(self): + in_t = self.get_input_datatype().get_hls_datatype_str() + odim_h, odim_w = self.get_padded_odim() + stride_h, stride_w = self.get_nodeattr("Stride") + hls_call = "FMPadding_Pixel_Nonsquare" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} (in0_{}, out_{});""".format( + hls_call, in_t, self.hls_sname(), self.hls_sname() + ) + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py new file mode 100644 index 0000000000..9b2a7b25b0 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/globalaccpool_hls.py @@ -0,0 +1,176 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os + +from finn.custom_op.fpgadataflow.globalaccpool import GlobalAccPool +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class GlobalAccPool_hls(GlobalAccPool, HLSBackend): + """Class that corresponds to finn-hlslib AccPool_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(GlobalAccPool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required GlobalAccPool_Batch attributes do not exist.""") + + # verify that input data is 2D + if len(self.get_nodeattr("numInputVectors")) != 3: + info_messages.append("""GlobalAccPool_Batch requires 2D data input.""") + raise Exception + + return info_messages + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape .""" + export_idt = self.get_input_datatype() + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + def docompute(self): + self.code_gen_dict["$DOCOMPUTE$"] = [ + """AccPool_Batch<{}, {}, {}, {}, {}> (in0_{}, out_{}, 1);""".format( + self.get_normal_input_shape()[1], + self.get_nodeattr("NumChannels"), + self.get_input_datatype().get_hls_datatype_str(), + self.get_nodeattr("PE"), + self.get_output_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname(), + ) + ] + + def blackboxfunction(self): + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{})""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] diff --git a/src/finn/custom_op/fpgadataflow/iodma.py b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py similarity index 81% rename from src/finn/custom_op/fpgadataflow/iodma.py rename to src/finn/custom_op/fpgadataflow/hls/iodma_hls.py index 65683079fc..8d9903f0f5 100644 --- a/src/finn/custom_op/fpgadataflow/iodma.py +++ b/src/finn/custom_op/fpgadataflow/hls/iodma_hls.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,8 @@ import warnings from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp # the IODMA inerfaces a memory-mapped AXI interface and an AXI stream # direction "in": pulls data from AXI-MM to AXI stream @@ -47,7 +49,7 @@ # Interfaces # - AXI-MM name specified by intfName unless this is set to "" (empty, the default) -# in which case output AXI-MM are named "out" and input AXI-MM are named "in0" +# in which case output AXI-MM are named "out_V" and input AXI-MM are named "in0_V" # - AXI-MM interface width (in bits) is specified by intfWidth # - AXI-Stream interface width (in bits) is specified by streamWidth # - If inftWidth and streamWidth are not equal, the DMA core performs @@ -72,11 +74,11 @@ # -the folded shape is not defined -class IODMA(HLSCustomOp): +class IODMA_hls(HWCustomOp, HLSBackend): """Class that corresponds to finn-hlslib DMA function(s).""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -97,7 +99,8 @@ def get_nodeattr_types(self): # name of axi-mm interface "intfName": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def get_normal_input_shape(self, ind=0): @@ -116,9 +119,7 @@ def get_folded_input_shape(self, ind=0): shape = list(self.get_normal_input_shape()) itype_bits = self.get_input_datatype().bitwidth() intfw = self.get_nodeattr("streamWidth") - assert ( - intfw % itype_bits == 0 - ), "Input stream width must be a multiple of datatype bits" + assert intfw % itype_bits == 0, "Input stream width must be a multiple of datatype bits" elems_per_word = intfw // itype_bits assert shape[-1] % elems_per_word == 0, "Fold depth must be integer" fold_depth = shape[-1] // elems_per_word @@ -133,9 +134,7 @@ def get_folded_output_shape(self, ind=0): shape = list(self.get_normal_output_shape()) itype_bits = self.get_output_datatype().bitwidth() intfw = self.get_nodeattr("streamWidth") - assert ( - intfw % itype_bits == 0 - ), "Input stream width must be a multiple of datatype bits" + assert intfw % itype_bits == 0, "Input stream width must be a multiple of datatype bits" elems_per_word = intfw // itype_bits assert shape[-1] % elems_per_word == 0, "Fold depth must be integer" fold_depth = shape[-1] // elems_per_word @@ -196,9 +195,7 @@ def get_number_output_values(self): stream_width = self.get_nodeattr("streamWidth") nelems = np.prod(oshape) nbits = nelems * itype_bits - assert ( - nbits % stream_width == 0 - ), "DMA: total transfer size must be word multiple" + assert nbits % stream_width == 0, "DMA: total transfer size must be word multiple" ovalues = nbits // stream_width return ovalues @@ -254,15 +251,23 @@ def docompute(self): # DWCs depend on AXI MM and out interface width if strmw == intfw: # case 0: AXI MM width = out width, no DWCs needed - self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")] + self.code_gen_dict["$DOCOMPUTE$"] = [ + dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname()) + ] elif (strmw % intfw == 0) or (intfw % strmw == 0): # case 1: AXI MM width divisible by out width or vice versa # single DWC + single extra stream needed self.code_gen_dict["$DOCOMPUTE$"] = [ "hls::stream > dma2dwc;" % intfw, - dma_inst_template % ("in0", "dma2dwc"), + dma_inst_template % ("in0_" + self.hls_sname(), "dma2dwc"), dwc_inst_template - % (intfw, strmw, total_bits // intfw, "dma2dwc", "out"), + % ( + intfw, + strmw, + total_bits // intfw, + "dma2dwc", + "out_" + self.hls_sname(), + ), ] else: # case 2: AXI MM width not divisible by out width or vice versa @@ -271,26 +276,40 @@ def docompute(self): self.code_gen_dict["$DOCOMPUTE$"] = [ "hls::stream > dma2lcm;" % intfw, "hls::stream > lcm2out;" % width_lcm, - dma_inst_template % ("in0", "dma2lcm"), + dma_inst_template % ("in0_" + self.hls_sname(), "dma2lcm"), dwc_inst_template % (intfw, width_lcm, total_bits // intfw, "dma2lcm", "lcm2out"), dwc_inst_template - % (width_lcm, strmw, total_bits // width_lcm, "lcm2out", "out"), + % ( + width_lcm, + strmw, + total_bits // width_lcm, + "lcm2out", + "out_" + self.hls_sname(), + ), ] elif direction == "out": # in0 -> (DWCs) -> IODMA -> AXI MM # DWCs depend on AXI MM and out interface width if strmw == intfw: # case 0: in width = AXI MM width, no DWCs needed - self.code_gen_dict["$DOCOMPUTE$"] = [dma_inst_template % ("in0", "out")] + self.code_gen_dict["$DOCOMPUTE$"] = [ + dma_inst_template % ("in0_" + self.hls_sname(), "out_" + self.hls_sname()) + ] elif (strmw % intfw == 0) or (intfw % strmw == 0): # case 1: AXI MM width divisible by in width or vice versa # single DWC + single extra stream needed self.code_gen_dict["$DOCOMPUTE$"] = [ "hls::stream > dwc2dma;" % intfw, dwc_inst_template - % (strmw, intfw, total_bits // strmw, "in0", "dwc2dma"), - dma_inst_template % ("dwc2dma", "out"), + % ( + strmw, + intfw, + total_bits // strmw, + "in0_" + self.hls_sname(), + "dwc2dma", + ), + dma_inst_template % ("dwc2dma", "out_" + self.hls_sname()), ] else: # case 2: AXI MM width not divisible by out width or vice versa @@ -300,10 +319,16 @@ def docompute(self): "hls::stream > in2lcm;" % width_lcm, "hls::stream > lcm2dma;" % intfw, dwc_inst_template - % (strmw, width_lcm, total_bits // strmw, "in0", "in2lcm"), + % ( + strmw, + width_lcm, + total_bits // strmw, + "in0_" + self.hls_sname(), + "in2lcm", + ), dwc_inst_template % (width_lcm, intfw, total_bits // width_lcm, "in2lcm", "lcm2dma"), - dma_inst_template % ("lcm2dma", "out"), + dma_inst_template % ("lcm2dma", "out_" + self.hls_sname()), ] else: raise Exception("Unknown IODMA direction: %s" % direction) @@ -316,13 +341,25 @@ def blackboxfunction(self): direction = self.get_nodeattr("direction") if direction == "in": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(%s *in0, hls::stream<%s > &out, unsigned int numReps)" - % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out) + "void %s(%s *in0_%s, hls::stream<%s > &out_%s, unsigned int numReps)" + % ( + self.onnx_node.name, + packed_hls_type_in, + self.hls_sname(), + packed_hls_type_out, + self.hls_sname(), + ) ] elif direction == "out": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, %s *out, unsigned int numReps)" - % (self.onnx_node.name, packed_hls_type_in, packed_hls_type_out) + "void %s(hls::stream<%s > &in0_%s, %s *out_%s, unsigned int numReps)" + % ( + self.onnx_node.name, + packed_hls_type_in, + self.hls_sname(), + packed_hls_type_out, + self.hls_sname(), + ) ] else: raise ValueError("Invalid IODMA direction, please set to in or out") @@ -339,32 +376,32 @@ def pragmas(self): if direction == "in": if intfname == "": self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE m_axi offset=slave port=in0" + "#pragma HLS INTERFACE m_axi offset=slave port=in0_" + self.hls_sname() ) else: self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname) ) self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE s_axilite port=in0 bundle=control" + "#pragma HLS INTERFACE s_axilite port=in0_%s bundle=control" % (self.hls_sname()) ) self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) elif direction == "out": self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ) if intfname == "": self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE m_axi offset=slave port=out" + "#pragma HLS INTERFACE m_axi offset=slave port=out_" + self.hls_sname() ) else: self.code_gen_dict["$PRAGMAS$"].append( "#pragma HLS INTERFACE m_axi offset=slave port=%s" % (intfname) ) self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE s_axilite port=out bundle=control" + "#pragma HLS INTERFACE s_axilite port=out_%s bundle=control" % (self.hls_sname()) ) else: raise ValueError("Invalid IODMA direction, please set to in or out") @@ -373,18 +410,6 @@ def pragmas(self): def execute_node(self, context, graph): pass - def dataoutstrm(self): - pass - - def read_npy_data(self): - pass - - def save_as_npy(self): - pass - - def strm_decl(self): - pass - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() if self.get_nodeattr("direction") == "out": diff --git a/src/finn/custom_op/fpgadataflow/labelselect_batch.py b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py similarity index 54% rename from src/finn/custom_op/fpgadataflow/labelselect_batch.py rename to src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py index 03f89bd7ec..1e2c0d034a 100644 --- a/src/finn/custom_op/fpgadataflow/labelselect_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/labelselect_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,99 +28,24 @@ import numpy as np import os -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.labelselect import LabelSelect from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class LabelSelect_Batch(HLSCustomOp): +class LabelSelect_hls(LabelSelect, HLSBackend): """Class that corresponds to finn-hlslib LabelSelect_Batch function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) - odt_name = self.get_nodeattr("outputDataType") - if odt_name == "": - # If not provided compute min size - labels = self.get_nodeattr("Labels") - odt = DataType.get_smallest_possible(labels - 1) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(odt.bitwidth(), 8) - new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) - odt = DataType[new_odt_name] - odt_name = odt.name - self.set_nodeattr("outputDataType", odt_name) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - my_attrs = { - "Labels": ("i", True, 0), - "PE": ("i", True, 0), - "K": ("i", True, 0), - # FINN DataTypes for input - "inputDataType": ("s", True, ""), - "outputDataType": ("s", False, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - } - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(LabelSelect.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - nlabels = self.get_nodeattr("Labels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [nlabels]) - return ishape - - def get_folded_input_shape(self, ind=0): - nlabels = self.get_nodeattr("Labels") - pe = self.get_nodeattr("PE") - vecs = list(self.get_nodeattr("numInputVectors")) - assert nlabels % pe == 0, "PE must divide Labels" - folds = int(nlabels / pe) - folded_ishape = tuple(vecs + [folds, pe]) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k = self.get_nodeattr("K") - vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple(vecs + [k]) - return oshape - - def get_folded_output_shape(self, ind=0): - k = self.get_nodeattr("K") - vecs = list(self.get_nodeattr("numInputVectors")) - oshape = tuple(vecs + [k, 1]) - return oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape." - return helper.make_node( - "RandomNormal", - inputs=[], - outputs=[self.onnx_node.output[0]], - mean=0.0, - scale=1.0, - dtype=TensorProto.INT64, - shape=list(oshape), - ) - - def infer_node_datatype(self, model): - node = self.onnx_node - # check input datatype against property - idt = model.get_tensor_datatype(node.input[0]) - self.set_nodeattr("inputDataType", idt.name) - - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -141,9 +66,7 @@ def verify_node(self): self.get_nodeattr("outputDataType") info_messages.append("All necessary attributes exist") except Exception: - info_messages.append( - """The required LabelSelect_Batch attributes do not exist.""" - ) + info_messages.append("""The required LabelSelect_Batch attributes do not exist.""") # verify that input data is 1D if len(self.get_nodeattr("numInputVectors")) > 1: @@ -152,30 +75,6 @@ def verify_node(self): return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - ret = DataType[self.get_nodeattr("outputDataType")] - return ret - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - return self.get_output_datatype().bitwidth() - - def get_number_output_values(self): - return self.get_nodeattr("K") - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -275,83 +174,39 @@ def read_npy_data(self): # Also notice that StreamingDataWidthConverter_Batch performs LE packing self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) def docompute(self): - node = self.onnx_node self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, {}, {}, {}, {} > (in0, out, 1);""".format( - node.op_type, + """LabelSelect_Batch<{}, {}, {}, {}, {} > (in0_{}, out_{}, 1);""".format( self.get_nodeattr("Labels"), self.get_nodeattr("PE"), self.get_nodeattr("K"), self.get_input_datatype().get_hls_datatype_str(), self.get_output_datatype().get_hls_datatype_str(), + self.hls_sname(), + self.hls_sname(), ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream > &out)""".format( + """void {}(hls::stream> &in0_{}, + hls::stream > &out_{})""".format( self.onnx_node.name, self.get_nodeattr("PE"), self.get_input_datatype().bitwidth(), + self.hls_sname(), self.get_output_datatype().bitwidth(), + self.hls_sname(), ) ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - def get_exp_cycles(self): - nlabels = self.get_nodeattr("Labels") - pe = self.get_nodeattr("PE") - exp_cycles = nlabels / pe - return int(exp_cycles) diff --git a/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py new file mode 100644 index 0000000000..ba44deb898 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/lookup_hls.py @@ -0,0 +1,337 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from math import ceil, log2 +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.lookup import Lookup +from finn.util.data_packing import ( + npy_to_rtlsim_input, + numpy_to_hls_code, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + + +class Lookup_hls(Lookup, HLSBackend): + "Streaming elementwise HLS lookup, mapping indices to values." + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(Lookup.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + mem_mode = self.get_nodeattr("mem_mode") + global_incls = [] + global_incls.append('#include "lookup.hpp"') + if mem_mode == "internal_embedded": + global_incls.append('#include "embeddings.hpp"') + self.code_gen_dict["$GLOBALS$"] = global_incls + + def defines(self, var): + n_inputs = np.prod(self.get_folded_input_shape()[:-1]) + dtype = self.get_input_datatype() + elem_hls_type = dtype.get_hls_datatype_str() + emb_type = DataType[self.get_nodeattr("EmbeddingType")] + emb_hls_type = emb_type.get_hls_datatype_str() + emb_dim = self.get_nodeattr("EmbeddingDim") + mem_mode = self.get_nodeattr("mem_mode") + my_defines = [] + my_defines.append("#define NumInputs %d" % n_inputs) + if mem_mode == "external": + ext_mem_width = self.get_nodeattr("ext_mem_width") + ext_mem_emb_size = self.get_folded_output_shape()[-2] + ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) + my_defines.append("#define MemBits %d" % ext_mem_width) + my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size) + my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align) + my_defines.append("#define T_SRC %s" % elem_hls_type) + my_defines.append("#define T_DST ap_uint") + elif mem_mode == "internal_embedded": + my_defines.append("#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings")) + my_defines.append("#define EmbeddingDim %d" % emb_dim) + my_defines.append("#define InputType %s" % elem_hls_type) + my_defines.append("#define EmbeddingType %s" % emb_hls_type) + self.code_gen_dict["$DEFINES$"] = my_defines + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "int64_t" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", %s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + "false", + ) + ] + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_embedded": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup(in0_%s, out_%s, embeddings);""" + % (self.hls_sname(), self.hls_sname()) + ] + elif mem_mode == "external": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """StreamingLookup_ext(in0_%s, out_%s, mem, size, oob_count, + oob_irq);""" + % (self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + ibits = self.get_instream_width() + packed_input_hls_type = "ap_uint<%d>" % ibits + obits = self.get_outstream_width() + packed_output_hls_type = "ap_uint<%d>" % obits + if mem_mode == "internal_embedded": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_input_hls_type, + self.hls_sname(), + packed_output_hls_type, + self.hls_sname(), + ) + ] + elif mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void " + + self.onnx_node.name + + "(hls::stream &in0_%s, hls::stream &out_%s, " + % (self.hls_sname(), self.hls_sname()) + + "T_DST const *const mem, unsigned const size, " + + "unsigned &oob_count, bool &oob_irq)" + ] + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + my_pragmas = ["#pragma HLS INTERFACE axis port=in0_" + self.hls_sname()] + my_pragmas.append("#pragma HLS INTERFACE axis port=out_" + self.hls_sname()) + my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return") + if mem_mode == "internal_embedded": + my_pragmas.append("#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM") + elif mem_mode == "external": + my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=size bundle=control") + my_pragmas.append("#pragma HLS INTERFACE s_axilite port=oob_count bundle=control") + my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") + else: + raise Exception("Unrecognized mem_mode: " + mem_mode) + self.code_gen_dict["$PRAGMAS$"] = my_pragmas + + def generate_params(self, model, path): + mem_mode = self.get_nodeattr("mem_mode") + embeddings = model.get_initializer(self.onnx_node.input[1]) + if mem_mode == "internal_embedded": + code_gen_dir = path + weight_filename = "{}/embeddings.hpp".format(code_gen_dir) + edt = DataType[self.get_nodeattr("EmbeddingType")] + # obits = self.get_outstream_width() + # packed_output_hls_type = "ap_uint<%d>" % obits + assert np.vectorize(edt.allowed)( + embeddings + ).all(), "Embeddings can't be expressed with type %s" % str(edt) + # reverse innertmost dim in embeddings to remain compatible with + # how we normally encode the data in FINN + embeddings_rev = np.flip(embeddings, -1) + embeddings_hls_code = numpy_to_hls_code(embeddings_rev, edt, "embeddings", True, False) + f_thresh = open(weight_filename, "w") + f_thresh.write(embeddings_hls_code) + f_thresh.close() + elif mem_mode == "external": + edt = DataType[self.get_nodeattr("EmbeddingType")] + ext_mem_width = self.get_nodeattr("ext_mem_width") + assert edt.bitwidth() == 8, ( + "Lookup with mem_mode=external " + + "only works with 8-bit embeddings but found " + + str(edt) + ) + emb_dim = self.get_nodeattr("EmbeddingDim") + # need to zero-pad embeddings in external mode for burst alignment + # compute how much padding we need + emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1] + ext_mem_emb_size = self.get_folded_output_shape()[-2] + ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) + align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align) + pad_amount = align_factor - emb_dim + embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)]) + # reshape for packing the innermost dim + embeddings_padded = embeddings_padded.reshape(-1, emb_elems_per_ext_mem_width) + weight_filename = "%s/%s.dat" % (path, self.onnx_node.name) + ret = pack_innermost_dim_as_hex_string( + embeddings_padded, edt, ext_mem_width, True, prefix="" + ) + with open(weight_filename, "w") as f: + for current_line in ret: + f.write(current_line + "\n") + else: + raise Exception("Unrecognized mem_mode: " + mem_mode) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = tuple(self.get_normal_input_shape()) + exp_oshape = tuple(self.get_normal_output_shape()) + folded_ishape = tuple(self.get_folded_input_shape()) + folded_oshape = tuple(self.get_folded_output_shape()) + mem_mode = self.get_nodeattr("mem_mode") + assert ( + mem_mode == "internal_embedded" + ), "Only mem_mode=internal_embedded is supported for simulation of Lookup layer" + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray" + assert inp.shape == exp_ishape, """Input shape doesn't match expected shape.""" + export_idt = self.get_input_datatype() + odt = self.get_output_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, + out_npy_path, + odt, + out_shape, + packed_bits, + target_bits, + reverse_inner=True, + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape.""" + + def get_ap_int_max_w(self): + parent_max = super().get_ap_int_max_w() + mem_mode = self.get_nodeattr("mem_mode") + ext_mem_width = self.get_nodeattr("ext_mem_width") + if mem_mode == "external": + return max(ext_mem_width, parent_max) + else: + return parent_max diff --git a/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py new file mode 100644 index 0000000000..94f8cc0845 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/matrixvectoractivation_hls.py @@ -0,0 +1,590 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation_hls: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# (optional) input 2 is the thresholds tensor, shape (o_size, n_thres) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MVAU_hls(MVAU, HLSBackend): + """Corresponds to finn-hlslib MatrixVectorActivation_Batch function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(MVAU.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "internal_decoupled" and mstyle == "distributed") or ( + mmode == "internal_embedded" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(MW, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + tmem_style = self.get_nodeattr("ram_style_thresholds") + if (noact == 0) and (tmem_style == "distributed"): + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. + if var == "ipgen": + SIMD = self.get_nodeattr("SIMD") + MW = self.get_nodeattr("MW") + condition = SIMD >= (MW / 1024) + msg = ( + f"HLS synthesis of MatrixVectorActivation requires: " + f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " + f"and MW={MW} for node: {self.onnx_node.name}." + ) + assert condition, msg + mem_mode = self.get_nodeattr("mem_mode") + numInputVectors = list(self.get_nodeattr("numInputVectors")) + numReps = np.prod(numInputVectors) + self.code_gen_dict["$DEFINES$"] = [ + """#define MW1 {}\n #define MH1 {}\n + #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n + #define TMEM1 {}\n #define numReps {}""".format( + self.get_nodeattr("MW"), + self.get_nodeattr("MH"), + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + self.calc_wmem(), + self.calc_tmem(), + numReps, + ) + ] + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + if mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + if mem_mode == "internal_embedded": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Matrix_Vector_Activate_Stream_Batch + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_embedded": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + + else: + raise Exception( + """Please set mem_mode to "internal_embedded" or "internal_decoupled", + currently no other parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "internal_embedded": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or external, + currently no other parameter value is supported!""" + ) + + # the threshold tensor is acc_type [PE][TMEM][N_THRES] + # partition for parallel access along PE and N_THRES + # dimensions (dims 1 and 3) + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + # add resource pragma for thresholds if set + if ram_style_thresholds == "distributed": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") + ) + elif ram_style_thresholds == "block": + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") + ) + elif ram_style_thresholds == "auto": + # no pragma needed + pass + else: + raise Exception("Unrecognized ram_style_thresholds value:" + ram_style_thresholds) + + def get_ap_int_max_w(self): + # base class impl (max of inp/out stream widths) + max_of_io = super().get_ap_int_max_w() + # internal_decoupled mode weight stream + weightstream = self.get_weightstream_width() + # single PE weight entry + weight_bits = self.get_weight_datatype().bitwidth() + simd = self.get_nodeattr("SIMD") + single_pe_w = simd * weight_bits + return max([weightstream, max_of_io, single_pe_w]) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for MatrixVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + self.reset_rtlsim(sim) + self.toggle_clk(sim) + if mem_mode == "external" or mem_mode == "internal_decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def instantiate_ip(self, cmd): + # instantiate the HLS IP + vlnv = self.get_nodeattr("ip_vlnv") + node_name = self.onnx_node.name + if self.get_nodeattr("mem_mode") == "internal_decoupled": + cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) + else: + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) diff --git a/src/finn/custom_op/fpgadataflow/pool_batch.py b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py similarity index 57% rename from src/finn/custom_op/fpgadataflow/pool_batch.py rename to src/finn/custom_op/fpgadataflow/hls/pool_hls.py index 813f13e504..64c6ec33f8 100644 --- a/src/finn/custom_op/fpgadataflow/pool_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/pool_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,11 +30,12 @@ import os from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.pool import Pool from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class Pool_Batch(HLSCustomOp): +class Pool_hls(Pool, HLSBackend): """Class that corresponds to finn-hlslib Pool_batch function. Requires ConvolutionInputGenerator(depthwise == 1) to format its input @@ -54,152 +55,11 @@ class Pool_Batch(HLSCustomOp): """ def get_nodeattr_types(self): - my_attrs = { - "Channels": ("i", True, 0), - "PE": ("i", True, 1), - "KernelSize": ("ints", True, []), - # Function: - # - MaxPool - # - QuantAvgPool - # TODO add support for AvgPool and AccPool - "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}), - "OutImgDims": ("ints", True, []), - # FINN DataTypes for inputs/outputs - "InputDataType": ("s", True, ""), - "OutputDataType": ("s", True, ""), - "AccumBits": ("i", False, 0), - "Size": ("i", False, 1), - "BatchSize": ("i", False, 1), - } - - my_attrs.update(super().get_nodeattr_types()) + my_attrs = {} + my_attrs.update(Pool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("InputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - fxn = self.get_nodeattr("Function") - odt = DataType[self.get_nodeattr("OutputDataType")] - - if fxn == "MaxPool": - # Same as input - idt = DataType[self.get_nodeattr("InputDataType")] - assert odt == idt, "In datatype must be equal to out datatype for Maxpool" - elif fxn == "QuantAvgPool": - idt = DataType[self.get_nodeattr("InputDataType")] - assert ( - idt.signed() == odt.signed() - ), """QuantAvgPool: Can't mix signed - and unsigned datatypes""" - else: - raise Exception("Pool_Batch doesn't currently support " + fxn) - - return odt - - def get_normal_input_shape(self, ind=0): - ifm_ch = self.get_nodeattr("Channels") - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - k = self.get_nodeattr("KernelSize") - k_prod = int(np.prod(k)) - ishape = (batch_size, *odims, k_prod * ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - assert ifm_ch % pe == 0, "PE must divide input channels" - fold = int(normal_ishape[-1] / pe) - folded_ishape = normal_ishape[:-1] + [fold, pe] - return tuple(folded_ishape) - - def get_normal_output_shape(self, ind=0): - ofm_ch = self.get_nodeattr("Channels") - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - oshape = (batch_size, *odims, ofm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - assert ifm_ch % pe == 0, "PE must divide input channels" - fold = int(ifm_ch / pe) - folded_oshape = normal_oshape[:-1] + [fold, pe] - return tuple(folded_oshape) - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[1:-1]) - - def get_exp_cycles(self): - # (Channels * kernel * kernel) / PE * odim * odim * batch_size - ifm_ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - k = self.get_nodeattr("KernelSize") - k_prod = int(np.prod(k)) - odims = self.get_nodeattr("OutImgDims") - batch_size = self.get_nodeattr("BatchSize") - exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size - return int(exp_cycles) - - def get_instream_width(self, ind=0): - dt_bits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - in_width = int(dt_bits * pe) - return in_width - - def get_outstream_width(self, ind=0): - dt_bits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = int(dt_bits * pe) - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""Pool_Batch needs 1 data input""") - - # check supported function - fnx = self.get_nodeattr("Function") - if fnx in ["MaxPool", "QuantAvgPool"]: - info_messages.append( - "Attribute Function contains a supported pool function" - ) - else: - info_messages.append( - "Attribute Function contains an unsupported pool function" - ) - return info_messages - def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] self.code_gen_dict["$GLOBALS$"] += ['#include "maxpool.h"'] @@ -239,17 +99,15 @@ def read_npy_data(self): npy_in = "%s/input_0.npy" % code_gen_dir self.code_gen_dict["$READNPYDATA$"] = [] self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0,false);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) def docompute(self): @@ -272,17 +130,15 @@ def docompute(self): else: act_hls_dt = "ap_uint<{}>".format(accum_bits) self.code_gen_dict["$DOCOMPUTE$"] += [ - "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format( - act_hls_dt, o_hls_dt, size - ) + "QuantAvgPoolFunction<{},{},{}> pool_fxn;".format(act_hls_dt, o_hls_dt, size) ] else: raise Exception("Pool_Batch doesn't currently support " + fxn) self.code_gen_dict["$DOCOMPUTE$"] += [ """Pool_batch, Slice< {} > > - (in0,out, pool_fxn, OFMDimTotal*numReps);""".format( - i_hls_dt, o_hls_dt + (in0_{}, out_{}, pool_fxn, OFMDimTotal*numReps);""".format( + i_hls_dt, o_hls_dt, self.hls_sname(), self.hls_sname() ) ] @@ -302,20 +158,18 @@ def dataoutstrm(self): oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s",false);' + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' % ( packed_hls_type, elem_hls_type, elem_bits, npy_type, + self.hls_sname(), oshape_cpp_str, npy_out, ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): packed_ibits = self.get_instream_width() packed_in_hls_type = "ap_uint<%d>" % packed_ibits @@ -323,20 +177,15 @@ def blackboxfunction(self): packed_obits = self.get_outstream_width() packed_out_hls_type = "ap_uint<%d>" % packed_obits self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_in_hls_type, packed_out_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_in_hls_type, + self.hls_sname(), + packed_out_hls_type, + self.hls_sname(), + ) ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py new file mode 100644 index 0000000000..d1f58d3e87 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingdatawidthconverter_hls.py @@ -0,0 +1,215 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, +) +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +# does not do anything at the ONNX node-by-node level, and input-output +# tensor shapes are the same. performs data width conversion at the rtlsim level + + +class StreamingDataWidthConverter_hls(StreamingDataWidthConverter, HLSBackend): + """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch + function.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] + + def defines(self, var): + numReps = 1 + numInWords = int(np.prod(self.get_folded_input_shape()[:-1])) + inWidth = self.get_nodeattr("inWidth") + outWidth = self.get_nodeattr("outWidth") + self.code_gen_dict["$DEFINES$"] = [ + "#define InWidth %d " % inWidth, + "#define OutWidth %d " % outWidth, + "#define NumInWords %d " % numInWords, + "#define numReps %d" % numReps, + ] + if self.needs_lcm(): + lcmWidth = self.get_iowidth_lcm() + assert numInWords % (lcmWidth / inWidth) == 0, "Error in DWC LCM calculation" + numLCMToOut = numInWords // (lcmWidth / inWidth) + self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth) + self.code_gen_dict["$DEFINES$"].append("#define NumLCMToOut %d" % (numLCMToOut)) + + def strm_decl(self): + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if self.needs_lcm(): + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> intermediate ("intermediate");'.format( + self.get_iowidth_lcm() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + # TODO continue with fxns below, they are copy-pasted + op = "StreamingDataWidthConverter_Batch" + if self.needs_lcm(): + self.code_gen_dict["$DOCOMPUTE$"] = [ + 'hls::stream> intermediate ("intermediate");'.format( + self.get_iowidth_lcm() + ), + "%s(in0_%s, intermediate, numReps);" + % (op, self.hls_sname()), + "%s(intermediate, out_%s, numReps);" + % (op, self.hls_sname()), + ] + else: + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s, numReps);" + % (op, self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + in_packed_bits = self.get_instream_width() + in_packed_hls_type = "ap_uint<%d>" % in_packed_bits + out_packed_bits = self.get_outstream_width() + out_packed_hls_type = "ap_uint<%d>" % out_packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + in_packed_hls_type, + self.hls_sname(), + out_packed_hls_type, + self.hls_sname(), + ) + ] + + def pragmas(self): + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + if self.needs_lcm(): + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS DATAFLOW disable_start_propagation") + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # reshape input into folded shape + reshaped_input = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(exp_shape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to "rtlsim" """.format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert context[node.output[0]].shape == tuple( + exp_shape + ), """Output + shape doesn't match expected shape, should be same as input shape""" diff --git a/src/finn/custom_op/fpgadataflow/eltwise.py b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py similarity index 54% rename from src/finn/custom_op/fpgadataflow/eltwise.py rename to src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py index 68ed6546c7..0d618d832a 100644 --- a/src/finn/custom_op/fpgadataflow/eltwise.py +++ b/src/finn/custom_op/fpgadataflow/hls/streamingeltwise_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,112 +28,24 @@ import numpy as np import os -import warnings -from qonnx.core.datatype import DataType -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingeltwise import StreamingEltwise from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy -class StreamingEltwise(HLSCustomOp): +class StreamingEltwise_hls(StreamingEltwise, HLSBackend): """Class that corresponds to finn-hlslib StreamingEltwise function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): - - my_attrs = super().get_nodeattr_types() - my_attrs.update( - { - "NumChannels": ("i", True, ""), - "PE": ("i", True, ""), - # FINN DataTypes for inputs; output datatype inferred from input - "inputDataType0": ("s", True, ""), - "inputDataType1": ("s", True, ""), - # type of EltwiseFunction for the operation - "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - "inFIFODepths": ("ints", False, [2, 2]), - } - ) + my_attrs = {} + my_attrs.update(StreamingEltwise.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def get_eltwise_op_lambda(self): - eltwise_op = self.get_nodeattr("eltwiseOp") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - odt = self.get_output_datatype() - tin0 = idt0.get_hls_datatype_str() - tin1 = idt1.get_hls_datatype_str() - tout = odt.get_hls_datatype_str() - eltwise_ops = { - # "Add": "[](auto a, auto b) { return a + b; }", - # "Sub": "[](auto a, auto b) { return a - b; }", - # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", - "Add": f"add<{tin0}, {tin1}, {tout}>()", - "Sub": f"sub<{tin0}, {tin1}, {tout}>()", - "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", - } - return eltwise_ops[eltwise_op] - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich]) - return ishape - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - assert ich % pe == 0, "PE must divide NumChannels" - vecs = list(self.get_nodeattr("numInputVectors")) - ishape = tuple(vecs + [ich // pe, pe]) - return ishape - - def get_normal_output_shape(self, ind=0): - return self.get_normal_input_shape() - - def get_folded_output_shape(self, ind=0): - return self.get_folded_input_shape() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input1 shape." - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) - assert ishape == exp_ishape, "Unexpected input2 shape." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt0 = model.get_tensor_datatype(node.input[0]) - if idt0 != self.get_input_datatype(0): - warn_str = "inputDataType0 changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype(0)), - str(idt0), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType0", idt0.name) - idt1 = model.get_tensor_datatype(node.input[1]) - if idt1 != self.get_input_datatype(1): - warn_str = "inputDataType1 changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype(1)), - str(idt1), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType1", idt1.name) - # enforce output data type (calculated based on idt) - odt = self.get_output_datatype() - model.set_tensor_datatype(self.onnx_node.output[0], odt) - def verify_node(self): info_messages = [] # verify that "backend" is set to "fpgadataflow" @@ -154,66 +66,10 @@ def verify_node(self): self.get_nodeattr("eltwiseOp") info_messages.append("All necessary attributes exist") except Exception: - info_messages.append( - """The required StreamingEltwise attributes do not exist.""" - ) + info_messages.append("""The required StreamingEltwise attributes do not exist.""") return info_messages - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType" + str(ind))] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - op = self.get_nodeattr("eltwiseOp") - idt0 = self.get_input_datatype(0) - idt1 = self.get_input_datatype(1) - assert idt0.signed() == idt1.signed(), ( - "%s: Inputs must have same signedness" % self.onnx_node.name - ) - idt0_min, idt0_max = idt0.min(), idt0.max() - idt1_min, idt1_max = idt1.min(), idt1.max() - cands = [ - idt0_min - idt1_min, - idt0_min - idt1_max, - idt0_max - idt1_min, - idt0_max - idt1_max, - ] - largest_magnitude = max(map(abs, cands)) - if op == "Add": - if idt0.signed(): - return DataType.get_smallest_possible(idt0.min() + idt1.min()) - else: - return DataType.get_smallest_possible(idt0.max() + idt1.max()) - elif op == "Sub": - return DataType.get_smallest_possible(-largest_magnitude) - elif op == "AbsDiff": - return DataType.get_smallest_possible(largest_magnitude) - else: - raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) - - def get_instream_width(self, ind=0): - """Returns input stream width.""" - ibits = self.get_input_datatype(ind).bitwidth() - pe = self.get_nodeattr("PE") - in_width = pe * ibits - return in_width - - def get_outstream_width(self, ind=0): - """Returns output stream width.""" - obits = self.get_output_datatype().bitwidth() - pe = self.get_nodeattr("PE") - out_width = pe * obits - return out_width - - def get_number_output_values(self): - return np.prod(self.get_folded_output_shape()[:-1]) - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) - def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") node = self.onnx_node @@ -235,9 +91,7 @@ def execute_node(self, context, graph): inp = context[node.input[0]] assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input0 shape doesn't match expected shape .""" + assert inp.shape == exp_ishape, """Input0 shape doesn't match expected shape .""" export_idt0 = self.get_input_datatype(0) # reshape input into folded form inp = inp.reshape(folded_ishape) @@ -248,9 +102,7 @@ def execute_node(self, context, graph): # exact same thing for input1 inp = context[node.input[1]] assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input1 shape doesn't match expected shape .""" + assert inp.shape == exp_ishape, """Input1 shape doesn't match expected shape .""" export_idt1 = self.get_input_datatype(1) # reshape input into folded form inp = inp.reshape(folded_ishape) @@ -354,25 +206,45 @@ def read_npy_data(self): self.code_gen_dict["$READNPYDATA$"] = [] npy_in = "%s/input_0.npy" % code_gen_dir self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type_0, elem_hls_type_0, elem_bits_0, npy_type, npy_in) + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type_0, + elem_hls_type_0, + elem_bits_0, + npy_type, + npy_in, + self.hls_sname(), + ) ) npy_in = "%s/input_1.npy" % code_gen_dir self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in1);' - % (packed_hls_type_1, elem_hls_type_1, elem_bits_1, npy_type, npy_in) + 'npy2apintstream<%s, %s, %d, %s>("%s", in1_%s);' + % ( + packed_hls_type_1, + elem_hls_type_1, + elem_bits_1, + npy_type, + npy_in, + self.hls_sname(), + ) ) def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width(0)) + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(0), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in1 ("in1");'.format(self.get_instream_width(1)) + 'hls::stream> in1_{} ("in1_{}");'.format( + self.get_instream_width(1), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) ) def docompute(self): @@ -394,7 +266,7 @@ def docompute(self): out_hls_type, ) self.code_gen_dict["$DOCOMPUTE$"] = [ - """{}<{}, {}, {}, {}, {}, {}>(in0, in1, out, {});""".format( + """{}<{}, {}, {}, {}, {}, {}>(in0_{}, in1_{}, out_{}, {});""".format( "StreamingEltwise", self.get_nodeattr("NumChannels"), self.get_nodeattr("PE"), @@ -402,65 +274,35 @@ def docompute(self): slice_in0, slice_in1, slice_out, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), eltwise_op_str, ) ] - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, hls::stream> &in1, - hls::stream> &out)""".format( + """void {}(hls::stream> &in0_{}, hls::stream> &in1_{}, + hls::stream> &out_{})""".format( self.onnx_node.name, self.get_nodeattr("PE") * self.get_input_datatype(0).bitwidth(), + self.hls_sname(), self.get_nodeattr("PE") * self.get_input_datatype(1).bitwidth(), + self.hls_sname(), self.get_nodeattr("PE") * self.get_output_datatype().bitwidth(), + self.hls_sname(), ) ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=in1 name=in1_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in1_" + self.hls_sname() ) self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - sname = self.hls_sname() - swidth = self.get_instream_width_padded() - intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] - return intf_names + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") diff --git a/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py new file mode 100755 index 0000000000..69db7b4606 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/streamingmaxpool_hls.py @@ -0,0 +1,222 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.streamingmaxpool import StreamingMaxPool +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class StreamingMaxPool_hls(StreamingMaxPool, HLSBackend): + """Class that corresponds to finn-hlslib StreamingMaxPool_batch function.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingMaxPool.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""StreamingMaxPool_Batch needs 1 data input""") + + return info_messages + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] + + def defines(self, var): + numReps = 1 + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + ceil_mode = self.get_nodeattr("CeilMode") + output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) + + if self.is_1d(): + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim {}\n #define PoolDim {}\n + #define NumChannels {}\n #define PE {}\n #define OutputSize {} + \n #define numReps {}""".format( + ifm_dim[1], + k[1], + self.get_nodeattr("NumChannels"), + self.get_nodeattr("PE"), + output_size, + numReps, + ) + ] + else: + self.code_gen_dict["$DEFINES$"] = [ + """#define ImgDim {}\n #define PoolDim {}\n + #define NumChannels {}\n #define numReps {}""".format( + ifm_dim[1], + k[1], + self.get_nodeattr("NumChannels"), + numReps, + ) + ] + + def docompute(self): + dtype = self.get_input_datatype() + if dtype.bitwidth() == 1: + if self.is_1d(): + raise Exception("Binary 1d MaxPool not implemented on HLS backend") + else: + op = "StreamingMaxPool" + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s);" + % (op, self.hls_sname(), self.hls_sname()) + ] + else: + dtype = self.get_input_datatype() + dtype_hls = dtype.get_hls_datatype_str() + minval_str = str(int(dtype.min())) + if self.is_1d(): + op = "StreamingMaxPool_Precision_1d" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """%s(in0_%s, out_%s);""" + % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) + ] + else: + op = "StreamingMaxPool_Precision" + self.code_gen_dict["$DOCOMPUTE$"] = [ + "%s(in0_%s, out_%s);" + % (op, dtype_hls, minval_str, self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == exp_oshape + ), "cppsim \ + did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/thresholding_batch.py b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py similarity index 64% rename from src/finn/custom_op/fpgadataflow/thresholding_batch.py rename to src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py index d9745acf63..b753bc7a03 100644 --- a/src/finn/custom_op/fpgadataflow/thresholding_batch.py +++ b/src/finn/custom_op/fpgadataflow/hls/thresholding_hls.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,15 +29,12 @@ import numpy as np import os import textwrap -import warnings from math import ceil, log2 from qonnx.core.datatype import DataType -from qonnx.util.basic import ( - interleave_matrix_outer_dim_from_partitions, - roundup_to_integer_multiple, -) +from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.thresholding import Thresholding from finn.util.data_packing import ( npy_to_rtlsim_input, numpy_to_hls_code, @@ -45,8 +42,6 @@ rtlsim_output_to_npy, ) -from . import templates - # ONNX i/o tensor shape assumptions for Thresholding: # input 0 is the input tensor, shape (..., NumChannels) # input 1 is the threshold tensor, shape (NumChannels, n_thres) @@ -54,39 +49,26 @@ # the ... here can be any shape (representing groups of vectors) -class Thresholding_Batch(HLSCustomOp): +class Thresholding_hls(Thresholding, HLSBackend): """Class that corresponds to finn-hls Thresholding_Batch function.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) - self.decoupled_wrapper = templates.decoupled_wrapper + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { - # parallelization; channels thresholded per cycle - "PE": ("i", True, 0), - # number of channels (each may have different thresholds) - "NumChannels": ("i", True, 0), - # number of steps in thresholding function - "numSteps": ("i", True, 1), + # memory mode for the thresholds + # internal_embedded -- embedded thresholds + # internal_decoupled -- default, streaming thresholds with streamer packaged inside IP + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled"}, + ), # string defining memory type "ram_style": ("s", False, "distributed", {"distributed", "block"}), - # FINN DataTypes for inputs, outputs - "inputDataType": ("s", True, ""), - "weightDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - # number of input vectors, examples: - # [1] is a single vector (like a FC layer with batch=1) - # [4] is four vectors (like a FC layer with batch=4) - # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) - "numInputVectors": ("ints", False, [1]), - # initialization value for the thresholding accumulator - "ActVal": ("i", False, 0), - # memory mode for the thresholds - # const -- embedded thresholds, default - # decoupled -- streaming thresholds with streamer packaged inside IP - "mem_mode": ("s", False, "const", {"const", "decoupled"}), - # (mem_mode = decoupled only) whether weights (thresholds) will be + # (mem_mode = internal_decoupled only) whether weights (thresholds) will be # writable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory @@ -97,60 +79,10 @@ def get_nodeattr_types(self): # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(Thresholding.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs - def calc_tmem(self): - """Calculates and returns TMEM.""" - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - return mh // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype().name), - str(idt.name), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify that all necessary attributes exist - # TODO collect automatically from get_nodeattr_types - try: - self.get_nodeattr("code_gen_dir_cppsim") - self.get_nodeattr("executable_path") - self.get_nodeattr("NumChannels") - self.get_nodeattr("PE") - self.get_nodeattr("inputDataType") - self.get_nodeattr("outputDataType") - info_messages.append("All necessary attributes exist") - except Exception: - info_messages.append( - """The required Threshold_Batch attributes do not exist.""" - ) - - return info_messages - def bram_estimation(self): """Calculates BRAM cost if resource set to BRAM""" style = self.get_nodeattr("ram_style") @@ -182,53 +114,9 @@ def lut_estimation(self): # total cost return comparator_cost + lutram_cost - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_weight_datatype(self): - """Returns FINN DataType of thresholds, here called weights.""" - return DataType[self.get_nodeattr("weightDataType")] - - def minimize_accumulator_width(self, model): - "Minimize threshold width ('accumulator width' here due to convention)" - thresholds = model.get_initializer(self.onnx_node.input[1]) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - min_input = self.get_input_datatype().min() - max_input = self.get_input_datatype().max() - # get range required by threshold values - tdt_min = min(min_input, min_threshold) - tdt_max = max(max_input, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds can't be expressed with type %s" % str(tdt) - self.set_nodeattr("weightDataType", tdt.name) - return DataType[self.get_nodeattr("weightDataType")] - - def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - return i_bits * self.get_nodeattr("PE") - - def get_outstream_width(self, ind=0): - o_bits = self.get_output_datatype().bitwidth() - return o_bits * self.get_nodeattr("PE") - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if self.get_nodeattr("mem_mode") == "decoupled": + """Returns weight stream width. Used only in internal_decoupled mode.""" + if self.get_nodeattr("mem_mode") == "internal_decoupled": pe = self.get_nodeattr("PE") wp = self.get_weight_datatype().bitwidth() n_thres_steps = self.get_nodeattr("numSteps") @@ -239,44 +127,16 @@ def get_weightstream_width(self): def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" + by the AXI Stream spec. Used in internal_decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) def get_ap_int_max_w(self): - temp_value = super().get_ap_int_max_w() - weightstream = self.get_weightstream_width() - return max([weightstream, temp_value]) - - def get_folded_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - fold = ich // pe - vecs = list(self.get_nodeattr("numInputVectors")) - folded_input_shape = tuple(vecs + [fold, pe]) - return folded_input_shape - - def get_folded_output_shape(self, ind=0): - # same shape as input - return self.get_folded_input_shape() - - def get_normal_input_shape(self, ind=0): - ich = self.get_nodeattr("NumChannels") - vecs = list(self.get_nodeattr("numInputVectors")) - normal_input_shape = tuple(vecs + [ich]) - return normal_input_shape - - def get_normal_output_shape(self, ind=0): - # same shape as input - return self.get_normal_input_shape() - - def get_number_output_values(self): - nf = np.prod(self.get_folded_output_shape()[:-1]) - return nf - - def get_exp_cycles(self): - # Channels/PE * batch size * fmdim * fmdim - return np.prod(self.get_folded_output_shape()[:-1]) + ap_int_max_w = HLSBackend.get_ap_int_max_w(self) + if self.get_nodeattr("mem_mode") == "internal_decoupled": + weightstream = self.get_weightstream_width() + ap_int_max_w = max([weightstream, ap_int_max_w]) + return ap_int_max_w def get_template_param_values(self): """Returns the template parameter values according to input, output and weight @@ -291,63 +151,6 @@ def get_template_param_values(self): return ret - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 - * for unsigned inputs, ensure thresholds are positive - * interleave rows between PEs - * reshape into (PE, TMEM, n_thres_steps) and return - """ - mh = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - tmem = mh // pe - assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." - assert ( - orig_thres_matrix.ndim == 2 - ), """Threshold matrix dimension is - not as expected (2).""" - n_thres_steps = orig_thres_matrix.shape[1] - assert n_thres_steps == self.get_nodeattr( - "numSteps" - ), "Mismatch in threshold steps" - if not self.get_input_datatype().signed(): - # ensure all thresholds are nonnegative - assert (orig_thres_matrix >= 0).all() - # ensure all thresholds are integer - assert np.equal( - np.mod(orig_thres_matrix, 1), 0 - ).all(), "Need int threshold tensor" - ret = orig_thres_matrix - # workaround for vivado_hls threshold bug - if ret[0][0] == 0 and n_thres_steps == 1: - ret = np.copy(ret) - ret[0][0] = 1 - warnings.warn( - "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" - ) - # ensure channels = mh , duplicating if necessary - if ret.shape[0] == 1: - ret = np.tile(ret, (mh, 1)) - assert ( - ret.shape[0] == mh - ), "Channels of threshold matrix are not as expected (mh)" - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - assert ( - ret.shape[0] == pe - ), """First dimension after distribution of the - rows between PEs is not as expected (pe)""" - assert ( - ret.shape[1] == tmem - ), """Second dimension after distribution of the - rows between PEs is not as expected (tmem)""" - assert ( - ret.shape[2] == n_thres_steps - ), """Third dimension after distribution of the - rows between PEs is not as expected (n_thres_steps)""" - return ret.reshape(1, pe, tmem, n_thres_steps) - def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights (thresholds) in appropriate format for this layer. This file can be used for either synthesis or @@ -361,7 +164,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ - threshold_tensor = self.get_hls_compatible_threshold_tensor(weights) + threshold_tensor = self.get_hw_compatible_threshold_tensor(weights) tdt = self.get_weight_datatype() assert np.vectorize(tdt.allowed)( threshold_tensor @@ -455,36 +258,18 @@ def generate_params(self, model, path): code_gen_dir = path thresholds = model.get_initializer(self.onnx_node.input[1]) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": # save thresholds in thresh.h weight_filename = "{}/thresh.h".format(code_gen_dir) self.make_weight_file(thresholds, "hls_header", weight_filename) - elif mem_mode == "decoupled": - # save decoupled weights for cppsim + elif mem_mode == "internal_decoupled": + # save internal_decoupled weights for cppsim weight_filename_sim = "{}/thresholds.npy".format(code_gen_dir) self.make_weight_file(thresholds, "decoupled_npy", weight_filename_sim) # also save weights as Verilog .dat file - # note that we provide two different .dat files, one for synth - # and one for synthesis. this is because URAM-based weights always - # need zero weights for synthesis, otherwise they get inferred - # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format(code_gen_dir) - weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) - # sim weights are always the true weights - self.make_weight_file( - thresholds, "decoupled_verilog_dat", weight_filename_rtl_sim - ) - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - # UltraRAM must have no memory initializer, or only zeroes - # otherwise BRAM will be inferred instead of URAM - # as a workaround we provide a zero-weight init here - synth_thresholds = np.zeros_like(thresholds, dtype=np.float32) - else: - synth_thresholds = thresholds - self.make_weight_file( - synth_thresholds, "decoupled_verilog_dat", weight_filename_rtl_synth - ) + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file(thresholds, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception("Unrecognized mem_mode") @@ -545,18 +330,14 @@ def execute_node(self, context, graph): out = 2 * out - 1 context[node.output[0]] = out oshape = self.get_normal_output_shape() - assert ( - context[node.output[0]].shape == oshape - ), """Output shape is not as expected""" + assert context[node.output[0]].shape == oshape, """Output shape is not as expected""" elif mode == "rtlsim": sim = self.get_rtlsim() nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) super().reset_rtlsim(sim) super().toggle_clk(sim) - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": wnbits = self.get_weightstream_width() export_wdt = self.get_weight_datatype() wei = npy_to_rtlsim_input( @@ -569,7 +350,7 @@ def execute_node(self, context, graph): } self.rtlsim_multi_io(sim, io_dict) output = io_dict["outputs"]["out"] - elif self.get_nodeattr("mem_mode") == "const": + elif self.get_nodeattr("mem_mode") == "internal_embedded": output = self.rtlsim(sim, inp) else: raise Exception("Unrecognized mem_mode") @@ -578,9 +359,7 @@ def execute_node(self, context, graph): packed_bits = self.get_outstream_width() out_npy_path = "{}/output.npy".format(code_gen_dir) out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) # load and reshape output output = np.load(out_npy_path) @@ -597,7 +376,7 @@ def execute_node(self, context, graph): def global_includes(self): self.code_gen_dict["$GLOBALS$"] = ['#include "activations.hpp"'] - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] # TODO check and add whatever missing @@ -615,13 +394,12 @@ def defines(self, var): total_spatial_size, ) ] - if self.get_nodeattr("mem_mode") == "decoupled": + if self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$DEFINES$"].append( "#define ActVal1 %d" % self.get_nodeattr("ActVal") ) self.code_gen_dict["$DEFINES$"].append( - "#define ThresType1 %s" - % self.get_weight_datatype().get_hls_datatype_str() + "#define ThresType1 %s" % self.get_weight_datatype().get_hls_datatype_str() ) self.code_gen_dict["$DEFINES$"].append( "#define NumSteps1 %d" % self.get_nodeattr("numSteps") @@ -639,11 +417,18 @@ def read_npy_data(self): self.code_gen_dict["$READNPYDATA$"] = [] # note: the innermost dim is reversed for the input self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": tdt = self.get_weight_datatype() elem_bits = tdt.bitwidth() packed_bits = self.get_weightstream_width() @@ -653,49 +438,63 @@ def read_npy_data(self): npy_in = "%s/thresholds.npy" % code_gen_dir self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, ImgDim1);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, ImgDim1);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) ) def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) ) mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights ("weights");'.format( - self.get_weightstream_width() + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() ) ) def docompute(self): tmpl_args = self.get_template_param_values() - node = self.onnx_node mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0, out, threshs, numReps);""".format( - node.op_type, + """Thresholding_Batch + (in0_{}, out_{}, threshs, numReps);""".format( tmpl_args["TSrcI"], tmpl_args["TDstI"], + self.hls_sname(), + self.hls_sname(), ) ] - elif mem_mode == "decoupled": + elif mem_mode == "internal_decoupled": # note that numReps is set to 1 in the invocation below, since # - for cppsim the repetition comes from the threshold stream reader+input # - for synth the unit runs continuously anyway (ap_ctrl_none) self.code_gen_dict["$DOCOMPUTE$"] = [ """{} - (in0, out, weights, numReps);""".format( + (in0_{}, out_{}, weights_{}, numReps);""".format( "Thresholding_Stream_Batch", tmpl_args["TSrcI"], tmpl_args["TDstI"], + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), ) ] else: @@ -718,41 +517,44 @@ def dataoutstrm(self): # note: the innermost dim is not reversed for the output self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' % ( packed_hls_type, elem_hls_type, elem_bits, npy_type, + self.hls_sname(), shape_cpp_str, npy_out, ) ] - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - def blackboxfunction(self): - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &out + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} )""".format( self.onnx_node.name, self.get_instream_width(), + self.hls_sname(), self.get_outstream_width(), + self.hls_sname(), ) ] - elif self.get_nodeattr("mem_mode") == "decoupled": + elif self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &weights, - hls::stream> &out + """void {}(hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} )""".format( self.onnx_node.name, self.get_instream_width(), + self.hls_sname(), self.get_weightstream_width(), + self.hls_sname(), self.get_outstream_width(), + self.hls_sname(), ) ] else: @@ -760,30 +562,22 @@ def blackboxfunction(self): def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if self.get_nodeattr("mem_mode") == "const": + if self.get_nodeattr("mem_mode") == "internal_embedded": # the threshold tensor is acc_type [PE][TMEM][N_THRES] # partition for parallel access along PE and N_THRES # dimensions (dims 1 and 3) self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=1" - ) + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") ) self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=3" - ) + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") ) # set resource type ram_style = self.get_nodeattr("ram_style") @@ -794,17 +588,11 @@ def pragmas(self): if pe < ich: if ram_style == "distributed": self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.m_thresholds " - "core=ROM_2P_LUTRAM" - ) + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_LUTRAM") ) elif ram_style == "block": self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.m_thresholds " - "core=ROM_2P_BRAM" - ) + ("#pragma HLS RESOURCE variable=threshs.m_thresholds " "core=ROM_2P_BRAM") ) else: raise Exception( @@ -813,17 +601,16 @@ def pragmas(self): ram_style ) ) - elif self.get_nodeattr("mem_mode") == "decoupled": + elif self.get_nodeattr("mem_mode") == "internal_decoupled": self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights name=weights_" - + self.hls_sname() + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() ) def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": node_name = self.onnx_node.name runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 sname = self.hls_sname() @@ -837,8 +624,7 @@ def code_generation_ipi(self): cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" - % (node_name, dout_name) + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) ) cmd.append( "create_bd_intf_pin -mode Slave " @@ -850,30 +636,23 @@ def code_generation_ipi(self): % (self.get_nodeattr("ip_vlnv"), node_name, node_name) ) # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (strm_vlnv, node_name, strm_inst) + "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) ) cmd.append( "set_property -dict [list " - "CONFIG.NSTREAMS {1} " - "CONFIG.MEM_DEPTH {%d} " - "CONFIG.MEM_WIDTH {%d} " - "CONFIG.MEM_INIT {%s} " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " "CONFIG.RAM_STYLE {%s} " - "CONFIG.STRM0_DEPTH {%d} " - "CONFIG.STRM0_WIDTH {%d} " - "CONFIG.STRM0_OFFSET {0} " "] [get_bd_cells /%s/%s]" % ( self.calc_tmem(), self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", self.get_nodeattr("ram_style"), - self.calc_tmem(), - self.get_weightstream_width_padded(), node_name, strm_inst, ) @@ -884,11 +663,11 @@ def code_generation_ipi(self): % (node_name, strm_inst, node_name, node_name, sname) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" % (node_name, rst_name, node_name, strm_inst) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" % (node_name, clk_name, node_name, strm_inst) ) cmd.append( @@ -914,8 +693,7 @@ def code_generation_ipi(self): axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] cmd.append( "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" - % (node_name, axilite_name) + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " @@ -925,8 +703,8 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const": - # base class impl sufficient for const mode + elif mem_mode == "internal_embedded": + # base class impl sufficient for internal_embedded mode return super().code_generation_ipi() else: raise Exception("Unrecognized mem_mode for Thresholding_Batch") @@ -935,7 +713,7 @@ def code_generation_ipi(self): def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: @@ -967,10 +745,8 @@ def derive_characteristic_fxns(self, period): "outputs": {"out": []}, } mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: + if mem_mode in ["internal_decoupled", "external"]: n_weight_inps = self.calc_tmem() num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [ - 0 for i in range(num_w_reps * n_weight_inps) - ] + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/tlastmarker.py b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py similarity index 81% rename from src/finn/custom_op/fpgadataflow/tlastmarker.py rename to src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py index 1bd32442a1..2e908016e7 100644 --- a/src/finn/custom_op/fpgadataflow/tlastmarker.py +++ b/src/finn/custom_op/fpgadataflow/hls/tlastmarker_hls.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,10 +27,11 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class TLastMarker(HLSCustomOp): +class TLastMarker_hls(HWCustomOp, HLSBackend): """Node that adds/removes AXI stream TLAST signals where needed. Its behavior is transparent in node-by-node execution, only visible in IP-stitched rtlsim or actual hardware. @@ -37,8 +39,8 @@ class TLastMarker(HLSCustomOp): (needed by the FINN PYNQ shell) or at the beginning to remove the end-of-burst from DMA read.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -56,7 +58,8 @@ def get_nodeattr_types(self): # Vitis docs recommend using qdma_axis for external, ap_axiu for internal "Protocol": ("s", False, "external", {"external", "internal"}), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(HWCustomOp.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) return my_attrs def execute_node(self, context, graph): @@ -130,9 +133,9 @@ def docompute(self): self.code_gen_dict["$DOCOMPUTE$"] = [ "for(unsigned int i=0; i &in0, - hls::stream &out, unsigned int numIters)""" - % self.onnx_node.name + """void %s(hls::stream &in0_%s, + hls::stream &out_%s, unsigned int numIters)""" + % (self.onnx_node.name, self.hls_sname(), self.hls_sname()) ] else: self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void %s(hls::stream &in0, hls::stream &out)""" - % self.onnx_node.name + """void %s(hls::stream &in0_%s, + hls::stream &out_%s)""" + % (self.onnx_node.name, self.hls_sname(), self.hls_sname()) ] def pragmas(self): self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() ] self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() ) dyn_iters = self.get_nodeattr("DynIters") @@ -211,9 +218,7 @@ def pragmas(self): "#pragma HLS INTERFACE s_axilite port=numIters bundle=control" ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") def get_number_output_values(self): return self.get_nodeattr("NumIters") @@ -239,10 +244,10 @@ def get_outstream_width(self, ind=0): def strm_decl(self): self.code_gen_dict["$STREAMDECLARATIONS$"] = [] self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream in0 ("in0");' + 'hls::stream in0_%s ("in0_%s");' % (self.hls_sname(), self.hls_sname()) ) self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream out ("out");' + 'hls::stream out_%s ("out_%s");' % (self.hls_sname(), self.hls_sname()) ) def get_verilog_top_module_intf_names(self): diff --git a/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py new file mode 100644 index 0000000000..05d26eddb2 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/upsampler_hls.py @@ -0,0 +1,175 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.upsampler import UpsampleNearestNeighbour +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class UpsampleNearestNeighbour_hls(UpsampleNearestNeighbour, HLSBackend): + """ + Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. + Upsampling is done with the Nearest Neighbour algorithm. + The layer expects square feature maps for the in and output. + """ + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(UpsampleNearestNeighbour.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def verify_node(self): + pass + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] + + def defines(self, var): + self.code_gen_dict["$DEFINES$"] = [] + + ifm_ch = self.get_nodeattr("NumChannels") + self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] + + ibits = self.get_input_datatype().bitwidth() + self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] + + idim = self.get_nodeattr("IFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] + + odim = self.get_nodeattr("OFMDim") + self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] + + batch_size = self.get_nodeattr("numInputVectors") + self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] + + def docompute(self): + is_2d = self.get_nodeattr("DimMode") == 0 + batch = self.get_nodeattr("numInputVectors") + if is_2d: + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_Batch > (in0_%s, out_%s, numReps);""" + % (self.hls_sname(), self.hls_sname()) + ] + else: + assert batch == 1, "1D upsampler currently needs numReps=1" + self.code_gen_dict["$DOCOMPUTE$"] = [ + """UpsampleNearestNeighbour_1D > (in0_%s, out_%s);""" + % (self.hls_sname(), self.hls_sname()) + ] + + def blackboxfunction(self): + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + "void %s(hls::stream<%s > &in0_%s, hls::stream<%s > &out_%s)" + % ( + self.onnx_node.name, + packed_hls_type, + self.hls_sname(), + packed_hls_type, + self.hls_sname(), + ) + ] + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_oshape = self.get_folded_output_shape() + + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" + export_idt = self.get_input_datatype() + self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + assert ( + context[node.output[0]].shape == folded_oshape + ), "cppsim did not produce expected folded output shape" + context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim, OutputDim, NumChannels).""" diff --git a/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py new file mode 100644 index 0000000000..3e10b640c5 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hls/vectorvectoractivation_hls.py @@ -0,0 +1,541 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hlsbackend import HLSBackend +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + + +class VVAU_hls(VVAU, HLSBackend): + """Corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VVAU.get_nodeattr_types(self)) + my_attrs.update(HLSBackend.get_nodeattr_types(self)) + return my_attrs + + def lut_estimation(self): + """Calculates resource estimations for LUTs based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 + """ + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + # determine tdt with input and weight data types + idt = self.get_input_datatype() + A = idt.bitwidth() + # parameters from experiments in paper mentioned above + c0 = 300 + c1 = 1.1 + c2 = 0 + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if (mmode == "internal_decoupled" and mstyle == "distributed") or ( + mmode == "internal_embedded" and self.calc_wmem() <= 128 + ): + c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) + + # multiplication + res_type = self.get_nodeattr("resType") + if res_type == "dsp": + mult_luts = 0 + else: + mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) + # adder tree + addertree_luts = (W + A) * (2 * Q - 1) + # accumulator + acc_datatype = self.get_accumulator_datatype() + acc_bits = acc_datatype.bitwidth() + k_h, k_w = self.get_nodeattr("Kernel") + # if accDataType is not set, then it will default to INT32, which would + # be a large overestimate in most (if not all) cases. In this scenario, + # we would use the minimum accumulator as determined by the data types + # bound, derived in https://arxiv.org/abs/2301.13376 + alpha = math.log(k_h * k_w, 2) + W + A - 1 - int(idt.signed()) + acc_bits = min( + acc_datatype.bitwidth(), + np.ceil(alpha + math.log(1 + pow(2, -alpha), 2) + 1), + ) + acc_luts = acc_bits + # thresholds and threshold comparators + thr_luts = 0 + comp_luts = 0 + noact = self.get_nodeattr("noActivation") + # TODO - add 'ram_style_threshold' node attribute + if noact == 0: + odt = self.get_output_datatype() + B = odt.bitwidth() + thr_luts = (2**B - 1) * acc_bits * self.calc_tmem() / 64 + comp_luts = (2**B - 1) * acc_bits + + return int( + c0 + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) + c2 + ) + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + res_type = self.get_nodeattr("resType") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + idt = self.get_input_datatype() + A = idt.bitwidth() + if res_type == "dsp": + mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling + else: + mult_dsp = 0 + return int(mult_dsp) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + # TODO ensure codegen dir exists + if mode == "cppsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for VectorVectorActivation") + in_ind += 1 + + if mode == "cppsim": + # execute the precompiled model + super().exec_precompiled_singlenode_model() + # load output npy file + super().npy_to_dynamic_output(context) + # reinterpret binary output as bipolar where needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == self.get_normal_output_shape() + ), "cppsim did not produce expected output shape" + elif mode == "rtlsim": + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + + if mem_mode == "external" or mem_mode == "internal_decoupled": + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input("{}/weights.npy".format(code_gen_dir), export_wdt, wnbits) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def get_template_param_values(self): + """Returns the template parameter values according to input, output and weight + data types.""" + ret = dict() + inp_hls_str = self.get_input_datatype().get_hls_datatype_str() + out_hls_str = self.get_output_datatype().get_hls_datatype_str() + inp_is_binary = self.get_input_datatype() == DataType["BINARY"] + # out_is_binary = self.get_output_datatype() == DataType["BINARY"] + wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] + bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 + if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): + raise Exception("True binary (non-bipolar) inputs not yet supported") + inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] + # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] + wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] + # reinterpret inp/wt as bipolar if bin_xnor_mode is iset + inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) + wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) + # fill in TSrcI and TWeightI + # TODO check these with Giulio + # TODO handle non-bipolar binary inputs + if inp_is_bipolar and wt_is_bipolar: + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and wt_is_bipolar: + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Recast" + elif inp_is_bipolar and (not wt_is_bipolar): + ret["TSrcI"] = "Recast" + ret["TWeightI"] = "Identity" + elif (not inp_is_bipolar) and (not wt_is_bipolar): + ret["TSrcI"] = "Slice<%s>" % inp_hls_str + ret["TWeightI"] = "Identity" + + # fill in TDstI + ret["TDstI"] = "Slice<%s>" % out_hls_str + + return ret + + def global_includes(self): + self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] + self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode not in ["internal_embedded", "internal_decoupled", "external"]: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + if self.calc_tmem() != 0: + self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] + + def defines(self, var): + dim_h, dim_w = self.get_nodeattr("Dim") + numReps = 1 * dim_h * dim_w + k_h, k_w = self.get_nodeattr("Kernel") + innerProdDim = k_h * k_w + mem_mode = self.get_nodeattr("mem_mode") + + self.code_gen_dict["$DEFINES$"] = [ + """#define Channels1 {}\n #define InnerProdDim {}\n + #define SIMD1 {}\n #define PE1 {}\n #define numReps {}""".format( + self.get_nodeattr("Channels"), + innerProdDim, + self.get_nodeattr("SIMD"), + self.get_nodeattr("PE"), + numReps, + ) + ] + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + self.code_gen_dict["$DEFINES$"].append("#define WP1 {}\n".format(wdt.bitwidth())) + + def read_npy_data(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + # note: the innermost dim is reversed for the input + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s, false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + elem_bits = wdt.bitwidth() + packed_bits = self.get_weightstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = wdt.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/weights.npy" % code_gen_dir + + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", weights_%s, false, numReps);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + if mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> weights_{} ("weights_{}");'.format( + self.get_weightstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + def docompute(self): + mem_mode = self.get_nodeattr("mem_mode") + map_to_hls_mult_style = { + "auto": "ap_resource_dflt()", + "lut": "ap_resource_lut()", + "dsp": "ap_resource_dsp()", + } + tmpl_args = self.get_template_param_values() + if self.calc_tmem() == 0: + odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() + threshs = "PassThroughActivation<%s>()" % odtype_hls_str + else: + threshs = "threshs" + + if mem_mode == "internal_embedded": + self.code_gen_dict["$DOCOMPUTE$"] = [ + """Vector_Vector_Activate_Batch + (in0_{}, out_{}, weights, {}, numReps, {});""".format( + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + wdt = self.get_weight_datatype() + if wdt == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + else: + export_wdt = wdt + wdtype_hls_str = export_wdt.get_hls_datatype_str() + self.code_gen_dict["$DOCOMPUTE$"] = [ + """{} + (in0_{}, out_{}, weights_{}, {}, numReps, {});""".format( + "Vector_Vector_Activate_Stream_Batch", + tmpl_args["TSrcI"], + tmpl_args["TDstI"], + tmpl_args["TWeightI"], + wdtype_hls_str, + self.hls_sname(), + self.hls_sname(), + self.hls_sname(), + threshs, + map_to_hls_mult_style[self.get_nodeattr("resType")], + ) + ] + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", + currently no other parameter value is supported!""" + ) + + def dataoutstrm(self): + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + shape = self.get_folded_output_shape() + shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") + + # note: the innermost dim is not reversed for the output + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s", false);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + shape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + self.code_gen_dict["$SAVEASCNPY$"] = [] + + def blackboxfunction(self): + mem_mode = self.get_nodeattr("mem_mode") + if mem_mode == "internal_embedded": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}(hls::stream> &in0_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ + """void {}( + hls::stream> &in0_{}, + hls::stream> &weights_{}, + hls::stream> &out_{} + )""".format( + self.onnx_node.name, + self.get_instream_width(), + self.hls_sname(), + self.get_weightstream_width(), + self.hls_sname(), + self.get_outstream_width(), + self.hls_sname(), + ) + ] + else: + raise Exception( + """Please set mem_mode to "internal_embedded" or "internal_decoupled", + currently no other parameter value is supported!""" + ) + + def pragmas(self): + mem_mode = self.get_nodeattr("mem_mode") + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + if mem_mode == "internal_embedded": + self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') + # the weight tensor is ap_uint [PE][WMEM] + # partition for parallel access along the PE dimension (dim 1) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=weights.m_weights " "complete dim=1") + ) + elif mem_mode == "internal_decoupled" or mem_mode == "external": + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=weights_" + self.hls_sname() + ) + else: + raise Exception( + """Please set mem_mode to "internal_embedded", "internal_decoupled", or external, + currently no other parameter value is supported!""" + ) + + if self.calc_tmem() != 0: + # TODO find a better way of checking for no pregenerated thresholds + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=1") + ) + self.code_gen_dict["$PRAGMAS$"].append( + ("#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " "complete dim=3") + ) + + def instantiate_ip(self, cmd): + # instantiate the HLS IP + vlnv = self.get_nodeattr("ip_vlnv") + node_name = self.onnx_node.name + if self.get_nodeattr("mem_mode") == "internal_decoupled": + cmd.append("create_bd_cell -type ip -vlnv %s /%s/%s" % (vlnv, node_name, node_name)) + else: + cmd.append("create_bd_cell -type ip -vlnv %s %s" % (vlnv, node_name)) diff --git a/src/finn/custom_op/fpgadataflow/hlsbackend.py b/src/finn/custom_op/fpgadataflow/hlsbackend.py new file mode 100644 index 0000000000..d8210fd684 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/hlsbackend.py @@ -0,0 +1,476 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import subprocess +from abc import ABC, abstractmethod +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow import templates +from finn.util.basic import CppBuilder, get_rtlsim_trace_depth, make_build_dir +from finn.util.hls import CallHLS +from finn.util.pyverilator import make_single_source_file + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class HLSBackend(ABC): + """HLSBackend class all custom ops that correspond to a finn-hlslib + function are using functionality of. Contains different functions every HLS + custom node should have. Some as abstract methods, these have to be filled + when writing a new HLS custom op node.""" + + def get_nodeattr_types(self): + return { + "code_gen_dir_cppsim": ("s", False, ""), + "executable_path": ("s", False, ""), + "res_hls": ("s", False, ""), + } + + def get_all_verilog_paths(self): + "Return list of all folders containing Verilog code for this node." + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + assert ( + code_gen_dir != "" + ), """Node attribute "code_gen_dir_ipgen" is + not set. Please run HLSSynthIP first.""" + verilog_path = "{}/project_{}/sol1/impl/verilog/".format(code_gen_dir, self.onnx_node.name) + # default impl only returns the HLS verilog codegen dir + return [verilog_path] + + def get_all_verilog_filenames(self, abspath=False): + "Return list of all Verilog files used for this node." + + verilog_files = [] + verilog_paths = self.get_all_verilog_paths() + for verilog_path in verilog_paths: + for f in os.listdir(verilog_path): + if f.endswith(".v"): + if abspath: + verilog_files += [verilog_path + "/" + f] + else: + verilog_files += [f] + return verilog_files + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + verilog_files = self.get_all_verilog_filenames(abspath=True) + single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") + tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" + make_single_source_file(verilog_files, target_file) + + # build the Verilator emu library + sim = PyVerilator.build( + self.get_verilog_top_module_name() + ".v", + build_dir=tmp_build_dir, + verilog_path=[single_src_dir], + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipgen(self, model, fpgapart, clk): + """Generates c++ code and tcl script for ip generation.""" + node = self.onnx_node + + # generate top cpp file for ip generation + path = self.get_nodeattr("code_gen_dir_ipgen") + self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + self.generate_params(model, path) + self.global_includes() + self.defines("ipgen") + self.blackboxfunction() + self.pragmas() + self.docompute() + + template = templates.ipgen_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + # generate tcl script for ip generation + self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)] + self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir] + self.code_gen_dict["$FPGAPART$"] = [fpgapart] + self.code_gen_dict["$TOPFXN$"] = [node.name] + self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] + self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives() + self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() + + template = templates.ipgentcl_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + def ipgen_default_directives(self): + """Return list of default HLS synthesis directives""" + + default_directives = [ + "set_param hls.enable_hidden_option_error false", + "config_compile -disable_unroll_code_size_check -pipeline_style flp", + "config_interface -m_axi_addr64", + "config_rtl -module_auto_prefix", + "config_rtl -deadlock_detection none", + ] + return default_directives + + def ipgen_extra_directives(self): + "Return a list of extra tcl directives for HLS synthesis." + return [] + + def ipgen_singlenode_code(self): + """Builds the bash script for IP generation using the CallHLS utility.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + builder = CallHLS() + builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name)) + builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) + builder.build(code_gen_dir) + ipgen_path = builder.ipgen_path + assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path) + self.set_nodeattr("ipgen_path", ipgen_path) + ip_path = ipgen_path + "/sol1/impl/ip" + assert os.path.isdir(ip_path), "IPGen failed: %s not found. Check log under %s" % ( + ip_path, + code_gen_dir, + ) + self.set_nodeattr("ip_path", ip_path) + vlnv = "xilinx.com:hls:%s:1.0" % node.name + self.set_nodeattr("ip_vlnv", vlnv) + + def code_generation_cppsim(self, model): + """Generates c++ code for simulation (cppsim).""" + node = self.onnx_node + path = self.get_nodeattr("code_gen_dir_cppsim") + self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] + self.generate_params(model, path) + self.global_includes() + self.defines("cppsim") + self.read_npy_data() + self.strm_decl() + self.pragmas() + self.docompute() + self.dataoutstrm() + self.save_as_npy() + + template = templates.docompute_template + + for key in self.code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(self.code_gen_dict[key]) + template = template.replace(key, code_gen_line) + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w") + f.write(template) + f.close() + self.code_gen_dict.clear() + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + vlnv = self.get_nodeattr("ip_vlnv") + cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)] + return cmd + + def compile_singlenode_code(self): + """Builds the bash script for compilation using the CppBuilder from + finn.util.basic and executes the script to produce the executable.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + builder = CppBuilder() + # to enable additional debug features please uncommand the next line + # builder.append_includes("-DDEBUG") + builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") + builder.append_includes("-I$FINN_ROOT/deps/cnpy/") + builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") + builder.append_includes("-I$FINN_ROOT/custom_hls") + builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) + builder.append_includes("--std=c++14") + builder.append_includes("-O3") + builder.append_sources(code_gen_dir + "/*.cpp") + builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") + builder.append_includes("-lz") + builder.set_executable_path(code_gen_dir + "/node_model") + builder.build(code_gen_dir) + self.set_nodeattr("executable_path", builder.executable_path) + + def dynamic_input_to_npy(self, context, count, target_dir=""): + """Saves input (given context) into .npy files. + + Count indicates the number of inputs that have to be saved.""" + node = self.onnx_node + if target_dir == "": + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + if code_gen_dir == "": + raise Exception( + """ + Found no codegen dir for this node, did you run the prepare_cppsim transformation? + """ + ) + target_dir = code_gen_dir + # create a npy file for each input of the node (in_ind is input index) + # assuming dynamic inputs start from 0 + for in_ind in range(count): + current_input_name = node.input[in_ind] + input_array = context[current_input_name] + if in_ind == 0: + expected_inp_shape = self.get_folded_input_shape() + idt = self.get_input_datatype() + else: + expected_inp_shape = self.get_folded_input_shape(in_ind) + idt = self.get_input_datatype(in_ind) + reshaped_input = input_array.reshape(expected_inp_shape) + if idt == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(target_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + + def npy_to_dynamic_output(self, context): + """Reads the output from an output.npy file generated from cppsim and + places its content into the context dictionary.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + output = np.load("{}/output.npy".format(code_gen_dir)) + exp_shape = self.get_normal_output_shape() + context[node.output[0]] = output.reshape(exp_shape) + + def npy_to_dynamic_outputs(self, context, npy_list): + """Reads the output from .npy files generated from cppsim and places + their content into the context dictionary. + npy_list is a list specifying which files to read, and its order must + match the order of node outputs.""" + node = self.onnx_node + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + for i in range(len(npy_list)): + output = np.load("{}/{}".format(code_gen_dir, npy_list[i])) + if i == 0: + exp_shape = self.get_normal_output_shape() + else: + exp_shape = self.get_normal_output_shape(i) + context[node.output[i]] = output.reshape(exp_shape) + + def exec_precompiled_singlenode_model(self): + """Executes precompiled executable.""" + executable_path = self.get_nodeattr("executable_path") + if executable_path == "": + raise Exception( + """ +Found no executable for this node, did you run the codegen and +compilation transformations? + """ + ) + process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE) + process_execute.communicate() + + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + return "V" + + def execute_node(self, context, graph): + """Executes single node using cppsim or rtlsim.""" + mode = self.get_nodeattr("exec_mode") + if mode == "cppsim": + # save input(s) + self.dynamic_input_to_npy(context, 1) + # execute the precompiled model + self.exec_precompiled_singlenode_model() + # load output npy file + self.npy_to_dynamic_output(context) + elif mode == "rtlsim": + pass + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + @abstractmethod + def global_includes(self): + """Function to set the global includes for c++ code that has to be generated + for cppsim or rtlsim, is member function of HLSBackend class but has to + be filled by every node.""" + pass + + @abstractmethod + def defines(self, var): + """Function to set the define commands for c++ code that has to be generated + for cppsim or rtlsim, is member function of HLSBackend class but has to + be filled by every node. + + var: makes it possible to reuse the function for different c++ code generation. + I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are + added.""" + pass + + def read_npy_data(self): + """Function to generate the commands for reading data from .npy file in c++, + might need to be overwritten depending on custom op.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_input_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_instream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_in = "%s/input_0.npy" % code_gen_dir + self.code_gen_dict["$READNPYDATA$"] = [] + self.code_gen_dict["$READNPYDATA$"].append( + 'npy2apintstream<%s, %s, %d, %s>("%s", in0_%s);' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + npy_in, + self.hls_sname(), + ) + ) + + def strm_decl(self): + """Function to generate the commands for the stream declaration in c++, + is member function of HLSBackend class but might need to be filled + by node.""" + self.code_gen_dict["$STREAMDECLARATIONS$"] = [] + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> in0_{} ("in0_{}");'.format( + self.get_instream_width(), self.hls_sname(), self.hls_sname() + ) + ) + self.code_gen_dict["$STREAMDECLARATIONS$"].append( + 'hls::stream> out_{} ("out_{}");'.format( + self.get_outstream_width(), self.hls_sname(), self.hls_sname() + ) + ) + + @abstractmethod + def docompute(self): + """Function to generate the commands for the computational part of the + c++ code, is member function of HLSBackend class but has to be filled + by every node.""" + pass + + def dataoutstrm(self): + """Function to generate the commands for reading out data from c++ and convert + into npy format, is member function of HLSBackend class might need to be filled + by node.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") + dtype = self.get_output_datatype() + if dtype == DataType["BIPOLAR"]: + # use binary for bipolar storage + dtype = DataType["BINARY"] + elem_bits = dtype.bitwidth() + packed_bits = self.get_outstream_width() + packed_hls_type = "ap_uint<%d>" % packed_bits + elem_hls_type = dtype.get_hls_datatype_str() + npy_type = "float" + npy_out = "%s/output.npy" % code_gen_dir + oshape = self.get_folded_output_shape() + oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") + + self.code_gen_dict["$DATAOUTSTREAM$"] = [ + 'apintstream2npy<%s, %s, %d, %s>(out_%s, %s, "%s");' + % ( + packed_hls_type, + elem_hls_type, + elem_bits, + npy_type, + self.hls_sname(), + oshape_cpp_str, + npy_out, + ) + ] + + def save_as_npy(self): + """Function to generate the commands for saving data in .npy file in c++""" + self.code_gen_dict["$SAVEASCNPY$"] = [] + + @abstractmethod + def blackboxfunction(self): + """Function to generate a blackbock function in c++ from which an IP block + will be generated, is member function of HLSBackend class but has to be filled + by every node.""" + pass + + def pragmas(self): + """Function to generate the pragma commands in c++, + might need to be overwritten depending on custom op.""" + self.code_gen_dict["$PRAGMAS$"] = [ + "#pragma HLS INTERFACE axis port=in0_" + self.hls_sname() + ] + self.code_gen_dict["$PRAGMAS$"].append( + "#pragma HLS INTERFACE axis port=out_" + self.hls_sname() + ) + self.code_gen_dict["$PRAGMAS$"].append("#pragma HLS INTERFACE ap_ctrl_none port=return") + + def get_ap_int_max_w(self): + """Return the maximum width of any ap_int used in this module. Used to set the + AP_INT_MAX_W definition for HLS.""" + instream = self.get_instream_width() + outstream = self.get_outstream_width() + ret = max([instream, outstream]) + assert ret <= 8191, "AP_INT_MAX_W=%d is larger than allowed maximum of 8191" % ret + return ret diff --git a/src/finn/custom_op/fpgadataflow/hlscustomop.py b/src/finn/custom_op/fpgadataflow/hwcustomop.py similarity index 51% rename from src/finn/custom_op/fpgadataflow/hlscustomop.py rename to src/finn/custom_op/fpgadataflow/hwcustomop.py index d1326607aa..57c0fec067 100644 --- a/src/finn/custom_op/fpgadataflow/hlscustomop.py +++ b/src/finn/custom_op/fpgadataflow/hwcustomop.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,24 +28,13 @@ import numpy as np import os -import subprocess import warnings from abc import abstractmethod from pyverilator.util.axi_utils import _read_signal, reset_rtlsim, rtlsim_multi_io -from qonnx.core.datatype import DataType from qonnx.custom_op.base import CustomOp from qonnx.util.basic import roundup_to_integer_multiple -from finn.util.basic import ( - CppBuilder, - get_rtlsim_trace_depth, - make_build_dir, - pyverilate_get_liveness_threshold_cycles, -) -from finn.util.hls import CallHLS -from finn.util.pyverilator import make_single_source_file - -from . import templates +from finn.util.basic import pyverilate_get_liveness_threshold_cycles try: from pyverilator import PyVerilator @@ -53,34 +42,21 @@ PyVerilator = None -class HLSCustomOp(CustomOp): - """HLSCustomOp class all custom ops that correspond to a finn-hlslib - function are based on. Contains different functions every fpgadataflow +class HWCustomOp(CustomOp): + """HWCustomOp class all custom ops that can be implemented with either + HLS or RTL backend are based on. Contains different functions every fpgadataflow custom node should have. Some as abstract methods, these have to be filled when writing a new fpgadataflow custom op node.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) - + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) self.code_gen_dict = {} - # getting templates from templates.py - - # template for single node execution - self.docompute_template = templates.docompute_template - - # templates for single node ip generation - # cpp file - self.ipgen_template = templates.ipgen_template - # tcl script - self.ipgentcl_template = templates.ipgentcl_template - def get_nodeattr_types(self): return { "backend": ("s", True, "fpgadataflow"), - "code_gen_dir_cppsim": ("s", False, ""), + "preferred_impl_style": ("s", False, "", {"", "hls", "rtl"}), "code_gen_dir_ipgen": ("s", False, ""), - "executable_path": ("s", False, ""), "ipgen_path": ("s", False, ""), "ip_path": ("s", False, ""), "ip_vlnv": ("s", False, ""), @@ -89,7 +65,6 @@ def get_nodeattr_types(self): "cycles_estimate": ("i", False, 0), "rtlsim_trace": ("s", False, ""), "res_estimate": ("s", False, ""), - "res_hls": ("s", False, ""), "res_synth": ("s", False, ""), "rtlsim_so": ("s", False, ""), # partitioning info @@ -151,70 +126,6 @@ def get_verilog_top_module_intf_names(self): intf_names["ap_none"] = [] return intf_names - def get_verilog_top_filename(self): - "Return the Verilog top module filename for this node." - - verilog_file = "{}/project_{}/sol1/impl/verilog/{}.v".format( - self.get_nodeattr("code_gen_dir_ipgen"), - self.onnx_node.name, - self.get_verilog_top_module_name(), - ) - return verilog_file - - def get_all_verilog_paths(self): - "Return list of all folders containing Verilog code for this node." - - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - assert ( - code_gen_dir != "" - ), """Node attribute "code_gen_dir_ipgen" is - not set. Please run HLSSynthIP first.""" - verilog_path = "{}/project_{}/sol1/impl/verilog/".format( - code_gen_dir, self.onnx_node.name - ) - # default impl only returns the HLS verilog codegen dir - return [verilog_path] - - def get_all_verilog_filenames(self, abspath=False): - "Return list of all Verilog files used for this node." - - verilog_files = [] - verilog_paths = self.get_all_verilog_paths() - for verilog_path in verilog_paths: - for f in os.listdir(verilog_path): - if f.endswith(".v"): - if abspath: - verilog_files += [verilog_path + "/" + f] - else: - verilog_files += [f] - return verilog_files - - def prepare_rtlsim(self): - """Creates a Verilator emulation library for the RTL code generated - for this node, sets the rtlsim_so attribute to its path and returns - a PyVerilator wrapper around it.""" - - if PyVerilator is None: - raise ImportError("Installation of PyVerilator is required.") - - verilog_files = self.get_all_verilog_filenames(abspath=True) - single_src_dir = make_build_dir("rtlsim_" + self.onnx_node.name + "_") - tmp_build_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") - target_file = single_src_dir + "/" + self.get_verilog_top_module_name() + ".v" - make_single_source_file(verilog_files, target_file) - - # build the Verilator emu library - sim = PyVerilator.build( - self.get_verilog_top_module_name() + ".v", - build_dir=tmp_build_dir, - verilog_path=[single_src_dir], - trace_depth=get_rtlsim_trace_depth(), - top_module_name=self.get_verilog_top_module_name(), - ) - # save generated lib filename in attribute - self.set_nodeattr("rtlsim_so", sim.lib._name) - return sim - def get_rtlsim(self): """Return a PyVerilator wrapper for the Verilator emulation library for this node.""" @@ -249,27 +160,27 @@ def uram_efficiency_estimation(self): def bram_estimation(self): """Function for BRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def uram_estimation(self): """Function for UltraRAM resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def lut_estimation(self): """Function for LUT resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def dsp_estimation(self): """Function for DSP resource estimation, is member function of - HLSCustomOp class but has to be filled by every node""" + HWCustomOp class but has to be filled by every node""" return 0 def get_exp_cycles(self): """Function for estimation of expected cycles for set folding, - is member function of HLSCustomOp class but has to be filled + is member function of HWCustomOp class but has to be filled by every node""" return 0 @@ -280,215 +191,6 @@ def get_op_and_param_counts(self): {op_ : , param_: }.""" return {} - def code_generation_ipgen(self, model, fpgapart, clk): - """Generates c++ code and tcl script for ip generation.""" - node = self.onnx_node - - # generate top cpp file for ip generation - path = self.get_nodeattr("code_gen_dir_ipgen") - self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] - self.generate_params(model, path) - self.global_includes() - self.defines("ipgen") - self.blackboxfunction() - self.pragmas() - self.docompute() - - template = self.ipgen_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - f = open(os.path.join(code_gen_dir, "top_{}.cpp".format(node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - # generate tcl script for ip generation - self.code_gen_dict["$PROJECTNAME$"] = ["project_{}".format(node.name)] - self.code_gen_dict["$HWSRCDIR$"] = [code_gen_dir] - self.code_gen_dict["$FPGAPART$"] = [fpgapart] - self.code_gen_dict["$TOPFXN$"] = [node.name] - self.code_gen_dict["$CLKPERIOD$"] = [str(clk)] - self.code_gen_dict["$DEFAULT_DIRECTIVES$"] = self.ipgen_default_directives() - self.code_gen_dict["$EXTRA_DIRECTIVES$"] = self.ipgen_extra_directives() - - template = self.ipgentcl_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - f = open(os.path.join(code_gen_dir, "hls_syn_{}.tcl".format(node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def ipgen_default_directives(self): - """Return list of default HLS synthesis directives""" - - default_directives = [ - "set_param hls.enable_hidden_option_error false", - "config_compile -disable_unroll_code_size_check -pipeline_style flp", - "config_interface -m_axi_addr64", - "config_rtl -module_auto_prefix", - "config_rtl -deadlock_detection none", - ] - return default_directives - - def ipgen_extra_directives(self): - "Return a list of extra tcl directives for HLS synthesis." - return [] - - def ipgen_singlenode_code(self): - """Builds the bash script for IP generation using the CallHLS utility.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - builder = CallHLS() - builder.append_tcl(code_gen_dir + "/hls_syn_{}.tcl".format(node.name)) - builder.set_ipgen_path(code_gen_dir + "/project_{}".format(node.name)) - builder.build(code_gen_dir) - ipgen_path = builder.ipgen_path - assert os.path.isdir(ipgen_path), "IPGen failed: %s not found" % (ipgen_path) - self.set_nodeattr("ipgen_path", ipgen_path) - ip_path = ipgen_path + "/sol1/impl/ip" - assert os.path.isdir( - ip_path - ), "IPGen failed: %s not found. Check log under %s" % (ip_path, code_gen_dir) - self.set_nodeattr("ip_path", ip_path) - vlnv = "xilinx.com:hls:%s:1.0" % node.name - self.set_nodeattr("ip_vlnv", vlnv) - - def code_generation_cppsim(self, model): - """Generates c++ code for simulation (cppsim).""" - node = self.onnx_node - path = self.get_nodeattr("code_gen_dir_cppsim") - self.code_gen_dict["$AP_INT_MAX_W$"] = [str(self.get_ap_int_max_w())] - self.generate_params(model, path) - self.global_includes() - self.defines("cppsim") - self.read_npy_data() - self.strm_decl() - self.pragmas() - self.docompute() - self.dataoutstrm() - self.save_as_npy() - - template = self.docompute_template - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - f = open(os.path.join(code_gen_dir, "execute_{}.cpp".format(node.op_type)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def code_generation_ipi(self): - """Constructs and returns the TCL for node instantiation in Vivado IPI.""" - vlnv = self.get_nodeattr("ip_vlnv") - cmd = ["create_bd_cell -type ip -vlnv %s %s" % (vlnv, self.onnx_node.name)] - return cmd - - def compile_singlenode_code(self): - """Builds the bash script for compilation using the CppBuilder from - finn.util.basic and executes the script to produce the executable.""" - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - builder = CppBuilder() - # to enable additional debug features please uncommand the next line - # builder.append_includes("-DDEBUG") - builder.append_includes("-I$FINN_ROOT/src/finn/qnn-data/cpp") - builder.append_includes("-I$FINN_ROOT/deps/cnpy/") - builder.append_includes("-I$FINN_ROOT/deps/finn-hlslib") - builder.append_includes("-I$FINN_ROOT/custom_hls") - builder.append_includes("-I{}/include".format(os.environ["HLS_PATH"])) - builder.append_includes("--std=c++14") - builder.append_includes("-O3") - builder.append_sources(code_gen_dir + "/*.cpp") - builder.append_sources("$FINN_ROOT/deps/cnpy/cnpy.cpp") - builder.append_includes("-lz") - builder.set_executable_path(code_gen_dir + "/node_model") - builder.build(code_gen_dir) - self.set_nodeattr("executable_path", builder.executable_path) - - def dynamic_input_to_npy(self, context, count, target_dir=""): - """Saves input (given context) into .npy files. - - Count indicates the number of inputs that have to be saved.""" - node = self.onnx_node - if target_dir == "": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - if code_gen_dir == "": - raise Exception( - """ - Found no codegen dir for this node, did you run the prepare_cppsim transformation? - """ - ) - target_dir = code_gen_dir - # create a npy file for each input of the node (in_ind is input index) - # assuming dynamic inputs start from 0 - for in_ind in range(count): - current_input_name = node.input[in_ind] - input_array = context[current_input_name] - if in_ind == 0: - expected_inp_shape = self.get_folded_input_shape() - idt = self.get_input_datatype() - else: - expected_inp_shape = self.get_folded_input_shape(in_ind) - idt = self.get_input_datatype(in_ind) - reshaped_input = input_array.reshape(expected_inp_shape) - if idt == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(target_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - - def npy_to_dynamic_output(self, context): - """Reads the output from an output.npy file generated from cppsim and - places its content into the context dictionary.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - output = np.load("{}/output.npy".format(code_gen_dir)) - exp_shape = self.get_normal_output_shape() - context[node.output[0]] = output.reshape(exp_shape) - - def npy_to_dynamic_outputs(self, context, npy_list): - """Reads the output from .npy files generated from cppsim and places - their content into the context dictionary. - npy_list is a list specifying which files to read, and its order must - match the order of node outputs.""" - node = self.onnx_node - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - for i in range(len(npy_list)): - output = np.load("{}/{}".format(code_gen_dir, npy_list[i])) - if i == 0: - exp_shape = self.get_normal_output_shape() - else: - exp_shape = self.get_normal_output_shape(i) - context[node.output[i]] = output.reshape(exp_shape) - - def exec_precompiled_singlenode_model(self): - """Executes precompiled executable.""" - executable_path = self.get_nodeattr("executable_path") - if executable_path == "": - raise Exception( - """ -Found no executable for this node, did you run the codegen and -compilation transformations? - """ - ) - process_execute = subprocess.Popen(executable_path, stdout=subprocess.PIPE) - process_execute.communicate() - def reset_rtlsim(self, sim): """Sets reset input in pyverilator to zero, toggles the clock and set it back to one""" @@ -502,12 +204,6 @@ def toggle_clk(self, sim): sim.io.ap_clk = 1 sim.io.ap_clk = 0 - def hls_sname(self): - """Get the naming convention used by Vitis HLS for stream signals - Example: the TDATA for a stream called "out" would be out_V_TDATA. - """ - return "V" - def rtlsim(self, sim, inp, inp2=None): """Runs the pyverilator simulation by passing the input values to the simulation, toggle the clock and observing the execution time. Function contains also an @@ -608,104 +304,19 @@ def rtlsim_multi_io(self, sim, io_dict): ) self.set_nodeattr("cycles_rtlsim", total_cycle_count) - def execute_node(self, context, graph): - """Executes single node using cppsim or rtlsim.""" - mode = self.get_nodeattr("exec_mode") - if mode == "cppsim": - # save input(s) - self.dynamic_input_to_npy(context, 1) - # execute the precompiled model - self.exec_precompiled_singlenode_model() - # load output npy file - self.npy_to_dynamic_output(context) - elif mode == "rtlsim": - pass - - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - def generate_params(self, model, path): """Function to generate parameters (i.e. weights and thresholds), - is member function of HLSCustomOp class but has to be filled - by every node.""" + is member function of HWCustomOp class but has to be filled + by every node that needs to generate parameters.""" pass @abstractmethod def get_number_output_values(self): """Function to get the number of expected output values, - is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def global_includes(self): - """Function to set the global includes for c++ code that has to be generated - for cppsim or rtlsim, is member function of HLSCustomOp class but has to - be filled by every node.""" - pass - - @abstractmethod - def defines(self, var): - """Function to set the define commands for c++ code that has to be generated - for cppsim or rtlsim, is member function of HLSCustomOp class but has to - be filled by every node. - - var: makes it possible to reuse the function for different c++ code generation. - I.e. if set to "ipgen" in MatrixVectorActivation additional PRAGMA defines are - added.""" - pass - - @abstractmethod - def read_npy_data(self): - """Function to generate the commands for reading data from .npy file in c++, - is member function of HLSCustomOp class but has to be filled by every node.""" - pass - - @abstractmethod - def strm_decl(self): - """Function to generate the commands for the stream declaration in c++, - is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def docompute(self): - """Function to generate the commands for the computational part of the - c++ code, is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def dataoutstrm(self): - """Function to generate the commands for reading out data from c++ and convert - into npy format, is member function of HLSCustomOp class but has to be filled - by every node.""" - pass - - @abstractmethod - def save_as_npy(self): - """Function to generate the commands for saving data in .npy file in c++, - is member function of HLSCustomOp class but has to be filled by every node.""" - pass - - @abstractmethod - def blackboxfunction(self): - """Function to generate a blackbock function in c++ from which an IP block - will be generated, is member function of HLSCustomOp class but has to be filled + is member function of HWCustomOp class but has to be filled by every node.""" pass - @abstractmethod - def pragmas(self): - """Function to generate the pragma commands in c++, is member function of - HLSCustomOp class but has to be filled by every node.""" - pass - def get_input_datatype(self, ind=0): """Returns FINN DataType of input stream ind.""" raise Exception("get_input_datatype not implemented for this op") @@ -750,28 +361,12 @@ def get_outstream_width_padded(self, ind=0): out_width = self.get_outstream_width(ind=ind) return roundup_to_integer_multiple(out_width, 8) - def get_ap_int_max_w(self): - """Return the maximum width of any ap_int used in this module. Used to set the - AP_INT_MAX_W definition for HLS.""" - instream = self.get_instream_width() - outstream = self.get_outstream_width() - ret = max([instream, outstream]) - assert ret <= 32768, ( - "AP_INT_MAX_W=%d is larger than allowed maximum of 32768" % ret - ) - return ret - def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): """Return the unconstrained characteristic functions for this node.""" # ensure rtlsim is ready - assert self.get_nodeattr("rtlsim_so") != "", ( - "rtlsim not ready for " + self.onnx_node.name - ) + assert self.get_nodeattr("rtlsim_so") != "", "rtlsim not ready for " + self.onnx_node.name if self.get_nodeattr("io_chrc_period") > 0: - warnings.warn( - "Skipping node %s: already has FIFO characteristic" - % self.onnx_node.name - ) + warnings.warn("Skipping node %s: already has FIFO characteristic" % self.onnx_node.name) return exp_cycles = self.get_exp_cycles() n_inps = np.prod(self.get_folded_input_shape()[:-1]) @@ -802,9 +397,7 @@ def derive_characteristic_fxns(self, period, override_rtlsim_dict=None): # extra dicts to keep track of cycle-by-cycle transaction behavior # note that we restrict key names to filter out weight streams etc txns_in = {key: [] for (key, value) in io_dict["inputs"].items() if "in" in key} - txns_out = { - key: [] for (key, value) in io_dict["outputs"].items() if "out" in key - } + txns_out = {key: [] for (key, value) in io_dict["outputs"].items() if "out" in key} def monitor_txns(sim_obj): for inp in txns_in: diff --git a/src/finn/custom_op/fpgadataflow/labelselect.py b/src/finn/custom_op/fpgadataflow/labelselect.py new file mode 100644 index 0000000000..f4b098cff7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/labelselect.py @@ -0,0 +1,186 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np +import onnxruntime as rt +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model, roundup_to_integer_multiple + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class LabelSelect(HWCustomOp): + """Abstraction layer for HW implementation of LabelSelect""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + odt_name = self.get_nodeattr("outputDataType") + if odt_name == "": + # If not provided compute min size + labels = self.get_nodeattr("Labels") + odt = DataType.get_smallest_possible(labels - 1) + # ensure a datatype divisible by 8-bits in case this is the last node + bw = roundup_to_integer_multiple(odt.bitwidth(), 8) + new_odt_name = odt.name.replace(str(odt.bitwidth()), str(bw)) + odt = DataType[new_odt_name] + odt_name = odt.name + self.set_nodeattr("outputDataType", odt_name) + + def get_nodeattr_types(self): + my_attrs = { + "Labels": ("i", True, 0), + "PE": ("i", True, 0), + "K": ("i", True, 0), + # FINN DataTypes for input + "inputDataType": ("s", True, ""), + "outputDataType": ("s", False, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_normal_input_shape(self, ind=0): + nlabels = self.get_nodeattr("Labels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [nlabels]) + return ishape + + def get_folded_input_shape(self, ind=0): + nlabels = self.get_nodeattr("Labels") + pe = self.get_nodeattr("PE") + vecs = list(self.get_nodeattr("numInputVectors")) + assert nlabels % pe == 0, "PE must divide Labels" + folds = int(nlabels / pe) + folded_ishape = tuple(vecs + [folds, pe]) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + k = self.get_nodeattr("K") + vecs = list(self.get_nodeattr("numInputVectors")) + oshape = tuple(vecs + [k]) + return oshape + + def get_folded_output_shape(self, ind=0): + k = self.get_nodeattr("K") + vecs = list(self.get_nodeattr("numInputVectors")) + oshape = tuple(vecs + [k, 1]) + return oshape + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape." + return helper.make_node( + "RandomNormal", + inputs=[], + outputs=[self.onnx_node.output[0]], + mean=0.0, + scale=1.0, + dtype=TensorProto.INT64, + shape=list(oshape), + ) + + def infer_node_datatype(self, model): + node = self.onnx_node + # check input datatype against property + idt = model.get_tensor_datatype(node.input[0]) + self.set_nodeattr("inputDataType", idt.name) + + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + ret = DataType[self.get_nodeattr("inputDataType")] + return ret + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + ret = DataType[self.get_nodeattr("outputDataType")] + return ret + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + return self.get_output_datatype().bitwidth() + + def get_number_output_values(self): + return self.get_nodeattr("K") + + def execute_node(self, context, graph): + # create a standard add node to help calculate the result + node = self.onnx_node + k = self.get_nodeattr("K") + + inp_values = context[node.input[0]] + oshape = context[node.output[0]].shape + ishape = inp_values.shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + k_inp = helper.make_tensor_value_info("k_inp", TensorProto.INT64, [1]) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.INT64, oshape) + val_outp = helper.make_tensor_value_info("val_outp", TensorProto.FLOAT, oshape) + node_topk = helper.make_node( + "TopK", + inputs=[node.input[0], "k_inp"], + outputs=["val_outp", node.output[0]], + ) + graph_topk = helper.make_graph( + nodes=[node_topk], + name="single-add-exec", + inputs=[inp, k_inp], + outputs=[val_outp, outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_topk = qonnx_make_model(graph_topk, **onnx_kwargs) + idict = {node.input[0]: inp_values, "k_inp": [k]} + sess = rt.InferenceSession(model_topk.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result[1], dtype=np.float32).reshape(oshape) + + def get_exp_cycles(self): + nlabels = self.get_nodeattr("Labels") + pe = self.get_nodeattr("PE") + exp_cycles = nlabels / pe + return int(exp_cycles) diff --git a/src/finn/custom_op/fpgadataflow/lookup.py b/src/finn/custom_op/fpgadataflow/lookup.py index fd3e2b5b1c..ab6228a5d6 100644 --- a/src/finn/custom_op/fpgadataflow/lookup.py +++ b/src/finn/custom_op/fpgadataflow/lookup.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,25 +27,22 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np -import os +import onnxruntime as rt import warnings -from math import ceil, log2 +from math import ceil +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class Lookup(HLSCustomOp): - "Streaming elementwise HLS lookup, mapping indices to values." +class Lookup(HWCustomOp): + """Abstraction layer for HW implementation of streaming elementwise lookup, + mapping indices to values.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -60,9 +57,9 @@ def get_nodeattr_types(self): # Input shape "InputShape": ("ints", False, [1]), # Memory mode - # const : parameters baked into bitfile (BRAM) + # internal_embedded : parameters baked into bitfile (BRAM) # external : lookup performed in external memory over AXI MM - "mem_mode": ("s", False, "const", ["const", "external"]), + "mem_mode": ("s", False, "internal_embedded", ["internal_embedded", "external"]), # Width for AXI-MM interface # only relevant when mem_mode="external" "ext_mem_width": ("i", False, 32), @@ -93,7 +90,7 @@ def get_folded_output_shape(self, ind=0): ishape = self.get_normal_input_shape() mem_mode = self.get_nodeattr("mem_mode") emb_dim = self.get_nodeattr("EmbeddingDim") - if mem_mode == "const": + if mem_mode == "internal_embedded": oshape = list(ishape) + [emb_dim] elif mem_mode == "external": ext_mem_width = self.get_nodeattr("ext_mem_width") @@ -156,296 +153,43 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - mem_mode = self.get_nodeattr("mem_mode") - global_incls = [] - global_incls.append('#include "lookup.hpp"') - if mem_mode == "const": - global_incls.append('#include "embeddings.hpp"') - self.code_gen_dict["$GLOBALS$"] = global_incls - - def defines(self, var): - n_inputs = np.prod(self.get_folded_input_shape()[:-1]) - dtype = self.get_input_datatype() - elem_hls_type = dtype.get_hls_datatype_str() - emb_type = DataType[self.get_nodeattr("EmbeddingType")] - emb_hls_type = emb_type.get_hls_datatype_str() - emb_dim = self.get_nodeattr("EmbeddingDim") - mem_mode = self.get_nodeattr("mem_mode") - my_defines = [] - my_defines.append("#define NumInputs %d" % n_inputs) - if mem_mode == "external": - ext_mem_width = self.get_nodeattr("ext_mem_width") - ext_mem_emb_size = self.get_folded_output_shape()[-2] - ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) - my_defines.append("#define MemBits %d" % ext_mem_width) - my_defines.append("#define EmbeddingSize %d" % ext_mem_emb_size) - my_defines.append("#define EmbeddingAlign %d" % ext_mem_emb_align) - my_defines.append("#define T_SRC %s" % elem_hls_type) - my_defines.append("#define T_DST ap_uint") - elif mem_mode == "const": - my_defines.append( - "#define NumEmbeddings %d" % self.get_nodeattr("NumEmbeddings") - ) - my_defines.append("#define EmbeddingDim %d" % emb_dim) - my_defines.append("#define InputType %s" % elem_hls_type) - my_defines.append("#define EmbeddingType %s" % emb_hls_type) - self.code_gen_dict["$DEFINES$"] = my_defines - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "int64_t" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", %s);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - "false", - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """StreamingLookup(in0, out, embeddings);""" - ] - elif mem_mode == "external": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """StreamingLookup_ext(in0, out, mem, size, oob_count, - oob_irq);""" - ] - - def blackboxfunction(self): - mem_mode = self.get_nodeattr("mem_mode") - ibits = self.get_instream_width() - packed_input_hls_type = "ap_uint<%d>" % ibits - obits = self.get_outstream_width() - packed_output_hls_type = "ap_uint<%d>" % obits - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_input_hls_type, packed_output_hls_type) - ] - elif mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void " - + self.onnx_node.name - + "(hls::stream &in0, hls::stream &out, " - + "T_DST const *const mem, unsigned const size, " - + "unsigned &oob_count, bool &oob_irq)" - ] - - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - my_pragmas = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - my_pragmas.append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - my_pragmas.append("#pragma HLS INTERFACE ap_ctrl_none port=return") - if mem_mode == "const": - my_pragmas.append( - "#pragma HLS BIND_STORAGE variable=embeddings type=ROM_2P impl=BRAM" - ) - elif mem_mode == "external": - my_pragmas.append("#pragma HLS INTERFACE m_axi offset=slave port=mem") - my_pragmas.append("#pragma HLS INTERFACE s_axilite port=mem bundle=control") - my_pragmas.append( - "#pragma HLS INTERFACE s_axilite port=size bundle=control" - ) - my_pragmas.append( - "#pragma HLS INTERFACE s_axilite port=oob_count bundle=control" - ) - my_pragmas.append("#pragma HLS INTERFACE ap_none port=oob_irq") - else: - raise Exception("Unrecognized mem_mode: " + mem_mode) - self.code_gen_dict["$PRAGMAS$"] = my_pragmas - - def generate_params(self, model, path): - mem_mode = self.get_nodeattr("mem_mode") - embeddings = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": - code_gen_dir = path - weight_filename = "{}/embeddings.hpp".format(code_gen_dir) - edt = DataType[self.get_nodeattr("EmbeddingType")] - # obits = self.get_outstream_width() - # packed_output_hls_type = "ap_uint<%d>" % obits - assert np.vectorize(edt.allowed)( - embeddings - ).all(), "Embeddings can't be expressed with type %s" % str(edt) - # reverse innertmost dim in embeddings to remain compatible with - # how we normally encode the data in FINN - embeddings_rev = np.flip(embeddings, -1) - embeddings_hls_code = numpy_to_hls_code( - embeddings_rev, edt, "embeddings", True, False - ) - f_thresh = open(weight_filename, "w") - f_thresh.write(embeddings_hls_code) - f_thresh.close() - elif mem_mode == "external": - edt = DataType[self.get_nodeattr("EmbeddingType")] - ext_mem_width = self.get_nodeattr("ext_mem_width") - assert edt.bitwidth() == 8, ( - "Lookup with mem_mode=external " - + "only works with 8-bit embeddings but found " - + str(edt) - ) - emb_dim = self.get_nodeattr("EmbeddingDim") - # need to zero-pad embeddings in external mode for burst alignment - # compute how much padding we need - emb_elems_per_ext_mem_width = self.get_folded_output_shape()[-1] - ext_mem_emb_size = self.get_folded_output_shape()[-2] - ext_mem_emb_align = ceil(log2(ext_mem_emb_size)) - align_factor = int((ext_mem_width / 8) * 2**ext_mem_emb_align) - pad_amount = align_factor - emb_dim - embeddings_padded = np.pad(embeddings, [(0, 0), (0, pad_amount)]) - # reshape for packing the innermost dim - embeddings_padded = embeddings_padded.reshape( - -1, emb_elems_per_ext_mem_width - ) - weight_filename = "%s/%s.dat" % (path, self.onnx_node.name) - ret = pack_innermost_dim_as_hex_string( - embeddings_padded, edt, ext_mem_width, True, prefix="" - ) - with open(weight_filename, "w") as f: - for current_line in ret: - f.write(current_line + "\n") - else: - raise Exception("Unrecognized mem_mode: " + mem_mode) - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") + # create a standard add node to help calculate the result node = self.onnx_node - exp_ishape = tuple(self.get_normal_input_shape()) - exp_oshape = tuple(self.get_normal_output_shape()) - folded_ishape = tuple(self.get_folded_input_shape()) - folded_oshape = tuple(self.get_folded_output_shape()) - mem_mode = self.get_nodeattr("mem_mode") - assert ( - mem_mode == "const" - ), "Only mem_mode=const is supported for simulation of Lookup layer" - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert inp.dtype == np.int64, "Inputs must be contained in int64 ndarray" - assert inp.shape == exp_ishape, """Input shape doesn't match expected shape.""" - export_idt = self.get_input_datatype() - odt = self.get_output_datatype() + inp_values = context[node.input[0]] + ishape = inp_values.shape + data_values = context[node.input[1]] + dshape = data_values.shape + oshape = context[node.output[0]].shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.INT64, ishape) + data = helper.make_tensor_value_info(node.input[1], TensorProto.FLOAT, dshape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_gather = helper.make_node( + "Gather", + inputs=[node.input[1], node.input[0]], + outputs=[node.output[0]], + ) + graph_gather = helper.make_graph( + nodes=[node_gather], + name="single-gather-exec", + inputs=[data, inp], + outputs=[outp], + ) - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, - out_npy_path, - odt, - out_shape, - packed_bits, - target_bits, - reverse_inner=True, - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape.""" + opset_version = 13 + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_gather = qonnx_make_model(graph_gather, **onnx_kwargs) + idict = {node.input[0]: inp_values, node.input[1]: data_values} + sess = rt.InferenceSession(model_gather.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) def bram_estimation(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": + if mem_mode == "internal_embedded": # current calculation assumes embeddings always stored in BRAM_18Ks - # when mem_mode is const + # when mem_mode is internal_embedded width_factor = ceil(self.get_outstream_width() / 16) depth_factor = ceil(self.get_nodeattr("NumEmbeddings") / 1024) return width_factor * depth_factor @@ -461,15 +205,6 @@ def bram_efficiency_estimation(self): bram16_est_capacity = bram16_est * 18 * 1024 return ebits / bram16_est_capacity - def get_ap_int_max_w(self): - parent_max = super().get_ap_int_max_w() - mem_mode = self.get_nodeattr("mem_mode") - ext_mem_width = self.get_nodeattr("ext_mem_width") - if mem_mode == "external": - return max(ext_mem_width, parent_max) - else: - return parent_max - def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") diff --git a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py index 72128fda4c..7bbe4c04e9 100644 --- a/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/matrixvectoractivation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,25 +28,20 @@ import math import numpy as np -import os +import onnx.numpy_helper as np_helper +import qonnx.custom_op.general.xnorpopcount as xp import textwrap import warnings from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) - -from . import templates +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string # ONNX i/o tensor shape assumptions for MatrixVectorActivation: # input 0 is the input tensor, shape (.., i_size) = (..., MW) @@ -56,13 +51,11 @@ # the ... here can be any shape (representing groups of vectors) -class MatrixVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hls Matrix_Vector_Activate(_Stream)_Batch - function.""" +class MVAU(HWCustomOp): + """Abstraction layer for HW implementation of MatrixVectorActivation layers.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) - self.decoupled_wrapper = templates.decoupled_wrapper + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -70,7 +63,7 @@ def get_nodeattr_types(self): "SIMD": ("i", True, 0), "MW": ("i", True, 0), "MH": ("i", True, 0), - "resType": ("s", False, "lut", {"auto", "lut", "dsp"}), + "resType": ("s", False, "auto", {"auto", "lut", "dsp"}), "ActVal": ("i", False, 0), # FINN DataTypes for inputs, weights, outputs "inputDataType": ("s", True, ""), @@ -89,11 +82,16 @@ def get_nodeattr_types(self): # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) "numInputVectors": ("ints", False, [1]), # memory mode for the FC weights - # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights with weight streamer packaged inside IP + # internal_embedded -- embedded weights, long compile/synth times + # internal_decoupled -- default, streaming weights with streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), - # FPGA resource type for memories in decoupled mode + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled", "external"}, + ), + # FPGA resource type for memories in internal_decoupled mode # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM @@ -115,8 +113,8 @@ def get_nodeattr_types(self): "auto", {"auto", "block", "distributed"}, ), - # (mem_mode = decoupled only) whether weights will be writable through - # an AXI-lite interface during runtime + # (mem_mode = internal_decoupled only) whether weights will be + # writeable through an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory # address map used for writable weights @@ -129,44 +127,40 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def calc_wmem(self): - """Calculates and returns WMEM.""" - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." - wmem = mw * mh // (pe * simd) - return wmem - - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 - else: - mh = self.get_nodeattr("MH") - pe = self.get_nodeattr("PE") - return mh // pe - - def make_shape_compatible_op(self, model): - oshape = self.get_normal_output_shape() - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): + def execute_node(self, context, graph): node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - # set output datatype from property - odt = self.get_output_datatype() - model.set_tensor_datatype(node.output[0], odt) + in_act = context[node.input[0]] + mvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + mvau_w = np_helper.to_array(mvau_w_init) + # Matrix multiplication + if self.get_nodeattr("binaryXnorMode"): + # Note: activation/weights are expected to be binary + # (by design coming from the transformation inferring this operation mode) + result = xp.xnorpopcountmatmul(in_act, mvau_w) + elif ( + self.get_nodeattr("inputDataType") == "BIPOLAR" + and self.get_nodeattr("weightDataType") == "BIPOLAR" + ): + # Convert to binary and use xnorpopcountmatmul function + result = xp.xnorpopcountmatmul((in_act + 1) / 2, (mvau_w + 1) / 2) + else: + # Regular matrix multiplication + result = np.matmul(in_act, mvau_w) + if self.get_nodeattr("noActivation") == 0: + mvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + mvau_thr = np_helper.to_array(mvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + if result.ndim == 4: + # NHWC to NCHW for multithreshold node + result = result.transpose((0, 3, 1, 2)) + result = multithreshold(result, mvau_thr, out_scale, out_bias) + if result.ndim == 4: + # NCHW to NHWC + result = result.transpose((0, 2, 3, 1)) + + context[node.output[0]] = result def verify_node(self): info_messages = [] @@ -192,9 +186,7 @@ def verify_node(self): self.get_nodeattr("outputDataType") info_messages.append("All necessary attributes exist") except Exception: - info_messages.append( - """The required MatrixVectorActivation attributes do not exist.""" - ) + info_messages.append("""The required MatrixVectorActivation attributes do not exist.""") # verify the number of inputs depending on noActivation value # check noActivation value to determine the number of inputs @@ -223,176 +215,26 @@ def verify_node(self): no_act ) ) - return info_messages - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - omega = (D_in * D_out) / (Q * P) - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - omega = (D_in * D_out) / (Q * P) - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # assuming decoupled (RTL) memory, which is more efficient than const (HLS) - if mem_width == 1: - return math.ceil(omega / 16384) - elif mem_width == 2: - return math.ceil(omega / 8192) - elif mem_width <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) - elif mem_width <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) - elif mem_width <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) - else: - return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) - - def bram_efficiency_estimation(self): - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * D_in * D_out - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def uram_efficiency_estimation(self): - """Function for URAM efficiency estimation: actual parameter storage - needed divided by the allocated URAM storage (from estimation)""" - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - D_in = self.get_nodeattr("MW") - D_out = self.get_nodeattr("MH") - uram_est = self.uram_estimation() - if uram_est == 0: - return 1 - wbits = W * D_in * D_out - uram_est_capacity = uram_est * 72 * 4096 - return wbits / uram_est_capacity - - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - Q = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * Q * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = Q * (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # adder tree - addertree_luts = (W + A) * (2 * Q - 1) - # accumulator - acc_bits = W + A + np.ceil(math.log(MW, 2)) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits - - return int( - c0 - + c1 * (P * (mult_luts + addertree_luts + acc_luts + thr_luts + comp_luts)) - + c2 - ) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - Q = self.get_nodeattr("SIMD") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * Q * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) - def get_exp_cycles(self): - pe = self.get_nodeattr("PE") - simd = self.get_nodeattr("SIMD") - num_inp_vec = self.get_nodeattr("numInputVectors") - mh = self.get_nodeattr("MH") - mw = self.get_nodeattr("MW") - # since mmv != 1 is not supported yet, we set mmv for now to 1 - mmv = 1 - exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv - return int(exp_cycles) + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" @@ -409,6 +251,10 @@ def get_weight_datatype(self): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] + def get_accumulator_datatype(self): + """Returns FINN DataType of accumulator""" + return DataType[self.get_nodeattr("accDataType")] + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] @@ -424,9 +270,10 @@ def get_outstream_width(self, ind=0): return out_width def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" + """Returns weight stream width. + Used only in internal_decoupled and external mode.""" if ( - self.get_nodeattr("mem_mode") == "decoupled" + self.get_nodeattr("mem_mode") == "internal_decoupled" or self.get_nodeattr("mem_mode") == "external" ): pe = self.get_nodeattr("PE") @@ -439,21 +286,10 @@ def get_weightstream_width(self): def get_weightstream_width_padded(self): """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" + by the AXI Stream spec. Used in internal_decoupled mode.""" weight_width = self.get_weightstream_width() return roundup_to_integer_multiple(weight_width, 8) - def get_ap_int_max_w(self): - # base class impl (max of inp/out stream widths) - max_of_io = super().get_ap_int_max_w() - # decoupled mode weight stream - weightstream = self.get_weightstream_width() - # single PE weight entry - weight_bits = self.get_weight_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - single_pe_w = simd * weight_bits - return max([weightstream, max_of_io, single_pe_w]) - def get_folded_input_shape(self, ind=0): mw = self.get_nodeattr("MW") mh = self.get_nodeattr("MH") @@ -498,149 +334,222 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" + def calc_wmem(self): + """Calculates and returns WMEM.""" + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + assert mw % simd == 0, "Requirement MW divisable by SIMD is violated." + wmem = mw * mh // (pe * simd) + return wmem - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + return mh // pe - return ret + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle != "ultra") + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - """Convert the original numpy weight matrix orig_weight_matrix into - a form suitable for passing to the hlslib call: - * ensure MH % PE == 0 and MW % SIMD == 0 - * for bipolar {-1,+1} weights, convert to binary {0, 1} - * interleave rows between PEs - * reshape into (1, PE, WMEM, SIMD) and return + def bram_estimation(self): + """Calculates resource estimation for BRAM based on: + - FINN-R: An End-to-End Deep-Learning Framework for Fast + Exploration of Quantized Neural Networks + - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, + Y. Umuroglu, M. Leeser and K. Vissers + - 12. Sep 2018 """ - mw = self.get_nodeattr("MW") - mh = self.get_nodeattr("MH") + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + omega = (D_in * D_out) / (Q * P) + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle in ["distributed", "ultra"]) + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # assuming internal_decoupled (RTL) memory, + # which is more efficient than internal_embedded (HLS) + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 9)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 18)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 36)) + + def bram_efficiency_estimation(self): + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * D_in * D_out + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = self.get_nodeattr("MW") + D_out = self.get_nodeattr("MH") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + + def get_exp_cycles(self): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - mw, - mh, - ), """Weights matrix doesn't - have expected shape (mw, mh)""" - assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." - assert mh % pe == 0, "Requirement MH divisable by PE is violated." - # start by transposing the original weight matrix, since ONNX and - # finn-hlslib use different assumptions - # ONNX uses (in_features, out_features) and matmul(x, W) - # finn-hlslib uses (out_features, in_features) and matmul(W, x) - ret = orig_weight_matrix.T - if self.get_weight_datatype() == DataType["BIPOLAR"]: - # convert bipolar to binary - ret = (ret + 1) / 2 - # interleave rows between PEs and reshape - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - # create SIMD as innermost dimension and add a dummy outer dim - ret = ret.reshape(1, pe, wmem, simd) - # reverse the SIMD dimension - ret = np.flip(ret, axis=-1) - return ret + num_inp_vec = self.get_nodeattr("numInputVectors") + mh = self.get_nodeattr("MH") + mw = self.get_nodeattr("MW") + # since mmv != 1 is not supported yet, we set mmv for now to 1 + mmv = 1 + exp_cycles = (mh / pe) * (mw / simd) * np.prod(num_inp_vec) / mmv + return int(exp_cycles) def minimize_accumulator_width(self, model): + """Minimize the accumulator bit width according to the weight values, + input data types, and size of dot product""" weights = model.get_initializer(self.onnx_node.input[1]) # since in the calculation the values of the weight matrix are used, # for the bipolar case they need to be converted to bipolar if self.get_nodeattr("binaryXnorMode"): weights = 2 * weights - 1 + + thresholds = None if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) - else: - thresholds = None + idt = self.get_input_datatype() - # calculate minimum and maximum values of accumulator + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + # if runtime-writeable weights, then the values of the weights can + # change and we need to use the worst-case values from the datatypes + if self.get_nodeattr("runtime_writeable_weights"): + wdt = self.get_weight_datatype() + lower_worst = wdt.min() * np.ones_like(weights) + lower_range = calculate_matvec_accumulator_range(lower_worst, idt) + upper_worst = wdt.max() * np.ones_like(weights) + upper_range = calculate_matvec_accumulator_range(upper_worst, idt) + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + + # if the thresholds can be used to determine range, then adjust the range + # according to the known values of the thresholds if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # set threshold datatype (and accumulator datatype implicitly) min_threshold = thresholds.min() max_threshold = thresholds.max() # clip threshold values - clip_upper = None - clip_lower = None - if max_threshold > acc_max + 1: - clip_upper = acc_max + 1 - if min_threshold < acc_min: - clip_lower = acc_min - if (clip_lower is not None) or (clip_upper is not None): + if max_threshold > acc_max or min_threshold < acc_min: warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) - thresholds = np.clip(thresholds, clip_lower, clip_upper) + thresholds = np.clip(thresholds, acc_min, acc_max) model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) min_threshold = thresholds.min() max_threshold = thresholds.max() - # get range required by threshold values - tdt_min = min(acc_min, min_threshold) - tdt_max = max(acc_max, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( + acc_min = min(min_threshold, acc_min) + acc_max = max(max_threshold, acc_max) + + # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 + if acc_min >= 0: + acc_bit_width = np.log2(acc_max + 1) + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"UINT{acc_bit_width}"] + # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= + # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + else: + _acc_max = max(-acc_min, 1 + acc_max) + acc_bit_width = np.log2(_acc_max) + 1 + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"INT{acc_bit_width}"] + + # if activation, assert that the thresholds can be expressed with adt + if thresholds is not None: + assert np.vectorize(adt.allowed)( threshold_tensor ).all(), "Thresholds in %s can't be expressed with type %s" % ( self.onnx_node.name, - str(tdt), + str(adt), ) - self.set_nodeattr("accDataType", tdt.name) - else: - if acc_min < 0: - if abs(acc_min) > acc_max: - adt = DataType.get_smallest_possible(acc_min) - else: - adt = DataType.get_smallest_possible(-acc_max - 1) - else: - adt = DataType.get_smallest_possible(acc_max) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] - self.set_nodeattr("accDataType", adt.name) + + # if no activation, output and accumulator datatypes are the same + if self.get_nodeattr("noActivation"): + # if this is the last node in the graph, then ensure the datatype is + # divisibly by 8 bits + if model.find_direct_successors(self.onnx_node) is None: + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] # for no-activation nodes, output dt = acc dt self.set_nodeattr("outputDataType", adt.name) + self.set_nodeattr("accDataType", adt.name) return DataType[self.get_nodeattr("accDataType")] - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + def minimize_weight_bit_width(self, model): + """Minimize the bit width based on the values of the weights""" + if not self.get_nodeattr("runtime_writeable_weights"): + weights = model.get_initializer(self.onnx_node.input[1]) + w_min = weights.min() + w_max = weights.max() + if w_min < 0: + if abs(w_min) > w_max: + wdt = DataType.get_smallest_possible(w_min) + else: + wdt = DataType.get_smallest_possible(-w_max - 1) + else: + wdt = DataType.get_smallest_possible(w_max) + self.set_nodeattr("weightDataType", wdt.name) + return DataType[self.get_nodeattr("weightDataType")] + + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: * ensure MH % PE == 0 @@ -671,19 +580,10 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): # ensure all thresholds are integer assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix - # workaround for vivado_hls threshold bug - if ret[0][0] == 0 and n_thres_steps == 1: - ret = np.copy(ret) - ret[0][0] = 1 - warnings.warn( - "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" - ) # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (mh, 1)) - assert ( - ret.shape[0] == mh - ), "Channels of threshold matrix are not as expected (mh)" + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) assert ( @@ -700,6 +600,43 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hw_compatible_weight_tensor(self, orig_weight_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 and MW % SIMD == 0 + * for bipolar {-1,+1} weights, convert to binary {0, 1} + * interleave rows between PEs + * reshape into (1, PE, WMEM, SIMD) and return + """ + mw = self.get_nodeattr("MW") + mh = self.get_nodeattr("MH") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + mw, + mh, + ), """Weights matrix doesn't + have expected shape (mw, mh)""" + assert mw % simd == 0, "Requirement MH divisable by SIMD is violated." + assert mh % pe == 0, "Requirement MH divisable by PE is violated." + # start by transposing the original weight matrix, since ONNX and + # finn-hlslib use different assumptions + # ONNX uses (in_features, out_features) and matmul(x, W) + # finn-hlslib uses (out_features, in_features) and matmul(W, x) + ret = orig_weight_matrix.T + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + # interleave rows between PEs and reshape + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + # create SIMD as innermost dimension and add a dummy outer dim + ret = ret.reshape(1, pe, wmem, simd) + # reverse the SIMD dimension + ret = np.flip(ret, axis=-1) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -713,17 +650,15 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): * weight_file_name : filename for the weight file to be generated """ - # convert weights into hlslib-compatible format - weight_tensor = self.get_hls_compatible_weight_tensor(weights) + # convert weights into hlslib/rtllib-compatible format + weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation if self.get_weight_datatype() == DataType["BIPOLAR"]: export_wdt = DataType["BINARY"] if weight_file_mode == "hls_header": - weight_hls_code = numpy_to_hls_code( - weight_tensor, export_wdt, "weights", True, True - ) + weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", True, True) # write weights into C++ header file as dictated by finn-hlslib f_weights = open(weight_file_name, "w") if export_wdt.bitwidth() != 1: @@ -746,7 +681,7 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): f_weights.write(weight_hls_code) f_weights.close() elif "decoupled" in weight_file_mode: - # create a weight stream for various flavors of decoupled mode: + # create a weight stream for various flavors of internal_decoupled mode: # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # reverse SIMD flip for saving weights in .npy @@ -757,14 +692,10 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): pe = self.get_nodeattr("PE") simd = self.get_nodeattr("SIMD") # simd_flipped - weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( - 1, -1, pe * simd - ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(1, -1, pe * simd) weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() # flipped - weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( - 1, -1, pe * simd - ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() if weight_file_mode == "decoupled_npy": # save weight stream into npy for cppsim @@ -815,42 +746,22 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": + if mem_mode == "internal_embedded": # save hlslib-compatible weights in params.h weight_filename = "{}/params.h".format(code_gen_dir) self.make_weight_file(weights, "hls_header", weight_filename) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": weight_filename_sim = "{}/weights.npy".format(code_gen_dir) - # save decoupled weights for cppsim + # save internal_decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # also save weights as Verilog .dat file - # note that we provide two different .dat files, one for synth - # and one for synthesis. this is because URAM-based weights always - # need zero weights for synthesis, otherwise they get inferred - # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( - code_gen_dir - ) - weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) - # sim weights are always the true weights - self.make_weight_file( - weights, "decoupled_verilog_dat", weight_filename_rtl_sim - ) - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - # UltraRAM must have no memory initializer, or only zeroes - # otherwise BRAM will be inferred instead of URAM - # as a workaround we provide a zero-weight init here - synth_weights = np.zeros_like(weights, dtype=np.float32) - else: - synth_weights = weights - self.make_weight_file( - synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth - ) + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -858,7 +769,7 @@ def generate_params(self, model, path): if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] @@ -903,410 +814,63 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for MatrixVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - # reinterpret binary output as bipolar where needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input( - "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits - ) - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - self.code_gen_dict["$GLOBALS$"] += ['#include "mvau.hpp"'] - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - # Only ipgen mode: Make sure that SIMD parameter satisfies minimum requirements. - if var == "ipgen": - SIMD = self.get_nodeattr("SIMD") - MW = self.get_nodeattr("MW") - condition = SIMD >= (MW / 1024) - msg = ( - f"HLS synthesis of MatrixVectorActivation requires: " - f"SIMD >= MW / 1024. This is not fulfilled with: SIMD={SIMD} " - f"and MW={MW} for node: {self.onnx_node.name}." - ) - assert condition, msg - mem_mode = self.get_nodeattr("mem_mode") - numInputVectors = list(self.get_nodeattr("numInputVectors")) - numReps = np.prod(numInputVectors) - self.code_gen_dict["$DEFINES$"] = [ - """#define MW1 {}\n #define MH1 {}\n - #define SIMD1 {}\n #define PE1 {}\n #define WMEM1 {}\n - #define TMEM1 {}\n #define numReps {}""".format( - self.get_nodeattr("MW"), - self.get_nodeattr("MH"), - self.get_nodeattr("SIMD"), - self.get_nodeattr("PE"), - self.calc_wmem(), - self.calc_tmem(), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append( - "#define WP1 {}\n".format(wdt.bitwidth()) - ) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights ("weights");'.format( - self.get_weightstream_width() - ) - ) + def get_op_and_param_counts(self): + in_features = self.get_nodeattr("MW") + out_features = self.get_nodeattr("MH") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_inp_vec = self.get_nodeattr("numInputVectors") + num_repetitions = int(np.prod(num_inp_vec)) + mac_count = in_features * out_features * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = in_features * out_features + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = out_features + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Batch - (in0, out, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Matrix_Vector_Activate_Stream_Batch - (in0, out, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0, - hls::stream> &weights, - hls::stream> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_weightstream_width(), - self.get_outstream_width(), - ) - ] - - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) + if mem_mode in ["internal_decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) - def pragmas(self): + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") - ram_style_thresholds = self.get_nodeattr("ram_style_thresholds") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=weights.m_weights " - "complete dim=1" - ) - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights name=weights_" - + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS stream depth=8 variable=weights" - ) - - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - # the threshold tensor is acc_type [PE][TMEM][N_THRES] - # partition for parallel access along PE and N_THRES - # dimensions (dims 1 and 3) - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=1" - ) - ) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=3" - ) - ) - # add resource pragma for thresholds if set - if ram_style_thresholds == "distributed": - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.m_thresholds " - "core=ROM_2P_LUTRAM" - ) - ) - elif ram_style_thresholds == "block": - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS RESOURCE variable=threshs.m_thresholds " - "core=ROM_2P_BRAM" - ) - ) - elif ram_style_thresholds == "auto": - # no pragma needed - pass - else: - raise Exception( - "Unrecognized ram_style_thresholds value:" + ram_style_thresholds - ) + sname = self.hls_sname() + if mem_mode == "external": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "internal_decoupled": + # only expose axilite interface if attribute is set + runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 + if runtime_writable: + intf_names["axilite"] = ["s_axilite"] + return intf_names def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if self.get_nodeattr("ram_style") == "ultra": assert ( @@ -1324,43 +888,33 @@ def code_generation_ipi(self): cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" - % (node_name, dout_name) + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) ) cmd.append( "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # Instantiate either the HLS or RTL IP depending on operator + self.instantiate_ip(cmd) + # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (strm_vlnv, node_name, strm_inst) + "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) ) cmd.append( "set_property -dict [list " - "CONFIG.NSTREAMS {1} " - "CONFIG.MEM_DEPTH {%d} " - "CONFIG.MEM_WIDTH {%d} " - "CONFIG.MEM_INIT {%s} " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " "CONFIG.RAM_STYLE {%s} " - "CONFIG.STRM0_DEPTH {%d} " - "CONFIG.STRM0_WIDTH {%d} " - "CONFIG.STRM0_OFFSET {0} " "] [get_bd_cells /%s/%s]" % ( self.calc_wmem(), self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", self.get_nodeattr("ram_style"), - self.calc_wmem(), - self.get_weightstream_width_padded(), node_name, strm_inst, ) @@ -1371,11 +925,11 @@ def code_generation_ipi(self): % (node_name, strm_inst, node_name, node_name, sname) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" % (node_name, rst_name, node_name, strm_inst) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" % (node_name, clk_name, node_name, strm_inst) ) cmd.append( @@ -1401,8 +955,7 @@ def code_generation_ipi(self): axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] cmd.append( "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" - % (node_name, axilite_name) + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " @@ -1412,65 +965,9 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + elif mem_mode == "internal_embedded" or mem_mode == "external": + # base class impl sufficient for internal_embedded/external modes + self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for MatrixVectorActivation") return cmd - - def get_verilog_top_module_intf_names(self): - intf_names = super().get_verilog_top_module_intf_names() - mem_mode = self.get_nodeattr("mem_mode") - sname = self.hls_sname() - if mem_mode == "external": - intf_names["s_axis"].append( - ("weights_" + sname, self.get_weightstream_width_padded()) - ) - if mem_mode == "decoupled": - # only expose axilite interface if attribute is set - runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 - if runtime_writable: - intf_names["axilite"] = ["s_axilite"] - return intf_names - - def get_op_and_param_counts(self): - in_features = self.get_nodeattr("MW") - out_features = self.get_nodeattr("MH") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_inp_vec = self.get_nodeattr("numInputVectors") - num_repetitions = int(np.prod(num_inp_vec)) - mac_count = in_features * out_features * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = in_features * out_features - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = out_features - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [ - 0 for i in range(num_w_reps * n_weight_inps) - ] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/custom_op/fpgadataflow/pool.py b/src/finn/custom_op/fpgadataflow/pool.py new file mode 100644 index 0000000000..35aee023b9 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/pool.py @@ -0,0 +1,224 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class Pool(HWCustomOp): + """Abstraction layer for HW implementation of Pool. + Requires ConvolutionInputGenerator(depthwise == 1) to format its input + + Input shape (BatchSize,OutImgDim,OutImgDim,TotalKernelSize*Channels) + Output shape (BatchSize,OutImgDim,OutImgDim,Channels) + + Notes: + + * The input shape was chosen to be compatible with im2col (only true when there + is not folding). + * The actual data layout produced by the hlslib kernels is different + for depthwise ops. + + * depthwise SWG: (1, OFMDim, OFMDim, IFMChannels/PE, K, K, PE) + + Channels can be folded using PE (SIMD from the input perspective) + """ + + def get_nodeattr_types(self): + my_attrs = { + "Channels": ("i", True, 0), + "PE": ("i", True, 1), + "KernelSize": ("ints", True, []), + # Function: + # - MaxPool + # - QuantAvgPool + # TODO add support for AvgPool and AccPool + "Function": ("s", True, "", {"MaxPool", "QuantAvgPool"}), + "OutImgDims": ("ints", True, []), + # FINN DataTypes for inputs/outputs + "InputDataType": ("s", True, ""), + "OutputDataType": ("s", True, ""), + "AccumBits": ("i", False, 0), + "Size": ("i", False, 1), + "BatchSize": ("i", False, 1), + } + + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("InputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + fxn = self.get_nodeattr("Function") + odt = DataType[self.get_nodeattr("OutputDataType")] + + if fxn == "MaxPool": + # Same as input + idt = DataType[self.get_nodeattr("InputDataType")] + assert odt == idt, "In datatype must be equal to out datatype for Maxpool" + elif fxn == "QuantAvgPool": + idt = DataType[self.get_nodeattr("InputDataType")] + assert ( + idt.signed() == odt.signed() + ), """QuantAvgPool: Can't mix signed + and unsigned datatypes""" + else: + raise Exception("Pool_Batch doesn't currently support " + fxn) + + return odt + + def get_normal_input_shape(self, ind=0): + ifm_ch = self.get_nodeattr("Channels") + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + k = self.get_nodeattr("KernelSize") + k_prod = int(np.prod(k)) + ishape = (batch_size, *odims, k_prod * ifm_ch) + return ishape + + def get_folded_input_shape(self, ind=0): + normal_ishape = list(self.get_normal_input_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(normal_ishape[-1] / pe) + folded_ishape = normal_ishape[:-1] + [fold, pe] + return tuple(folded_ishape) + + def get_normal_output_shape(self, ind=0): + ofm_ch = self.get_nodeattr("Channels") + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + oshape = (batch_size, *odims, ofm_ch) + return oshape + + def get_folded_output_shape(self, ind=0): + normal_oshape = list(self.get_normal_output_shape()) + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + assert ifm_ch % pe == 0, "PE must divide input channels" + fold = int(ifm_ch / pe) + folded_oshape = normal_oshape[:-1] + [fold, pe] + return tuple(folded_oshape) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[1:-1]) + + def get_exp_cycles(self): + # (Channels * kernel * kernel) / PE * odim * odim * batch_size + ifm_ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + k = self.get_nodeattr("KernelSize") + k_prod = int(np.prod(k)) + odims = self.get_nodeattr("OutImgDims") + batch_size = self.get_nodeattr("BatchSize") + exp_cycles = ((ifm_ch * k_prod) / pe) * np.prod(odims) * batch_size + return int(exp_cycles) + + def get_instream_width(self, ind=0): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + in_width = int(dt_bits * pe) + return in_width + + def get_outstream_width(self, ind=0): + dt_bits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = int(dt_bits * pe) + return out_width + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input shape for Pool_Batch." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + # data type stays the same + dtype = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], dtype) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""Pool_Batch needs 1 data input""") + + # check supported function + fnx = self.get_nodeattr("Function") + if fnx in ["MaxPool", "QuantAvgPool"]: + info_messages.append("Attribute Function contains a supported pool function") + else: + info_messages.append("Attribute Function contains an unsupported pool function") + return info_messages + + def execute_node(self, context, graph): + # simulate behavior with Python functionality + node = self.onnx_node + fnx = self.get_nodeattr("Function") + k = self.get_nodeattr("KernelSize") + ch = self.get_nodeattr("Channels") + k2 = k[0] * k[1] + + inp_values = context[node.input[0]] + ishape = inp_values.shape + # reshape array to apply max or avg function only on kernel + tmp_shape = tuple(list(ishape)[:-1] + [k2, ch]) + tmp_values = inp_values.reshape(tmp_shape) + if fnx == "MaxPool": + result = np.max(tmp_values, axis=3) + elif fnx == "QuantAvgPool": + # determine bits to shift + ibits = self.get_input_datatype().bitwidth() + obits = self.get_output_datatype().bitwidth() + max_value = 2**ibits - 1 + max_value = max_value * k2 + max_bit_width = int(max_value).bit_length() + shift_bits = max_bit_width - obits + shift_bits = shift_bits if shift_bits >= 0 else 0 + result = np.sum(tmp_values, axis=3) + result = np.right_shift(result.astype(int), shift_bits) + oshape = context[node.output[0]].shape + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/finn-rtllib/memstream/sim/test.sh b/src/finn/custom_op/fpgadataflow/rtl/__init__.py old mode 100755 new mode 100644 similarity index 55% rename from finn-rtllib/memstream/sim/test.sh rename to src/finn/custom_op/fpgadataflow/rtl/__init__.py index 7cb0497d26..06067a4fca --- a/finn-rtllib/memstream/sim/test.sh +++ b/src/finn/custom_op/fpgadataflow/rtl/__init__.py @@ -1,6 +1,4 @@ -#!/bin/bash - -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,5 +26,26 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -iverilog ../hdl/*.v tb_memstream_writes.v -o sim -./sim +from finn.custom_op.fpgadataflow.rtl.convolutioninputgenerator_rtl import ( + ConvolutionInputGenerator_rtl, +) +from finn.custom_op.fpgadataflow.rtl.fmpadding_rtl import FMPadding_rtl +from finn.custom_op.fpgadataflow.rtl.matrixvectoractivation_rtl import MVAU_rtl +from finn.custom_op.fpgadataflow.rtl.streamingdatawidthconverter_rtl import ( + StreamingDataWidthConverter_rtl, +) +from finn.custom_op.fpgadataflow.rtl.streamingfifo_rtl import StreamingFIFO_rtl +from finn.custom_op.fpgadataflow.rtl.thresholding_rtl import Thresholding_rtl +from finn.custom_op.fpgadataflow.rtl.vectorvectoractivation_rtl import VVAU_rtl + +custom_op = dict() + +# make sure new HLSCustomOp subclasses are imported here so that they get +# registered and plug in correctly into the infrastructure +custom_op["ConvolutionInputGenerator_rtl"] = ConvolutionInputGenerator_rtl +custom_op["FMPadding_rtl"] = FMPadding_rtl +custom_op["StreamingDataWidthConverter_rtl"] = StreamingDataWidthConverter_rtl +custom_op["StreamingFIFO_rtl"] = StreamingFIFO_rtl +custom_op["MVAU_rtl"] = MVAU_rtl +custom_op["VVAU_rtl"] = VVAU_rtl +custom_op["Thresholding_rtl"] = Thresholding_rtl diff --git a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py similarity index 53% rename from src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py rename to src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py index 1afd23d3a1..321522e7ba 100755 --- a/src/finn/custom_op/fpgadataflow/convolutioninputgenerator_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/convolutioninputgenerator_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,11 +29,17 @@ import math import numpy as np import os +import shutil from qonnx.core.datatype import DataType from qonnx.custom_op.general import im2col from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.convolutioninputgenerator import ( + ConvolutionInputGenerator, +) +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -55,169 +61,35 @@ # NOTE: "Parallel" implementation style not yet implemented in this version! -class ConvolutionInputGenerator_rtl(HLSCustomOp): - """Class that does not correspond to one of the finn-hlslib ConvolutionInputGenerator - (sliding window) function variants. Generates an RTL ConvolutionInputGenerator - implementation based on (System-)Verilog templates, defined in finn-rtllib/swg.""" +class ConvolutionInputGenerator_rtl(ConvolutionInputGenerator, RTLBackend): + """Class that corresponds to finn-rtllib swg module. + Generates an RTL ConvolutionInputGenerator implementation + based on (System-)Verilog templates, defined in finn-rtllib/swg.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { - "ConvKernelDim": ("ints", True, []), # [H, W] = [Y, X] - "IFMChannels": ("i", True, 0), - "IFMDim": ("ints", True, []), # [H, W] = [Y, X] - "OFMDim": ("ints", True, []), # [H, W] = [Y, X] - "SIMD": ("i", True, 0), # additional parallelization parameter - not yet implemented "M": ("i", False, 1), - # alternative implementation style - not yet implemented - "parallel_window": ("i", False, 0, {0}), - "Stride": ("ints", True, []), # [H, W] = [Y, X] - "Dilation": ("ints", True, []), # [H, W] = [Y, X] - # FINN DataTypes for inputs, weights, outputs - "inputDataType": ("s", True, ""), - "outputDataType": ("s", True, ""), - "depthwise": ("i", False, 0, {0, 1}), - # Enable reprogrammable implementation to change FM dimensions, - # stride, or dilation during runtime - "dynamic_mode": ("i", False, 0, {0, 1}), - # FPGA resource type for ConvolutionInputGenerator input buffer - # auto -- let Vivado decide - # block -- use BRAM - # distributed -- use LUTRAM - # ultra -- use URAM - "ram_style": ( - "s", - False, - "auto", - {"auto", "block", "distributed", "ultra"}, - ), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(ConvolutionInputGenerator.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - wf = int(ifm_ch / simd) - folded_ishape = (1, ifm_dim_h, ifm_dim_w, wf, simd) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - oshape = (1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - k_h, k_w = self.get_nodeattr("ConvKernelDim") - ifm_dim_h, ifm_dim_w = self.get_nodeattr("IFMDim") - ifm_ch = self.get_nodeattr("IFMChannels") - stride_h, stride_w = self.get_nodeattr("Stride") - dilation_h, dilation_w = self.get_nodeattr("Dilation") - simd = self.get_nodeattr("SIMD") - pad = 0 - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, pad, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, pad, dilation_w) - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - if self.get_nodeattr("parallel_window"): - wf = int((ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, k_h * k_w * simd) - else: - wf = int((k_h * k_w * ifm_ch) // simd) - folded_oshape = (1, ofm_dim_h, ofm_dim_w, wf, simd) - return folded_oshape - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for ConvInpGen." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - # data type stays the same - dtype = model.get_tensor_datatype(node.input[0]) - model.set_tensor_datatype(node.output[0], dtype) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("outputDataType")] - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - assert ifm_ch % simd == 0, "SIMD must divide IFMChannels" - in_width = simd * ibits - return in_width - - def get_outstream_width(self, ind=0): - if self.get_nodeattr("parallel_window"): - # feed all window pixels in parallel - k_h, k_w = self.get_nodeattr("ConvKernelDim") - return self.get_instream_width() * k_h * k_w - else: - # if parallel variant not in use: same width for output and input stream - return self.get_instream_width() - def get_number_input_values(self): + """Function to get the number of expected input values.""" folded_ishape = self.get_folded_input_shape() num_input_elems = np.prod(folded_ishape[:-1]) return num_input_elems - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - num_output_elems = np.prod(folded_oshape[:-1]) - return num_output_elems - - def get_1d_conv_attrs_normalized(self): - # normalize FM dimensions so that: - # [H, W] = [Y, X] = [1, D] or [D, 1] are always mapped to [1, D]. - # The dummy ('1') dimension is the Y-dimension. - ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") - dilation = self.get_nodeattr("Dilation") - - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - ofm_dim = ofm_dim[::-1] - k = k[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - return (ifm_ch, ifm_dim, ofm_dim, k, stride, dilation) + def use_parallel_window_output(self): + return self.get_nodeattr("parallel_window") def get_buffer_depth(self): + """Returns total depth of the internal buffer, depending on + implementation style.""" ifm_ch = self.get_nodeattr("IFMChannels") k = self.get_nodeattr("ConvKernelDim") ifm_dim = self.get_nodeattr("IFMDim") @@ -232,96 +104,110 @@ def get_buffer_depth(self): mmv_in = 1 mmv_out = 1 channel_factor = int(ifm_ch / simd) - impl_style = self.select_impl_style() if impl_style == "default": - # compute minimal buffer length (assuming it holds 1 complete window) buffer_min_size = ( (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 ) * channel_factor - # add additional buffer space in case of stride > 1 # this minimizes cycle count as it allows an earlier pre-load of inputs buffer_depth = ( buffer_min_size + max( 0, - ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) - * channel_factor, + ((stride_w - 1) - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor, ) + max( 0, - ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) - * channel_factor, + ((stride_h - 1) * w - (int(mmv_out * k_h * k_w / mmv_in))) * channel_factor, ) ) - else: - buffer_depth = 0 - raise Exception("Requested impl. style not implemented") + elif impl_style == "parallel": + buffer_min_size = ( + (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + ) * channel_factor + 1 + buffer_depth = buffer_min_size + 1 return buffer_depth def get_exp_cycles(self): - simd = self.get_nodeattr("SIMD") - ifm_ch = self.get_nodeattr("IFMChannels") - k = self.get_nodeattr("ConvKernelDim") - ifm_dim = self.get_nodeattr("IFMDim") - ofm_dim = self.get_nodeattr("OFMDim") - stride = self.get_nodeattr("Stride") - dilation = self.get_nodeattr("Dilation") - depthwise = self.get_nodeattr("depthwise") - ifm_dim_h, ifm_dim_w = ifm_dim - ofm_dim_h, ofm_dim_w = ofm_dim - k_h, k_w = k - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - channel_factor = int(ifm_ch / simd) + impl_style = self.select_impl_style() - if ifm_dim_h == 1 or ifm_dim_w == 1: - # 1D case - ( - ifm_ch, - [ifm_dim_h, ifm_dim_w], - [ofm_dim_h, ofm_dim_w], - [k_h, k_w], - [stride_h, stride_w], - [dilation_h, dilation_w], - ) = self.get_1d_conv_attrs_normalized() - - if depthwise: - exp_cycles = ( - +ofm_dim_w * k_w * channel_factor - + channel_factor * (k_w - 1) * (stride_w - 1) - - (k_w - 1) - + 2 - ) + if impl_style == "parallel": + exp_cycles = self.get_number_input_values() + 2 + elif impl_style == "default": + simd = self.get_nodeattr("SIMD") + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + ofm_dim = self.get_nodeattr("OFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + depthwise = self.get_nodeattr("depthwise") + ifm_dim_h, ifm_dim_w = ifm_dim + ofm_dim_h, ofm_dim_w = ofm_dim + k_h, k_w = k + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + channel_factor = int(ifm_ch / simd) + if ifm_dim_h == 1 or ifm_dim_w == 1: + # 1D case + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + if depthwise: + exp_cycles = ( + +ofm_dim_w * k_w * channel_factor + + channel_factor * (k_w - 1) * (stride_w - 1) + - (k_w - 1) + + 2 + ) + else: + exp_cycles = ofm_dim_w * k_w * channel_factor + 2 else: - exp_cycles = ofm_dim_w * k_w * channel_factor + 2 - else: - # 2D case - buffer_min_size = ( - (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1 - ) * channel_factor - cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor - cycles_read_block = stride_w * ifm_dim_w * channel_factor - max_cycles = max(cycles_write_block, cycles_read_block) - if depthwise: - max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1) - exp_cycles = buffer_min_size + ofm_dim_h * max_cycles # initial buffering - if depthwise: - exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor + # 2D case + buffer_min_size = ( + (k_h - 1) * dilation_h * ifm_dim_w + (k_w - 1) * dilation_w + 1 + ) * channel_factor + cycles_write_block = ofm_dim_w * k_w * k_h * channel_factor + cycles_read_block = stride_w * ifm_dim_w * channel_factor + max_cycles = max(cycles_write_block, cycles_read_block) + if depthwise: + max_cycles += ofm_dim_w * (stride_w - 1) * (channel_factor - 1) + exp_cycles = buffer_min_size + ofm_dim_h * max_cycles + if depthwise: + exp_cycles += (stride_h - 1) * ifm_dim_w * channel_factor return int(exp_cycles) def bram_estimation(self): simd = self.get_nodeattr("SIMD") ram_style = self.get_nodeattr("ram_style") + impl_style = self.select_impl_style() + [k_h, k_w] = self.get_nodeattr("ConvKernelDim") + [ifm_dim_h, ifm_dim_w] = self.get_nodeattr("IFMDim") + [dilation_h, dilation_w] = self.get_nodeattr("Dilation") - # NOTE: Actual BRAM usage might be lower in some cases. - # This does not account for the exact Vivado behavior yet. - buffer_width = simd * self.get_input_datatype().bitwidth() - buffer_depth = self.get_buffer_depth() if ram_style == "block" or ram_style == "auto": + buffer_width = simd * self.get_input_datatype().bitwidth() + if impl_style == "default": + buffer_depth = self.get_buffer_depth() + buffer_count = 1 + elif impl_style == "parallel": + if ifm_dim_h == 1 or ifm_dim_w == 1: + return 0 # 1D case (no line buffers needed) + kernel_width = (k_w - 1) * dilation_w + 1 + buffer_depth = (ifm_dim_w - kernel_width) + ifm_dim_w * (dilation_h - 1) + buffer_count = k_h - 1 + + # NOTE: Actual BRAM usage might be lower in some cases + # due to imperfect modeling of Vivado behavior if buffer_depth <= 512: ram_width = 36 elif buffer_depth <= 1024: @@ -356,7 +242,7 @@ def bram_estimation(self): remainder_cascade_width = math.ceil(buffer_width / remainder_width) cascade_savings = ram_cascade_width - remainder_cascade_width - return int(ram_cascade_depth * ram_cascade_width - cascade_savings) + return int((ram_cascade_depth * ram_cascade_width - cascade_savings) * buffer_count) else: return 0 @@ -374,31 +260,106 @@ def lut_estimation(self): def uram_estimation(self): simd = self.get_nodeattr("SIMD") ram_style = self.get_nodeattr("ram_style") - buffer_width = simd * self.get_input_datatype().bitwidth() - buffer_depth = self.get_buffer_depth() + impl_style = self.select_impl_style() + [k_h, k_w] = self.get_nodeattr("ConvKernelDim") + [ifm_dim_h, ifm_dim_w] = self.get_nodeattr("IFMDim") + [dilation_h, dilation_w] = self.get_nodeattr("Dilation") if ram_style == "ultra": + buffer_width = simd * self.get_input_datatype().bitwidth() + if impl_style == "default": + buffer_depth = self.get_buffer_depth() + buffer_count = 1 + elif impl_style == "parallel": + if ifm_dim_h == 1 or ifm_dim_w == 1: + return 0 # 1D case (no line buffers needed) + kernel_width = (k_w - 1) * dilation_w + 1 + buffer_depth = (ifm_dim_w - kernel_width) + ifm_dim_w * (dilation_h - 1) + buffer_count = k_h - 1 + ram_depth = 4096 ram_width = 72 ram_cascade_depth = math.ceil(buffer_depth / ram_depth) ram_cascade_width = math.ceil(buffer_width / ram_width) - return int(ram_cascade_depth * ram_cascade_width) + return int(ram_cascade_depth * ram_cascade_width * buffer_count) else: return 0 def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception( - "cppsim not possible for RTL SWG, please set exec_mode to rtlsim" - ) + ConvolutionInputGenerator.execute_node(self, context, graph) + # if depthwise = 1 + # interleave channels such that cppsim of ConvolutionInputGenerator_rtl + # has a notion of SIMD parallelism. Subsequent VVAU_{hls/rtl} expects + # the channels to be interleaved (i.e. to match their PE parallelism). + if self.get_nodeattr("depthwise"): + node = self.onnx_node + im2col_out = context[node.output[0]] + simd = getCustomOp(node).get_nodeattr("SIMD") + ofm_h, ofm_w = getCustomOp(node).get_nodeattr("OFMDim") + k_h, k_w = getCustomOp(node).get_nodeattr("ConvKernelDim") + ifm_ch = getCustomOp(node).get_nodeattr("IFMChannels") + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, k_h * k_w, ifm_ch // simd, simd) + im2col_out = im2col_out.transpose(0, 1, 2, 4, 3, 5) + im2col_out = im2col_out.reshape(1, ofm_h, ofm_w, ifm_ch * k_h * k_w) + context[node.output[0]] = im2col_out elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + inp = (inp + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # reshape input into folded form + inp = inp.reshape(folded_ishape) + # make copy before saving array + reshaped_input = inp.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + # binary -> bipolar if needed + if self.get_output_datatype() == DataType["BIPOLAR"]: + out = context[node.output[0]] + out = 2 * out - 1 + context[node.output[0]] = out + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output + shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -407,58 +368,9 @@ def execute_node(self, context, graph): ) ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - # reshape input into folded form - inp = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = inp.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim_h, ofm_dim_w, k_h*k_w*ifm_ch).""" - def prepare_codegen_default(self): - # Default implementation style for MMV_out = 1: addressable cyclic buffer - # Computing incremental addressing scheme directly.. + """Fills code generation dict for the default implementation style by computing + the incremental addressing scheme for the circular buffer.""" if self.get_nodeattr("dynamic_mode"): template_select = "/finn-rtllib/swg/swg_template_default_dynamic.sv" else: @@ -488,9 +400,7 @@ def prepare_codegen_default(self): channel_factor = int(ifm_ch / simd) # compute minimal buffer length (assuming it holds 1 complete window) - buffer_min_size = ( - (k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1 - ) * channel_factor + buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w + 1) * channel_factor buffer_actual_size = self.get_buffer_depth() code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] @@ -528,13 +438,13 @@ def prepare_codegen_default(self): ) addr_incr_end_simd = -buffer_min_size + (channel_factor + 1) - # sanity check + # sanity check for wrap logic assert not ( abs(addr_incr_end_window) > buffer_actual_size - ), "ERROR: W increment > buffer size, wrap logic doesn't account for this" + ), "ERROR: W increment > buffer size, try setting parallel_window=1" assert not ( abs(addr_incr_end_row) > buffer_actual_size - ), "ERROR: H increment > buffer size, wrap logic doesn't account for this" + ), "ERROR: H increment > buffer size, try setting parallel_window=1" # set certain threshold indices to detect when reading/writing finishes code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)] @@ -639,9 +549,270 @@ def prepare_codegen_default(self): return template_path, code_gen_dict + def prepare_codegen_parallel(self): + """Fills code generation dict for the parallel implementation style by computing + the loop controller configuration and partitioning the fixed buffer into + shift-registers (for parallel read access) and line buffers (for efficient + LUTRAM/BRAM/URAM implementation).""" + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_parallel.sv" + code_gen_dict = {} + + ifm_ch = self.get_nodeattr("IFMChannels") + k = self.get_nodeattr("ConvKernelDim") + ifm_dim = self.get_nodeattr("IFMDim") + stride = self.get_nodeattr("Stride") + dilation = self.get_nodeattr("Dilation") + simd = self.get_nodeattr("SIMD") + M = self.get_nodeattr("M") + + k_h, k_w = k + h, w = ifm_dim + pad = [0, 0, 0, 0] # padding happens in separate padding node for now + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + pad_h = pad[0] + pad[2] + pad_w = pad[1] + pad[3] + out_dim_h = im2col.compute_conv_output_dim(h, k_h, stride_h, pad_h, dilation_h) + out_dim_w = im2col.compute_conv_output_dim(w, k_w, stride_w, pad_w, dilation_w) + mmv_in = M * 1 + mmv_out = M * k_h * k_w + channel_factor = int(ifm_ch / simd) + + # compute minimal buffer length (assuming it holds 1 complete window) + buffer_min_size = ((k_h - 1) * dilation_h * w + (k_w - 1) * dilation_w) * channel_factor + 1 + + buffer_actual_size = self.get_buffer_depth() + code_gen_dict["$BUF_ELEM_TOTAL$"] = [str(buffer_actual_size)] + + # compute some intermediate values, e.g., kernel "width" = k_w incl. dilation + # or cols/rows that are skipped due to imperfect stride<->dim combination + kernel_width = (k_w - 1) * dilation_w + 1 + kernel_height = (k_h - 1) * dilation_h + 1 + skip_columns = w % (kernel_width + (out_dim_w - 1) * stride_w) + skip_rows = h % (kernel_height + (out_dim_h - 1) * stride_h) + + # set certain threshold indices to detect when reading/writing finishes + code_gen_dict["$LAST_READ_ELEM$"] = [str(h * w * channel_factor - 1)] + code_gen_dict["$LAST_WRITE_ELEM$"] = [ + str(((h - skip_rows - 1) * w + (w - skip_columns)) * channel_factor - 1) + ] + + # re-use default controller loop structure + loop_h_iterations = out_dim_h + loop_w_iterations = out_dim_w + loop_kh_iterations = channel_factor + loop_kw_iterations = 1 + loop_simd_iterations = 1 + + if loop_kh_iterations == 1: + if loop_w_iterations == 1: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_H"] + loop_h_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_W"] + loop_w_iterations -= 1 # -1 because state is initial state + else: + code_gen_dict["$INNERMOST_STATE$"] = ["STATE_LOOP_KH"] + loop_kh_iterations -= 1 # -1 because state is initial state + + # set head address increment values + addr_incr_end_simd = 1 + addr_incr_end_window_elem = 1 + addr_incr_end_window_row = 1 + addr_incr_end_window = (stride_w - 1) * channel_factor + 1 + addr_incr_end_row = ((skip_columns + (kernel_width - 1)) * channel_factor + 1) + ( + (stride_h - 1) * w * channel_factor + ) + + # add init value for CURRENT_ELEM counter = last elem of first window + code_gen_dict["$FIRST_WRITE_ELEM$"] = [str(buffer_min_size - 1)] + + cntr_bitwidth = math.ceil( + math.log2( + max( + loop_h_iterations - 2 + 1, + loop_w_iterations - 2 + 1, + loop_kh_iterations - 2 + 1, + loop_kw_iterations - 2 + 1, + loop_simd_iterations - 2 + 1, + ) + ) + ) + code_gen_dict["$CNTR_BITWIDTH$"] = [str(cntr_bitwidth)] + code_gen_dict["$LOOP_H_ITERATIONS$"] = [str(loop_h_iterations - 2)] + code_gen_dict["$LOOP_W_ITERATIONS$"] = [str(loop_w_iterations - 2)] + code_gen_dict["$LOOP_KH_ITERATIONS$"] = [str(loop_kh_iterations - 2)] + code_gen_dict["$LOOP_KW_ITERATIONS$"] = [str(loop_kw_iterations - 2)] + code_gen_dict["$LOOP_SIMD_ITERATIONS$"] = [str(loop_simd_iterations - 2)] + + incr_bitwidth = 1 + math.ceil( + math.log2( + max( + abs(addr_incr_end_simd) + 1, + abs(addr_incr_end_window_elem) + 1, + abs(addr_incr_end_window_row) + 1, + abs(addr_incr_end_window) + 1, + abs(addr_incr_end_row) + 1, + ) + ) + ) + code_gen_dict["$INCR_BITWIDTH$"] = [str(incr_bitwidth)] + code_gen_dict["$HEAD_INCR_SIMD$"] = [str(addr_incr_end_simd)] + code_gen_dict["$HEAD_INCR_KW$"] = [str(addr_incr_end_window_elem)] + code_gen_dict["$HEAD_INCR_KH$"] = [str(addr_incr_end_window_row)] + code_gen_dict["$HEAD_INCR_W$"] = [str(addr_incr_end_window)] + code_gen_dict["$HEAD_INCR_H$"] = [str(addr_incr_end_row)] + # not used, set to zero: + code_gen_dict["$TAIL_INCR_W$"] = ["0"] + code_gen_dict["$TAIL_INCR_H$"] = ["0"] + code_gen_dict["$TAIL_INCR_LAST$"] = ["0"] + code_gen_dict["$IS_DEPTHWISE$"] = ["0"] + + code_gen_dict["$SIMD$"] = [str(simd)] + code_gen_dict["$MMV_IN$"] = [str(mmv_in)] + code_gen_dict["$MMV_OUT$"] = [str(mmv_out)] + + # prepare buffer partitioning into "reg_fifos" and "bram_fifos" + # use normalized ([H,W]=[1,W]) dimensions for 1D case + ( + ifm_ch, + [ifm_dim_h, ifm_dim_w], + [ofm_dim_h, ofm_dim_w], + [k_h, k_w], + [stride_h, stride_w], + [dilation_h, dilation_w], + ) = self.get_1d_conv_attrs_normalized() + + reg_fifos = [] + bram_fifos_depth = [] + + px_idx = 0 + for ky in range(k_h): + reg_fifo = [] + for kx in range(k_w): + for c in range(channel_factor): + if c < (channel_factor - 1): + if not (ky == 0 and kx == 0): + reg_fifo.append(-1) + px_idx += 1 + else: + reg_fifo.append(px_idx) + px_idx += 1 + if kx < (k_w - 1): + reg_fifo.extend([-1] * ((dilation_w - 1) * channel_factor)) + px_idx += (dilation_w - 1) * channel_factor + reg_fifos.append(reg_fifo) + + if ky < (k_h - 1): + line_buffer_len = ((w - kernel_width) + w * (dilation_h - 1)) * channel_factor + bram_fifos_depth.append(line_buffer_len) + px_idx += line_buffer_len + + code_gen_dict["$GENERATE_REG_FIFOS$"] = [] + for i, reg_fifo in enumerate(reg_fifos): + code_gen_dict["$GENERATE_REG_FIFOS$"].append( + """ + wire [IN_WIDTH-1:0] reg_fifo_{id}_in; + wire [IN_WIDTH-1:0] reg_fifo_{id}_out; + wire [IN_WIDTH*{len}-1:0] reg_fifo_{id}; + swg_reg_buffer + #( + .WIDTH(IN_WIDTH), + .DEPTH({len}) + ) + reg_buffer_inst_{id} + ( + .clk(clk), + .shift_enable(shift_enable), + .shift_in(reg_fifo_{id}_in), + .shift_out(reg_fifo_{id}_out), + .data_out(reg_fifo_{id}) + );""".format( + id=i, + len=len(reg_fifo), + ) + ) + + code_gen_dict["$GENERATE_BRAM_FIFOS$"] = [] + for i, bram_fifo_depth in enumerate(bram_fifos_depth): + code_gen_dict["$GENERATE_BRAM_FIFOS$"].append( + """ + wire [IN_WIDTH-1:0] bram_fifo_{id}_in; + wire [IN_WIDTH-1:0] bram_fifo_{id}_out; + swg_ram_buffer + #( + .WIDTH(IN_WIDTH), + .DEPTH({len}), + .RAM_STYLE("{ram_style}") + ) + ram_buffer_inst_{id} + ( + .clk(clk), + .rst_n(rst_n), + .shift_enable(shift_enable), + .shift_in(bram_fifo_{id}_in), + .shift_out(bram_fifo_{id}_out) + );""".format( + id=i, + len=bram_fifo_depth, + ram_style=self.get_nodeattr("ram_style"), + ) + ) + + code_gen_dict["$GENERATE_OUTPUT_MAPPING$"] = [] + out_idx = mmv_out - 1 + for fifo_id, reg_fifo in enumerate(reg_fifos): + for fifo_idx, access_idx in enumerate(reg_fifo): + if access_idx != -1: + code_gen_dict["$GENERATE_OUTPUT_MAPPING$"].append( + """assign data_out[OUT_ELEM_WIDTH*{out_idx}+:OUT_ELEM_WIDTH] + = reg_fifo_{fifo_id}[{access_idx}*{mmv}*OUT_ELEM_WIDTH+ + OUT_ELEM_WIDTH*{mmv_idx}+:OUT_ELEM_WIDTH];""".format( + out_idx=out_idx, + fifo_id=fifo_id, + access_idx=len(reg_fifo) - 1 - int((max(reg_fifo) - access_idx) / M), + mmv_idx=(max(reg_fifo) - access_idx) % M, + mmv=M, + ) + ) + # reversal: out_idx=0 -> oldest buffer element -> highest access_idx + out_idx = out_idx - 1 + assert out_idx == -1, "ERROR: Not all output vector elements connected" + + code_gen_dict["$GENERATE_BUFFER_CONNECTION$"] = [] + for i in range(len(reg_fifos)): + if i == 0: + # first FIFO containing newest elements -> input comes from input reg + code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( + """assign reg_fifo_{fifo_id}_in = data_in;""".format( + fifo_id=i, + ) + ) + else: + # other REG FIFOs -> input comes from connected BRAM FIFO (line buffer) + input_fifo_id = i - 1 + code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( + """assign reg_fifo_{fifo_id}_in = bram_fifo_{input_fifo_id}_out; + """.format( + fifo_id=i, input_fifo_id=input_fifo_id + ) + ) + for i in range(len(bram_fifos_depth)): + input_fifo_id = i + code_gen_dict["$GENERATE_BUFFER_CONNECTION$"].append( + """assign bram_fifo_{fifo_id}_in = reg_fifo_{input_fifo_id}_out; + """.format( + fifo_id=i, input_fifo_id=input_fifo_id + ) + ) + + return template_path, code_gen_dict + def select_impl_style(self): + """Selects implementation style based on folding configuration.""" simd = self.get_nodeattr("SIMD") M = self.get_nodeattr("M") + depthwise = self.get_nodeattr("depthwise") ifm_ch = self.get_nodeattr("IFMChannels") ifm_dim = self.get_nodeattr("IFMDim") stride = self.get_nodeattr("Stride") @@ -666,36 +837,37 @@ def select_impl_style(self): if self.get_nodeattr("parallel_window"): # mmv_in = M * 1 mmv_out = M * k_h * k_w - assert ( - ifm_ch == simd - ), "Constraint violated: SIMD must be equal to IFMChannels" else: # mmv_in = 1 mmv_out = 1 - assert ( - ifm_ch % simd == 0 - ), "Constraint violated: SIMD must divide IFMChannels" + assert ifm_ch % simd == 0, "Constraint violated: SIMD must divide IFMChannels" # choose implementation style if mmv_out > 1 or (k_h == 1 and k_w == 1): impl_style = "parallel" - assert ( - ifm_ch == simd - ), "Constraint violated: SIMD must be equal to IFMChannels" + if depthwise or (k_h == 1 and k_w == 1): + # allow SIMD < IFM_CH in depthwise mode (VVAU supports the resulting data layout) + # also allowed for 1x1 kernel since depthwise and non-depthwise are equivalent + assert ifm_ch % simd == 0, "Constraint violated: SIMD must divide IFMChannels" + else: + assert ifm_ch == simd, "Constraint violated: SIMD must be equal to IFMChannels" else: impl_style = "default" - assert ( - impl_style == "default" - ), "ERROR: Parallel window mode not yet implemented" return impl_style - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): + """Generates HDL code and wrapper for the IP, depending on required + implementation style.""" impl_style = self.select_impl_style() # prepare code generation by filling out dictionaries if impl_style == "default": template_path, code_gen_dict = self.prepare_codegen_default() + elif impl_style == "parallel": + template_path, code_gen_dict = self.prepare_codegen_parallel() + if self.get_nodeattr("dynamic_mode"): + raise Exception("Dynamic mode is not compatible with parallel_window") else: raise Exception("Requested impl. style not implemented") @@ -705,11 +877,14 @@ def generate_hdl(self): # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) code_gen_dict["$BIT_WIDTH$"] = [str(self.get_input_datatype().bitwidth())] + code_gen_dict["$IN_WIDTH_PADDED$"] = [ + str(roundup_to_integer_multiple(self.get_instream_width(), 8)) + ] + code_gen_dict["$OUT_WIDTH_PADDED$"] = [ + str(roundup_to_integer_multiple(self.get_outstream_width(), 8)) + ] ram_style = self.get_nodeattr("ram_style") - if ram_style == "auto": - code_gen_dict["$RAM_STYLE$"] = [""] - else: - code_gen_dict["$RAM_STYLE$"] = ['(* ram_style = "{}" *)'.format(ram_style)] + code_gen_dict["$RAM_STYLE$"] = ['"{}"'.format(ram_style)] # apply code generation to templates code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") @@ -721,9 +896,7 @@ def generate_hdl(self): template_select = "/finn-rtllib/swg/swg_template_wrapper.v" with open(os.environ["FINN_ROOT"] + template_select, "r") as f: template_wrapper = f.read() - with open( - os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_axilite.v", "r" - ) as f: + with open(os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_template_axilite.v", "r") as f: template_axilite = f.read() for key in code_gen_dict: # transform list into long string separated by '\n' @@ -732,16 +905,12 @@ def generate_hdl(self): template_wrapper = template_wrapper.replace(key, code_gen_line) template_axilite = template_axilite.replace(key, code_gen_line) with open( - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv" - ), + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_impl.sv"), "w", ) as f: f.write(template) with open( - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v" - ), + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), "w", ) as f: f.write(template_wrapper) @@ -749,13 +918,15 @@ def generate_hdl(self): # AXI-Lite reg. file component is only needed for dynamic mode if self.get_nodeattr("dynamic_mode"): with open( - os.path.join( - code_gen_dir, self.get_nodeattr("gen_top_module") + "_axilite.v" - ), + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_axilite.v"), "w", ) as f: f.write(template_axilite) + # Copy static source file for common core components + shutil.copy2(os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_common.sv", code_gen_dir) + shutil.copy2(os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv", code_gen_dir) + # set ipgen_path and ip_path so that HLS-Synth transformation # and stich_ip transformation do not complain self.set_nodeattr("ipgen_path", code_gen_dir) @@ -773,8 +944,10 @@ def prepare_rtlsim(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") verilog_paths = [code_gen_dir] verilog_files = [ + "swg_pkg.sv", self.get_nodeattr("gen_top_module") + "_wrapper.v", self.get_nodeattr("gen_top_module") + "_impl.sv", + "swg_common.sv", ] if self.get_nodeattr("dynamic_mode"): verilog_files.append(self.get_nodeattr("gen_top_module") + "_axilite.v") @@ -796,8 +969,10 @@ def code_generation_ipi(self): code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") sourcefiles = [ + "swg_pkg.sv", self.get_nodeattr("gen_top_module") + "_wrapper.v", self.get_nodeattr("gen_top_module") + "_impl.sv", + "swg_common.sv", ] if self.get_nodeattr("dynamic_mode"): @@ -835,6 +1010,8 @@ def get_dynamic_config(self, ifm_dim=None, stride=None, dilation=None): apply (e.g. component must be synthesized for largest buffer size).""" # NOTE: For better driver integration, this functionality could be packaged # as a standalone function in the future + if self.select_impl_style() != "default": + raise Exception("Impl. style is incompatible with dynamic mode") if ifm_dim is None: ifm_dim = self.get_nodeattr("IFMDim") @@ -885,46 +1062,3 @@ def get_dynamic_config(self, ifm_dim=None, stride=None, dilation=None): "cfg_last_write": (15 * 4, int(code_gen_dict["$LAST_WRITE_ELEM$"][0])), } return config - - def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates C++ code and tcl script for IP generation. - Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - pass - - def code_generation_cppsim(self, model): - """Normally: Generates C++ code for simulation (cppsim).""" - pass - - def compile_singlenode_code(self): - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass diff --git a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py similarity index 52% rename from src/finn/custom_op/fpgadataflow/fmpadding_rtl.py rename to src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py index 5650d21885..cc49446ea3 100644 --- a/src/finn/custom_op/fpgadataflow/fmpadding_rtl.py +++ b/src/finn/custom_op/fpgadataflow/rtl/fmpadding_rtl.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,11 +30,10 @@ import numpy as np import os import shutil -import warnings -from qonnx.core.datatype import DataType from qonnx.util.basic import roundup_to_integer_multiple -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp +from finn.custom_op.fpgadataflow.fmpadding import FMPadding +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend from finn.util.basic import get_rtlsim_trace_depth, make_build_dir from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy @@ -44,139 +43,24 @@ PyVerilator = None -class FMPadding_rtl(HLSCustomOp): +class FMPadding_rtl(FMPadding, RTLBackend): """CustomOp wrapper for the finn-rtllib fmpadding_axi component Supports adjusting the padding amount and spatial feature sizes at runtime.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { - # spatial size of input images - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - # total padding (per dimension) to apply - "Padding": ( - "ints", - True, - [1, 1, 1, 1], - ), # [H_begin, W_begin, H_end, W_end] = [Y_begin, X_begin, Y_end, X_end] - # number of channels in input image - "NumChannels": ("i", True, 0), - # SIMD Input parallelism - "SIMD": ("i", False, 1), - # FINN input datatype - "inputDataType": ("s", True, ""), - # shape describing input vecs per execution - "numInputVectors": ("i", False, 1), # Enable reprogrammable implementation to change FM dimensions, # stride, or dilation during runtime "dynamic_mode": ("i", False, 0, {0, 1}), - # attribute to save top module name - not user configurable - "gen_top_module": ("s", False, ""), } - my_attrs.update(super().get_nodeattr_types()) + my_attrs.update(FMPadding.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) return my_attrs - def get_padded_odim(self): - "Return the padded spatial size of the output." - idim_h, idim_w = self.get_nodeattr("ImgDim") - pad = self.get_nodeattr("Padding") - pad_h = pad[0] + pad[2] - pad_w = pad[1] + pad[3] - odim_h = idim_h + pad_h - odim_w = idim_w + pad_w - return [odim_h, odim_w] - - def get_exp_cycles(self): - odim_h, odim_w = self.get_padded_odim() - channels = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - batch_size = self.get_nodeattr("numInputVectors") - exp_cycles = (channels / simd) * batch_size * odim_h * odim_w - return int(exp_cycles) - - def get_normal_input_shape(self, ind=0): - idim_h, idim_w = self.get_nodeattr("ImgDim") - num_ch = self.get_nodeattr("NumChannels") - ishape = (1, idim_h, idim_w, num_ch) - return ishape - - def get_normal_output_shape(self, ind=0): - odim_h, odim_w = self.get_padded_odim() - num_ch = self.get_nodeattr("NumChannels") - - oshape = (1, odim_h, odim_w, num_ch) - return oshape - - def get_folded_input_shape(self, ind=0): - normal_ishape = list(self.get_normal_input_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_ishape[-1] / simd) - folded_ishape = normal_ishape[:-1] + [fold, simd] - return tuple(folded_ishape) - - def get_folded_output_shape(self, ind=0): - normal_oshape = list(self.get_normal_output_shape()) - ifm_ch = self.get_nodeattr("NumChannels") - simd = self.get_nodeattr("SIMD") - assert ifm_ch % simd == 0, "SIMD must divide input channels" - fold = int(normal_oshape[-1] / simd) - folded_oshape = normal_oshape[:-1] + [fold, simd] - return tuple(folded_oshape) - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpected input shape for FMPadding_rtl." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("inputDataType", idt.name) - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - pass - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - ret = DataType[self.get_nodeattr("inputDataType")] - # the hlslib op always pads with zeros, so ensure that the DataType - # is able to represent zeros - assert ret.allowed(0), "FMPadding_rtl DataType must support zero" - return ret - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output. (Same as input datatype)""" - return self.get_input_datatype() - - def get_instream_width(self, ind=0): - ibits = self.get_input_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return ibits * simd - - def get_outstream_width(self, ind=0): - obits = self.get_output_datatype().bitwidth() - simd = self.get_nodeattr("SIMD") - return obits * simd - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - def get_verilog_top_module_intf_names(self): # Overload default HLSCustomOp implementation to add axilite control IF intf_names = super().get_verilog_top_module_intf_names() @@ -186,17 +70,52 @@ def get_verilog_top_module_intf_names(self): def execute_node(self, context, graph): mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") if mode == "cppsim": - raise Exception( - "cppsim not possible for FMPadding_rtl, please set exec_mode to rtlsim" - ) + FMPadding.execute_node(self, context, graph) elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert ( + inp.shape == exp_ishape + ), """Input shape doesn't + match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + assert ( + context[node.output[0]].shape == exp_oshape + ), """Output shape doesn't match expected shape + (1, OutputDim_H, OutputDim_W, NumChannels).""" + else: raise Exception( """Invalid value for attribute exec_mode! Is currently set to: {} @@ -205,43 +124,6 @@ def execute_node(self, context, graph): ) ) - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ImgDim_h, ImgDim_w, NumChannels).""" - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim_H, OutputDim_W, NumChannels).""" - def get_template_values(self, ifm_dims, pads, chans, simd, idt): dimY, dimX = ifm_dims padT, padL, padB, padR = pads @@ -289,7 +171,7 @@ def get_dynamic_config(self, ifm_dims=None, pads=None): } return config - def generate_hdl(self): + def generate_hdl(self, model, fpgapart, clk): rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fmpadding/hdl" template_path = rtlsrc + "/fmpadding_template.v" dims = self.get_nodeattr("ImgDim") @@ -375,46 +257,3 @@ def code_generation_ipi(self): % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) ] return cmd - - def code_generation_ipgen(self, model, fpgapart, clk): - """Normally: Generates C++ code and tcl script for IP generation. - Here: Generates (System-)Verilog code for IP generation.""" - self.generate_hdl() - - def ipgen_singlenode_code(self): - """Normally: Builds the bash script for IP generation.""" - pass - - def code_generation_cppsim(self, model): - """Normally: Generates C++ code for simulation (cppsim).""" - pass - - def compile_singlenode_code(self): - pass - - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass diff --git a/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py new file mode 100644 index 0000000000..d48b3a918d --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/matrixvectoractivation_rtl.py @@ -0,0 +1,292 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk + +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +# ONNX i/o tensor shape assumptions for MatrixVectorActivation_rtl: +# input 0 is the input tensor, shape (.., i_size) = (..., MW) +# input 1 is the weight tensor, shape (i_size, o_size) = (MW, MH) +# output 0 is the output tensor, shape (.., o_size) = (..., MH) +# the ... here can be any shape (representing groups of vectors) + + +class MVAU_rtl(MVAU, RTLBackend): + """Class that corresponds to finn-rtl Matrix Vector Unit.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # Flag to indicate if Versal device is targeted + "is_versal": ("i", False, 0, {0, 1}), + } + my_attrs.update(MVAU.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + MVAU.execute_node(self, context, graph) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 1: + raise Exception("Unexpected input found for MatrixVectorActivation_rtl") + in_ind += 1 + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + reset_rtlsim(sim) + toggle_clk(sim) + if mem_mode in ["external", "internal_decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def lut_estimation(self): + return 0 + + def dsp_estimation(self): + # multiplication + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + if self.get_nodeattr("is_versal"): + mult_dsp = P * np.ceil(Q / 3) + else: + mult_dsp = np.ceil(P / 4) * Q + return int(mult_dsp) + + def instantiate_ip(self, cmd): + # instantiate the RTL IP + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert ( + clk > 0.741 + ), """Infeasible clk target of {} ns has been set, + consider lowering the targeted clock frequency!""".format( + clk + ) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len + return dsp_chain_len + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the + # supported RTL compute core + assert ( + self.get_nodeattr("resType") != "lut" + ), """LUT-based RTL-MVU implementation currently not supported! + Please change resType for {} to 'dsp' or consider switching to HLS-based MVAU!""".format( + self.onnx_node.name + ) + + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + is_versal_family = self.get_nodeattr("is_versal") + + if is_versal_family: + return "mvu_vvu_8sx9_dsp58" + else: + act_width = self.get_input_datatype(0).bitwidth() + weight_width = self.get_input_datatype(1).bitwidth() + if (act_width == 4 and weight_width == 4) and not (is_versal_family): + return "mvu_4sx4u" + else: + return "mvu_8sx8u_dsp48" + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_codegen_default(self, fpgapart, clk): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(1)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + code_gen_dict["$MW$"] = [str(self.get_nodeattr("MW"))] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("MH"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + + return template_path, code_gen_dict + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py new file mode 100644 index 0000000000..e79782eb6d --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingdatawidthconverter_rtl.py @@ -0,0 +1,218 @@ +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +import shutil + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.streamingdatawidthconverter import ( + StreamingDataWidthConverter, +) +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class StreamingDataWidthConverter_rtl(StreamingDataWidthConverter, RTLBackend): + """Class that corresponds to finn-rtllib datawidth converter + module.""" + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(StreamingDataWidthConverter.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def check_divisible_iowidths(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + # the rtl module only supports + # stream widths that are divisible by + # integer width ratios + iwidth_d = iwidth % owidth == 0 + owidth_d = owidth % iwidth == 0 + assert ( + iwidth_d or owidth_d + ), """RTL implementation of DWC requires + stream widths that are integer width ratios + from each other. Input width is set to %s + and output width is set to %s """ % ( + iwidth, + owidth, + ) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + if mode == "cppsim": + StreamingDataWidthConverter.execute_node(self, context, graph) + elif mode == "rtlsim": + node = self.onnx_node + exp_ishape = self.get_normal_input_shape() + exp_oshape = self.get_normal_output_shape() + folded_ishape = self.get_folded_input_shape() + + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple( + exp_ishape + ), """Input shape doesn't + match expected shape.""" + export_idt = self.get_input_datatype() + + reshaped_input = inp.reshape(folded_ishape) + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + rtlsim_inp = npy_to_rtlsim_input( + "{}/input_0.npy".format(code_gen_dir), export_idt, nbits + ) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + rtlsim_output = self.rtlsim(sim, rtlsim_inp) + odt = export_idt + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy( + rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits + ) + # load and reshape output + output = np.load(out_npy_path) + output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) + context[node.output[0]] = output + + assert context[node.output[0]].shape == tuple( + exp_oshape + ), """Output shape doesn't match expected shape.""" + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def get_template_values(self): + topname = self.get_verilog_top_module_name() + ibits = self.get_instream_width() + obits = self.get_outstream_width() + code_gen_dict = { + "IBITS": int(ibits), + "OBITS": int(obits), + "TOP_MODULE_NAME": topname, + } + return code_gen_dict + + def generate_hdl(self, model, fpgapart, clk): + rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/dwc/hdl" + template_path = rtlsrc + "/dwc_template.v" + code_gen_dict = self.get_template_values() + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key_name in code_gen_dict: + key = "$%s$" % key_name + template = template.replace(key, str(code_gen_dict[key_name])) + + with open( + os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), + "w", + ) as f: + f.write(template) + + sv_files = ["dwc_axi.sv", "dwc.sv"] + for sv_file in sv_files: + shutil.copy(rtlsrc + "/" + sv_file, code_gen_dir) + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + "dwc_axi.sv", + "dwc.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def code_generation_ipi(self): + """Constructs and returns the TCL for node instantiation in Vivado IPI.""" + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + sourcefiles = [ + "dwc_axi.sv", + "dwc.sv", + self.get_nodeattr("gen_top_module") + ".v", + ] + + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % (f)] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd diff --git a/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py new file mode 100644 index 0000000000..dfae607622 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/streamingfifo_rtl.py @@ -0,0 +1,283 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np +import os +import shutil +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.streamingfifo import StreamingFIFO +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class StreamingFIFO_rtl(StreamingFIFO, RTLBackend): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # Toggle between rtl or IPI implementation + # rtl - use the rtl generated IP during stitching + # vivado - use the AXI Infrastructure FIFO + "impl_style": ("s", False, "rtl", {"rtl", "vivado"}), + } + my_attrs.update(StreamingFIFO.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + + return my_attrs + + def get_adjusted_depth(self): + impl = self.get_nodeattr("impl_style") + depth = self.get_nodeattr("depth") + if impl == "vivado": + old_depth = depth + # round up depth to nearest power-of-2 + # Vivado FIFO impl may fail otherwise + depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth + if old_depth != depth: + warnings.warn( + "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" + % (self.onnx_node.name, old_depth, depth) + ) + + return depth + + def get_verilog_top_module_intf_names(self): + ret = super().get_verilog_top_module_intf_names() + is_rtl = self.get_nodeattr("impl_style") == "rtl" + is_depth_monitor = self.get_nodeattr("depth_monitor") == 1 + if is_rtl and is_depth_monitor: + ret["ap_none"] = ["maxcount"] + return ret + + def generate_hdl(self, model, fpgapart, clk): + rtlsrc = os.environ["FINN_ROOT"] + "/finn-rtllib/fifo/hdl" + template_path = rtlsrc + "/fifo_template.v" + + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + topname = self.get_verilog_top_module_name() + self.set_nodeattr("gen_top_module", topname) + + code_gen_dict = {} + code_gen_dict["$TOP_MODULE_NAME$"] = topname + # make instream width a multiple of 8 for axi interface + in_width = self.get_instream_width_padded() + count_width = int(self.get_nodeattr("depth") - 1).bit_length() + code_gen_dict["$COUNT_RANGE$"] = "[{}:0]".format(count_width - 1) + code_gen_dict["$IN_RANGE$"] = "[{}:0]".format(in_width - 1) + code_gen_dict["$OUT_RANGE$"] = "[{}:0]".format(in_width - 1) + code_gen_dict["$WIDTH$"] = str(in_width) + code_gen_dict["$DEPTH$"] = str(self.get_nodeattr("depth")) + # apply code generation to templates + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + with open(template_path, "r") as f: + template = f.read() + for key_name in code_gen_dict: + key = "%s" % key_name + template = template.replace(key, str(code_gen_dict[key_name])) + with open( + os.path.join(code_gen_dir, self.get_verilog_top_module_name() + ".v"), + "w", + ) as f: + f.write(template) + + shutil.copy(rtlsrc + "/Q_srl.v", code_gen_dir) + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + node = self.onnx_node + inp = context[node.input[0]] + exp_shape = self.get_normal_input_shape() + + if mode == "cppsim": + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file for the input of the node + assert ( + str(inp.dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = inp.reshape(expected_inp_shape) + if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = DataType[self.get_nodeattr("dataType")] + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + super().reset_rtlsim(sim) + super().toggle_clk(sim) + output = self.rtlsim(sim, inp) + odt = DataType[self.get_nodeattr("dataType")] + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipi(self): + impl_style = self.get_nodeattr("impl_style") + if impl_style == "rtl": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + sourcefiles = [ + "Q_srl.v", + self.get_nodeattr("gen_top_module") + ".v", + ] + + sourcefiles = [os.path.join(code_gen_dir, f) for f in sourcefiles] + + cmd = [] + for f in sourcefiles: + cmd += ["add_files -norecurse %s" % (f)] + cmd += [ + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ] + return cmd + elif impl_style == "vivado": + cmd = [] + node_name = self.onnx_node.name + depth = self.get_adjusted_depth() + ram_style = self.get_nodeattr("ram_style") + # create a hierarchy for this layer, with the same port names + clk_name = self.get_verilog_top_module_intf_names()["clk"][0] + rst_name = self.get_verilog_top_module_intf_names()["rst"][0] + dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] + din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] + cmd.append("create_bd_cell -type hier %s" % node_name) + cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) + cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) + cmd.append( + "create_bd_intf_pin -mode Master " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) + ) + cmd.append( + "create_bd_intf_pin -mode Slave " + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) + ) + # instantiate and configure DWC + cmd.append( + "create_bd_cell -type ip " + "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name + ) + cmd.append( + "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] " + "[get_bd_cells /%s/fifo]" % (depth, node_name) + ) + cmd.append( + "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] " + "[get_bd_cells /%s/fifo]" % (ram_style, node_name) + ) + cmd.append( + "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] " + "[get_bd_cells /%s/fifo]" % (np.ceil(self.get_outstream_width() / 8), node_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] " + "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) + ) + cmd.append( + "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] " + "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] " + "[get_bd_pins %s/fifo/s_axis_aresetn]" % (node_name, rst_name, node_name) + ) + cmd.append( + "connect_bd_net [get_bd_pins %s/%s] " + "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name) + ) + return cmd + else: + raise Exception( + "FIFO implementation style %s not supported, please use rtl or vivado" % impl_style + ) + + def prepare_rtlsim(self): + assert self.get_nodeattr("impl_style") != "vivado", ( + "StreamingFIFO impl_style " + "cannot be vivado for rtlsim. Only impl_style=rtl supported." + ) + # Modified to use generated (System-)Verilog instead of HLS output products + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + "Q_srl.v", + self.get_nodeattr("gen_top_module") + ".v", + ] + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim diff --git a/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py new file mode 100644 index 0000000000..67b41d0165 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/thresholding_rtl.py @@ -0,0 +1,559 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import os +import shutil +from pyverilator.util.axi_utils import reset_rtlsim, rtlsim_multi_io +from qonnx.core.datatype import DataType +from qonnx.util.basic import roundup_to_integer_multiple + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.thresholding import Thresholding +from finn.util.basic import ( + get_memutil_alternatives, + get_rtlsim_trace_depth, + make_build_dir, + mem_primitives_versal, + pyverilate_get_liveness_threshold_cycles, +) +from finn.util.data_packing import ( + npy_to_rtlsim_input, + pack_innermost_dim_as_hex_string, + rtlsim_output_to_npy, +) + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class Thresholding_rtl(Thresholding, RTLBackend): + """Class that corresponds to finn-rtllib 'thresholding' function.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # memory depth triggers for threshold storage + "depth_trigger_uram": ("i", False, 0), + "depth_trigger_bram": ("i", False, 0), + # enable uniform thres optimization + # doesn't actually do anything yet, only + # for resource estimations + "uniform_thres": ("i", False, 0, {0, 1}), + # enable deep pipelining for easier timing closure + # setting to 0 may save some FFs but otherwise leave on + "deep_pipeline": ("i", False, 1, {0, 1}), + } + my_attrs.update(Thresholding.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def get_pe_mem_geometries(self): + """return a list of (bitwidth, depth) for PE memory configurations to be used + in resource estimation + + for each bitwidth, the depth is calculated as the + number of thresholds that can be stored in a single + memory block + the bitwidth is the bitwidth of the threshold values + the depth is the number of thresholds that can be stored + in a single memory block + the number of memory blocks is calculated as the number + of thresholds divided by the depth + the number of memory blocks is then multiplied by the + number of PEs to get the total number of memory blocks + required for the entire layer + """ + pe = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + wdt_bits = wdt.bitwidth() + odt = self.get_output_datatype() + odt_bits = odt.bitwidth() + t_channels = self.get_nodeattr("NumChannels") + cf = t_channels / pe + is_uniform = self.get_nodeattr("uniform_thres") + if is_uniform: + ret = [(odt_bits - x, cf * (2**x)) for x in range(1, odt_bits)] + else: + ret = [(wdt_bits, (cf) * 2**x) for x in range(odt_bits)] + return ret + + def get_memory_estimate(self): + """return the memory estimate for this node""" + res_dict = {} + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + pe = self.get_nodeattr("PE") + ret = self.get_pe_mem_geometries() + for mem_cfg in ret: + (width, depth) = mem_cfg + primitives = mem_primitives_versal + if depth_trigger_bram != 0 or depth_trigger_uram != 0: + if depth >= depth_trigger_bram and depth < depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "BRAM" in k} + elif depth >= depth_trigger_uram: + primitives = {k: v for (k, v) in mem_primitives_versal.items() if "URAM" in k} + alts = get_memutil_alternatives(mem_cfg, primitives) + primary_alt = alts[0] + res_type = primary_alt[0].split("_")[0] + res_count, eff, waste = primary_alt[1] + res_dict[res_type] = res_dict.get(res_type, 0) + pe * res_count + return res_dict + + def bram_estimation(self): + """return the number of BRAMs required for this node""" + res_dict = self.get_memory_estimate() + return res_dict.get("BRAM", 0) + + def uram_estimation(self): + """return the number of URAMs required for this node""" + res_dict = self.get_memory_estimate() + return res_dict.get("URAM", 0) + + def lut_estimation(self): + """return the number of LUTs required for this node""" + res_dict = self.get_memory_estimate() + return res_dict.get("LUTRAM", 0) + + def get_all_meminit_filenames(self, abspath=False): + "Return a list of all .dat memory initializer files used for this node" + dat_files = [] + t_path = self.get_nodeattr("code_gen_dir_ipgen") if abspath else "." + pe = self.get_nodeattr("PE") + output_data_type = self.get_nodeattr("outputDataType") # output precision + o_bitwidth = DataType[output_data_type].bitwidth() + for stage in range(o_bitwidth): + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + dat_files.append(thresh_file) + return dat_files + + def prepare_codegen_rtl_values(self, model): + """All dictionary values produced in this function are to replace + their key value(s) in the RTL template files""" + code_gen_dict = {} + + # TODO check for sortedness and size here? + thresholds = model.get_initializer(self.onnx_node.input[1]) + bias = self.get_nodeattr("ActVal") # activation bias value + output_data_type = self.get_nodeattr("outputDataType") # output precision + input_data_type = self.get_nodeattr("inputDataType") # input/threshold precision + o_bitwidth = DataType[output_data_type].bitwidth() + + # The RTL expects 2^N-1 thresholds, but narrow range quantization will result in + # one less threshold, prepending a dummy threshold and reducing bias by 1 to compensate. + expected_thresholds = 2**o_bitwidth - 1 + n_thres_steps = self.get_nodeattr("numSteps") + if expected_thresholds != n_thres_steps and DataType[input_data_type].signed() is not True: + min_val = np.amin(thresholds, axis=1) + thresholds = np.insert(thresholds, 0, min_val, axis=1) + bias = bias - 1 + + # add dummy dimension as final dimension (that's what gets packed with next call) + thresholds = np.expand_dims(thresholds, axis=-1) + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 4) + t_packed = pack_innermost_dim_as_hex_string( + thresholds, + wdt, + bw_hexdigit, + prefix="", + ) + + t_path = self.get_nodeattr("code_gen_dir_ipgen") + pe = self.get_nodeattr("PE") + num_channels = self.get_nodeattr("NumChannels") # number of channels + + # If a single threshold value is found, broadcast the value + expected_shape = (num_channels, n_thres_steps) + if t_packed.shape == (1, 1): + t_packed = np.broadcast_to(t_packed, expected_shape) + + channel_fold = int(num_channels / pe) + + for stage in range(o_bitwidth): + sn = o_bitwidth - stage - 1 + for pe_value in range(pe): + thresh_file = t_path + "/%s_threshs_%s_%s.dat" % ( + self.onnx_node.name, + pe_value, + stage, + ) + threshs = np.zeros([channel_fold * (2**stage)], dtype="object") + for ch in range(channel_fold): + for i in range(2**stage): + threshs[(ch << stage) + i] = t_packed[ch * pe + pe_value][ + (i << (o_bitwidth - stage)) + 2**sn - 1 + ] + with open(thresh_file, "w") as f: + for val in threshs: + f.write(val + "\n") + code_gen_dict["$THRESHOLDS_PATH$"] = ['"./%s_"' % self.onnx_node.name] + + # Identify the module name + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [ + self.get_verilog_top_module_name() + "_axi_wrapper" + ] + # Set the top module name - AXI wrapper + code_gen_dict["$TOP_MODULE$"] = code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] + + # Identify the module variables + i_bitwidth = DataType[input_data_type].bitwidth() + + code_gen_dict["$N$"] = [str(o_bitwidth)] # output precision - convert bitwidth to string + code_gen_dict["$M$"] = [ + str(i_bitwidth) + ] # input/threshold precision - convert bitwidth to string + code_gen_dict["$C$"] = [str(num_channels)] # number of channels + code_gen_dict["$BIAS$"] = [str(bias)] # activation bias value + code_gen_dict["$PE$"] = [str(pe)] # requires C = M*PE + + # Is the input datatype signed or unsigned? + # The thresholding core needs to know this when comparing weights to inputs + if self.get_input_datatype().signed(): + code_gen_dict["$SIGNED$"] = [str(1)] + else: + code_gen_dict["$SIGNED$"] = [str(0)] + + if bias >= 0: + o_bits = math.ceil(math.log2(2**o_bitwidth + bias)) + else: + o_bits = 1 + math.ceil( + math.log2(-bias if -bias >= 2 ** (o_bitwidth - 1) else 2**o_bitwidth + bias) + ) + + code_gen_dict["$O_BITS$"] = [str(int(o_bits))] + + rt_weights = self.get_nodeattr("runtime_writeable_weights") + code_gen_dict["$USE_AXILITE$"] = [str(rt_weights)] + + depth_trigger_uram = self.get_nodeattr("depth_trigger_uram") + depth_trigger_bram = self.get_nodeattr("depth_trigger_bram") + deep_pipeline = self.get_nodeattr("deep_pipeline") + code_gen_dict["$DEPTH_TRIGGER_URAM$"] = [str(depth_trigger_uram)] + code_gen_dict["$DEPTH_TRIGGER_BRAM$"] = [str(depth_trigger_bram)] + code_gen_dict["$DEEP_PIPELINE$"] = [str(deep_pipeline)] + return code_gen_dict + + def get_rtl_file_list(self): + """Thresholding binary search RTL file list""" + return [ + "axilite_if.v", + "thresholding.sv", + "thresholding_axi.sv", + "thresholding_template_wrapper.v", + ] + + def get_rtl_file_paths(self): + """Get full path of all RTL files""" + rtl_root_dir = os.environ["FINN_ROOT"] + "/finn-rtllib/thresholding/hdl/" + rtl_file_list = self.get_rtl_file_list() + rtl_file_paths = [rtl_root_dir + file for file in rtl_file_list] + return rtl_file_paths + + def get_rtl_template_data(self, path): + """Return RTL file contents as a template""" + with open(path, "r") as f: + template = f.read() + return template + + def fill_in_rtl_template_data(self, replace_dict, template_data): + """Use attribute values to finn in RTL template placeholders""" + template_data_cp = template_data + for key in replace_dict: + replacement_line = "\n".join(replace_dict[key]) + template_data_cp = template_data_cp.replace(key, replacement_line) + return template_data_cp + + def dump_rtl_data(self, dest_dir, filename, data): + """Dump filled-in-template RTL files for future synthesis step""" + # when generating template files, handle a special case: + # if the filename contains the word "template", replace that + # with the node name to distinguish between instances + if "template" in filename: + filename = self.get_nodeattr("gen_top_module") + ".v" + with open(os.path.join(dest_dir, filename), "w") as f: + f.write(data) + return + + def generate_hdl(self, model, fpgapart, clk): + """Prepare HDL files from templates for synthesis""" + # Generate a dictionary of values to put in RTL template + code_gen_dict = self.prepare_codegen_rtl_values(model) + + # Retrieve the destination directory for the final RTL files + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + + # Set the 'gen_top_module' attribute for use later + # by PyVerilator and IPI generation + self.set_nodeattr("gen_top_module", code_gen_dict["$TOP_MODULE$"][0]) + + weights = model.get_initializer(self.onnx_node.input[1]) + weights_fname = f"{code_gen_dir}/memblock.dat" + self.make_weight_file(weights, "decoupled", weights_fname) + + for rtl_file_path in self.get_rtl_file_paths(): + # read in original RTL template file + template_data = self.get_rtl_template_data(rtl_file_path) + # apply code generation to templates + data = self.fill_in_rtl_template_data(code_gen_dict, template_data) + # dump filled-in template to destination directory for compilation + file_only_path = rtl_file_path.split("/")[-1] + self.dump_rtl_data(code_gen_dir, file_only_path, data) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + # i.e. during the HLSSynthIP() transformation + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + return + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + verilog_paths = [code_gen_dir] + verilog_files = [ + x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() + ] + dat_files = self.get_all_meminit_filenames(abspath=True) + single_src_dir = make_build_dir("pyverilator_" + self.onnx_node.name + "_") + for dat_file in dat_files: + shutil.copy(dat_file, single_src_dir) + + # build the Verilator emulation library + sim = PyVerilator.build( + verilog_files, + build_dir=single_src_dir, + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_nodeattr("gen_top_module"), + auto_eval=False, + ) + + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + return sim + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + if mode == "cppsim": + Thresholding.execute_node(self, context, graph) + elif mode == "rtlsim": + node = self.onnx_node + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for Thresholding_rtl") + in_ind += 1 + + # Create a PyVerilator wrapper of the RTLSim .so + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + io_names = self.get_verilog_top_module_intf_names() + istream_name = io_names["s_axis"][0][0] + ostream_name = io_names["m_axis"][0][0] + io_dict = { + "inputs": {istream_name: inp}, + "outputs": {ostream_name: []}, + } + + trace_file = self.get_nodeattr("rtlsim_trace") + if trace_file == "default": + trace_file = self.onnx_node.name + ".vcd" + sname = "_" + + # Change into so directory to ensure threshold files can be found + rtlsim_so = self.get_nodeattr("rtlsim_so") + so_dir = os.path.dirname(os.path.realpath(rtlsim_so)) + olcwd = os.getcwd() + os.chdir(so_dir) + num_out_values = self.get_number_output_values() + reset_rtlsim(sim) + total_cycle_count = rtlsim_multi_io( + sim, + io_dict, + num_out_values, + trace_file=trace_file, + sname=sname, + liveness_threshold=pyverilate_get_liveness_threshold_cycles(), + ) + self.set_nodeattr("cycles_rtlsim", total_cycle_count) + os.chdir(olcwd) + output = io_dict["outputs"][ostream_name] + + # Manage output data + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def code_generation_ipi(self): + """Constructs and returns the TCL commands for node instantiation as an RTL + block.""" + rtl_file_list = [ + x.replace("thresholding_template_wrapper", self.get_nodeattr("gen_top_module")) + for x in self.get_rtl_file_list() + ] + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + source_target = "./ip/verilog/rtl_ops/%s" % self.onnx_node.name + cmd = ["file mkdir %s" % source_target] + + for rtl_file in rtl_file_list: + cmd.append( + "add_files -copy_to %s -norecurse %s" + % (source_target, os.path.join(code_gen_dir, rtl_file)) + ) + + # Create an RTL block, not an IP core (-type ip) + cmd.append( + "create_bd_cell -type module -reference %s %s" + % (self.get_nodeattr("gen_top_module"), self.onnx_node.name) + ) + + return cmd + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + if self.get_nodeattr("runtime_writeable_weights") == 1: + intf_names["axilite"] = ["s_axilite"] + + return intf_names + + def make_weight_file(self, weights, weight_file_mode, weight_file_name): + """Produce a file containing given weights (thresholds) in appropriate + format for this layer. This file can be used for either synthesis or + run-time reconfig of weights. + + Arguments: + + * weights : numpy array with weights to be put into the file + * weight_file_name : filename for the weight file to be generated + + """ + threshold_tensor = self.get_hw_compatible_threshold_tensor(weights) + tdt = self.get_weight_datatype() + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + + pe = self.get_nodeattr("PE") + ch = self.get_nodeattr("NumChannels") + n_thres_steps = self.get_nodeattr("numSteps") + + # If a single threshold value is found, broadcast the value + n_thres_steps = self.get_nodeattr("numSteps") + expected_shape = (ch, n_thres_steps) + if weights.shape == (1, 1): + weights = np.broadcast_to(weights, expected_shape) + + width_padded = roundup_to_integer_multiple(weights.shape[1], 4) + weight_padded = np.zeros((weights.shape[0], width_padded)) + weight_padded[: weights.shape[0], :n_thres_steps] = weights + weight_stream = [] + wdt = self.get_weight_datatype() + bw_hexdigit = roundup_to_integer_multiple(wdt.bitwidth(), 32) + padding = np.zeros(width_padded, dtype=np.int32) + + chan_ind = 0 + cf = ch // pe + for fold in range(cf): + for c in range(2 ** (pe - 1).bit_length()): + if (c == 0 or c % pe != 0) and c < pe: + for w in weight_padded[chan_ind]: + w_packed = pack_innermost_dim_as_hex_string( + [w], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) + chan_ind += 1 + else: + for z in padding: + w_packed = pack_innermost_dim_as_hex_string( + [z], wdt, bw_hexdigit, prefix="" + ).item() + weight_stream.append(w_packed) + with open(weight_file_name, "w") as f: + for val in weight_stream: + f.write(val + "\n") diff --git a/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py new file mode 100644 index 0000000000..27fc9f10a1 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/rtl/vectorvectoractivation_rtl.py @@ -0,0 +1,286 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import os +from pyverilator.util.axi_utils import reset_rtlsim, toggle_clk +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.rtlbackend import RTLBackend +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.util.basic import get_rtlsim_trace_depth, make_build_dir +from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.util.fpgadataflow import is_versal + +try: + from pyverilator import PyVerilator +except ModuleNotFoundError: + PyVerilator = None + + +class VVAU_rtl(VVAU, RTLBackend): + """Class that corresponds to finn-rtl Vector Vector Unit.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = {} + my_attrs.update(VVAU.get_nodeattr_types(self)) + my_attrs.update(RTLBackend.get_nodeattr_types(self)) + return my_attrs + + def execute_node(self, context, graph): + mode = self.get_nodeattr("exec_mode") + mem_mode = self.get_nodeattr("mem_mode") + node = self.onnx_node + + if mode == "cppsim": + VVAU.execute_node(self, context, graph) + elif mode == "rtlsim": + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # create a npy file fore each input of the node (in_ind is input index) + in_ind = 0 + for inputs in node.input: + # it is assumed that the first input of the node is the data input + # the second input are the weights + # the third input are the thresholds + if in_ind == 0: + assert ( + str(context[inputs].dtype) == "float32" + ), """Input datatype is + not float32 as expected.""" + expected_inp_shape = self.get_folded_input_shape() + reshaped_input = context[inputs].reshape(expected_inp_shape) + if self.get_input_datatype() == DataType["BIPOLAR"]: + # store bipolar activations as binary + reshaped_input = (reshaped_input + 1) / 2 + export_idt = DataType["BINARY"] + else: + export_idt = self.get_input_datatype() + # make copy before saving the array + reshaped_input = reshaped_input.copy() + np.save( + os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), + reshaped_input, + ) + elif in_ind > 2: + raise Exception("Unexpected input found for VectorVectorActivation") + in_ind += 1 + + sim = self.get_rtlsim() + nbits = self.get_instream_width() + inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), export_idt, nbits) + reset_rtlsim(sim) + toggle_clk(sim) + + if mem_mode in ["external", "internal_decoupled"]: + wnbits = self.get_weightstream_width() + export_wdt = self.get_weight_datatype() + # we have converted bipolar weights to binary for export, + # so use it as such for weight generation + if self.get_weight_datatype() == DataType["BIPOLAR"]: + export_wdt = DataType["BINARY"] + wei = npy_to_rtlsim_input( + "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits + ) + dim_h, dim_w = self.get_nodeattr("Dim") + num_w_reps = dim_h * dim_w + + io_dict = { + "inputs": {"in0": inp, "weights": wei * num_w_reps}, + "outputs": {"out": []}, + } + self.rtlsim_multi_io(sim, io_dict) + output = io_dict["outputs"]["out"] + else: + output = self.rtlsim(sim, inp) + odt = self.get_output_datatype() + target_bits = odt.bitwidth() + packed_bits = self.get_outstream_width() + out_npy_path = "{}/output.npy".format(code_gen_dir) + out_shape = self.get_folded_output_shape() + rtlsim_output_to_npy(output, out_npy_path, odt, out_shape, packed_bits, target_bits) + + # load and reshape output + output = np.load(out_npy_path) + oshape = self.get_normal_output_shape() + output = np.asarray([output], dtype=np.float32).reshape(*oshape) + context[node.output[0]] = output + else: + raise Exception( + """Invalid value for attribute exec_mode! Is currently set to: {} + has to be set to one of the following value ("cppsim", "rtlsim")""".format( + mode + ) + ) + + def lut_estimation(self): + return 0 + + def dsp_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + return int(P * np.ceil(Q / 3)) + + def instantiate_ip(self, cmd): + # instantiate the RTL IP + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + rtllib_dir = os.path.join(os.environ["FINN_ROOT"], "finn-rtllib/mvu/") + sourcefiles = [ + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + rtllib_dir + "mvu_vvu_axi.sv", + rtllib_dir + "replay_buffer.sv", + rtllib_dir + "mvu_4sx4u.sv", + rtllib_dir + "mvu_vvu_8sx9_dsp58.sv", + rtllib_dir + "mvu_8sx8u_dsp48.sv", + ] + for f in sourcefiles: + cmd.append("add_files -norecurse %s" % (f)) + cmd.append( + "create_bd_cell -type hier -reference %s /%s/%s" + % ( + self.get_nodeattr("gen_top_module"), + self.onnx_node.name, + self.onnx_node.name, + ) + ) + + def generate_hdl(self, model, fpgapart, clk): + # Generate params as part of IP preparation + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + self.generate_params(model, code_gen_dir) + + template_path, code_gen_dict = self.prepare_codegen_default(fpgapart, clk) + # add general parameters to dictionary + code_gen_dict["$MODULE_NAME_AXI_WRAPPER$"] = [self.get_verilog_top_module_name()] + # save top module name so we can refer to it after this node has been renamed + # (e.g. by GiveUniqueNodeNames(prefix) during MakeZynqProject) + self.set_nodeattr("gen_top_module", self.get_verilog_top_module_name()) + + # apply code generation to template + with open(template_path, "r") as f: + template_wrapper = f.read() + for key in code_gen_dict: + # transform list into long string separated by '\n' + code_gen_line = "\n".join(code_gen_dict[key]) + template_wrapper = template_wrapper.replace(key, code_gen_line) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(0))) + with open( + os.path.join(code_gen_dir, self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"), + "w", + ) as f: + f.write(template_wrapper.replace("$FORCE_BEHAVIORAL$", str(1))) + + # set ipgen_path and ip_path so that HLS-Synth transformation + # and stich_ip transformation do not complain + self.set_nodeattr("ipgen_path", code_gen_dir) + self.set_nodeattr("ip_path", code_gen_dir) + + def _resolve_segment_len(self, clk): + # Insert pipeline registers in the DSP58 chain to meet target clock frequency + # ~0.741 ns seems the worst-case delay through first DSP + # ~0.605 ns seems to be (on average) delay for all subsequent DSPs + # clk >= (critical_path_dsps - 1) * 0.605 + 0.741 + assert ( + clk > 0.741 + ), """Infeasible clk target of {} ns has been set, + consider lowering the targeted clock frequency!""".format( + clk + ) + critical_path_dsps = np.floor((clk - 0.741) / 0.605 + 1) + max_chain_len = np.ceil(self.get_nodeattr("SIMD") / 3) + dsp_chain_len = critical_path_dsps if critical_path_dsps < max_chain_len else max_chain_len + return dsp_chain_len + + def _resolve_impl_style(self, fpgapart): + # Based on target device and activation/weight-width, choose the + # supported RTL compute core + assert ( + self.get_nodeattr("resType") != "lut" + ), """LUT-based RTL-VVU implementation currently not supported! + Please change resType for {} to 'dsp' or consider switching to HLS-based VVAU!""".format( + self.onnx_node.name + ) + is_versal_family = is_versal(fpgapart) + assert ( + is_versal_family + ), "DSP-based (RTL) VVU currently only supported on Versal (DSP58) devices" + + return "mvu_vvu_8sx9_dsp58" + + def prepare_codegen_default(self, fpgapart, clk): + template_path = os.environ["FINN_ROOT"] + "/finn-rtllib/mvu/mvu_vvu_axi_wrapper.v" + + code_gen_dict = {} + code_gen_dict["$IS_MVU$"] = [str(0)] + code_gen_dict["$COMPUTE_CORE$"] = [self._resolve_impl_style(fpgapart)] + mw = int(np.prod(self.get_nodeattr("Kernel"))) + code_gen_dict["$MW$"] = [str(mw)] + code_gen_dict["$MH$"] = [str(self.get_nodeattr("Channels"))] + code_gen_dict["$PE$"] = [str(self.get_nodeattr("PE"))] + code_gen_dict["$SIMD$"] = [str(self.get_nodeattr("SIMD"))] + code_gen_dict["$ACTIVATION_WIDTH$"] = [str(self.get_input_datatype(0).bitwidth())] + code_gen_dict["$WEIGHT_WIDTH$"] = [str(self.get_input_datatype(1).bitwidth())] + code_gen_dict["$ACCU_WIDTH$"] = [str(self.get_output_datatype().bitwidth())] + code_gen_dict["$SIGNED_ACTIVATIONS$"] = ( + [str(1)] if (self.get_input_datatype(0).min() < 0) else [str(0)] + ) + code_gen_dict["$SEGMENTLEN$"] = [str(self._resolve_segment_len(clk))] + + return template_path, code_gen_dict + + def prepare_rtlsim(self): + """Creates a Verilator emulation library for the RTL code generated + for this node, sets the rtlsim_so attribute to its path and returns + a PyVerilator wrapper around it.""" + + if PyVerilator is None: + raise ImportError("Installation of PyVerilator is required.") + + code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") + # Path to (System-)Verilog files used by top-module & path to top-module + verilog_paths = [code_gen_dir, os.environ["FINN_ROOT"] + "/finn-rtllib/mvu"] + verilog_files = [self.get_nodeattr("gen_top_module") + "_wrapper_sim.v"] + + # build the Verilator emu library + sim = PyVerilator.build( + verilog_files, + build_dir=make_build_dir("pyverilator_" + self.onnx_node.name + "_"), + verilog_path=verilog_paths, + trace_depth=get_rtlsim_trace_depth(), + top_module_name=self.get_verilog_top_module_name(), + ) + # save generated lib filename in attribute + self.set_nodeattr("rtlsim_so", sim.lib._name) + + return sim diff --git a/tests/end2end/test_end2end_access_board.py b/src/finn/custom_op/fpgadataflow/rtlbackend.py similarity index 56% rename from tests/end2end/test_end2end_access_board.py rename to src/finn/custom_op/fpgadataflow/rtlbackend.py index ba3c49195b..2e4d647b22 100644 --- a/tests/end2end/test_end2end_access_board.py +++ b/src/finn/custom_op/fpgadataflow/rtlbackend.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,31 +26,39 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pytest - -import subprocess - -from finn.util.test import get_build_env - - -@pytest.mark.board -@pytest.mark.end2end -def test_end2end_access_board(): - build_env = get_build_env("zynq", 5) - if build_env["ip"] == "": - pytest.skip("PYNQ board IP address not specified") - remote_cmd_base = [ - "ssh", - "-o", - "PreferredAuthentications=publickey", - "-o", - "PasswordAuthentication=no", - "%s@%s" % (build_env["username"], build_env["ip"]), - ] - test_text = "BoardIsAccessible" - touch_cmd = remote_cmd_base + ["echo %s" % test_text] - verif_res = subprocess.run( - touch_cmd, stdout=subprocess.PIPE, universal_newlines=True - ) - assert verif_res.returncode == 0 - assert verif_res.stdout.split("\n")[0] == test_text +from abc import ABC, abstractmethod + + +class RTLBackend(ABC): + """RTLBackend class all custom ops that correspond to a module in finn-rtllib + are using functionality of. Contains different functions every RTL + custom node should have. Some as abstract methods, these have to be filled + when writing a new RTL custom op node.""" + + def get_nodeattr_types(self): + return { + # attribute to save top module name - not user configurable + "gen_top_module": ("s", False, ""), + } + + @abstractmethod + def generate_hdl(self, model, fpgapart, clk): + pass + + @abstractmethod + def prepare_rtlsim(self): + pass + + @abstractmethod + def code_generation_ipi(self): + pass + + def code_generation_ipgen(self, model, fpgapart, clk): + self.generate_hdl(model, fpgapart, clk) + + # TODO: Implement alternative + def hls_sname(self): + """Get the naming convention used by Vitis HLS for stream signals + Example: the TDATA for a stream called "out" would be out_V_TDATA. + """ + return "V" diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py new file mode 100644 index 0000000000..4921caeb00 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter.py @@ -0,0 +1,216 @@ +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import math +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# does not do anything at the ONNX node-by-node level, and input-output +# tensor shapes are the same. performs data width conversion at the rtlsim level + + +class StreamingDataWidthConverter(HWCustomOp): + """Abstraction layer for HW implementation of StreamingDataWidthConverter""" + + def get_nodeattr_types(self): + my_attrs = { + # shape of input/output tensors + "shape": ("ints", True, []), + # bit width of input and output streams + "inWidth": ("i", True, 0), + "outWidth": ("i", True, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("dataType")] + + def get_normal_input_shape(self, ind=0): + ishape = self.get_nodeattr("shape") + return ishape + + def get_normal_output_shape(self, ind=0): + oshape = self.get_nodeattr("shape") + return oshape + + def get_iowidth_lcm(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + return int(np.lcm(iwidth, owidth)) + + def needs_lcm(self): + iwidth = self.get_nodeattr("inWidth") + owidth = self.get_nodeattr("outWidth") + maxwidth = max(iwidth, owidth) + minwidth = min(iwidth, owidth) + return maxwidth % minwidth != 0 + + def check_divisible_iowidths(self): + pass + + def get_folded_input_shape(self, ind=0): + self.check_divisible_iowidths() + iwidth = self.get_nodeattr("inWidth") + ishape = self.get_normal_input_shape() + dummy_t = np.random.randn(*ishape) + ibits = self.get_input_datatype().bitwidth() + assert ( + iwidth % ibits == 0 + ), """DWC input width must be divisible by + input element bitwidth""" + ielems = int(iwidth // ibits) + ichannels = ishape[-1] + new_shape = [] + for i in ishape[:-1]: + new_shape.append(i) + new_shape.append(int(ichannels // ielems)) + new_shape.append(ielems) + dummy_t = dummy_t.reshape(new_shape) + return dummy_t.shape + + def get_folded_output_shape(self, ind=0): + self.check_divisible_iowidths() + owidth = self.get_nodeattr("outWidth") + oshape = self.get_normal_output_shape() + dummy_t = np.random.randn(*oshape) + obits = self.get_output_datatype().bitwidth() + assert ( + owidth % obits == 0 + ), """DWC output width must be divisible by + input element bitwidth""" + oelems = int(owidth // obits) + ochannels = oshape[-1] + new_shape = [] + for i in oshape[:-1]: + new_shape.append(i) + new_shape.append(int(ochannels // oelems)) + new_shape.append(oelems) + dummy_t = dummy_t.reshape(new_shape) + + return dummy_t.shape + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def get_instream_width(self, ind=0): + in_width = self.get_nodeattr("inWidth") + return in_width + + def get_outstream_width(self, ind=0): + out_width = self.get_nodeattr("outWidth") + return out_width + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify the number of inputs + if len(self.onnx_node.input) == 1: + info_messages.append("The number of inputs is correct") + else: + info_messages.append("""StreamingDWC needs 1 data input""") + + return info_messages + + def execute_node(self, context, graph): + node = self.onnx_node + exp_shape = self.get_normal_input_shape() + inp = context[node.input[0]] + assert str(inp.dtype) == "float32", "Input datatype is not float32" + assert inp.shape == tuple(exp_shape), "Input shape does not match expected shape." + + output = inp + output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) + context[node.output[0]] = output + + def lut_estimation(self): + """Calculates resource estimations for LUTs""" + inw = self.get_instream_width() + outw = self.get_outstream_width() + + minw = min(inw, outw) + maxw = max(inw, outw) + + # sometimes widths aren't directly divisible + # this requires going up from input width to least common multiple + # then down to output width + intw = abs(maxw * minw) // math.gcd(maxw, minw) + + # we assume a shift-based implementation + # even if we don't use LUTs explicitly, we make some unavailable + # to other logic because they're tied into the DWC control sets + + cnt_luts = 0 + cset_luts = 0 + + if inw != intw: + cnt_luts += abs(math.ceil(math.log(inw / intw, 2))) + cset_luts += intw + if intw != outw: + cnt_luts += abs(math.ceil(math.log(intw / outw, 2))) + cset_luts += outw + + return int(cnt_luts + cset_luts) diff --git a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py b/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py deleted file mode 100644 index a80d2bbefa..0000000000 --- a/src/finn/custom_op/fpgadataflow/streamingdatawidthconverter_batch.py +++ /dev/null @@ -1,532 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import math -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -# does not do anything at the ONNX node-by-node level, and input-output -# tensor shapes are the same. performs data width conversion at the rtlsim level - - -class StreamingDataWidthConverter_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib StreamingDataWidthConverter_Batch - function.""" - - def get_nodeattr_types(self): - my_attrs = { - # shape of input/output tensors - "shape": ("ints", True, []), - # bit width of input and output streams - "inWidth": ("i", True, 0), - "outWidth": ("i", True, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), - # Toggle between hls or IPI implementation - # hls - use the hls generated IP during stitching - # vivado - use the AXI Infrastructure DWC - "impl_style": ("s", False, "hls", {"hls", "vivado"}), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_normal_input_shape(self, ind=0): - ishape = self.get_nodeattr("shape") - return ishape - - def get_normal_output_shape(self, ind=0): - oshape = self.get_nodeattr("shape") - return oshape - - def check_divisible_iowidths(self): - impl_style = self.get_nodeattr("impl_style") - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - if impl_style == "vivado": - # the AXIS IP we use in vivado mode only supports - # stream widths that are divisible by 8 - iwidth_d8 = iwidth % 8 == 0 - owidth_d8 = owidth % 8 == 0 - assert ( - iwidth_d8 and owidth_d8 - ), """DWC impl_style=vivado requires - stream widths that are divisible by 8: (%d, %d)""" % ( - iwidth, - owidth, - ) - - def get_iowidth_lcm(self): - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - return int(np.lcm(iwidth, owidth)) - - def needs_lcm(self): - iwidth = self.get_nodeattr("inWidth") - owidth = self.get_nodeattr("outWidth") - maxwidth = max(iwidth, owidth) - minwidth = min(iwidth, owidth) - impl_style = self.get_nodeattr("impl_style") - return (impl_style == "hls") and (maxwidth % minwidth != 0) - - def get_folded_input_shape(self, ind=0): - self.check_divisible_iowidths() - iwidth = self.get_nodeattr("inWidth") - ishape = self.get_normal_input_shape() - dummy_t = np.random.randn(*ishape) - ibits = self.get_input_datatype().bitwidth() - assert ( - iwidth % ibits == 0 - ), """DWC input width must be divisible by - input element bitwidth""" - ielems = int(iwidth // ibits) - ichannels = ishape[-1] - new_shape = [] - for i in ishape[:-1]: - new_shape.append(i) - new_shape.append(int(ichannels // ielems)) - new_shape.append(ielems) - dummy_t = dummy_t.reshape(new_shape) - return dummy_t.shape - - def get_folded_output_shape(self, ind=0): - self.check_divisible_iowidths() - owidth = self.get_nodeattr("outWidth") - oshape = self.get_normal_output_shape() - dummy_t = np.random.randn(*oshape) - obits = self.get_output_datatype().bitwidth() - assert ( - owidth % obits == 0 - ), """DWC output width must be divisible by - input element bitwidth""" - oelems = int(owidth // obits) - ochannels = oshape[-1] - new_shape = [] - for i in oshape[:-1]: - new_shape.append(i) - new_shape.append(int(ochannels // oelems)) - new_shape.append(oelems) - dummy_t = dummy_t.reshape(new_shape) - - return dummy_t.shape - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_instream_width(self, ind=0): - in_width = self.get_nodeattr("inWidth") - return in_width - - def get_outstream_width(self, ind=0): - out_width = self.get_nodeattr("outWidth") - return out_width - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == tuple(exp_ishape), "Unexpect input shape for StreamingDWC." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""StreamingDWC needs 1 data input""") - - return info_messages - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "streamtools.h"'] - - def defines(self, var): - numReps = 1 - numInWords = int(np.prod(self.get_folded_input_shape()[:-1])) - inWidth = self.get_nodeattr("inWidth") - outWidth = self.get_nodeattr("outWidth") - self.code_gen_dict["$DEFINES$"] = [ - "#define InWidth %d " % inWidth, - "#define OutWidth %d " % outWidth, - "#define NumInWords %d " % numInWords, - "#define numReps %d" % numReps, - ] - if self.needs_lcm(): - lcmWidth = self.get_iowidth_lcm() - assert ( - numInWords % (lcmWidth / inWidth) == 0 - ), "Error in DWC LCM calculation" - numLCMToOut = numInWords // (lcmWidth / inWidth) - self.code_gen_dict["$DEFINES$"].append("#define LCMWidth %d" % lcmWidth) - self.code_gen_dict["$DEFINES$"].append( - "#define NumLCMToOut %d" % (numLCMToOut) - ) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - if self.needs_lcm(): - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> intermediate ("intermediate");'.format( - self.get_iowidth_lcm() - ) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - # TODO continue with fxns below, they are copy-pasted - op = "StreamingDataWidthConverter_Batch" - if self.needs_lcm(): - self.code_gen_dict["$DOCOMPUTE$"] = [ - 'hls::stream> intermediate ("intermediate");'.format( - self.get_iowidth_lcm() - ), - "%s(in0, intermediate, numReps);" % (op), - "%s(intermediate, out, numReps);" - % (op), - ] - else: - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0, out, numReps);" % (op) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - in_packed_bits = self.get_instream_width() - in_packed_hls_type = "ap_uint<%d>" % in_packed_bits - out_packed_bits = self.get_outstream_width() - out_packed_hls_type = "ap_uint<%d>" % out_packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, in_packed_hls_type, out_packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - if self.needs_lcm(): - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS DATAFLOW disable_start_propagation" - ) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - impl_style = self.get_nodeattr("impl_style") - node = self.onnx_node - exp_shape = self.get_normal_input_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - assert impl_style == "hls", "DWC cppsim only possible when impl_style==hls" - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - assert impl_style == "hls", "DWC rtlsim only possible when impl_style==hls" - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert inp.shape == tuple( - exp_shape - ), "Input shape does not match expected shape." - - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - # reshape input into folded shape - reshaped_input = inp.reshape(folded_ishape) - # make copy before saving array - reshaped_input = reshaped_input.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - output = inp - output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) - context[node.output[0]] = output - - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(exp_shape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to "rtlsim" """.format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert context[node.output[0]].shape == tuple( - exp_shape - ), """Output - shape doesn't match expected shape, should be same as input shape""" - - def code_generation_ipi(self): - impl_style = self.get_nodeattr("impl_style") - if impl_style == "hls": - return super().code_generation_ipi() - elif impl_style == "vivado": - cmd = [] - node_name = self.onnx_node.name - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" - % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate and configure DWC - cmd.append( - "create_bd_cell -type ip " - "-vlnv xilinx.com:ip:axis_dwidth_converter:1.1 /%s/dwc" % node_name - ) - cmd.append( - "set_property -dict " - "[list CONFIG.S_TDATA_NUM_BYTES.VALUE_SRC USER] " - "[get_bd_cells /%s/dwc]" % node_name - ) - cmd.append( - "set_property -dict " - "[list CONFIG.S_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]" - % (np.ceil(self.get_instream_width() / 8), node_name) - ) - cmd.append( - "set_property -dict " - "[list CONFIG.M_TDATA_NUM_BYTES {%d}] [get_bd_cells /%s/dwc]" - % (np.ceil(self.get_outstream_width() / 8), node_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/dwc/M_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/dwc/S_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aresetn]" - % (node_name, rst_name, node_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/dwc/aclk]" - % (node_name, clk_name, node_name) - ) - return cmd - else: - raise Exception( - "DWC implementation style %s not supported, please use hls or vivado" - % impl_style - ) - - def lut_estimation(self): - """Calculates resource estimations for LUTs""" - inw = self.get_instream_width() - outw = self.get_outstream_width() - - minw = min(inw, outw) - maxw = max(inw, outw) - - # sometimes withs aren't directly divisible - # this requires going up from input width to least common multiple - # then down to output width - intw = abs(maxw * minw) // math.gcd(maxw, minw) - - # we assume a shift-based implementation - # even if we don't use LUTs explicitly, we make some unavailable - # to other logic because they're tied into the DWC control sets - - cnt_luts = 0 - cset_luts = 0 - - if inw != intw: - cnt_luts += abs(math.ceil(math.log(inw / intw, 2))) - cset_luts += intw - if intw != outw: - cnt_luts += abs(math.ceil(math.log(intw / outw, 2))) - cset_luts += outw - - return int(cnt_luts + cset_luts) - - def prepare_rtlsim(self): - assert self.get_nodeattr("impl_style") != "vivado", ( - "StreamingDataWidthConverter impl_style " - "cannot be vivado for rtlsim. Only impl_style=rtl supported." - ) - super().prepare_rtlsim() - - def code_generation_ipgen(self, model, fpgapart, clk): - # no codegen required for impl_style=vivado since - # that uses premade, configurable AXIS IP - if self.get_nodeattr("impl_style") == "hls": - super().code_generation_ipgen(model, fpgapart, clk) - - def ipgen_singlenode_code(self): - # no IP generation required for impl_style=vivado since - # that uses premade, configurable AXIS IP - if self.get_nodeattr("impl_style") == "hls": - super().ipgen_singlenode_code() - else: - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # set ipgen_path and ip_path so that HLSSynthIP - # and CreatedStitchedIP transformations do not complain - self.set_nodeattr("ipgen_path", code_gen_dir) - self.set_nodeattr("ip_path", code_gen_dir) diff --git a/src/finn/custom_op/fpgadataflow/streamingeltwise.py b/src/finn/custom_op/fpgadataflow/streamingeltwise.py new file mode 100644 index 0000000000..4681c144f7 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingeltwise.py @@ -0,0 +1,216 @@ +# Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class StreamingEltwise(HWCustomOp): + """Abstraction layer for HW implementation of StreamingEltwise""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = super().get_nodeattr_types() + my_attrs.update( + { + "NumChannels": ("i", True, ""), + "PE": ("i", True, ""), + # FINN DataTypes for inputs; output datatype inferred from input + "inputDataType0": ("s", True, ""), + "inputDataType1": ("s", True, ""), + # type of EltwiseFunction for the operation + "eltwiseOp": ("s", True, "", ["Add", "Sub", "AbsDiff"]), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + "inFIFODepths": ("ints", False, [2, 2]), + } + ) + return my_attrs + + def get_eltwise_op_lambda(self): + eltwise_op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + odt = self.get_output_datatype() + tin0 = idt0.get_hls_datatype_str() + tin1 = idt1.get_hls_datatype_str() + tout = odt.get_hls_datatype_str() + eltwise_ops = { + # "Add": "[](auto a, auto b) { return a + b; }", + # "Sub": "[](auto a, auto b) { return a - b; }", + # "AbsDiff": "[](auto a, auto b) { return a>b? a-b : b-a; }", + "Add": f"add<{tin0}, {tin1}, {tout}>()", + "Sub": f"sub<{tin0}, {tin1}, {tout}>()", + "AbsDiff": f"absdiff<{tin0}, {tin1}, {tout}>()", + } + return eltwise_ops[eltwise_op] + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich]) + return ishape + + def get_folded_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + assert ich % pe == 0, "PE must divide NumChannels" + vecs = list(self.get_nodeattr("numInputVectors")) + ishape = tuple(vecs + [ich // pe, pe]) + return ishape + + def get_normal_output_shape(self, ind=0): + return self.get_normal_input_shape() + + def get_folded_output_shape(self, ind=0): + return self.get_folded_input_shape() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpected input1 shape." + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[1])) + assert ishape == exp_ishape, "Unexpected input2 shape." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt0 = model.get_tensor_datatype(node.input[0]) + if idt0 != self.get_input_datatype(0): + warn_str = "inputDataType0 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(0)), + str(idt0), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType0", idt0.name) + idt1 = model.get_tensor_datatype(node.input[1]) + if idt1 != self.get_input_datatype(1): + warn_str = "inputDataType1 changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype(1)), + str(idt1), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType1", idt1.name) + # enforce output data type (calculated based on idt) + odt = self.get_output_datatype() + model.set_tensor_datatype(self.onnx_node.output[0], odt) + + def verify_node(self): + pass + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType" + str(ind))] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + op = self.get_nodeattr("eltwiseOp") + idt0 = self.get_input_datatype(0) + idt1 = self.get_input_datatype(1) + assert idt0.signed() == idt1.signed(), ( + "%s: Inputs must have same signedness" % self.onnx_node.name + ) + idt0_min, idt0_max = idt0.min(), idt0.max() + idt1_min, idt1_max = idt1.min(), idt1.max() + cands = [ + idt0_min - idt1_min, + idt0_min - idt1_max, + idt0_max - idt1_min, + idt0_max - idt1_max, + ] + largest_magnitude = max(map(abs, cands)) + if op == "Add": + if idt0.signed(): + return DataType.get_smallest_possible(idt0.min() + idt1.min()) + else: + return DataType.get_smallest_possible(idt0.max() + idt1.max()) + elif op == "Sub": + return DataType.get_smallest_possible(-largest_magnitude) + elif op == "AbsDiff": + return DataType.get_smallest_possible(largest_magnitude) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (self.onnx_node.name, op)) + + def get_instream_width(self, ind=0): + """Returns input stream width.""" + ibits = self.get_input_datatype(ind).bitwidth() + pe = self.get_nodeattr("PE") + in_width = pe * ibits + return in_width + + def get_outstream_width(self, ind=0): + """Returns output stream width.""" + obits = self.get_output_datatype().bitwidth() + pe = self.get_nodeattr("PE") + out_width = pe * obits + return out_width + + def get_number_output_values(self): + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def execute_node(self, context, graph): + # simulate behavior using Python + node = self.onnx_node + inp0_values = context[node.input[0]] + inp1_values = context[node.input[1]] + eltwiseOp = self.get_nodeattr("eltwiseOp") + oshape = context[node.output[0]].shape + ishape0 = inp0_values.shape + ishape1 = inp1_values.shape + assert ishape0 == ishape1, "Shapes of inputs should be the same for Streamingeltwise" + # subtraction + result = inp0_values - inp1_values + if eltwiseOp == "Sub": + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) + elif eltwiseOp == "AbsDiff": + context[node.output[0]] = np.abs(np.asarray(result, dtype=np.float32)).reshape(oshape) + else: + raise Exception("%s: Unknown eltWiseOp = %s" % (node.name, eltwiseOp)) + + def get_verilog_top_module_intf_names(self): + intf_names = super().get_verilog_top_module_intf_names() + sname = self.hls_sname() + swidth = self.get_instream_width_padded() + intf_names["s_axis"] = [(x + "_" + sname, swidth) for x in ["in0", "in1"]] + return intf_names diff --git a/src/finn/custom_op/fpgadataflow/streamingfifo.py b/src/finn/custom_op/fpgadataflow/streamingfifo.py index 522305327f..1556575b00 100644 --- a/src/finn/custom_op/fpgadataflow/streamingfifo.py +++ b/src/finn/custom_op/fpgadataflow/streamingfifo.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,23 +27,15 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import math import numpy as np -import os -import subprocess import warnings from qonnx.core.datatype import DataType -from shutil import copy -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.basic import get_finn_root -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -from . import templates - -class StreamingFIFO(HLSCustomOp): - def __init__(self, onnx_node): - super().__init__(onnx_node) - self.strm_fifo_wrapper = templates.strm_fifo_wrapper +class StreamingFIFO(HWCustomOp): + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = super().get_nodeattr_types() @@ -53,12 +45,10 @@ def get_nodeattr_types(self): "depth": ("i", True, 0), # folded shape of input/output "folded_shape": ("ints", True, []), + # normal shape of input/output + "normal_shape": ("ints", True, []), # FINN DataTypes for inputs/outputs "dataType": ("s", True, ""), - # Toggle between hls or IPI implementation - # rtl - use the hls generated IP during stitching - # vivado - use the AXI Infrastructure FIFO - "impl_style": ("s", False, "rtl", {"rtl", "vivado"}), # FPGA resource type for FIFOs when impl_style is vivado # auto -- let Vivado decide # block -- use BRAM @@ -80,22 +70,6 @@ def get_nodeattr_types(self): return my_attrs - def get_adjusted_depth(self): - impl = self.get_nodeattr("impl_style") - depth = self.get_nodeattr("depth") - if impl == "vivado": - old_depth = depth - # round up depth to nearest power-of-2 - # Vivado FIFO impl may fail otherwise - depth = (1 << (depth - 1).bit_length()) if impl == "vivado" else depth - if old_depth != depth: - warnings.warn( - "%s: rounding-up FIFO depth from %d to %d for impl_style=vivado" - % (self.onnx_node.name, old_depth, depth) - ) - - return depth - def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() oshape = self.get_normal_output_shape() @@ -128,117 +102,12 @@ def get_verilog_top_module_intf_names(self): ret["ap_none"] = ["maxcount"] return ret - def get_verilog_top_module_name(self): - "Return the Verilog top module name for this node." - - node = self.onnx_node - prefixed_top_name = "%s" % (node.name) - return prefixed_top_name - - def code_generation_ipgen(self, model, fpgapart, clk): - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_dir = "{}/project_{}/sol1/impl/verilog".format( - code_gen_dir, self.onnx_node.name - ) - os.makedirs(verilog_dir) - # copy Q_srl.v from finn-rtllib to verilog directory - memstream_dir = get_finn_root() + "/finn-rtllib/memstream/hdl/" - Q_file = os.path.join(memstream_dir, "Q_srl.v") - copy(Q_file, verilog_dir) - - # empty code gen dictionary for new entries - self.code_gen_dict.clear() - self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)] - self.code_gen_dict["$LAYER_NAME$"] = [ - "{}_{}".format(self.onnx_node.name, self.onnx_node.name) - ] - # make instream width a multiple of 8 for axi interface - in_width = self.get_instream_width_padded() - count_width = int(self.get_nodeattr("depth") - 1).bit_length() - self.code_gen_dict["$COUNT_RANGE$"] = ["[{}:0]".format(count_width - 1)] - self.code_gen_dict["$IN_RANGE$"] = ["[{}:0]".format(in_width - 1)] - self.code_gen_dict["$OUT_RANGE$"] = ["[{}:0]".format(in_width - 1)] - self.code_gen_dict["$WIDTH$"] = [str(in_width)] - self.code_gen_dict["$DEPTH$"] = [str(self.get_nodeattr("depth"))] - self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()] - - template = self.strm_fifo_wrapper - - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - f = open(os.path.join(verilog_dir, "{}.v".format(self.onnx_node.name)), "w") - f.write(template) - f.close() - self.code_gen_dict.clear() - - def ipgen_singlenode_code(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - verilog_dir = "{}/project_{}/sol1/impl/verilog".format( - code_gen_dir, self.onnx_node.name - ) - # prepare the IP packaging tcl template - template = templates.ip_package_tcl - self.code_gen_dict.clear() - self.code_gen_dict["$TOPNAME$"] = ["{}".format(self.onnx_node.name)] - # note: setting the root dir as absolute can cause path problems - # the ipgen script will be invoked from the sources dir so root_dir=. is OK - self.code_gen_dict["$VERILOG_DIR$"] = ["."] - self.code_gen_dict["$HLS_SNAME$"] = [self.hls_sname()] - for key in self.code_gen_dict: - # transform list into long string separated by '\n' - code_gen_line = "\n".join(self.code_gen_dict[key]) - template = template.replace(key, code_gen_line) - f = open(os.path.join(verilog_dir, "package_ip.tcl"), "w") - f.write(template) - f.close() - # create a shell script and call Vivado to invoke the IP pkg script - make_project_sh = verilog_dir + "/make_ip.sh" - working_dir = os.environ["PWD"] - with open(make_project_sh, "w") as f: - f.write("#!/bin/bash \n") - f.write("cd {}\n".format(verilog_dir)) - f.write("vivado -mode batch -source package_ip.tcl\n") - f.write("cd {}\n".format(working_dir)) - bash_command = ["bash", make_project_sh] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate() - # set ipgen_path and ip_path to point to the new packaged IP - self.set_nodeattr("ipgen_path", verilog_dir) - self.set_nodeattr("ip_path", verilog_dir) - vlnv = "xilinx.com:hls:%s:1.0" % (self.onnx_node.name) - self.set_nodeattr("ip_vlnv", vlnv) - self.code_gen_dict.clear() - def get_normal_input_shape(self, ind=0): depth = self.get_adjusted_depth() - assert depth >= 2, """Depth is too low""" + assert depth >= 1, """Depth is too low""" if depth > 256 and self.get_nodeattr("impl_style") == "rtl": - warnings.warn( - "Depth is high, set between 2 and 256 for efficient SRL implementation" - ) - # derive normal shape from folded shape - # StreamingFIFOs are inserted in between fpgadataflow nodes - # the folded shape could be for example (1, nf, pe) - # with nf (neuron folding): mh // pe - # the normal input shape is in this case (1, mh) - # so to achieve this the two inner dimensions are multiplied - # and together with all previous dimensions - # this gives the normal input shape - - folded_shape = self.get_nodeattr("folded_shape") - # extract inner dimension - inner_dim = folded_shape[-1] - # multiply with the next inner dimension - folding_factor = folded_shape[-2] * inner_dim - normal_ishape = [] - # create the normal_ishape - for i in range(len(folded_shape) - 2): - normal_ishape.append(folded_shape[i]) - normal_ishape.append(folding_factor) - - return normal_ishape + warnings.warn("Depth is high, set between 2 and 256 for efficient SRL implementation") + return self.get_nodeattr("normal_shape") def get_normal_output_shape(self, ind=0): return self.get_normal_input_shape() @@ -268,162 +137,13 @@ def get_output_datatype(self, ind=0): return DataType[self.get_nodeattr("dataType")] def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") node = self.onnx_node - inp = context[node.input[0]] - exp_shape = self.get_normal_input_shape() - - if mode == "cppsim": - output = inp - output = np.asarray([output], dtype=np.float32).reshape(*exp_shape) - context[node.output[0]] = output - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - # create a npy file for the input of the node - assert ( - str(inp.dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = inp.reshape(expected_inp_shape) - if DataType[self.get_nodeattr("dataType")] == DataType["BIPOLAR"]: - # store bipolar activations as binary - reshaped_input = (reshaped_input + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = DataType[self.get_nodeattr("dataType")] - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - sim = self.get_rtlsim() - nbits = self.get_instream_width() - inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - output = self.rtlsim(sim, inp) - odt = DataType[self.get_nodeattr("dataType")] - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) + context[node.output[0]] = context[node.input[0]] def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - pass - - def defines(self, var): - pass - - def read_npy_data(self): - pass - - def strm_decl(self): - pass - - def docompute(self): - pass - - def dataoutstrm(self): - pass - - def save_as_npy(self): - pass - - def blackboxfunction(self): - pass - - def pragmas(self): - pass - - def code_generation_ipi(self): - impl_style = self.get_nodeattr("impl_style") - if impl_style == "rtl": - return super().code_generation_ipi() - elif impl_style == "vivado": - cmd = [] - node_name = self.onnx_node.name - depth = self.get_adjusted_depth() - ram_style = self.get_nodeattr("ram_style") - # create a hierarchy for this layer, with the same port names - clk_name = self.get_verilog_top_module_intf_names()["clk"][0] - rst_name = self.get_verilog_top_module_intf_names()["rst"][0] - dout_name = self.get_verilog_top_module_intf_names()["m_axis"][0][0] - din_name = self.get_verilog_top_module_intf_names()["s_axis"][0][0] - cmd.append("create_bd_cell -type hier %s" % node_name) - cmd.append("create_bd_pin -dir I -type clk /%s/%s" % (node_name, clk_name)) - cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) - cmd.append( - "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" - % (node_name, dout_name) - ) - cmd.append( - "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) - ) - # instantiate and configure DWC - cmd.append( - "create_bd_cell -type ip " - "-vlnv xilinx.com:ip:axis_data_fifo:2.0 /%s/fifo" % node_name - ) - cmd.append( - "set_property -dict [list CONFIG.FIFO_DEPTH {%d}] " - "[get_bd_cells /%s/fifo]" % (depth, node_name) - ) - cmd.append( - "set_property -dict [list CONFIG.FIFO_MEMORY_TYPE {%s}] " - "[get_bd_cells /%s/fifo]" % (ram_style, node_name) - ) - cmd.append( - "set_property -dict [list CONFIG.TDATA_NUM_BYTES {%d}] " - "[get_bd_cells /%s/fifo]" - % (np.ceil(self.get_outstream_width() / 8), node_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/fifo/M_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, dout_name) - ) - cmd.append( - "connect_bd_intf_net [get_bd_intf_pins %s/fifo/S_AXIS] " - "[get_bd_intf_pins %s/%s]" % (node_name, node_name, din_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] " - "[get_bd_pins %s/fifo/s_axis_aresetn]" - % (node_name, rst_name, node_name) - ) - cmd.append( - "connect_bd_net [get_bd_pins %s/%s] " - "[get_bd_pins %s/fifo/s_axis_aclk]" % (node_name, clk_name, node_name) - ) - return cmd - else: - raise Exception( - "FIFO implementation style %s not supported, please use rtl or vivado" - % impl_style - ) - def bram_estimation(self): """Calculates resource estimation for BRAM""" impl = self.get_nodeattr("impl_style") @@ -487,10 +207,3 @@ def lut_estimation(self): ram_luts = 0 return int(address_luts + ram_luts) - - def prepare_rtlsim(self): - assert self.get_nodeattr("impl_style") != "vivado", ( - "StreamingFIFO impl_style " - "cannot be vivado for rtlsim. Only impl_style=rtl supported." - ) - super().prepare_rtlsim() diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py new file mode 100755 index 0000000000..59a8f092d0 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/streamingmaxpool.py @@ -0,0 +1,236 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import onnxruntime as rt +import warnings +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim +from qonnx.util.basic import qonnx_make_model + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + +# TODO: consider splitting this into separate implementations for 1D and 2D +# similar to what we do for ConvolutionInputGenerator + + +class StreamingMaxPool(HWCustomOp): + """Abstraction layer for HW implementation of StreamingMaxPool""" + + def get_nodeattr_types(self): + my_attrs = { + "ImgDim": ("ints", True, []), # [H, W] = [Y, X] + "PoolDim": ("ints", True, []), # [H, W] = [Y, X] + "NumChannels": ("i", True, 0), + # parallelism control - only supported for 1D maxpool + "PE": ("i", False, 0), + # round up (instead of down) output size - only supported for 1D maxpool + "CeilMode": ("i", False, 0), + # FINN DataTypes for inputs/outputs + "dataType": ("s", True, ""), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("dataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("dataType")] + + def get_1d_attrs_normalized(self): + # support both (1, D) and (D, 1) cases transparently: + # assume the dummy ('1') dimension is the Y-dimension, i.e. + # images and kernels (and their attributes) of dimension + # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] + ifm_dim = self.get_nodeattr("ImgDim") + k = self.get_nodeattr("PoolDim") + ifm_ch = self.get_nodeattr("NumChannels") + if ifm_dim[1] == 1: + ifm_dim = ifm_dim[::-1] + k = k[::-1] + return (ifm_dim, k, ifm_ch) + + def is_1d(self): + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + return (ifm_dim[0] == 1) and (k[0] == 1) + + def get_normal_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + ifm_ch = self.get_nodeattr("NumChannels") + ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) + return ishape + + def get_folded_input_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + ifm_ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + nf = int(ifm_ch / pe) + if self.is_1d(): + folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe) + else: + folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) + return folded_ishape + + def get_normal_output_shape(self, ind=0): + ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") + k_h, k_w = tuple(self.get_nodeattr("PoolDim")) + ifm_ch = self.get_nodeattr("NumChannels") + ceil_mode = self.get_nodeattr("CeilMode") + if not self.is_1d(): + assert ifm_dim_h % k_h == 0, "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0" + assert ifm_dim_w % k_w == 0, "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0" + ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode) + ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode) + oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) + return oshape + + def get_folded_output_shape(self, ind=0): + # even though there is no folding in the current hlslib op, + # insert a time multiplexing axis to remain compatible with the + # shapes produced by the rest of the dataflow pipeline + ifm_ch = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + nf = int(ifm_ch / pe) + ret = list(self.get_normal_output_shape()) + if self.is_1d(): + ret[-1] = nf + ret.append(pe) + else: + ret.insert(-1, 1) + return tuple(ret) + + def get_number_output_values(self): + folded_oshape = self.get_folded_output_shape() + return np.prod(folded_oshape[:-1]) + + def get_exp_cycles(self): + # derived from StreamingMaxPool_Batch loop nest + ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() + + warnings.warn( + """Estimated latency for layer {} can be lower than + actual latency!""".format( + self.onnx_node.name + ) + ) + if self.is_1d(): + _, _, _, nf, _ = self.get_folded_output_shape() + ceil_mode = self.get_nodeattr("CeilMode") + ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) + exp_cycles = ofm_dim * nf * (k[1] + 1) + return int(exp_cycles) + else: + # TODO: adjust inaccurate formula + return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) + + def get_instream_width(self, ind=0): + dt_bits = self.get_input_datatype().bitwidth() + pe = self.get_nodeattr("PE") + ifm_ch = self.get_nodeattr("NumChannels") + if self.is_1d(): + in_width = int(dt_bits * pe) + else: + in_width = int(dt_bits * ifm_ch) + return in_width + + def get_outstream_width(self, ind=0): + """For streaming maxpool out stream width is the same as in stream width""" + return self.get_instream_width() + + def make_shape_compatible_op(self, model): + exp_ishape = self.get_normal_input_shape() + oshape = self.get_normal_output_shape() + ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) + assert ishape == exp_ishape, "Unexpect input shape for StreamingMaxPool." + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype()), + str(idt), + ) + warnings.warn(warn_str) + self.set_nodeattr("dataType", idt.name) + # data type stays the same + model.set_tensor_datatype(node.output[0], idt) + + def verify_node(self): + pass + + def execute_node(self, context, graph): + # create a standard add node to help calculate the result + node = self.onnx_node + kernel_shape = self.get_nodeattr("PoolDim") + ceil_mode = self.get_nodeattr("CeilMode") + inp_values = context[node.input[0]] + dummy_out = context[node.output[0]] + # convert i/o NHWC -> NCHW + inp_values = np.transpose(inp_values, (0, 3, 1, 2)) + dummy_out = np.transpose(dummy_out, (0, 3, 1, 2)) + # handle 1d case + ishape = inp_values.shape + if ishape[2] == 1 or ishape[3] == 1: + inp_values = inp_values.reshape(ishape[0], ishape[1], ishape[2] * ishape[3]) + kernel_shape = [kernel_shape[0] * kernel_shape[1]] + # execute as regular MaxPool + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, inp_values.shape) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, dummy_out.shape) + node_mp = helper.make_node( + "MaxPool", + inputs=[node.input[0]], + outputs=[node.output[0]], + kernel_shape=kernel_shape, + strides=kernel_shape, + ceil_mode=ceil_mode, + ) + graph_mp = helper.make_graph( + nodes=[node_mp], + name="single-mp-exec", + inputs=[inp], + outputs=[outp], + ) + + opset_version = self.onnx_opset_version + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_mp = qonnx_make_model(graph_mp, **onnx_kwargs) + idict = {node.input[0]: inp_values} + sess = rt.InferenceSession(model_mp.SerializeToString()) + result = sess.run(None, idict) + result = np.asarray(result, dtype=np.float32).reshape(dummy_out.shape) + # convert output NCHW -> NHWC + result = np.transpose(result, (0, 2, 3, 1)) + context[node.output[0]] = result diff --git a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py b/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py deleted file mode 100755 index a0e60931ed..0000000000 --- a/src/finn/custom_op/fpgadataflow/streamingmaxpool_batch.py +++ /dev/null @@ -1,428 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import numpy as np -import os -import warnings -from qonnx.core.datatype import DataType -from qonnx.custom_op.general.maxpoolnhwc import compute_pool_output_dim - -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy - -# TODO: consider splitting this into separate implementations for 1D and 2D -# similar to what we do for ConvolutionInputGenerator - - -class StreamingMaxPool_Batch(HLSCustomOp): - """Class that corresponds to finn-hlslib StreamingMaxPool_batch function.""" - - def get_nodeattr_types(self): - my_attrs = { - "ImgDim": ("ints", True, []), # [H, W] = [Y, X] - "PoolDim": ("ints", True, []), # [H, W] = [Y, X] - "NumChannels": ("i", True, 0), - # parallelism control - only supported for 1D maxpool - "PE": ("i", False, 0), - # round up (instead of down) output size - only supported for 1D maxpool - "CeilMode": ("i", False, 0), - # FINN DataTypes for inputs/outputs - "dataType": ("s", True, ""), - } - my_attrs.update(super().get_nodeattr_types()) - return my_attrs - - def get_input_datatype(self, ind=0): - """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("dataType")] - - def get_output_datatype(self, ind=0): - """Returns FINN DataType of output.""" - return DataType[self.get_nodeattr("dataType")] - - def get_1d_attrs_normalized(self): - # support both (1, D) and (D, 1) cases transparently: - # assume the dummy ('1') dimension is the Y-dimension, i.e. - # images and kernels (and their attributes) of dimension - # [H, W] = [Y, X] = [D, 1] or [1, D] are always mapped to [1, D] - ifm_dim = self.get_nodeattr("ImgDim") - k = self.get_nodeattr("PoolDim") - ifm_ch = self.get_nodeattr("NumChannels") - if ifm_dim[1] == 1: - ifm_dim = ifm_dim[::-1] - k = k[::-1] - return (ifm_dim, k, ifm_ch) - - def is_1d(self): - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - return (ifm_dim[0] == 1) and (k[0] == 1) - - def get_normal_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - ifm_ch = self.get_nodeattr("NumChannels") - ishape = (1, ifm_dim_h, ifm_dim_w, ifm_ch) - return ishape - - def get_folded_input_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - ifm_ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - nf = int(ifm_ch / pe) - if self.is_1d(): - folded_ishape = (1, ifm_dim_h, ifm_dim_w, nf, pe) - else: - folded_ishape = (1, ifm_dim_h, ifm_dim_w, 1, ifm_ch) - return folded_ishape - - def get_normal_output_shape(self, ind=0): - ifm_dim_h, ifm_dim_w = self.get_nodeattr("ImgDim") - k_h, k_w = tuple(self.get_nodeattr("PoolDim")) - ifm_ch = self.get_nodeattr("NumChannels") - ceil_mode = self.get_nodeattr("CeilMode") - if not self.is_1d(): - assert ( - ifm_dim_h % k_h == 0 - ), "StreamingMaxPool needs ImgDim_h % PoolDim_h == 0" - assert ( - ifm_dim_w % k_w == 0 - ), "StreamingMaxPool needs ImgDim_w % PoolDim_w == 0" - ofm_dim_h = compute_pool_output_dim(ifm_dim_h, k_h, k_h, 0, ceil_mode) - ofm_dim_w = compute_pool_output_dim(ifm_dim_w, k_w, k_w, 0, ceil_mode) - oshape = (1, ofm_dim_h, ofm_dim_w, ifm_ch) - return oshape - - def get_folded_output_shape(self, ind=0): - # even though there is no folding in the current hlslib op, - # insert a time multiplexing axis to remain compatible with the - # shapes produced by the rest of the dataflow pipeline - ifm_ch = self.get_nodeattr("NumChannels") - pe = self.get_nodeattr("PE") - nf = int(ifm_ch / pe) - ret = list(self.get_normal_output_shape()) - if self.is_1d(): - ret[-1] = nf - ret.append(pe) - else: - ret.insert(-1, 1) - return tuple(ret) - - def get_number_output_values(self): - folded_oshape = self.get_folded_output_shape() - return np.prod(folded_oshape[:-1]) - - def get_exp_cycles(self): - # derived from StreamingMaxPool_Batch loop nest - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - - warnings.warn( - """Estimated latency for layer {} can be lower than - actual latency!""".format( - self.onnx_node.name - ) - ) - if self.is_1d(): - _, _, _, nf, _ = self.get_folded_output_shape() - ceil_mode = self.get_nodeattr("CeilMode") - ofm_dim = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) - exp_cycles = ofm_dim * nf * (k[1] + 1) - return int(exp_cycles) - else: - # TODO: adjust inaccurate formula - return int(ifm_dim[1] * ifm_dim[1] * (1 + 1 / (k[1] * k[1]))) - - def get_instream_width(self, ind=0): - dt_bits = self.get_input_datatype().bitwidth() - pe = self.get_nodeattr("PE") - ifm_ch = self.get_nodeattr("NumChannels") - if self.is_1d(): - in_width = int(dt_bits * pe) - else: - in_width = int(dt_bits * ifm_ch) - return in_width - - def get_outstream_width(self, ind=0): - """For streaming maxpool out stream width is the same as in stream width""" - return self.get_instream_width() - - def make_shape_compatible_op(self, model): - exp_ishape = self.get_normal_input_shape() - oshape = self.get_normal_output_shape() - ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ishape == exp_ishape, "Unexpect input shape for StreamingMaxPool." - return super().make_const_shape_op(oshape) - - def infer_node_datatype(self, model): - node = self.onnx_node - idt = model.get_tensor_datatype(node.input[0]) - if idt != self.get_input_datatype(): - warn_str = "inputDataType changing for %s: %s -> %s " % ( - node.name, - str(self.get_input_datatype()), - str(idt), - ) - warnings.warn(warn_str) - self.set_nodeattr("dataType", idt.name) - # data type stays the same - model.set_tensor_datatype(node.output[0], idt) - - def verify_node(self): - info_messages = [] - # verify that "backend" is set to "fpgadataflow" - backend_value = self.get_nodeattr("backend") - if backend_value == "fpgadataflow": - info_messages.append("Attribute backend is set correctly") - else: - info_messages.append('Attribute backend should be set to "fpgadataflow"') - - # verify the number of inputs - if len(self.onnx_node.input) == 1: - info_messages.append("The number of inputs is correct") - else: - info_messages.append("""StreamingMaxPool_Batch needs 1 data input""") - - return info_messages - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "maxpool.h"'] - - def defines(self, var): - numReps = 1 - ifm_dim, k, ifm_ch = self.get_1d_attrs_normalized() - ceil_mode = self.get_nodeattr("CeilMode") - output_size = compute_pool_output_dim(ifm_dim[1], k[1], k[1], 0, ceil_mode) - - if self.is_1d(): - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim {}\n #define PoolDim {}\n - #define NumChannels {}\n #define PE {}\n #define OutputSize {} - \n #define numReps {}""".format( - ifm_dim[1], - k[1], - self.get_nodeattr("NumChannels"), - self.get_nodeattr("PE"), - output_size, - numReps, - ) - ] - else: - self.code_gen_dict["$DEFINES$"] = [ - """#define ImgDim {}\n #define PoolDim {}\n - #define NumChannels {}\n #define numReps {}""".format( - ifm_dim[1], - k[1], - self.get_nodeattr("NumChannels"), - numReps, - ) - ] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - dtype = self.get_input_datatype() - if dtype.bitwidth() == 1: - if self.is_1d(): - raise Exception("Binary 1d MaxPool not implemented on HLS backend") - else: - op = "StreamingMaxPool" - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0, out);" % (op) - ] - else: - dtype = self.get_input_datatype() - dtype_hls = dtype.get_hls_datatype_str() - minval_str = str(int(dtype.min())) - if self.is_1d(): - op = "StreamingMaxPool_Precision_1d" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """%s(in0, out);""" - % (op, dtype_hls, minval_str) - ] - else: - op = "StreamingMaxPool_Precision" - self.code_gen_dict["$DOCOMPUTE$"] = [ - "%s(in0, out);" - % (op, dtype_hls, minval_str) - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_hls_type, packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_ishape = self.get_folded_input_shape() - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (1, ifm_dim, ifm_dim, ifm_ch).""" - if self.get_input_datatype() == DataType["BIPOLAR"]: - # store bipolar activations as binary - inp = (inp + 1) / 2 - export_idt = DataType["BINARY"] - else: - export_idt = self.get_input_datatype() - - reshaped_input = inp.reshape(folded_ishape) - np.save(os.path.join(code_gen_dir, "input_0.npy"), reshaped_input) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == exp_oshape - ), "cppsim \ - did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - # binary -> bipolar if needed - if self.get_output_datatype() == DataType["BIPOLAR"]: - out = context[node.output[0]] - out = 2 * out - 1 - context[node.output[0]] = out - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output - shape doesn't match expected shape (1, ofm_dim, ofm_dim, ifm_ch).""" diff --git a/src/finn/custom_op/fpgadataflow/templates.py b/src/finn/custom_op/fpgadataflow/templates.py index c7bbc3f139..3d89a0ab23 100644 --- a/src/finn/custom_op/fpgadataflow/templates.py +++ b/src/finn/custom_op/fpgadataflow/templates.py @@ -109,107 +109,6 @@ exit 0 """ -# verilog wrapper for decoupled mem mode -decoupled_wrapper = """ -module $TOPNAME$( -ap_clk, -ap_rst_n, -in0_$HLS_SNAME$_TDATA, -in0_$HLS_SNAME$_TVALID, -in0_$HLS_SNAME$_TREADY, -out_$HLS_SNAME$_TDATA, -out_$HLS_SNAME$_TVALID, -out_$HLS_SNAME$_TREADY -); - -input ap_clk; -input ap_rst_n; -input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; -input in0_$HLS_SNAME$_TVALID; -output in0_$HLS_SNAME$_TREADY; -output $OUT_RANGE$ out_$HLS_SNAME$_TDATA; -output out_$HLS_SNAME$_TVALID; -input out_$HLS_SNAME$_TREADY; - -reg [31:0] config_address = 0; -reg config_ce = 0; -reg config_we = 0; -reg [31:0] config_d0 = 0; -wire [31:0] config_q0; - -//multiple wire AXI Streams -wire m_axis_0_afull; -// FIFO count to generate programmable full -wire [5:0] fifo_0_count; -wire m_axis_0_tready; -wire m_axis_0_tvalid; -wire $WEIGHT_RANGE$ m_axis_0_tdata; - -//memstream component - -memstream -#( -//parameters to enable/disable axi-mm, set number of streams, set readmemh for -// memory, set per-stream offsets in memory, set per-stream widths -.CONFIG_EN(1), -.NSTREAMS(1), -.MEM_DEPTH($MEM_DEPTH$), -.MEM_WIDTH($WEIGHT_WIDTH$), -.MEM_INIT("./"), -.RAM_STYLE("$RAM_STYLE$"), - -//widths per stream -.STRM0_WIDTH($WEIGHT_WIDTH$), - -//depths per stream -.STRM0_DEPTH($WSTREAM_DEPTH$), - -//offsets for each stream -.STRM0_OFFSET(0) -) -mem -( -.aclk(ap_clk), -.aresetn(ap_rst_n), - -//optional configuration interface compatible with ap_memory -.config_address(config_address), -.config_ce(config_ce), -.config_we(config_we), -.config_d0(config_d0), -.config_q0(config_q0), - -//multiple output AXI Streams, TDATA width rounded to multiple of 8 bits -.m_axis_0_afull(m_axis_0_afull), -.m_axis_0_tready(m_axis_0_tready), -.m_axis_0_tvalid(m_axis_0_tvalid), -.m_axis_0_tdata(m_axis_0_tdata) - - -); - - -//MVA_Stream_Unit - -$LAYER_NAME$ -MVA_Stream_U -( -.ap_clk(ap_clk), //input -.ap_rst_n(ap_rst_n), //input -.in0_$HLS_SNAME$_TDATA(in0_$HLS_SNAME$_TDATA), //$IN_RANGE$ input -.in0_$HLS_SNAME$_TVALID(in0_$HLS_SNAME$_TVALID), //input -.in0_$HLS_SNAME$_TREADY(in0_$HLS_SNAME$_TREADY), //output -.weights_$HLS_SNAME$_TDATA(m_axis_0_tdata), //$WEIGHT_RANGE$ input -.weights_$HLS_SNAME$_TVALID(m_axis_0_tvalid), //input -.weights_$HLS_SNAME$_TREADY(m_axis_0_tready), //output -.out_$HLS_SNAME$_TDATA(out_$HLS_SNAME$_TDATA), //$OUT_RANGE$ output -.out_$HLS_SNAME$_TVALID(out_$HLS_SNAME$_TVALID), //output -.out_$HLS_SNAME$_TREADY(out_$HLS_SNAME$_TREADY) //input -); - -endmodule -""" - ip_package_tcl = """ ## IP Info set Vendor "xilinx.com" @@ -313,49 +212,3 @@ ipx::save_core [ipx::current_core] ipx::archive_core $Top.zip [ipx::current_core] """ - -strm_fifo_wrapper = """ -module $TOPNAME$( -ap_clk, -ap_rst_n, -count, -maxcount, -in0_$HLS_SNAME$_TDATA, -in0_$HLS_SNAME$_TVALID, -in0_$HLS_SNAME$_TREADY, -out_$HLS_SNAME$_TDATA, -out_$HLS_SNAME$_TVALID, -out_$HLS_SNAME$_TREADY -); - -input ap_clk; -input ap_rst_n; -output $COUNT_RANGE$ count; -output $COUNT_RANGE$ maxcount; -input $IN_RANGE$ in0_$HLS_SNAME$_TDATA; -input in0_$HLS_SNAME$_TVALID; -output in0_$HLS_SNAME$_TREADY; -output $OUT_RANGE$ out_$HLS_SNAME$_TDATA; -output out_$HLS_SNAME$_TVALID; -input out_$HLS_SNAME$_TREADY; - -Q_srl #( -.depth($DEPTH$), -.width($WIDTH$) -) -$LAYER_NAME$ -( - .clock(ap_clk), - .reset(!ap_rst_n), - .count(count), - .maxcount(maxcount), - .i_d(in0_$HLS_SNAME$_TDATA), - .i_v(in0_$HLS_SNAME$_TVALID), - .i_r(in0_$HLS_SNAME$_TREADY), - .o_d(out_$HLS_SNAME$_TDATA), - .o_v(out_$HLS_SNAME$_TVALID), - .o_r(out_$HLS_SNAME$_TREADY) -); - -endmodule -""" diff --git a/src/finn/custom_op/fpgadataflow/thresholding.py b/src/finn/custom_op/fpgadataflow/thresholding.py new file mode 100644 index 0000000000..dde813a293 --- /dev/null +++ b/src/finn/custom_op/fpgadataflow/thresholding.py @@ -0,0 +1,268 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import numpy as np +import warnings +from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.util.basic import interleave_matrix_outer_dim_from_partitions + +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp + + +class Thresholding(HWCustomOp): + """Abstraction layer for HW implementation of Thresholding.""" + + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) + + def get_nodeattr_types(self): + my_attrs = { + # whether weights (thresholds) will be + # writable through an AXI-lite interface during runtime + # 1 for enabled, 0 for disabled. + "runtime_writeable_weights": ("i", False, 0, {0, 1}), + # parallelization; channels thresholded per cycle + "PE": ("i", True, 0), + # number of channels (each may have different thresholds) + "NumChannels": ("i", True, 0), + # number of steps in thresholding function. Used only in decoupled mode + "numSteps": ("i", True, 1), + # FINN DataTypes for inputs, outputs + "inputDataType": ("s", True, ""), + "weightDataType": ("s", True, ""), + "outputDataType": ("s", True, ""), + # number of input vectors, examples: + # [1] is a single vector (like a FC layer with batch=1) + # [4] is four vectors (like a FC layer with batch=4) + # [1, 4, 4] is four * four vectors (like a conv layer with batch=1) + "numInputVectors": ("ints", False, [1]), + # initialization value for the thresholding accumulator + "ActVal": ("i", False, 0), + } + my_attrs.update(super().get_nodeattr_types()) + return my_attrs + + def make_shape_compatible_op(self, model): + oshape = self.get_normal_output_shape() + return super().make_const_shape_op(oshape) + + def infer_node_datatype(self, model): + node = self.onnx_node + idt = model.get_tensor_datatype(node.input[0]) + if idt != self.get_input_datatype(): + warn_str = "inputDataType changing for %s: %s -> %s " % ( + node.name, + str(self.get_input_datatype().name), + str(idt.name), + ) + warnings.warn(warn_str) + self.set_nodeattr("inputDataType", idt.name) + # set output datatype from property + odt = self.get_output_datatype() + model.set_tensor_datatype(node.output[0], odt) + + def verify_node(self): + info_messages = [] + # verify that "backend" is set to "fpgadataflow" + backend_value = self.get_nodeattr("backend") + if backend_value == "fpgadataflow": + info_messages.append("Attribute backend is set correctly") + else: + info_messages.append('Attribute backend should be set to "fpgadataflow"') + + # verify that all necessary attributes exist + # TODO collect automatically from get_nodeattr_types + try: + self.get_nodeattr("code_gen_dir_cppsim") + self.get_nodeattr("executable_path") + self.get_nodeattr("NumChannels") + self.get_nodeattr("PE") + self.get_nodeattr("inputDataType") + self.get_nodeattr("outputDataType") + info_messages.append("All necessary attributes exist") + except Exception: + info_messages.append("""The required Threshold_Batch attributes do not exist.""") + + return info_messages + + def get_input_datatype(self, ind=0): + """Returns FINN DataType of input.""" + return DataType[self.get_nodeattr("inputDataType")] + + def get_output_datatype(self, ind=0): + """Returns FINN DataType of output.""" + return DataType[self.get_nodeattr("outputDataType")] + + def get_weight_datatype(self): + """Returns FINN DataType of thresholds, here called weights.""" + return DataType[self.get_nodeattr("weightDataType")] + + def get_weightstream_width(self): + """Returns weight stream width""" + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + n_thres_steps = self.get_nodeattr("numSteps") + w_width = pe * wp * n_thres_steps + return w_width + + def minimize_accumulator_width(self, model): + "Minimize threshold width ('accumulator width' here due to convention)" + thresholds = model.get_initializer(self.onnx_node.input[1]) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + min_input = self.get_input_datatype().min() + max_input = self.get_input_datatype().max() + # get range required by threshold values + tdt_min = min(min_input, min_threshold) + tdt_max = max(max_input, max_threshold) + if tdt_min < 0: + if abs(tdt_min) > tdt_max: + tdt = DataType.get_smallest_possible(tdt_min) + else: + tdt = DataType.get_smallest_possible(-tdt_max - 1) + else: + tdt = DataType.get_smallest_possible(tdt_max) + assert np.vectorize(tdt.allowed)( + threshold_tensor + ).all(), "Thresholds can't be expressed with type %s" % str(tdt) + self.set_nodeattr("weightDataType", tdt.name) + # Update QONNX DataType of tensor for consistency + model.set_tensor_datatype(self.onnx_node.input[1], tdt) + return DataType[self.get_nodeattr("weightDataType")] + + def get_instream_width(self, ind=0): + i_bits = self.get_input_datatype().bitwidth() + return i_bits * self.get_nodeattr("PE") + + def get_outstream_width(self, ind=0): + o_bits = self.get_output_datatype().bitwidth() + return o_bits * self.get_nodeattr("PE") + + def get_folded_input_shape(self, ind=0): + pe = self.get_nodeattr("PE") + fold = self.calc_tmem() + vecs = list(self.get_nodeattr("numInputVectors")) + folded_input_shape = tuple(vecs + [fold, pe]) + return folded_input_shape + + def get_folded_output_shape(self, ind=0): + # same shape as input + return self.get_folded_input_shape() + + def get_normal_input_shape(self, ind=0): + ich = self.get_nodeattr("NumChannels") + vecs = list(self.get_nodeattr("numInputVectors")) + normal_input_shape = tuple(vecs + [ich]) + return normal_input_shape + + def get_normal_output_shape(self, ind=0): + # same shape as input + return self.get_normal_input_shape() + + def get_number_output_values(self): + nf = np.prod(self.get_folded_output_shape()[:-1]) + return nf + + def get_exp_cycles(self): + # Channels/PE * batch size * fmdim * fmdim + return np.prod(self.get_folded_output_shape()[:-1]) + + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): + """Convert the original numpy weight matrix orig_weight_matrix into + a form suitable for passing to the hlslib call: + * ensure MH % PE == 0 + * for unsigned inputs, ensure thresholds are positive + * interleave rows between PEs + * reshape into (PE, TMEM, n_thres_steps) and return + """ + mh = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + tmem = mh // pe + assert mh % pe == 0, "Requirement NumChannels divisable by PE is violated." + assert ( + orig_thres_matrix.ndim == 2 + ), """Threshold matrix dimension is + not as expected (2).""" + n_thres_steps = orig_thres_matrix.shape[1] + assert n_thres_steps == self.get_nodeattr("numSteps"), "Mismatch in threshold steps" + if not self.get_input_datatype().signed(): + # ensure all thresholds are nonnegative + assert (orig_thres_matrix >= 0).all() + # ensure all thresholds are integer + assert np.equal(np.mod(orig_thres_matrix, 1), 0).all(), "Need int threshold tensor" + ret = orig_thres_matrix + # ensure channels = mh , duplicating if necessary + if ret.shape[0] == 1: + ret = np.tile(ret, (mh, 1)) + assert ret.shape[0] == mh, "Channels of threshold matrix are not as expected (mh)" + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + assert ( + ret.shape[0] == pe + ), """First dimension after distribution of the + rows between PEs is not as expected (pe)""" + assert ( + ret.shape[1] == tmem + ), """Second dimension after distribution of the + rows between PEs is not as expected (tmem)""" + assert ( + ret.shape[2] == n_thres_steps + ), """Third dimension after distribution of the + rows between PEs is not as expected (n_thres_steps)""" + return ret.reshape(1, pe, tmem, n_thres_steps) + + def execute_node(self, context, graph): + node = self.onnx_node + inp_values = context[node.input[0]] + th_val = context[node.input[1]] + # MT expects inputs to be in the shape (N,C,H,W) or (N, C) + # if 4D then input values in context are (N,H,W,C) and need to + # be transposed. + # if 2D then inputs can be passed directly to MT function + is_4d = len(inp_values.shape) == 4 + if is_4d: + inp_values = np.transpose(inp_values, (0, 3, 1, 2)) + y = multithreshold(inp_values, th_val) + if is_4d: + y = y.transpose(0, 2, 3, 1) + act = DataType[self.get_nodeattr("outputDataType")] + if act == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + context[node.output[0]] = y + + def calc_tmem(self): + """Calculates and returns TMEM.""" + num_channels = self.get_nodeattr("NumChannels") + pe = self.get_nodeattr("PE") + return num_channels // pe diff --git a/src/finn/custom_op/fpgadataflow/upsampler.py b/src/finn/custom_op/fpgadataflow/upsampler.py index a018fd35aa..3348394e05 100644 --- a/src/finn/custom_op/fpgadataflow/upsampler.py +++ b/src/finn/custom_op/fpgadataflow/upsampler.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,22 +27,20 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import numpy as np +import onnxruntime as rt import warnings +from onnx import TensorProto, helper from qonnx.core.datatype import DataType +from qonnx.util.basic import qonnx_make_model -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import npy_to_rtlsim_input, rtlsim_output_to_npy +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp -class UpsampleNearestNeighbour_Batch(HLSCustomOp): - """ - Corresponds to finn-hlslib UpsampleNearestNeighbour_Batch function. - Upsampling is done with the Nearest Neighbour algorithm. - The layer expects square feature maps for the in and output. - """ +class UpsampleNearestNeighbour(HWCustomOp): + """Abstraction layer for HW implementation of UpsampleNearestNeighbour.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { @@ -107,9 +105,7 @@ def make_shape_compatible_op(self, model): exp_ishape = self.get_normal_input_shape() oshape = self.get_normal_output_shape() ishape = tuple(model.get_tensor_shape(self.onnx_node.input[0])) - assert ( - ishape == exp_ishape - ), "Unexpect input shape for UpsampleNearestNeighbour_Batch." + assert ishape == exp_ishape, "Unexpect input shape for UpsampleNearestNeighbour_Batch." return super().make_const_shape_op(oshape) def infer_node_datatype(self, model): @@ -152,184 +148,44 @@ def get_number_output_values(self): folded_oshape = self.get_folded_output_shape() return np.prod(folded_oshape[:-1]) - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "upsample.hpp"'] - - def defines(self, var): - self.code_gen_dict["$DEFINES$"] = [] - - ifm_ch = self.get_nodeattr("NumChannels") - self.code_gen_dict["$DEFINES$"] += ["#define IFMChannels {}".format(ifm_ch)] - - ibits = self.get_input_datatype().bitwidth() - self.code_gen_dict["$DEFINES$"] += ["#define Input_precision {}".format(ibits)] - - idim = self.get_nodeattr("IFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define IFMDim {}".format(idim)] - + def execute_node(self, context, graph): + # create a standard resize node to help calculate the result + node = self.onnx_node + inp_values = context[node.input[0]] + ishape = inp_values.shape odim = self.get_nodeattr("OFMDim") - self.code_gen_dict["$DEFINES$"] += ["#define OFMDim {}".format(odim)] - - batch_size = self.get_nodeattr("numInputVectors") - self.code_gen_dict["$DEFINES$"] += ["#define numReps {}".format(batch_size)] - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - - def docompute(self): - is_2d = self.get_nodeattr("DimMode") == 0 - batch = self.get_nodeattr("numInputVectors") - if is_2d: - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_Batch > (in0, out, numReps);""" - ] + idim = self.get_nodeattr("IFMDim") + if ishape[1] == ishape[2]: + scales_val = [1, int(round(odim / idim)), int(round(odim / idim)), 1] + elif ishape[1] > 1 and ishape[2] == 1: + scales_val = [1, int(round(odim / idim)), 1, 1] else: - assert batch == 1, "1D upsampler currently needs numReps=1" - self.code_gen_dict["$DOCOMPUTE$"] = [ - """UpsampleNearestNeighbour_1D > (in0, out);""" - ] - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - if dtype == DataType["BIPOLAR"]: - # use binary for bipolar storage - dtype = DataType["BINARY"] - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - oshape = self.get_folded_output_shape() - oshape_cpp_str = str(oshape).replace("(", "{").replace(")", "}") - - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s");' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - oshape_cpp_str, - npy_out, + warnings.warn( + """HW abstraction layer for Upsample cannot be executed. + Upsampling only supported for 1D H, or 2D square scaling""" ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - "void %s(hls::stream<%s > &in0, hls::stream<%s > &out)" - % (self.onnx_node.name, packed_hls_type, packed_hls_type) - ] - - def pragmas(self): - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() + oshape = context[node.output[0]].shape + inp = helper.make_tensor_value_info(node.input[0], TensorProto.FLOAT, ishape) + scales = helper.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) + outp = helper.make_tensor_value_info(node.output[0], TensorProto.FLOAT, oshape) + node_resize = helper.make_node( + "Resize", + inputs=[node.input[0], "", "scales"], + outputs=[node.output[0]], + mode="nearest", ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" + graph_resize = helper.make_graph( + nodes=[node_resize], + name="single-resize-exec", + inputs=[inp, scales], + outputs=[outp], ) - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - node = self.onnx_node - exp_ishape = self.get_normal_input_shape() - exp_oshape = self.get_normal_output_shape() - folded_oshape = self.get_folded_output_shape() - - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - inp = context[node.input[0]] - assert str(inp.dtype) == "float32", "Input datatype is not float32" - assert ( - inp.shape == exp_ishape - ), """Input shape doesn't - match expected shape (numInputVectors, ImgDim, ImgDim, NumChannels).""" - export_idt = self.get_input_datatype() - self.dynamic_input_to_npy(context, 1, target_dir=code_gen_dir) - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == folded_oshape - ), "cppsim did not produce expected folded output shape" - context[node.output[0]] = context[node.output[0]].reshape(*exp_oshape) - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - rtlsim_inp = npy_to_rtlsim_input( - "{}/input_0.npy".format(code_gen_dir), export_idt, nbits - ) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - rtlsim_output = self.rtlsim(sim, rtlsim_inp) - odt = export_idt - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - rtlsim_output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - # load and reshape output - output = np.load(out_npy_path) - output = np.asarray([output], dtype=np.float32).reshape(*exp_oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - assert ( - context[node.output[0]].shape == exp_oshape - ), """Output shape doesn't match expected shape - (1, OutputDim, OutputDim, NumChannels).""" + opset_version = 13 + opset_imports = [helper.make_opsetid("", opset_version)] + onnx_kwargs = {"opset_imports": opset_imports} + model_resize = qonnx_make_model(graph_resize, **onnx_kwargs) + idict = {node.input[0]: inp_values, "scales": scales_val} + sess = rt.InferenceSession(model_resize.SerializeToString()) + result = sess.run(None, idict) + context[node.output[0]] = np.asarray(result, dtype=np.float32).reshape(oshape) diff --git a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py index d5e29ca22a..d95c6eb7cc 100644 --- a/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py +++ b/src/finn/custom_op/fpgadataflow/vectorvectoractivation.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,34 +28,31 @@ import math import numpy as np -import os +import onnx.numpy_helper as np_helper import textwrap import warnings from qonnx.core.datatype import DataType +from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.util.basic import ( calculate_matvec_accumulator_range, interleave_matrix_outer_dim_from_partitions, roundup_to_integer_multiple, ) -from finn.custom_op.fpgadataflow.hlscustomop import HLSCustomOp -from finn.util.data_packing import ( - npy_to_rtlsim_input, - numpy_to_hls_code, - pack_innermost_dim_as_hex_string, - rtlsim_output_to_npy, -) +from finn.custom_op.fpgadataflow.hwcustomop import HWCustomOp +from finn.util.data_packing import numpy_to_hls_code, pack_innermost_dim_as_hex_string -class VectorVectorActivation(HLSCustomOp): - """Class that corresponds to finn-hlslib Vector_Vector_Activate_Batch function""" +class VVAU(HWCustomOp): + """Abstraction layer for HW implementation of VectorVectorActivation layers.""" - def __init__(self, onnx_node): - super().__init__(onnx_node) + def __init__(self, onnx_node, **kwargs): + super().__init__(onnx_node, **kwargs) def get_nodeattr_types(self): my_attrs = { "PE": ("i", True, 0), + "SIMD": ("i", False, 1), "Dim": ("ints", True, []), # [H, W] "Channels": ("i", True, 0), "Kernel": ("ints", True, []), # [H, W] @@ -70,11 +67,16 @@ def get_nodeattr_types(self): # no-activation mode (produce accumulators) "noActivation": ("i", False, 0, {0, 1}), # memory mode for the layer weights - # const -- embedded weights, default, long compile/synth times - # decoupled -- streaming weights with weight streamer packaged inside IP + # internal_embedded -- embedded weights, long compile/synth times + # internal_decoupled -- default, streaming weights with streamer packaged inside IP # external -- streaming weights with external streamer - "mem_mode": ("s", False, "const", {"const", "decoupled", "external"}), - # (mem_mode = decoupled only) whether weights will be writable through + "mem_mode": ( + "s", + False, + "internal_decoupled", + {"internal_embedded", "internal_decoupled", "external"}, + ), + # (mem_mode = internal_decoupled only) whether weights will be writable through # an AXI-lite interface during runtime # 1 for enabled, 0 for disabled. # see finn-rtllib/memstream/doc/README for more about the memory @@ -84,7 +86,7 @@ def get_nodeattr_types(self): # vector through the accelerator. This will get rid of any old # weight data from the weight FIFOs. "runtime_writeable_weights": ("i", False, 0, {0, 1}), - # FPGA resource type for memories in decoupled mode + # FPGA resource type for memories in internal_decoupled mode # auto -- let Vivado decide # block -- use BRAM # distributed -- use LUTRAM @@ -103,88 +105,66 @@ def get_nodeattr_types(self): my_attrs.update(super().get_nodeattr_types()) return my_attrs - def minimize_accumulator_width(self, model): - weights = model.get_initializer(self.onnx_node.input[1]) - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - # put weights into the shape expected by calculate_matvec_accumulator_range - weights = weights.reshape(fm, k_h * k_w).transpose() - if len(self.onnx_node.input) > 2: - thresholds = model.get_initializer(self.onnx_node.input[2]) - else: - thresholds = None - idt = self.get_input_datatype() - # calculate minimum and maximum values of accumulator - (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) - if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - # set threshold datatype (and accumulator datatype implicitly) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - # clip threshold values - clip_upper = None - clip_lower = None - if max_threshold > acc_max + 1: - clip_upper = acc_max + 1 - if min_threshold < acc_min: - clip_lower = acc_min - if (clip_lower is not None) or (clip_upper is not None): - warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) - thresholds = np.clip(thresholds, clip_lower, clip_upper) - model.set_initializer(self.onnx_node.input[2], thresholds) - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) - min_threshold = thresholds.min() - max_threshold = thresholds.max() - # get range required by threshold values - tdt_min = min(acc_min, min_threshold) - tdt_max = max(acc_max, max_threshold) - if tdt_min < 0: - if abs(tdt_min) > tdt_max: - tdt = DataType.get_smallest_possible(tdt_min) - else: - tdt = DataType.get_smallest_possible(-tdt_max - 1) - else: - tdt = DataType.get_smallest_possible(tdt_max) - assert np.vectorize(tdt.allowed)( - threshold_tensor - ).all(), "Thresholds in %s can't be expressed with type %s" % ( - self.onnx_node.name, - str(tdt), - ) - self.set_nodeattr("accDataType", tdt.name) + def _infer_sparse_weight_tensor(self, W_conv, k_h, k_w, channels): + W_sparse = np.zeros((channels, channels, k_h, k_w), dtype=np.float32) + for ch in range(channels): + W_sparse[ch][ch] = W_conv[ch][0] + W_conv = W_sparse.astype(np.float32) + W_matmul = W_conv.transpose(0, 2, 3, 1) + W_matmul = W_matmul.reshape(channels, channels * k_h * k_w) + W_matmul = W_matmul.T + return W_matmul + + def execute_node(self, context, graph): + node = self.onnx_node + in_act = context[node.input[0]] + (_, dim_h, dim_w, _) = in_act.shape + (k_h, k_w) = self.get_nodeattr("Kernel") + channels = self.get_nodeattr("Channels") + producer = [x for x in graph.node if x.output[0] == node.input[0]] + if bool(producer) and ( + producer[0].op_type == "Im2Col" or producer[0].op_type == "ConvolutionInputGenerator" + ): + pe = channels else: - if acc_min < 0: - if abs(acc_min) > acc_max: - adt = DataType.get_smallest_possible(acc_min) - else: - adt = DataType.get_smallest_possible(-acc_max - 1) - else: - adt = DataType.get_smallest_possible(acc_max) - # ensure a datatype divisible by 8-bits in case this is the last node - bw = roundup_to_integer_multiple(adt.bitwidth(), 8) - new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) - adt = DataType[new_adt_name] - self.set_nodeattr("accDataType", adt.name) - # for no-activation nodes, output dt = acc dt - self.set_nodeattr("outputDataType", adt.name) - return DataType[self.get_nodeattr("accDataType")] + pe = self.get_nodeattr("PE") - def calc_wmem(self): - """Calculates and returns WMEM.""" - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - pe = self.get_nodeattr("PE") - wmem = k_h * k_w * ch // pe - return wmem + # Reorder the input activations. Note that PE gets interleaved by the SWG, + # so we have to untangle and for simplicity of computation assume pe=1. + # Note that PE has no effect on the QONNX node + in_act = in_act.reshape(1, dim_h, dim_w, channels // pe, k_h * k_w, pe) + in_act = in_act.transpose(0, 1, 2, 4, 3, 5) + in_act = in_act.reshape(1, dim_h, dim_w, channels * k_h * k_w) + # Reshape weights in appropriate format + vvau_w_init = [x for x in graph.initializer if x.name == node.input[1]][0] + vvau_w = np_helper.to_array(vvau_w_init) + vvau_w_onnx = self._infer_sparse_weight_tensor(vvau_w, k_h, k_w, channels) - def calc_tmem(self): - """Calculates and returns TMEM.""" - if self.get_nodeattr("noActivation") == 1: - return 0 + if ( + self.get_nodeattr("inputDataType") == "BIPOLAR" + and self.get_nodeattr("weightDataType") == "BIPOLAR" + ): + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format + result = (result + k_h * k_w) / 2 else: - ch = self.get_nodeattr("Channels") - pe = self.get_nodeattr("PE") - return ch // pe + result = np.matmul(in_act, vvau_w_onnx) # result is in [N, H, W, C] format + + if self.get_nodeattr("noActivation") == 0: + vvau_thr_init = [x for x in graph.initializer if x.name == node.input[2]][0] + vvau_thr = np_helper.to_array(vvau_thr_init) + odt_is_bipolar = self.get_nodeattr("outputDataType") == "BIPOLAR" + out_scale = 2 if odt_is_bipolar else 1 + out_bias = -1 if odt_is_bipolar else self.get_nodeattr("ActVal") + # NHWC to NCHW for multithreshold node + result = result.transpose((0, 3, 1, 2)) + result = multithreshold(result, vvau_thr, out_scale, out_bias) + # NCHW to NHWC + result = result.transpose((0, 2, 3, 1)) + + context[node.output[0]] = result + + def verify_node(self): + pass def make_shape_compatible_op(self, model): oshape = self.get_normal_output_shape() @@ -205,42 +185,76 @@ def infer_node_datatype(self, model): odt = self.get_output_datatype() model.set_tensor_datatype(node.output[0], odt) - def verify_node(self): - pass - def get_input_datatype(self, ind=0): """Returns FINN DataType of input.""" - return DataType[self.get_nodeattr("inputDataType")] + # when performing FIFO insertion on an FC layer with ext weights, the ind + # parameter can be > 0 (referring to the weights) so handle that here + if ind == 0: + return DataType[self.get_nodeattr("inputDataType")] + elif ind == 1: + return DataType[self.get_nodeattr("weightDataType")] + else: + raise Exception("Undefined input ind for this layer type") def get_weight_datatype(self): """Returns FINN DataType of weights.""" return DataType[self.get_nodeattr("weightDataType")] + def get_accumulator_datatype(self): + """Returns FINN DataType of accumulator""" + return DataType[self.get_nodeattr("accDataType")] + def get_output_datatype(self, ind=0): """Returns FINN DataType of output.""" return DataType[self.get_nodeattr("outputDataType")] def get_instream_width(self, ind=0): - i_bits = self.get_input_datatype().bitwidth() - in_width = i_bits * self.get_nodeattr("PE") + i_bits = self.get_input_datatype(ind).bitwidth() + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + in_width = i_bits * simd * pe return in_width + def get_weightstream_width(self): + """Returns weight stream width. Used only in internal_decoupled mode.""" + if ( + self.get_nodeattr("mem_mode") == "internal_decoupled" + or self.get_nodeattr("mem_mode") == "external" + ): + simd = self.get_nodeattr("SIMD") + pe = self.get_nodeattr("PE") + wp = self.get_weight_datatype().bitwidth() + w_width = simd * pe * wp + return w_width + else: + return 0 + def get_outstream_width(self, ind=0): o_bits = self.get_output_datatype().bitwidth() out_width = o_bits * self.get_nodeattr("PE") return out_width + def get_weightstream_width_padded(self): + """Returns weight stream width padded to a multiple of 8. This is required + by the AXI Stream spec. Used in internal_decoupled mode.""" + weight_width = self.get_weightstream_width() + return roundup_to_integer_multiple(weight_width, 8) + def get_folded_input_shape(self, ind=0): k_h, k_w = self.get_nodeattr("Kernel") - sf = k_h * k_w dim_h, dim_w = self.get_nodeattr("Dim") ch = self.get_nodeattr("Channels") + simd = self.get_nodeattr("SIMD") pe = self.get_nodeattr("PE") + kernel_2 = k_h * k_w + assert kernel_2 % simd == 0, "Requirement kernel (k_h * k_w) divisable by SIMD is violated." + sf = kernel_2 // simd + assert ch % pe == 0, "Requirement Channels divisable by PE is violated." nf = ch // pe if ind == 0: # calculate shape of input 0 - folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, pe]) + folded_input_shape = tuple([1, dim_h, dim_w, sf * nf, simd * pe]) elif ind == 1 and self.get_nodeattr("mem_mode") == "external": # calculate shape of input 1 (weights) folded_input_shape = tuple([1, sf * nf, pe]) @@ -274,8 +288,107 @@ def get_number_output_values(self): nf = np.prod(self.get_folded_output_shape()[:-1]) return nf + def calc_wmem(self): + """Calculates and returns WMEM.""" + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + wmem = (k_h * k_w * ch // pe) // simd + return wmem + + def calc_tmem(self): + """Calculates and returns TMEM.""" + if self.get_nodeattr("noActivation") == 1: + return 0 + else: + ch = self.get_nodeattr("Channels") + pe = self.get_nodeattr("PE") + return ch // pe + + def uram_estimation(self): + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle != "ultra") + or (mmode == "internal_embedded") + or (mmode == "external") + ): + return 0 + width_multiplier = math.ceil(mem_width / 72) + depth_multiplier = math.ceil(omega / 4096) + return width_multiplier * depth_multiplier + + def bram_estimation(self): + """Calculates resource estimation for BRAM""" + # TODO add in/out FIFO contributions + P = self.get_nodeattr("PE") + Q = self.get_nodeattr("SIMD") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + mem_width = Q * W * P + # assuming SDP mode RAMB18s (see UG573 Table 1-10) + # since this is HLS memory, not using the full width of a BRAM + # assuming memories up to 128 deep get implemented in LUTs + mmode = self.get_nodeattr("mem_mode") + mstyle = self.get_nodeattr("ram_style") + if ( + (mmode == "internal_decoupled" and mstyle in ["distributed", "ultra"]) + or (mstyle == "auto" and self.calc_wmem() <= 128) + or (mmode == "internal_embedded" and self.calc_wmem() <= 128) + or (mmode == "external") + ): + return 0 + + if mem_width == 1: + return math.ceil(omega / 16384) + elif mem_width == 2: + return math.ceil(omega / 8192) + elif mem_width <= 4: + return (math.ceil(omega / 4096)) * (math.ceil(mem_width / 4)) + elif mem_width <= 9: + return (math.ceil(omega / 2048)) * (math.ceil(mem_width / 8)) + elif mem_width <= 18 or omega > 512: + return (math.ceil(omega / 1024)) * (math.ceil(mem_width / 16)) + else: + return (math.ceil(omega / 512)) * (math.ceil(mem_width / 32)) + + def bram_efficiency_estimation(self): + P = self.get_nodeattr("PE") + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + omega = self.calc_wmem() + bram16_est = self.bram_estimation() + if bram16_est == 0: + return 1 + wbits = W * P * omega + bram16_est_capacity = bram16_est * 36 * 512 + return wbits / bram16_est_capacity + + def uram_efficiency_estimation(self): + """Function for URAM efficiency estimation: actual parameter storage + needed divided by the allocated URAM storage (from estimation)""" + wdt = self.get_weight_datatype() + W = wdt.bitwidth() + D_in = int(np.prod(self.get_nodeattr("Kernel"))) + D_out = self.get_nodeattr("Channels") + uram_est = self.uram_estimation() + if uram_est == 0: + return 1 + wbits = W * D_in * D_out + uram_est_capacity = uram_est * 72 * 4096 + return wbits / uram_est_capacity + def get_exp_cycles(self): pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") ch = self.get_nodeattr("Channels") dim_h, dim_w = self.get_nodeattr("Dim") k_h, k_w = self.get_nodeattr("Kernel") @@ -283,68 +396,110 @@ def get_exp_cycles(self): batch_size = 1 # since mmv != 1 is not supported yet, we set mmv for now to 1 mmv = 1 - exp_cycles = ((ch * k_h * k_w) / pe) * batch_size * (dim_h * dim_w) / mmv + exp_cycles = ((ch * k_h * k_w) / pe / simd) * batch_size * (dim_h * dim_w) / mmv return int(exp_cycles) - def get_template_param_values(self): - """Returns the template parameter values according to input, output and weight - data types.""" - ret = dict() - inp_hls_str = self.get_input_datatype().get_hls_datatype_str() - out_hls_str = self.get_output_datatype().get_hls_datatype_str() - inp_is_binary = self.get_input_datatype() == DataType["BINARY"] - # out_is_binary = self.get_output_datatype() == DataType["BINARY"] - wt_is_binary = self.get_weight_datatype() == DataType["BINARY"] - bin_xnor_mode = self.get_nodeattr("binaryXnorMode") == 1 - if (inp_is_binary or wt_is_binary) and (not bin_xnor_mode): - raise Exception("True binary (non-bipolar) inputs not yet supported") - inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] - # out_is_bipolar = self.get_output_datatype() == DataType["BIPOLAR"] - wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] - # reinterpret inp/wt as bipolar if bin_xnor_mode is iset - inp_is_bipolar = inp_is_bipolar or (inp_is_binary and bin_xnor_mode) - wt_is_bipolar = wt_is_bipolar or (wt_is_binary and bin_xnor_mode) - # fill in TSrcI and TWeightI - # TODO check these with Giulio - # TODO handle non-bipolar binary inputs - if inp_is_bipolar and wt_is_bipolar: - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and wt_is_bipolar: - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Recast" - elif inp_is_bipolar and (not wt_is_bipolar): - ret["TSrcI"] = "Recast" - ret["TWeightI"] = "Identity" - elif (not inp_is_bipolar) and (not wt_is_bipolar): - ret["TSrcI"] = "Slice<%s>" % inp_hls_str - ret["TWeightI"] = "Identity" + def minimize_accumulator_width(self, model): + """Minimize the accumulator bit width according to the weight values, + input data types, and size of dot product""" + weights = model.get_initializer(self.onnx_node.input[1]) + k_h, k_w = self.get_nodeattr("Kernel") + fm = self.get_nodeattr("Channels") + # put weights into the shape expected by calculate_matvec_accumulator_range + weights = weights.reshape(fm, k_h * k_w).transpose() + # since in the calculation the values of the weight matrix are used, + # for the bipolar case they need to be converted to bipolar + if self.get_nodeattr("binaryXnorMode"): + weights = 2 * weights - 1 + if len(self.onnx_node.input) > 2: + thresholds = model.get_initializer(self.onnx_node.input[2]) + else: + thresholds = None + idt = self.get_input_datatype() + + (acc_min, acc_max) = calculate_matvec_accumulator_range(weights, idt) + # if runtime-writeable weights, then the values of the weights can + # change and we need to use the worst-case values from the datatypes + if self.get_nodeattr("runtime_writeable_weights"): + wdt = self.get_weight_datatype() + lower_worst = wdt.min() * np.ones_like(weights) + lower_range = calculate_matvec_accumulator_range(lower_worst, idt) + upper_worst = wdt.max() * np.ones_like(weights) + upper_range = calculate_matvec_accumulator_range(upper_worst, idt) + acc_min = min(min(lower_range), min(upper_range)) + acc_max = max(max(upper_range), max(upper_range)) + + # if the thresholds can be used to determine range, then adjust the range + # according to the known values of the thresholds + if thresholds is not None: + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) + # set threshold datatype (and accumulator datatype implicitly) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + # clip threshold values + if max_threshold > acc_max or min_threshold < acc_min: + warnings.warn("Clipping some thresholds in %s" % self.onnx_node.name) + thresholds = np.clip(thresholds, acc_min, acc_max) + model.set_initializer(self.onnx_node.input[2], thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) + min_threshold = thresholds.min() + max_threshold = thresholds.max() + acc_min = min(min_threshold, acc_min) + acc_max = max(max_threshold, acc_max) + + # if the acc_range is always greater than 0, then acc_max <= 2^P - 1 + if acc_min >= 0: + acc_bit_width = np.log2(acc_max + 1) + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"UINT{acc_bit_width}"] + # if the acc_range is signed, then acc_min >= -2^{P-1} and acc_max <= + # 2^{P - 1} - 1, which means 2^{P - 1} >= max(-acc_min, 1 + acc_max) + else: + _acc_max = max(-acc_min, 1 + acc_max) + acc_bit_width = np.log2(_acc_max) + 1 + acc_bit_width = math.ceil(acc_bit_width) + adt = DataType[f"INT{acc_bit_width}"] - # fill in TDstI - ret["TDstI"] = "Slice<%s>" % out_hls_str + # if activation, assert that the thresholds can be expressed with adt + if thresholds is not None: + assert np.vectorize(adt.allowed)( + threshold_tensor + ).all(), "Thresholds in %s can't be expressed with type %s" % ( + self.onnx_node.name, + str(adt), + ) - return ret + # if no activation, output and accumulator datatypes are the same + if self.get_nodeattr("noActivation"): + # if this is the last node in the graph, then ensure the datatype is + # divisibly by 8 bits + if model.find_direct_successors(self.onnx_node) is None: + bw = roundup_to_integer_multiple(adt.bitwidth(), 8) + new_adt_name = adt.name.replace(str(adt.bitwidth()), str(bw)) + adt = DataType[new_adt_name] + # for no-activation nodes, output dt = acc dt + self.set_nodeattr("outputDataType", adt.name) + self.set_nodeattr("accDataType", adt.name) - def get_hls_compatible_weight_tensor(self, orig_weight_matrix): - pe = self.get_nodeattr("PE") - ch = self.get_nodeattr("Channels") - k_h, k_w = self.get_nodeattr("Kernel") - wmem = self.calc_wmem() - assert orig_weight_matrix.shape == ( - ch, - 1, - k_h, - k_w, - ), """Weights matrix doesn't - have expected shape (channels, 1, kernel_size, kernel_size)""" - ret = orig_weight_matrix - ret = ret.reshape(ch, k_h * k_w) - # distribute rows between PEs - ret = interleave_matrix_outer_dim_from_partitions(ret, pe) - ret = ret.reshape(1, pe, wmem, 1) - return ret + return DataType[self.get_nodeattr("accDataType")] + + def minimize_weight_bit_width(self, model): + """Minimize the bit width based on the values of the weights""" + if not self.get_nodeattr("runtime_writeable_weights"): + weights = model.get_initializer(self.onnx_node.input[1]) + w_min = weights.min() + w_max = weights.max() + if w_min < 0: + if abs(w_min) > w_max: + wdt = DataType.get_smallest_possible(w_min) + else: + wdt = DataType.get_smallest_possible(-w_max - 1) + else: + wdt = DataType.get_smallest_possible(w_max) + self.set_nodeattr("weightDataType", wdt.name) + return DataType[self.get_nodeattr("weightDataType")] - def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): + def get_hw_compatible_threshold_tensor(self, orig_thres_matrix): """Convert the original numpy weight matrix orig_weight_matrix into a form suitable for passing to the hlslib call: * ensure MH % PE == 0 @@ -375,19 +530,10 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): # ensure all thresholds are integer assert (orig_thres_matrix.astype(np.int32) == orig_thres_matrix).all() ret = orig_thres_matrix - # workaround for vivado_hls threshold bug - if ret[0][0] == 0 and n_thres_steps == 1: - ret = np.copy(ret) - ret[0][0] = 1 - warnings.warn( - "Setting 0-valued first threshold to 1 to avoid vivado_hls bug" - ) # ensure channels = mh , duplicating if necessary if ret.shape[0] == 1: ret = np.tile(ret, (ch, 1)) - assert ( - ret.shape[0] == ch - ), "Channels of threshold matrix are not as expected (ch)" + assert ret.shape[0] == ch, "Channels of threshold matrix are not as expected (ch)" # distribute rows between PEs ret = interleave_matrix_outer_dim_from_partitions(ret, pe) assert ( @@ -404,6 +550,29 @@ def get_hls_compatible_threshold_tensor(self, orig_thres_matrix): rows between PEs is not as expected (n_thres_steps)""" return ret.reshape(1, pe, tmem, n_thres_steps) + def get_hw_compatible_weight_tensor(self, orig_weight_matrix): + pe = self.get_nodeattr("PE") + simd = self.get_nodeattr("SIMD") + ch = self.get_nodeattr("Channels") + k_h, k_w = self.get_nodeattr("Kernel") + wmem = self.calc_wmem() + assert orig_weight_matrix.shape == ( + ch, + 1, + k_h, + k_w, + ), """Weights matrix doesn't + have expected shape (channels, 1, kernel_size, kernel_size)""" + ret = orig_weight_matrix + if self.get_weight_datatype() == DataType["BIPOLAR"]: + # convert bipolar to binary + ret = (ret + 1) / 2 + ret = ret.reshape(ch, k_h * k_w) + # distribute rows between PEs + ret = interleave_matrix_outer_dim_from_partitions(ret, pe) + ret = ret.reshape(1, pe, wmem, simd) + return ret + def make_weight_file(self, weights, weight_file_mode, weight_file_name): """Produce a file containing given weights in appropriate format for this layer. This file can be used for either synthesis or run-time reconfig @@ -418,21 +587,20 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): """ # convert weights into hlslib-compatible format - weight_tensor = self.get_hls_compatible_weight_tensor(weights) + weight_tensor = self.get_hw_compatible_weight_tensor(weights) export_wdt = self.get_weight_datatype() # we have converted bipolar weights to binary for export, # so use it as such for weight generation if self.get_weight_datatype() == DataType["BIPOLAR"]: export_wdt = DataType["BINARY"] if weight_file_mode == "hls_header": - weight_hls_code = numpy_to_hls_code( - weight_tensor, export_wdt, "weights", True, True - ) + weight_hls_code = numpy_to_hls_code(weight_tensor, export_wdt, "weights", True, True) # write weights into C++ header file as dictated by finn-hlslib f_weights = open(weight_file_name, "w") if export_wdt.bitwidth() != 1: f_weights.write( - "const FixedPointWeights<1,{},{},{}> weights = ".format( + "const FixedPointWeights<{},{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), export_wdt.get_hls_datatype_str(), self.get_nodeattr("PE"), self.calc_wmem(), @@ -440,7 +608,8 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): ) else: f_weights.write( - "const BinaryWeights<1,{},{}> weights = ".format( + "const BinaryWeights<{},{},{}> weights = ".format( + self.get_nodeattr("SIMD"), self.get_nodeattr("PE"), self.calc_wmem(), ) @@ -448,39 +617,50 @@ def make_weight_file(self, weights, weight_file_mode, weight_file_name): f_weights.write(weight_hls_code) f_weights.close() elif "decoupled" in weight_file_mode: - # create a weight stream for various flavors of decoupled mode: + # create a weight stream for various flavors of internal_decoupled mode: # transpose weight tensor from (1, PE, WMEM, SIMD) to (1, WMEM, PE, SIMD) weight_tensor_unflipped = np.transpose(weight_tensor, (0, 2, 1, 3)) # reverse SIMD flip for saving weights in .npy weight_tensor_simd_flipped = np.flip(weight_tensor_unflipped, axis=-1) # PE flip for saving weights in .dat weight_tensor_pe_flipped = np.flip(weight_tensor_unflipped, axis=-2) + # SIMD & PE flip + weight_tensor_pe_simd_flipped = np.flip(weight_tensor_pe_flipped, axis=-1) # reshape weight tensor (simd_flipped and pe_flipped) to desired shape pe = self.get_nodeattr("PE") - simd = 1 + simd = self.get_nodeattr("SIMD") # simd_flipped - weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape( - 1, -1, pe * simd - ) + weight_tensor_simd_flipped = weight_tensor_simd_flipped.reshape(1, -1, pe * simd) weight_tensor_simd_flipped = weight_tensor_simd_flipped.copy() # flipped - weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape( - 1, -1, pe * simd - ) + weight_tensor_pe_flipped = weight_tensor_pe_flipped.reshape(1, -1, pe * simd) weight_tensor_pe_flipped = weight_tensor_pe_flipped.copy() + # SIMD & PE flipped + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.reshape(1, -1, pe * simd) + weight_tensor_pe_simd_flipped = weight_tensor_pe_simd_flipped.copy() if weight_file_mode == "decoupled_npy": # save weight stream into npy for cppsim - np.save(weight_file_name, weight_tensor_simd_flipped) + if self.onnx_node.op_type == "VVAU_rtl": + weight_tensor_unflipped = weight_tensor_unflipped.reshape(1, -1, pe * simd) + weight_tensor_unflipped = weight_tensor_unflipped.copy() + np.save(weight_file_name, weight_tensor_unflipped) + else: + np.save(weight_file_name, weight_tensor_simd_flipped) elif weight_file_mode == "decoupled_verilog_dat": # convert weight values into hexstring weight_width = self.get_weightstream_width() # pad to nearest 4 bits to get hex strings weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - weight_tensor_pe_flipped = pack_innermost_dim_as_hex_string( - weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" - ) + if self.onnx_node.op_type == "VVAU_rtl": + weight_arr = pack_innermost_dim_as_hex_string( + weight_tensor_pe_simd_flipped, export_wdt, weight_width_padded, prefix="" + ) + else: + weight_arr = pack_innermost_dim_as_hex_string( + weight_tensor_pe_flipped, export_wdt, weight_width_padded, prefix="" + ) # add zeroes to pad out file to 1024 entries - weight_stream = weight_tensor_pe_flipped.flatten() + weight_stream = weight_arr.flatten() weight_stream = weight_stream.copy() with open(weight_file_name, "w") as f: for val in weight_stream: @@ -517,42 +697,22 @@ def generate_params(self, model, path): code_gen_dir = path # weights, if not external weights = model.get_initializer(self.onnx_node.input[1]) - if mem_mode == "const": + if mem_mode == "internal_embedded": # save hlslib-compatible weights in params.h weight_filename = "{}/params.h".format(code_gen_dir) self.make_weight_file(weights, "hls_header", weight_filename) - elif mem_mode == "decoupled" or mem_mode == "external": + elif mem_mode == "internal_decoupled" or mem_mode == "external": weight_filename_sim = "{}/weights.npy".format(code_gen_dir) - # save decoupled weights for cppsim + # save internal_decoupled weights for cppsim self.make_weight_file(weights, "decoupled_npy", weight_filename_sim) - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": # also save weights as Verilog .dat file - # note that we provide two different .dat files, one for synth - # and one for synthesis. this is because URAM-based weights always - # need zero weights for synthesis, otherwise they get inferred - # as BRAM - weight_filename_rtl_synth = "{}/memblock_synth_0.dat".format( - code_gen_dir - ) - weight_filename_rtl_sim = "{}/memblock_sim_0.dat".format(code_gen_dir) - # sim weights are always the true weights - self.make_weight_file( - weights, "decoupled_verilog_dat", weight_filename_rtl_sim - ) - ram_style = self.get_nodeattr("ram_style") - if ram_style == "ultra": - # UltraRAM must have no memory initializer, or only zeroes - # otherwise BRAM will be inferred instead of URAM - # as a workaround we provide a zero-weight init here - synth_weights = np.zeros_like(weights, dtype=np.float32) - else: - synth_weights = weights - self.make_weight_file( - synth_weights, "decoupled_verilog_dat", weight_filename_rtl_synth - ) + # This file will be ignored when synthesizing UltraScale memory. + weight_filename_rtl = "{}/memblock.dat".format(code_gen_dir) + self.make_weight_file(weights, "decoupled_verilog_dat", weight_filename_rtl) else: raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", + """Please set mem_mode to "internal_embedded", "internal_decoupled", or "external", currently no other parameter value is supported!""" ) @@ -560,7 +720,7 @@ def generate_params(self, model, path): if len(self.onnx_node.input) > 2: thresholds = model.get_initializer(self.onnx_node.input[2]) if thresholds is not None: - threshold_tensor = self.get_hls_compatible_threshold_tensor(thresholds) + threshold_tensor = self.get_hw_compatible_threshold_tensor(thresholds) # use UINT32 threshold export for bipolar times bipolar inp_is_bipolar = self.get_input_datatype() == DataType["BIPOLAR"] wt_is_bipolar = self.get_weight_datatype() == DataType["BIPOLAR"] @@ -605,356 +765,52 @@ def generate_params(self, model, path): f_thresh.write(thresholds_hls_code) f_thresh.close() - def execute_node(self, context, graph): - mode = self.get_nodeattr("exec_mode") - mem_mode = self.get_nodeattr("mem_mode") - node = self.onnx_node - - # TODO ensure codegen dir exists - if mode == "cppsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - elif mode == "rtlsim": - code_gen_dir = self.get_nodeattr("code_gen_dir_ipgen") - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - # create a npy file fore each input of the node (in_ind is input index) - in_ind = 0 - for inputs in node.input: - # it is assumed that the first input of the node is the data input - # the second input are the weights - # the third input are the thresholds - if in_ind == 0: - assert ( - str(context[inputs].dtype) == "float32" - ), """Input datatype is - not float32 as expected.""" - expected_inp_shape = self.get_folded_input_shape() - reshaped_input = context[inputs].reshape(expected_inp_shape) - # make copy before saving the array - reshaped_input = reshaped_input.copy() - np.save( - os.path.join(code_gen_dir, "input_{}.npy".format(in_ind)), - reshaped_input, - ) - elif in_ind > 2: - raise Exception("Unexpected input found for VectorVectorActivation") - in_ind += 1 - - if mode == "cppsim": - # execute the precompiled model - super().exec_precompiled_singlenode_model() - # load output npy file - super().npy_to_dynamic_output(context) - assert ( - context[node.output[0]].shape == self.get_normal_output_shape() - ), "cppsim did not produce expected output shape" - elif mode == "rtlsim": - sim = self.get_rtlsim() - nbits = self.get_instream_width() - idt = self.get_input_datatype() - inp = npy_to_rtlsim_input("{}/input_0.npy".format(code_gen_dir), idt, nbits) - super().reset_rtlsim(sim) - super().toggle_clk(sim) - - if mem_mode == "external" or mem_mode == "decoupled": - wnbits = self.get_weightstream_width() - export_wdt = self.get_weight_datatype() - # we have converted bipolar weights to binary for export, - # so use it as such for weight generation - if self.get_weight_datatype() == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - wei = npy_to_rtlsim_input( - "{}/weights.npy".format(code_gen_dir), export_wdt, wnbits - ) - dim_h, dim_w = self.get_nodeattr("Dim") - num_w_reps = dim_h * dim_w - - io_dict = { - "inputs": {"in0": inp, "weights": wei * num_w_reps}, - "outputs": {"out": []}, - } - self.rtlsim_multi_io(sim, io_dict) - output = io_dict["outputs"]["out"] - else: - output = self.rtlsim(sim, inp) - odt = self.get_output_datatype() - target_bits = odt.bitwidth() - packed_bits = self.get_outstream_width() - out_npy_path = "{}/output.npy".format(code_gen_dir) - out_shape = self.get_folded_output_shape() - rtlsim_output_to_npy( - output, out_npy_path, odt, out_shape, packed_bits, target_bits - ) - - # load and reshape output - output = np.load(out_npy_path) - oshape = self.get_normal_output_shape() - output = np.asarray([output], dtype=np.float32).reshape(*oshape) - context[node.output[0]] = output - else: - raise Exception( - """Invalid value for attribute exec_mode! Is currently set to: {} - has to be set to one of the following value ("cppsim", "rtlsim")""".format( - mode - ) - ) - - def global_includes(self): - self.code_gen_dict["$GLOBALS$"] = ['#include "weights.hpp"'] - self.code_gen_dict["$GLOBALS$"] += ['#include "activations.hpp"'] - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode not in ["const", "decoupled", "external"]: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - if self.calc_tmem() != 0: - self.code_gen_dict["$GLOBALS$"] += ['#include "thresh.h"'] - - def defines(self, var): - dim_h, dim_w = self.get_nodeattr("Dim") - numReps = 1 * dim_h * dim_w + def get_op_and_param_counts(self): k_h, k_w = self.get_nodeattr("Kernel") - innerProdDim = k_h * k_w - mem_mode = self.get_nodeattr("mem_mode") - - self.code_gen_dict["$DEFINES$"] = [ - """#define Channels1 {}\n #define InnerProdDim {}\n - #define SIMD1 1\n #define PE1 {}\n #define numReps {}""".format( - self.get_nodeattr("Channels"), - innerProdDim, - self.get_nodeattr("PE"), - numReps, - ) - ] - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - self.code_gen_dict["$DEFINES$"].append( - "#define WP1 {}\n".format(wdt.bitwidth()) - ) - - def read_npy_data(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_input_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_instream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/input_0.npy" % code_gen_dir - self.code_gen_dict["$READNPYDATA$"] = [] - # note: the innermost dim is reversed for the input - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", in0, false);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - elem_bits = wdt.bitwidth() - packed_bits = self.get_weightstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = wdt.get_hls_datatype_str() - npy_type = "float" - npy_in = "%s/weights.npy" % code_gen_dir - - self.code_gen_dict["$READNPYDATA$"].append( - 'npy2apintstream<%s, %s, %d, %s>("%s", weights, false, numReps);' - % (packed_hls_type, elem_hls_type, elem_bits, npy_type, npy_in) - ) - - def strm_decl(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$STREAMDECLARATIONS$"] = [] - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> in0 ("in0");'.format(self.get_instream_width()) - ) - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> out ("out");'.format(self.get_outstream_width()) - ) - if mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$STREAMDECLARATIONS$"].append( - 'hls::stream> weights ("weights");'.format( - self.get_weightstream_width() - ) - ) + fm = self.get_nodeattr("Channels") + dim_h, dim_w = self.get_nodeattr("Dim") + weight_bits = self.get_weight_datatype().bitwidth() + inp_bits = self.get_input_datatype().bitwidth() + num_repetitions = int(dim_h * dim_w) + mac_count = k_h * k_w * fm * num_repetitions + # cannonicalize op type: highest bitwidth operand first s.t. + # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types + bw1 = min(inp_bits, weight_bits) + bw2 = max(inp_bits, weight_bits) + mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) + weight_param_type = "param_weight_%db" % (weight_bits) + weight_count = k_h * k_w * fm + ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} + if self.get_nodeattr("noActivation") == 0: + tdt = DataType[self.get_nodeattr("accDataType")] + thres_bits = tdt.bitwidth() + thres_param_type = "param_threshold_%db" % (thres_bits) + thres_count = fm + ret_dict[thres_param_type] = thres_count + return ret_dict - def docompute(self): - mem_mode = self.get_nodeattr("mem_mode") - map_to_hls_mult_style = { - "auto": "ap_resource_dflt()", - "lut": "ap_resource_lut()", - "dsp": "ap_resource_dsp()", + def derive_characteristic_fxns(self, period): + n_inps = np.prod(self.get_folded_input_shape()[:-1]) + io_dict = { + "inputs": { + "in0": [0 for i in range(n_inps)], + }, + "outputs": {"out": []}, } - tmpl_args = self.get_template_param_values() - if self.calc_tmem() == 0: - odtype_hls_str = self.get_output_datatype().get_hls_datatype_str() - threshs = "PassThroughActivation<%s>()" % odtype_hls_str - else: - threshs = "threshs" - - if mem_mode == "const": - self.code_gen_dict["$DOCOMPUTE$"] = [ - """Vector_Vector_Activate_Batch - (in0, out, weights, {}, numReps, {});""".format( - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - wdt = self.get_weight_datatype() - if wdt == DataType["BIPOLAR"]: - export_wdt = DataType["BINARY"] - else: - export_wdt = wdt - wdtype_hls_str = export_wdt.get_hls_datatype_str() - self.code_gen_dict["$DOCOMPUTE$"] = [ - """{} - (in0, out, weights, {}, numReps, {});""".format( - "Vector_Vector_Activate_Stream_Batch", - tmpl_args["TSrcI"], - tmpl_args["TDstI"], - tmpl_args["TWeightI"], - wdtype_hls_str, - threshs, - map_to_hls_mult_style[self.get_nodeattr("resType")], - ) - ] - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or "external", - currently no other parameter value is supported!""" - ) - - def dataoutstrm(self): - code_gen_dir = self.get_nodeattr("code_gen_dir_cppsim") - dtype = self.get_output_datatype() - elem_bits = dtype.bitwidth() - packed_bits = self.get_outstream_width() - packed_hls_type = "ap_uint<%d>" % packed_bits - elem_hls_type = dtype.get_hls_datatype_str() - npy_type = "float" - npy_out = "%s/output.npy" % code_gen_dir - shape = self.get_folded_output_shape() - shape_cpp_str = str(shape).replace("(", "{").replace(")", "}") - - # note: the innermost dim is not reversed for the output - self.code_gen_dict["$DATAOUTSTREAM$"] = [ - 'apintstream2npy<%s, %s, %d, %s>(out, %s, "%s", false);' - % ( - packed_hls_type, - elem_hls_type, - elem_bits, - npy_type, - shape_cpp_str, - npy_out, - ) - ] - - def save_as_npy(self): - self.code_gen_dict["$SAVEASCNPY$"] = [] - - def blackboxfunction(self): mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "const": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}(hls::stream> &in0, - hls::stream> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_outstream_width(), - ) - ] - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$BLACKBOXFUNCTION$"] = [ - """void {}( - hls::stream> &in0, - hls::stream> &weights, - hls::stream> &out - )""".format( - self.onnx_node.name, - self.get_instream_width(), - self.get_weightstream_width(), - self.get_outstream_width(), - ) - ] - else: - raise Exception( - """Please set mem_mode to "const" or "decoupled", currently no other - parameter value is supported!""" - ) - - def pragmas(self): - mem_mode = self.get_nodeattr("mem_mode") - self.code_gen_dict["$PRAGMAS$"] = [ - "#pragma HLS INTERFACE axis port=in0 name=in0_" + self.hls_sname() - ] - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=out name=out_" + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE ap_ctrl_none port=return" - ) - - if mem_mode == "const": - self.code_gen_dict["$PRAGMAS$"].append('#include "params.h"') - # the weight tensor is ap_uint [PE][WMEM] - # partition for parallel access along the PE dimension (dim 1) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=weights.m_weights " - "complete dim=1" - ) - ) - elif mem_mode == "decoupled" or mem_mode == "external": - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS INTERFACE axis port=weights name=weights_" - + self.hls_sname() - ) - self.code_gen_dict["$PRAGMAS$"].append( - "#pragma HLS stream depth=8 variable=weights" - ) - else: - raise Exception( - """Please set mem_mode to "const", "decoupled", or external, - currently no other parameter value is supported!""" - ) - - if self.calc_tmem() != 0: - # TODO find a better way of checking for no pregenerated thresholds - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=1" - ) - ) - self.code_gen_dict["$PRAGMAS$"].append( - ( - "#pragma HLS ARRAY_PARTITION variable=threshs.m_thresholds " - "complete dim=3" - ) - ) + if mem_mode in ["internal_decoupled", "external"]: + n_weight_inps = self.calc_wmem() + num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) + io_dict["inputs"]["weights"] = [0 for i in range(num_w_reps * n_weight_inps)] + super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) def get_verilog_top_module_intf_names(self): intf_names = super().get_verilog_top_module_intf_names() mem_mode = self.get_nodeattr("mem_mode") sname = self.hls_sname() if mem_mode == "external": - intf_names["s_axis"].append( - ("weights_" + sname, self.get_weightstream_width_padded()) - ) - if mem_mode == "decoupled": + intf_names["s_axis"].append(("weights_" + sname, self.get_weightstream_width_padded())) + if mem_mode == "internal_decoupled": # only expose axilite interface if attribute is set runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if runtime_writable: @@ -965,7 +821,7 @@ def code_generation_ipi(self): cmd = [] # add streamer if needed mem_mode = self.get_nodeattr("mem_mode") - if mem_mode == "decoupled": + if mem_mode == "internal_decoupled": runtime_writable = self.get_nodeattr("runtime_writeable_weights") == 1 if self.get_nodeattr("ram_style") == "ultra": assert ( @@ -983,43 +839,33 @@ def code_generation_ipi(self): cmd.append("create_bd_pin -dir I -type rst /%s/%s" % (node_name, rst_name)) cmd.append( "create_bd_intf_pin -mode Master " - "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" - % (node_name, dout_name) + "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, dout_name) ) cmd.append( "create_bd_intf_pin -mode Slave " "-vlnv xilinx.com:interface:axis_rtl:1.0 /%s/%s" % (node_name, din_name) ) - # instantiate the hls ip - cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (self.get_nodeattr("ip_vlnv"), node_name, node_name) - ) + # Instantiate either the HLS or RTL IP depending on operator + self.instantiate_ip(cmd) + # instantiate a streamer and connect it to the HLS IP - strm_vlnv = "xilinx.com:user:memstream:1.0" + strm_vlnv = "amd.com:finn:memstream:1.0" strm_inst = node_name + "_wstrm" cmd.append( - "create_bd_cell -type ip -vlnv %s /%s/%s" - % (strm_vlnv, node_name, strm_inst) + "create_bd_cell -type ip -vlnv %s /%s/%s" % (strm_vlnv, node_name, strm_inst) ) cmd.append( "set_property -dict [list " - "CONFIG.NSTREAMS {1} " - "CONFIG.MEM_DEPTH {%d} " - "CONFIG.MEM_WIDTH {%d} " - "CONFIG.MEM_INIT {%s} " + "CONFIG.DEPTH {%d} " + "CONFIG.WIDTH {%d} " + "CONFIG.INIT_FILE {%s} " "CONFIG.RAM_STYLE {%s} " - "CONFIG.STRM0_DEPTH {%d} " - "CONFIG.STRM0_WIDTH {%d} " - "CONFIG.STRM0_OFFSET {0} " "] [get_bd_cells /%s/%s]" % ( self.calc_wmem(), self.get_weightstream_width_padded(), - self.get_nodeattr("code_gen_dir_ipgen") + "/", + self.get_nodeattr("code_gen_dir_ipgen") + "/memblock.dat", self.get_nodeattr("ram_style"), - self.calc_wmem(), - self.get_weightstream_width_padded(), node_name, strm_inst, ) @@ -1030,11 +876,11 @@ def code_generation_ipi(self): % (node_name, strm_inst, node_name, node_name, sname) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aresetn]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_rst_n]" % (node_name, rst_name, node_name, strm_inst) ) cmd.append( - "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/aclk]" + "connect_bd_net [get_bd_pins %s/%s] [get_bd_pins %s/%s/ap_clk]" % (node_name, clk_name, node_name, strm_inst) ) cmd.append( @@ -1060,8 +906,7 @@ def code_generation_ipi(self): axilite_name = self.get_verilog_top_module_intf_names()["axilite"][0] cmd.append( "create_bd_intf_pin -mode Slave " - "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" - % (node_name, axilite_name) + "-vlnv xilinx.com:interface:aximm_rtl:1.0 /%s/%s" % (node_name, axilite_name) ) cmd.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " @@ -1071,194 +916,9 @@ def code_generation_ipi(self): # TODO calculate and pass in segment size here cmd.append("assign_bd_address") cmd.append("save_bd_design") - elif mem_mode == "const" or mem_mode == "external": - # base class impl sufficient for const/external modes - return super().code_generation_ipi() + elif mem_mode == "internal_embedded" or mem_mode == "external": + # base class impl sufficient for internal_embedded/external modes + self.instantiate_ip(cmd) else: raise Exception("Unrecognized mem_mode for VectorVectorActivation") return cmd - - def uram_estimation(self): - P = self.get_nodeattr("PE") - Q = 1 - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - mem_width = Q * W * P - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle != "ultra") - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - width_multiplier = math.ceil(mem_width / 72) - depth_multiplier = math.ceil(omega / 4096) - return width_multiplier * depth_multiplier - - def bram_estimation(self): - """Calculates resource estimation for BRAM""" - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - # assuming SDP mode RAMB18s (see UG573 Table 1-10) - # since this is HLS memory, not using the full width of a BRAM - # assuming memories up to 128 deep get implemented in LUTs - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if ( - (mmode == "decoupled" and mstyle in ["distributed", "ultra"]) - or (mmode == "const" and self.calc_wmem() <= 128) - or (mmode == "external") - ): - return 0 - - if W == 1: - return math.ceil(omega / 16384) * P - elif W == 2: - return math.ceil(omega / 8192) * P - elif W <= 4: - return (math.ceil(omega / 4096)) * (math.ceil(W / 4)) * P - elif W <= 9: - return (math.ceil(omega / 2048)) * (math.ceil(W / 8)) * P - elif W <= 18 or omega > 512: - return (math.ceil(omega / 1024)) * (math.ceil(W / 16)) * P - else: - return (math.ceil(omega / 512)) * (math.ceil(W / 32)) * P - - def bram_efficiency_estimation(self): - P = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - omega = self.calc_wmem() - bram16_est = self.bram_estimation() - if bram16_est == 0: - return 1 - wbits = W * P * omega - bram16_est_capacity = bram16_est * 36 * 512 - return wbits / bram16_est_capacity - - def lut_estimation(self): - """Calculates resource estimations for LUTs based on: - - FINN-R: An End-to-End Deep-Learning Framework for Fast - Exploration of Quantized Neural Networks - - M. Blott, T. B. Preusser, N. J. Fraser, G. Gambardella, K. O'Brien, - Y. Umuroglu, M. Leeser and K. Vissers - - 12. Sep 2018 - """ - # TODO add in/out FIFO contributions - P = self.get_nodeattr("PE") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - # determine tdt with input and weight data types - idt = self.get_input_datatype() - A = idt.bitwidth() - # parameters from experiments in paper mentioned above - c0 = 300 - c1 = 1.1 - c2 = 0 - mmode = self.get_nodeattr("mem_mode") - mstyle = self.get_nodeattr("ram_style") - if (mmode == "decoupled" and mstyle == "distributed") or ( - mmode == "const" and self.calc_wmem() <= 128 - ): - c2 = (P * W) * math.ceil(self.calc_wmem() / 64) - - # multiplication - res_type = self.get_nodeattr("resType") - if res_type == "dsp": - mult_luts = 0 - else: - mult_luts = (2 * math.ceil((W + A) / 6) - 1) * (W + A) - # accumulator - k_h, k_w = self.get_nodeattr("Kernel") - acc_bits = W + A + math.ceil(math.log(k_h * k_w, 2)) - acc_luts = acc_bits - # thresholds and threshold comparators - thr_luts = 0 - comp_luts = 0 - noact = self.get_nodeattr("noActivation") - if noact == 0: - odt = self.get_output_datatype() - B = odt.bitwidth() - thr_luts = (2**B - 1) * acc_bits * math.ceil(self.calc_tmem() / 64) - comp_luts = (2**B - 1) * acc_bits - - return int(c0 + c1 * (P * (mult_luts + acc_luts + thr_luts + comp_luts)) + c2) - - def dsp_estimation(self): - # multiplication - P = self.get_nodeattr("PE") - res_type = self.get_nodeattr("resType") - wdt = self.get_weight_datatype() - W = wdt.bitwidth() - idt = self.get_input_datatype() - A = idt.bitwidth() - if res_type == "dsp": - mult_dsp = P * np.ceil((W + A) / 48) # TODO: more accurate modelling - else: - mult_dsp = 0 - return int(mult_dsp) - - def get_weightstream_width(self): - """Returns weight stream width. Used only in decoupled mode.""" - if ( - self.get_nodeattr("mem_mode") == "decoupled" - or self.get_nodeattr("mem_mode") == "external" - ): - pe = self.get_nodeattr("PE") - wp = self.get_weight_datatype().bitwidth() - w_width = pe * wp - return w_width - else: - return 0 - - def get_weightstream_width_padded(self): - """Returns weight stream width padded to a multiple of 8. This is required - by the AXI Stream spec. Used in decoupled mode.""" - weight_width = self.get_weightstream_width() - return roundup_to_integer_multiple(weight_width, 8) - - def get_op_and_param_counts(self): - k_h, k_w = self.get_nodeattr("Kernel") - fm = self.get_nodeattr("Channels") - dim_h, dim_w = self.get_nodeattr("Dim") - weight_bits = self.get_weight_datatype().bitwidth() - inp_bits = self.get_input_datatype().bitwidth() - num_repetitions = int(dim_h * dim_w) - mac_count = k_h * k_w * fm * num_repetitions - # cannonicalize op type: highest bitwidth operand first s.t. - # e.g. mac_8bx4b and mac_4bx8b don't appear as two different op types - bw1 = min(inp_bits, weight_bits) - bw2 = max(inp_bits, weight_bits) - mac_op_type = "op_mac_%dbx%db" % (bw1, bw2) - weight_param_type = "param_weight_%db" % (weight_bits) - weight_count = k_h * k_w * fm - ret_dict = {mac_op_type: mac_count, weight_param_type: weight_count} - if self.get_nodeattr("noActivation") == 0: - tdt = DataType[self.get_nodeattr("accDataType")] - thres_bits = tdt.bitwidth() - thres_param_type = "param_threshold_%db" % (thres_bits) - thres_count = fm - ret_dict[thres_param_type] = thres_count - return ret_dict - - def derive_characteristic_fxns(self, period): - n_inps = np.prod(self.get_folded_input_shape()[:-1]) - io_dict = { - "inputs": { - "in0": [0 for i in range(n_inps)], - }, - "outputs": {"out": []}, - } - mem_mode = self.get_nodeattr("mem_mode") - if mem_mode in ["decoupled", "external"]: - n_weight_inps = self.calc_wmem() - num_w_reps = np.prod(self.get_nodeattr("numInputVectors")) - io_dict["inputs"]["weights"] = [ - 0 for i in range(num_w_reps * n_weight_inps) - ] - super().derive_characteristic_fxns(period, override_rtlsim_dict=io_dict) diff --git a/src/finn/qnn-data/build_dataflow/build.py b/src/finn/qnn-data/build_dataflow/build.py index 0d9d55a086..58d566a6e6 100644 --- a/src/finn/qnn-data/build_dataflow/build.py +++ b/src/finn/qnn-data/build_dataflow/build.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2020-2022 Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -43,6 +44,7 @@ mvau_wwidth_max=10000, # can specify detailed folding/FIFO/etc config with: # folding_config_file="folding_config.json", + specialize_layers_config_file="specialize_layers_config.json", synth_clk_period_ns=10.0, board=platform_name, shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, diff --git a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json index a053c1a22f..8165055fd5 100644 --- a/src/finn/qnn-data/build_dataflow/dataflow_build_config.json +++ b/src/finn/qnn-data/build_dataflow/dataflow_build_config.json @@ -4,7 +4,8 @@ "mvau_wwidth_max": 10000, "synth_clk_period_ns": 10.0, "board": "Pynq-Z1", - "standalone_thresholds": true, + "standalone_thresholds": false, + "folding_config_file": "folding_config.json", "shell_flow_type": "vivado_zynq", "verify_save_rtlsim_waveforms": true, "force_python_rtlsim": true, diff --git a/src/finn/qnn-data/build_dataflow/folding_config.json b/src/finn/qnn-data/build_dataflow/folding_config.json index 95167f1a30..124876c3db 100644 --- a/src/finn/qnn-data/build_dataflow/folding_config.json +++ b/src/finn/qnn-data/build_dataflow/folding_config.json @@ -1,30 +1,29 @@ { "Defaults": {}, - "Thresholding_Batch_0": { - "PE": 49, - "ram_style": "distributed" + "Thresholding_rtl_0": { + "PE": 49 }, - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MatrixVectorActivation_1": { + "MVAU_hls_1": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_2": { + "MVAU_hls_2": { "PE": 8, "SIMD": 8, "ram_style": "auto" }, - "MatrixVectorActivation_3": { + "MVA_hls_3": { "PE": 10, "SIMD": 8, "ram_style": "distributed" }, - "LabelSelect_Batch_0": { + "LabelSelect_hls_0": { "PE": 1 } } diff --git a/src/finn/qnn-data/build_dataflow/specialize_layers_config.json b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json new file mode 100644 index 0000000000..9224a72907 --- /dev/null +++ b/src/finn/qnn-data/build_dataflow/specialize_layers_config.json @@ -0,0 +1,21 @@ +{ + "Defaults": {}, + "Thresholding_0": { + "preferred_impl_style": "rtl" + }, + "MVAU_0": { + "preferred_impl_style": "hls" + }, + "MVAU_1": { + "preferred_impl_style": "" + }, + "MVAU_2": { + "preferred_impl_style": "" + }, + "MVAU_3": { + "preferred_impl_style": "" + }, + "LabelSelect_0": { + "preferred_impl_style": "hls" + } +} diff --git a/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py index be09abad9c..e0e2a75f19 100644 --- a/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py +++ b/src/finn/qnn-data/cybsec-mlp/validate-unsw-nb15.py @@ -57,9 +57,7 @@ def make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches): help='name of bitfile (i.e. "resizer.bit")', default="../bitfile/finn-accel.bit", ) - parser.add_argument( - "--dataset_root", help="dataset root dir for download/reuse", default="." - ) + parser.add_argument("--dataset_root", help="dataset root dir for download/reuse", default=".") parser.add_argument( "--limit_batches", help="number of batches, -1 for max", type=int, default=-1 ) @@ -72,9 +70,7 @@ def make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches): limit_batches = args.limit_batches print("Loading dataset...") - (test_imgs, test_labels) = make_unsw_nb15_test_batches( - bsize, dataset_root, limit_batches - ) + (test_imgs, test_labels) = make_unsw_nb15_test_batches(bsize, dataset_root, limit_batches) ok = 0 nok = 0 diff --git a/src/finn/qnn-data/templates/driver/driver_base.py b/src/finn/qnn-data/templates/driver/driver_base.py index 2096760580..aa54f84733 100644 --- a/src/finn/qnn-data/templates/driver/driver_base.py +++ b/src/finn/qnn-data/templates/driver/driver_base.py @@ -122,7 +122,7 @@ def load_external_weights(self): w_filenames = [] if not os.path.isdir(self.runtime_weight_dir): return - for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir): + for dirpath, dirnames, filenames in os.walk(self.runtime_weight_dir): w_filenames.extend(filenames) tmp_weight_dict = {} @@ -173,7 +173,7 @@ def load_runtime_weights(self, flush_accel=True, verify=True): w_filenames = [] if not os.path.isdir(self.runtime_weight_dir): return - for (dirpath, dirnames, filenames) in os.walk(self.runtime_weight_dir): + for dirpath, dirnames, filenames in os.walk(self.runtime_weight_dir): w_filenames.extend(filenames) rt_weight_dict = {} for w_filename in w_filenames: @@ -182,26 +182,29 @@ def load_runtime_weights(self, flush_accel=True, verify=True): dat = f.read() else: continue - layer_w = np.fromiter( - [int(x, 16) for x in dat.strip().split()], dtype=np.uint32 - ) + layer_w = np.fromiter([int(x, 16) for x in dat.strip().split()], dtype=np.uint32) sdp_ind = int(w_filename.split("_")[0]) layer_ind = int(w_filename.split("_")[1]) rt_weight_dict[(sdp_ind, layer_ind)] = layer_w for sdp_ind, layer_ind in rt_weight_dict.keys(): - cand_if_name = "StreamingDataflowPartition_%d/s_axilite_%d" % ( - sdp_ind, - layer_ind, - ) + cand_if_name = "StreamingDataflowPartition_%d" % sdp_ind if cand_if_name in self.ip_dict.keys(): - layer_mmio = getattr( - getattr(self, "StreamingDataflowPartition_%d" % sdp_ind), - "s_axilite_%d" % layer_ind, - ).mmio + layer_mmio = getattr(self, "StreamingDataflowPartition_%d" % sdp_ind).mmio layer_w = rt_weight_dict[(sdp_ind, layer_ind)] layer_mmio.write_mm(0, layer_w.tobytes()) if verify: - new_w = np.copy(layer_mmio.array[: layer_w.shape[0]]) + if self.platform == "alveo": + # Pynq for Alveo uses tinynumpy under the hood. There is a bug when going + # from a tinynumpy.ndarray to numpy.ndarray. To work around this, we first + # convert the tinynumpy.ndarray to a list and then copy the list to a + # numpy.ndarray. + # There is a known bug with larger sets of weights. Accesses to address + # spaces over 16KB do NOT work as intended. Be aware of this if seeing + # unexpected behaviour. + new_array = layer_mmio.array[: layer_w.shape[0]] + new_w = np.copy(np.array(([x for x in new_array]), dtype=layer_w.dtype)) + else: + new_w = np.copy(layer_mmio.array[: layer_w.shape[0]]) assert (layer_w == new_w).all() if flush_accel: # run accelerator to flush any stale weights from weight streamer FIFOs @@ -270,12 +273,12 @@ def batch_size(self, value): self.obuf_packed = [] for i in range(self.num_inputs): new_packed_ibuf = allocate( - shape=self.ishape_packed(i), dtype=np.uint8, cacheable=cacheable + shape=self.ishape_packed(i), dtype=np.uint8, cacheable=cacheable, target=self.device ) self.ibuf_packed_device.append(new_packed_ibuf) for o in range(self.num_outputs): new_packed_obuf = allocate( - shape=self.oshape_packed(o), dtype=np.uint8, cacheable=cacheable + shape=self.oshape_packed(o), dtype=np.uint8, cacheable=cacheable, target=self.device ) self.obuf_packed_device.append(new_packed_obuf) self.obuf_packed.append(np.empty_like(new_packed_obuf)) @@ -346,9 +349,7 @@ def execute_on_buffers(self, asynch=False, batch_size=None): assert batch_size <= self.batch_size, "Specified batch_size is too large." if self.platform == "zynq-iodma": for o in range(self.num_outputs): - assert ( - self.odma[o].read(0x00) & 0x4 != 0 - ), "Output DMA %d is not idle" % (o) + assert self.odma[o].read(0x00) & 0x4 != 0, "Output DMA %d is not idle" % (o) # manually launch IODMAs since signatures are missing for iwdma, iwbuf, iwdma_name in self.external_weights: iwdma.write(0x10, iwbuf.device_address) @@ -364,17 +365,13 @@ def execute_on_buffers(self, asynch=False, batch_size=None): self.idma[i].write(0x00, 1) elif self.platform == "alveo": for o in range(self.num_outputs): - assert self.odma_handle[o] is None, ( - "Output DMA %d is already running" % o - ) + assert self.odma_handle[o] is None, "Output DMA %d is already running" % o for i in range(self.num_inputs): self.idma[i].start(self.ibuf_packed_device[i], batch_size) for iwdma, iwbuf, iwdma_name in self.external_weights: iwdma.start(iwbuf, batch_size) for o in range(self.num_outputs): - self.odma_handle[o] = self.odma[o].start( - self.obuf_packed_device[o], batch_size - ) + self.odma_handle[o] = self.odma[o].start(self.obuf_packed_device[o], batch_size) else: raise Exception("Unrecognized platform: %s" % self.platform) # blocking behavior depends on asynch parameter @@ -390,9 +387,7 @@ def wait_until_finished(self): while status & 0x2 == 0: status = self.odma[o].read(0x00) elif self.platform == "alveo": - assert all( - [x is not None for x in self.odma_handle] - ), "No odma_handle to wait on" + assert all([x is not None for x in self.odma_handle]), "No odma_handle to wait on" for o in range(self.num_outputs): self.odma_handle[o].wait() self.odma_handle[o] = None @@ -406,9 +401,7 @@ def execute(self, input_npy): # if single input, convert to list to normalize how we process the input if not type(input_npy) is list: input_npy = [input_npy] - assert self.num_inputs == len( - input_npy - ), "Not all accelerator inputs are specified." + assert self.num_inputs == len(input_npy), "Not all accelerator inputs are specified." for i in range(self.num_inputs): ibuf_folded = self.fold_input(input_npy[i], ind=i) ibuf_packed = self.pack_input(ibuf_folded, ind=i) diff --git a/src/finn/qnn-data/templates/driver/validate.py b/src/finn/qnn-data/templates/driver/validate.py index 1b29d4342c..c8bc1c009d 100644 --- a/src/finn/qnn-data/templates/driver/validate.py +++ b/src/finn/qnn-data/templates/driver/validate.py @@ -38,9 +38,7 @@ parser.add_argument( "--batchsize", help="number of samples for inference", type=int, default=100 ) - parser.add_argument( - "--dataset", help="dataset to use (mnist of cifar10)", required=True - ) + parser.add_argument("--dataset", help="dataset to use (mnist of cifar10)", required=True) parser.add_argument( "--platform", help="Target platform: zynq-iodma alveo", default="zynq-iodma" ) diff --git a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json index 442ea72d9a..9fe22443dc 100644 --- a/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json +++ b/src/finn/qnn-data/test_ext_weights/tfc-w1a1-extw.json @@ -1,30 +1,29 @@ { "Defaults": {}, - "Thresholding_Batch_0": { - "PE": 49, - "ram_style": "distributed" + "Thresholding_rtl_0": { + "PE": 49 }, - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "PE": 16, "SIMD": 49, "ram_style": "block" }, - "MatrixVectorActivation_1": { + "MVAU_hls_1": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MatrixVectorActivation_2": { + "MVAU_hls_2": { "PE": 8, "SIMD": 8, "mem_mode": "external" }, - "MatrixVectorActivation_3": { + "MVAU_hls_3": { "PE": 10, "SIMD": 8, "ram_style": "distributed" }, - "LabelSelect_Batch_0": { + "LabelSelect_hls_0": { "PE": 1 } } diff --git a/src/finn/transformation/fpgadataflow/annotate_cycles.py b/src/finn/transformation/fpgadataflow/annotate_cycles.py index 7befad7aa7..6646434bdf 100644 --- a/src/finn/transformation/fpgadataflow/annotate_cycles.py +++ b/src/finn/transformation/fpgadataflow/annotate_cycles.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +32,7 @@ from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class AnnotateCycles(Transformation): @@ -46,7 +47,7 @@ def apply(self, model): graph = model.graph # annotate node cycles for node in graph.node: - if _is_fpgadataflow_node(node): + if is_hls_node(node) or is_rtl_node(node): op_inst = registry.getCustomOp(node) cycles = op_inst.get_exp_cycles() op_inst.set_nodeattr("cycles_estimate", cycles) diff --git a/src/finn/transformation/fpgadataflow/annotate_resources.py b/src/finn/transformation/fpgadataflow/annotate_resources.py index 0cc4234c8c..f07a5186d5 100644 --- a/src/finn/transformation/fpgadataflow/annotate_resources.py +++ b/src/finn/transformation/fpgadataflow/annotate_resources.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.analysis.fpgadataflow.post_synth_res import post_synth_res from finn.analysis.fpgadataflow.res_estimation import res_estimation -from finn.transformation.move_reshape import _is_fpgadataflow_node +from finn.util.fpgadataflow import is_fpgadataflow_node class AnnotateResources(Transformation): @@ -68,7 +69,7 @@ def apply(self, model): children_dict = {} # annotate node resources for node in graph.node: - if _is_fpgadataflow_node(node) and node.name in self.res_dict.keys(): + if is_fpgadataflow_node(node) and node.name in self.res_dict.keys(): op_inst = registry.getCustomOp(node) op_inst.set_nodeattr("res_" + self.mode, str(self.res_dict[node.name])) children_dict[node.name] = self.res_dict[node.name] @@ -76,9 +77,7 @@ def apply(self, model): # recurse into model to manually annotate per-layer resources sdp_model_filename = getCustomOp(node).get_nodeattr("model") sdp_model = ModelWrapper(sdp_model_filename) - sdp_model = sdp_model.transform( - AnnotateResources(self.mode, self.res_dict) - ) + sdp_model = sdp_model.transform(AnnotateResources(self.mode, self.res_dict)) sdp_dict = sdp_model.get_metadata_prop("res_total_" + self.mode) sdp_dict = eval(sdp_dict) # save transformed model diff --git a/src/finn/transformation/fpgadataflow/cleanup.py b/src/finn/transformation/fpgadataflow/cleanup.py index 1d0efaf4bb..907b65eb9d 100644 --- a/src/finn/transformation/fpgadataflow/cleanup.py +++ b/src/finn/transformation/fpgadataflow/cleanup.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -53,7 +54,7 @@ def apply(self, model): model.set_metadata_prop("vivado_stitch_proj", "") for node in model.graph.node: op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) @@ -79,7 +80,5 @@ def apply(self, model): except KeyError: # exception if op_type is not supported - raise Exception( - "Custom op_type %s is currently not supported." % op_type - ) + raise Exception("Custom op_type %s is currently not supported." % op_type) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/compile_cppsim.py b/src/finn/transformation/fpgadataflow/compile_cppsim.py index da337caa62..6190560265 100644 --- a/src/finn/transformation/fpgadataflow/compile_cppsim.py +++ b/src/finn/transformation/fpgadataflow/compile_cppsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,7 +30,7 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node class CompileCppSim(NodeLocalTransformation): @@ -50,7 +51,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) @@ -70,7 +71,5 @@ def applyNodeLocal(self, node): in node attribute "executable_path".""" except KeyError: # exception if op_type is not supported - raise Exception( - "Custom op_type %s is currently not supported." % op_type - ) + raise Exception("Custom op_type %s is currently not supported." % op_type) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py similarity index 83% rename from src/finn/transformation/fpgadataflow/convert_to_hls_layers.py rename to src/finn/transformation/fpgadataflow/convert_to_hw_layers.py index 7b8a1bf6b8..897d714bf8 100644 --- a/src/finn/transformation/fpgadataflow/convert_to_hls_layers.py +++ b/src/finn/transformation/fpgadataflow/convert_to_hw_layers.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,17 +40,12 @@ from qonnx.util.basic import get_by_name from qonnx.util.onnx import nchw_to_nhwc -from finn.transformation.fpgadataflow.minimize_accumulator_width import ( - MinimizeAccumulatorWidth, -) - class InferConvInpGen(Transformation): """Convert Im2Col layers to ConvolutionInputGenerator layers.""" - def __init__(self, use_rtl_variant=False): + def __init__(self): super().__init__() - self.use_rtl_variant = use_rtl_variant def apply(self, model): graph = model.graph @@ -65,9 +60,7 @@ def apply(self, model): i2c_out_shape = model.get_tensor_shape(i2c_output) dt = model.get_tensor_datatype(i2c_input) if not dt.is_integer(): - warnings.warn( - "%s : Input is not int. Can't infer ConvInpGen." % n.name - ) + warnings.warn("%s : Input is not int. Can't infer ConvInpGen." % n.name) continue i2c_inst = getCustomOp(n) stride_h, stride_w = i2c_inst.get_nodeattr("stride") @@ -76,7 +69,6 @@ def apply(self, model): pad_h = pad_attr[0] + pad_attr[2] pad_w = pad_attr[1] + pad_attr[3] dilation_h, dilation_w = i2c_inst.get_nodeattr("dilations") - # temporary checks until non-square conv support is finalized pad_val = i2c_inst.get_nodeattr("pad_value") depthwise = i2c_inst.get_nodeattr("depthwise") ifm_ch = i2c_in_shape[-1] @@ -92,12 +84,8 @@ def apply(self, model): ConvInpGen_idim_w = ifm_dim_w if pad_h > 0 or pad_w > 0: - # if padding enabled, ensure pad_val supported by DataType - # assert dt.allowed(pad_val),"""FMPadding_Batch DataType - # must support pad_val""" assert pad_val == 0, ( - "%s : FMPadding_Batch doesn't currently support pad_val!= 0" - % n.name + "%s : FMPadding_Batch doesn't currently support pad_val!= 0" % n.name ) odim_padding_h = ifm_dim_h + pad_h @@ -117,12 +105,8 @@ def apply(self, model): ConvInpGen_idim_h = odim_padding_h ConvInpGen_idim_w = odim_padding_w - padding_optype = ( - "FMPadding_rtl" if self.use_rtl_variant else "FMPadding_Batch" - ) - padding_node = helper.make_node( - padding_optype, + "FMPadding", [i2c_input], [padding_out], domain="finn.custom_op.fpgadataflow", @@ -138,27 +122,37 @@ def apply(self, model): is_kernel_pointwise = k_h == 1 and k_w == 1 is_square_image = ConvInpGen_idim_h == ConvInpGen_idim_w - is_square_kernel = k_h == k_w is_equal_stride = stride_h == stride_w - is_1d_convolution = (k_h == 1 and k_w > 1 and ifm_dim_h == 1) or ( - k_h > 1 and k_w == 1 and ifm_dim_w == 1 - ) - - # Ensure that RTL variant is not inserted for unsupported configuration - is_rtl_variant_compatible = True - if is_kernel_pointwise: - is_rtl_variant_compatible = False - if self.use_rtl_variant: - warnings.warn( - """%s : RTL ConvInpGen requested for unsupported - configuration. Falling back to HLS implementation.""" - % n.name - ) - - if self.use_rtl_variant and is_rtl_variant_compatible: + is_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) + if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: + downsample_1D = is_1D + is1D_unitx = ifm_dim_w == 1 + downsample_2D = (not downsample_1D) and is_square_image and is_equal_stride + if not (downsample_1D or downsample_2D): + warnings.warn(f"Couldn't infer Downsample from {n.name},check config.") + continue + ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) + stride = max(stride_h, stride_w) + # create DownSampler node + ConvInpGen_node = helper.make_node( + "DownSampler", + [ConvInpGen_input], + [i2c_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ImgDim=ConvInpGen_idim, + NumChannels=ifm_ch, + SIMD=ifm_ch, + Stride=stride, + inputDataType=dt.name, + name="DownSampler_" + n.name, + is1D=downsample_1D, + is1D_unitx=is1D_unitx, + ) + else: ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator_rtl", + "ConvolutionInputGenerator", [ConvInpGen_input], [i2c_output], domain="finn.custom_op.fpgadataflow", @@ -168,112 +162,15 @@ def apply(self, model): IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], OFMDim=[ofm_dim_h, ofm_dim_w], SIMD=ifm_ch, - M=1, - parallel_window=0, Stride=[stride_h, stride_w], Dilation=[dilation_h, dilation_w], inputDataType=dt.name, outputDataType=dt.name, depthwise=depthwise, - name="ConvolutionInputGenerator_rtl_" + n.name, + is1D=is_1D, + name="ConvolutionInputGenerator_" + n.name, ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) - else: - # Ensure that only supported HLS nodes are inserted - if (stride_h > 1 or stride_w > 1) and is_kernel_pointwise: - downsample_1D = (ifm_dim_h == 1) or (ifm_dim_w == 1) - is1D_unitx = ifm_dim_w == 1 - downsample_2D = ( - (not downsample_1D) and is_square_image and is_equal_stride - ) - if not (downsample_1D or downsample_2D): - warnings.warn( - f"Couldn't infer Downsample from {n.name},check config." - ) - continue - ConvInpGen_idim = max(ConvInpGen_idim_h, ConvInpGen_idim_w) - stride = max(stride_h, stride_w) - # create DownSampler node - ConvInpGen_node = helper.make_node( - "DownSampler", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ImgDim=ConvInpGen_idim, - NumChannels=ifm_ch, - SIMD=ifm_ch, - Stride=stride, - inputDataType=dt.name, - name="DownSampler_" + n.name, - is1D=downsample_1D, - is1D_unitx=is1D_unitx, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) - else: - # create equivalent ConvolutionInputGenerator node - if ( - is_square_image and is_square_kernel - ): # square images and square kernels - assert is_equal_stride, ( - """%s: Non-equal strides along different axes is not supported - for (non-)square convolutions""" - % n.name - ) - assert dilation_h == 1 and dilation_w == 1, ( - """%s: Dilation value != 1 is not supported - for square convolutions""" - % n.name - ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator_" + n.name, - ) - else: # 1D images and/or kernels - assert is_1d_convolution, ( - """%s: ConvolutionInputGenerator1D works only - for 1D convs""" - % n.name - ) - if dilation_h > 1 or dilation_w > 1: - assert depthwise == 1, ( - """%s: Dilation value > 1 is only supported for - 1D depthwise separable convolutions""" - % n.name - ) - ConvInpGen_node = helper.make_node( - "ConvolutionInputGenerator1D", - [ConvInpGen_input], - [i2c_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ConvInpGen_idim_h, ConvInpGen_idim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=ifm_ch, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=dt.name, - outputDataType=dt.name, - depthwise=depthwise, - name="ConvolutionInputGenerator1D_" + n.name, - ) - graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) + graph.node.insert(ConvInpGen_node_idx, ConvInpGen_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -283,10 +180,96 @@ def apply(self, model): return (model, graph_modified) +class InferThresholdingLayer(Transformation): + """Convert any MultiThreshold into a standalone thresholding HLS layer.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + if node.op_type == "MultiThreshold": + thl_input = node.input[0] + thl_threshold = node.input[1] + thl_output = node.output[0] + thl_in_shape = model.get_tensor_shape(thl_input) + thl_thres_shape = model.get_tensor_shape(thl_threshold) + idt = model.get_tensor_datatype(thl_input) + + # skip conversion for layers with float input + if not idt.is_integer(): + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + thl_in_layout = model.get_tensor_layout(thl_input) + if thl_in_layout == DataLayout.NCHW: + thl_input = nchw_to_nhwc(thl_input, model, node_ind) + node_ind += 1 + thl_in_shape = model.get_tensor_shape(thl_input) + + # keep track of where we need to insert the HLS Op + # it has to be ahead of the output transform + insert_point = node_ind + thl_output_layout = model.get_tensor_layout(thl_output) + if thl_output_layout == DataLayout.NCHW: + thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume number of channels is in last dimension + ifc = int(thl_in_shape[-1]) + # create node with no parallelization first + pe = 1 + + odt = model.get_tensor_datatype(thl_output) + scale = getCustomOp(node).get_nodeattr("out_scale") + assert scale == 1.0, ( + node.name + ": MultiThreshold out_scale must be 1 for HLS conversion." + ) + actval = getCustomOp(node).get_nodeattr("out_bias") + assert int(actval) == actval, ( + node.name + ": MultiThreshold out_bias must be integer for HLS conversion." + ) + actval = int(actval) + + # a signed activation should always have a negative bias, + # but BIPOLAR uses the -1 as 0 encoding so the assert does not apply + if odt != DataType["BIPOLAR"]: + assert (not odt.signed()) or (actval < 0), ( + node.name + ": Signed output requires actval < 0" + ) + + new_node = helper.make_node( + "Thresholding", + [thl_input, thl_threshold], + [thl_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=ifc, + PE=pe, + numSteps=thl_thres_shape[1], + inputDataType=idt.name, + weightDataType=idt.name, + outputDataType=odt.name, + numInputVectors=list(thl_in_shape[:-1]), + ActVal=actval, + name="Thresholding_" + node.name, + ) + + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + + return (model, graph_modified) + + class InferUpsample(Transformation): - """ - Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour_Batch nodes. - """ + """Convert Upsample and Resize nodes to layers to UpsampleNearestNeighbour nodes.""" def apply(self, model): graph = model.graph @@ -306,15 +289,13 @@ def apply(self, model): dt = model.get_tensor_datatype(n.input[0]) if not dt.is_integer(): warnings.warn( - "%s: Input not int. Can't infer UpsampleNearestNeighbour." - % n.name + "%s: Input not int. Can't infer UpsampleNearestNeighbour." % n.name ) continue if model.get_tensor_layout(n.input[0]) != DataLayout.NHWC: warnings.warn( - "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour." - % n.name + "%s: Input not NHWC. Can't infer UpsampleNearestNeighbour." % n.name ) continue @@ -335,8 +316,7 @@ def apply(self, model): is_scale_square_2d = scales[1] == scales[2] is_scale_1d = scales[1] > 1 and scales[2] == 1 assert is_scale_square_2d or is_scale_1d, ( - "%s: Upsampling only supported for 1D H, or 2D square scaling" - % n.name + "%s: Upsampling only supported for 1D H, or 2D square scaling" % n.name ) assert scales[0] == scales[3] == 1, ( n.name + ": Upsampling is only supported for scales with " @@ -350,11 +330,10 @@ def apply(self, model): is_shape_1d = in_shape[1] > 1 and in_shape[2] == 1 assert is_shape_square_2d or is_shape_1d, ( - "%s: Upsampling is only supported for 1D H or 2D square inputs." - % n.name + "%s: Upsampling is only supported for 1D H or 2D square inputs." % n.name ) - # Extract information for HLS node + # Extract information for HW node IFMDim = in_shape[1] OFMDim = int(round(in_shape[1] * spatial_scale)) NumChannels = in_shape[-1] @@ -362,9 +341,9 @@ def apply(self, model): inputDataType = dt.name dim_mode = 0 if is_shape_square_2d else 1 - # Insert the HLSCustomOp node - Upsample_HLS_node = helper.make_node( - "UpsampleNearestNeighbour_Batch", + # Insert the HWCustomOp node + Upsample_HW_node = helper.make_node( + "UpsampleNearestNeighbour", [n.input[0]], [n.output[0]], domain="finn.custom_op.fpgadataflow", @@ -375,11 +354,11 @@ def apply(self, model): inputDataType=inputDataType, numInputVectors=numInputVectors, DimMode=dim_mode, - name="UpsampleNearestNeighbour_Batch_" + n.name, + name="UpsampleNearestNeighbour_" + n.name, ) # Remove the old node - graph.node.insert(node_ind, Upsample_HLS_node) + graph.node.insert(node_ind, Upsample_HW_node) # remove old nodes graph.node.remove(n) graph_modified = True @@ -387,7 +366,7 @@ def apply(self, model): class InferStreamingMaxPool(Transformation): - """Convert MaxPoolNHWC layers to StreamingMaxPool layers.""" + """Convert MaxPoolNHWC layers to StreamingMaxPool HW layers.""" def apply(self, model): graph = model.graph @@ -399,10 +378,15 @@ def apply(self, model): mp_input = node.input[0] mp_output = node.output[0] mp_in_shape = model.get_tensor_shape(mp_input) - # mp_out_shape = model.get_tensor_shape(mp_output) dt = model.get_tensor_datatype(mp_input) mp_inst = getCustomOp(node) k_h, k_w = mp_inst.get_nodeattr("kernel_shape") + s_h, s_w = mp_inst.get_nodeattr("strides") + if k_h != s_h or k_w != s_w: + warn_str = """Stride is not equal to kernel. Node cannot be converted to + StreamingMaxPool layer.""" + warnings.warn(warn_str) + continue ifm_ch = mp_in_shape[-1] ifm_dim_h = mp_in_shape[1] ifm_dim_w = mp_in_shape[2] @@ -414,9 +398,9 @@ def apply(self, model): pass_1d = is_1d and (not is_bipolar) pass_2d = (not is_1d) and is_divisable if pass_1d or pass_2d: - # create equivalent StreamingMaxPool_Batch node + # create equivalent StreamingMaxPool node new_node = helper.make_node( - "StreamingMaxPool_Batch", + "StreamingMaxPool", [mp_input], [mp_output], domain="finn.custom_op.fpgadataflow", @@ -427,24 +411,22 @@ def apply(self, model): dataType=dt.name, PE=pe, CeilMode=ceil_mode, - name="StreamingMaxPool_Batch_" + node.name, + name="StreamingMaxPool_" + node.name, ) graph.node.insert(node_ind, new_node) # remove old nodes graph.node.remove(node) graph_modified = True else: - warnings.warn(node.name + ": could not convert to HLS") + warnings.warn(node.name + ": could not convert to HW") if graph_modified: model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferPool_Batch(Transformation): - """If kernel_shape > strides, replace Pool layer with with of Im2col - + pool(with kernel_shape == strides), plus Transpose layers to keep the original - data layout.""" +class InferAddStreamsLayer(Transformation): + """Convert any Add into a AddStreams HW layer.""" def apply(self, model): graph = model.graph @@ -452,710 +434,376 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]: - node_input = node.input[0] - ishape = model.get_tensor_shape(node_input) - node_output = node.output[0] - idt = model.get_tensor_datatype(node_input) - oshape = model.get_tensor_shape(node_output) - # only support 4D input tensors (1D convs need extra dummy dim) - if len(ishape) != 4: + if node.op_type == "Add": + in0 = node.input[0] + in1 = node.input[1] + result = node.output[0] + in0_shape = model.get_tensor_shape(in0) + in1_shape = model.get_tensor_shape(in1) + in0_static = not (model.get_initializer(in0) is None) + in1_static = not (model.get_initializer(in1) is None) + + # skip if different shapes on inputs + if in0_shape != in1_shape: + continue + # skip if any of inputs have initializers + # (this node is meant for adding two dynamic streams) + if in0_static or in1_static: continue - # extract pool parameters - if node.op_type == "MaxPool": - kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints) - sh, sw = list(get_by_name(node.attribute, "strides").ints) - dlayout = "NCHW" - elif node.op_type == "QuantAvgPool2d": - inst = getCustomOp(node) - # QuantAvgPool2d has a single scalar attribute - # for kernel size and stride (implicit square) - kh = kw = inst.get_nodeattr("kernel") - sh = sw = inst.get_nodeattr("stride") - dlayout = inst.get_nodeattr("data_layout") - elif node.op_type == "MaxPoolNHWC": - inst = getCustomOp(node) - kh, kw = inst.get_nodeattr("kernel_shape") - sh, sw = inst.get_nodeattr("strides") - dlayout = "NHWC" - try: - pad = list(get_by_name(node.attribute, "pads").ints) - except AttributeError: - pad = [0, 0, 0, 0] + idt0 = model.get_tensor_datatype(in0) + idt1 = model.get_tensor_datatype(in1) - if not idt.is_integer(): + # skip if different data types on inputs + if idt0 != idt1: continue - if (kh < sh) or (kw < sw): - # TODO check/implement swg support + idt = idt0 + + # skip conversion for layers with float input + if not idt.is_integer(): continue - odt = model.get_tensor_datatype(node_output) + # check layout and convert if necessary + in0_layout = model.get_tensor_layout(in0) + in1_layout = model.get_tensor_layout(in1) + result_layout = model.get_tensor_layout(result) - if dlayout == "NCHW": - _, ifm_ch, ifm_h, ifm_w = ishape - _, ofm_ch, ofm_h, ofm_w = oshape - elif dlayout == "NHWC": - _, ifm_h, ifm_w, ifm_ch = ishape - _, ofm_h, ofm_w, ofm_ch = oshape - else: - raise Exception("Unknown dlayout: " + str(dlayout)) + if in0_layout == DataLayout.NCHW: + in0 = nchw_to_nhwc(in0, model, node_ind) + node_ind += 1 + in0_shape = model.get_tensor_shape(in0) - # if data layout NCHW, we need transpose nodes surrounding - # the hls layer - if dlayout == "NCHW": - # create new intermediate values - inp_trans_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ifm_h, ifm_w, ifm_ch), # NHWC - ) - graph.value_info.append(inp_trans_out) - inp_trans_out = inp_trans_out.name - model.set_tensor_datatype(inp_trans_out, idt) + if in1_layout == DataLayout.NCHW: + in1 = nchw_to_nhwc(in1, model, node_ind) + node_ind += 1 + in1_shape = model.get_tensor_shape(in1) - pool_output = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ofm_h, ofm_w, ofm_ch), - ) - graph.value_info.append(pool_output) - pool_output = pool_output.name - # model.set_tensor_datatype(pool_output, odt) + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind - im2col_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), - TensorProto.FLOAT, - (1, ofm_h, ofm_w, ifm_ch * kh * kw), + if result_layout == DataLayout.NCHW: + result = nchw_to_nhwc(result, model, node_ind, reverse=True) + node_ind += 1 + + # now safe to assume num_channels is size of last dimension + num_channels = int(in0_shape[-1]) + # create node with no parallelization first + pe = 1 + + # create and insert new AddStreams node + new_node = helper.make_node( + "AddStreams", + [in0, in1], + [result], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + NumChannels=num_channels, + PE=pe, + inputDataType=idt.name, + numInputVectors=in0_shape[:-1], + name="AddStreams_" + node.name, ) - graph.value_info.append(im2col_out) - im2col_out = im2col_out.name - model.set_tensor_datatype(im2col_out, idt) + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True - # create new nodes - if dlayout == "NCHW": - # NCHW -> NHWC - inp_trans_node = helper.make_node( - "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] - ) - im2col_in = inp_trans_out - else: - im2col_in = node_input - pool_output = node_output + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) - accum_bits = 0 - pool_size_param = 0 # will be overridden if neededs - pad_value = 0 - if node.op_type in ["MaxPool", "MaxPoolNHWC"]: - pool_fxn = "MaxPool" - odt = idt - pad_value = idt.min() - elif node.op_type == "QuantAvgPool2d": - assert odt.is_integer(), """Output data type for QuantAvgPool2d - needs to be integer""" - assert all( - x == 0 for x in pad - ), "Padding is not supported for QuantAvgPool2d" - inst = getCustomOp(node) - pool_fxn = "QuantAvgPool" - pool_size_param = inst.get_shifts() - accum_bits = inst.get_accum_size() - else: - raise Exception( - "pad_value and pool_fxn not configured for {}".format( - node.op_type - ) +class InferDuplicateStreamsLayer(Transformation): + """Insert a DuplicateStreams HW layer for any tensor with fanout == 2""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + node_ind += 1 + successors = model.find_consumers(node.output[0]) + if successors is not None and len(successors) >= 2: + output_tensor = node.output[0] + n_outputs = len(successors) + + dt = model.get_tensor_datatype(output_tensor) + + # skip conversion for layers with float input + if not dt.is_integer(): + continue + + # create clone tensors + out_shape = model.get_tensor_shape(output_tensor) + out_tensor_clones = [] + for i in range(n_outputs): + clone = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape ) + model.graph.value_info.append(clone) + out_tensor_clones += [clone.name] - # format input tensor - im2col_node = helper.make_node( - "Im2Col", - [im2col_in], - [im2col_out], - domain="qonnx.custom_op.general", - stride=[sh, sw], - kernel_size=[kh, kw], - pad_amount=pad, - pad_value=pad_value, - depthwise=1, - input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch), - name="Im2Col_" + node.name, - ) + num_ch = int(out_shape[-1]) + vecs = out_shape[:-1] - # Warning PE has to be equal to ifm_ch until Im2Col is replaced by - # ConvolutionInputGenerator with depthwise=1. - # For other settings the output will be incorrect due to incorrect input - # data layout - pool_node = helper.make_node( - "Pool_Batch", - [im2col_out], - [pool_output], + # create node with no parallelization first + pe = 1 + + dup_node = helper.make_node( + "DuplicateStreams", + [output_tensor], + out_tensor_clones, domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - InputDataType=idt.name, - OutputDataType=odt.name, - Channels=ifm_ch, - PE=ifm_ch, - KernelSize=[kh, kw], - Function=pool_fxn, - OutImgDims=[ofm_h, ofm_w], - AccumBits=accum_bits, - Size=pool_size_param, - BatchSize=1, - name="Pool_Batch_" + node.name, + NumChannels=num_ch, + PE=pe, + inputDataType=dt.name, + numInputVectors=vecs, + NumOutputStreams=n_outputs, + outFIFODepths=[2] * n_outputs, + name="DuplicateStreams_" + node.name, ) - if dlayout == "NCHW": - # NHWC -> NCHW - out_trans_node = helper.make_node( - "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] - ) + graph.node.insert(node_ind, dup_node) + + # connect successors to out tensor clone + clone_idx = 0 + for successor in successors: + for i, succ_input in enumerate(successor.input): + if succ_input == output_tensor: + successor.input[i] = out_tensor_clones[clone_idx] + clone_idx += 1 + # if one node has multiple connections to the same output + # find_direct_successors will return one node per input + # so break the inner loop will result in correct behaviour + break - # insert nodes where the conv is to preserve topological ordering - if dlayout == "NCHW": - graph.node.insert(node_ind, inp_trans_node) - graph.node.insert(node_ind + 1, im2col_node) - graph.node.insert(node_ind + 2, pool_node) - graph.node.insert(node_ind + 3, out_trans_node) - else: - graph.node.insert(node_ind, im2col_node) - graph.node.insert(node_ind + 1, pool_node) - # remove old node - graph.node.remove(node) graph_modified = True if graph_modified: + model = model.transform(SortGraph()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferBinaryMatrixVectorActivation(Transformation): - """Convert XnorPopcountMatMul layers to - MatrixVectorActivation layers. Any immediately following MultiThreshold - layers will also be absorbed into the MVTU.""" +class InferChannelwiseLinearLayer(Transformation): + """Convert any channel-wise Add/Mul into a HW layer.""" - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode + def get_smallest_possible(self, vals): + """Returns smallest (fewest bits) possible DataType that can represent + value. Prefers unsigned integers where possible.""" + vals = np.array(vals, dtype=np.float64) + for v in vals: + assert int(v) == v, "Error float value" + + for k in DataType.get_accumulator_dt_cands(): + dt = DataType[k] + + if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: + # not currently supported + continue + + if (dt.min() <= vals).all() and (vals <= dt.max()).all(): + return dt + + warnings.warn( + """InferChannelwiseLinearLayer: Output values may not be + representable with supported data types. + Setting maximum width data type available. + This will lead to errors if there are no constrains on the input + """ + ) + + if (0 <= vals).all(): + return DataType["UINT64"] + else: + return DataType["INT64"] def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False - for n in graph.node: + for node in graph.node: node_ind += 1 - if n.op_type == "XnorPopcountMatMul": - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( - n.name - + """: First - input for xnorpopcount is not set to FINN DataType BINARY.""" - ) - assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( - n.name - + """: Second - input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" - ) - idt = DataType["BINARY"] - wdt = DataType["BINARY"] - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # extract weight shape, note that ONNX and finn-hlslib - # make different assumptions about dim order here - # ONNX assumes W has (in, out) shape - # finn-hlslib assumes W has (out, in) shape - mh = int(W.shape[1]) - mw = int(W.shape[0]) + if node.op_type == "Add" or node.op_type == "Mul": + # assuming input[0] is dynamic + ll_input = node.input[0] + ll_output = node.output[0] + ll_in_shape = model.get_tensor_shape(ll_input) + + # check if input 1 has an initializer + ll_const = node.input[1] + if ll_const is not None: + ll_cinit = model.get_initializer(ll_const) + if ll_cinit is None: + # input 1 is also dynamic + continue + else: + continue + + # get number of channels and channel index from input + ll_in_layout = model.get_tensor_layout(ll_input) + if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: + ch_index = -1 + ch = ll_in_shape[-1] + elif ll_in_layout == DataLayout.NCHW: + ch_index = 1 + ch = ll_in_shape[1] + else: + continue + + # check if the shape of initializer is compatible + ll_cinit_shape = list(ll_cinit.shape) + if np.prod(ll_cinit_shape) == 1: + warnings.warn("Broadcasting " + str(node.op_type) + "(" + node.name + ")") + ll_cinit = np.full((ch), ll_cinit.flatten()[0]) + elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: + # parameter shape not compatible with Channelwise + continue + + # check initializer contains integers as floats + if not (ll_cinit.astype(np.int32) == ll_cinit).all(): + continue + # all initializer conditions are met + + # check inputs + idt = model.get_tensor_datatype(ll_input) + if not idt.is_integer(): + # skip conversion for layers with float input + continue + + # check layout of inputs/outputs, and convert if needed + # check layout and convert if necessary + if ll_in_layout == DataLayout.NCHW: + ll_input = nchw_to_nhwc(ll_input, model, node_ind) + node_ind += 1 + ll_in_shape = model.get_tensor_shape(ll_input) + + # keep track of where we need to insert the HW Op + # it has to be ahead of the output transform + insert_point = node_ind + ll_output_layout = model.get_tensor_layout(ll_output) + if ll_output_layout == DataLayout.NCHW: + ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) + node_ind += 1 + + # get parameter data type + param_min = min(ll_cinit.flatten()) + param_max = max(ll_cinit.flatten()) + pdt = self.get_smallest_possible([param_min, param_max]) + + # set function and determine output data type + if node.op_type == "Add": + func = "add" + out_min = idt.min() + param_min + out_max = idt.max() + param_max + odt = self.get_smallest_possible([out_min, out_max]) + elif node.op_type == "Mul": + func = "mul" + possible_limits = [] + possible_limits += [idt.min() * param_min] + possible_limits += [idt.min() * param_max] + possible_limits += [idt.max() * param_min] + possible_limits += [idt.max() * param_max] + odt = self.get_smallest_possible(possible_limits) + + model.set_initializer(ll_const, ll_cinit.reshape(ch)) + model.set_tensor_datatype(ll_output, odt) + # create node with no parallelization first pe = 1 - simd = 1 - wmem = mw * mh // (pe * simd) - assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisiable by - (WMEM * PE * SIMD) is violated.""" + assert ch % pe == 0, "Requirement IFC divisable by PE is violated." + # create and insert node + new_node = helper.make_node( + "ChannelwiseOp", + [ll_input, ll_const], + [ll_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + Func=func, + NumChannels=ch, + PE=pe, + inputDataType=idt.name, + paramDataType=pdt.name, + outputDataType=odt.name, + numInputVectors=list(ll_in_shape[:-1]), + name="ChannelwiseOp_" + node.name, ) - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # TODO ensure integer thresholds? - # create MVTU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor MH.""" - ) - odt = model.get_tensor_datatype(mt_output) - if odt.bitwidth() == 1: - # covers both bipolar and binary - actval = 0 - else: - actval = odt.min() - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - binaryXnorMode=1, - noActivation=0, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name=n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - binaryXnorMode=1, - noActivation=1, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name=n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True + graph.node.insert(insert_point, new_node) + # remove old node + graph.node.remove(node) + graph_modified = True + if graph_modified: - model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferQuantizedMatrixVectorActivation(Transformation): - """Convert MatMul layers with quantized inputs and weights to - MatrixVectorActivation layers. Any immediately following MultiThreshold - layers will also be absorbed into the MVTU.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode +class InferLabelSelectLayer(Transformation): + """Convert any TopK into a LabelSelect HW layer.""" def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False - for n in graph.node: + for node in graph.node: node_ind += 1 - if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - idt = model.get_tensor_datatype(mm_input) - wdt = model.get_tensor_datatype(mm_weight) - if idt.is_integer() and wdt.is_integer(): - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # extract weight shape, note that ONNX and finn-hlslib - # make different assumptions about dim order here - # ONNX assumes W has (in, out) shape - # finn-hlslib assumes W has (out, in) shape - mh = int(W.shape[1]) - mw = int(W.shape[0]) - # create node with no parallelization first - pe = 1 - simd = 1 - wmem = mw * mh // (pe * simd) - assert mw * mh == wmem * pe * simd, ( - n.name - + """: Requirement (MW * MH) divisible by - (WMEM * PE * SIMD) is violated.""" - ) - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # TODO ensure integer thresholds? - # create MVTU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == mh, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor MH.""" - ) - odt = model.get_tensor_datatype(mt_output) - scale = getCustomOp(consumer).get_nodeattr("out_scale") - actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert int(actval) == actval, ( - consumer.name - + ": out_bias must be integer for HLS conversion." - ) - actval = int(actval) - odt_is_bipolar = odt == DataType["BIPOLAR"] - bipolar_ok = ( - odt_is_bipolar and (scale == 2.0) and (actval == -1) - ) - assert scale == 1.0 or bipolar_ok, ( - consumer.name - + ": out_scale=1 or bipolar output needed for conversion." - ) - assert (not odt.signed()) or (actval < 0), ( - consumer.name + ": Signed output requres actval < 0" - ) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - if bipolar_ok: - # remove bias for bipolar, since - # binary->bipolar is achieved by reinterpretation - actval = 0 - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - binaryXnorMode=0, - noActivation=0, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new MatrixVectorActivation node - new_node = helper.make_node( - "MatrixVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - MW=mw, - MH=mh, - SIMD=simd, - PE=pe, - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - binaryXnorMode=0, - noActivation=1, - numInputVectors=list(mm_in_shape[:-1]), - mem_mode=self.mem_mode, - name="MatrixVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True - if graph_modified: - model = model.transform(MinimizeAccumulatorWidth()) - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) + if node.op_type == "TopK": + fc_input = node.input[0] + k_input = node.input[1] + val_output = node.output[0] + idx_output = node.output[1] + fc_in_shape = model.get_tensor_shape(fc_input) + idt = model.get_tensor_datatype(fc_input) -class InferVectorVectorActivation(Transformation): - """Convert MatMul layers with quantized inputs and weights to - VectorVectorActivation layers, if the sparsity annotation - of the weight matrix indicates that the MatMul layer belongs to - a depthwise convolution. Any immediately following MultiThreshold - layers will also be absorbed into the VVAU.""" + # skip conversion for layers with float input + if not idt.is_integer(): + continue - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode + # skip conversion for if value output is connected (not supported) + if model.find_consumer(val_output) is not None: + continue - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for n in graph.node: - node_ind += 1 - if ( - n.op_type == "MatMul" - and model.get_tensor_sparsity(n.input[1]) is not None - ): - sparsity = model.get_tensor_sparsity(n.input[1]) - try: - k_h, k_w = sparsity["dw"]["kernel_shape"] - except KeyError: - raise Exception( - n.name - + """: sparsity annotation doesn't indicate that MatMul - belongs to a depthwise convolution.""" - ) - - mm_input = n.input[0] - mm_weight = n.input[1] - mm_output = n.output[0] - mm_in_shape = model.get_tensor_shape(mm_input) - mm_out_shape = model.get_tensor_shape(mm_output) - idt = model.get_tensor_datatype(mm_input) - wdt = model.get_tensor_datatype(mm_weight) - if idt.is_integer() and wdt.is_integer(): - mm_output = n.output[0] - W = model.get_initializer(mm_weight) - # infer dense weight tensor from sparse weight matrix - # kernel size (k_h, k_w) which was extracted above and the value of - # the channels is used. - # the weight matrix has a shape of (k_h * k_w * Channels, Channels) - # we need to reverse the creation of the sparse weight matrix - # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) - channels = int(W.shape[1]) - # transpose to achieve a shape of (k_h * k_w * Channels, Channels) - W = W.T - # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards - # to (Channels, Channels, k_h, k_w) - W = W.reshape(channels, k_h, k_w, channels) - W = W.transpose(0, 3, 1, 2) - # now we can extract the values using a for loop over the channels - # and fill a zero numpy array in the correct shape - w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) - for ch in range(channels): - w_tensor[ch][0] = W[ch][ch] - model.set_initializer(mm_weight, w_tensor) - model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) - # create node with pe=channels as default - pe = channels - # see if we have any following thresholds - consumer = model.find_consumer(mm_output) - if consumer is not None and consumer.op_type == "MultiThreshold": - # create VVAU (i.e. including activation) - mt_output = consumer.output[0] - mt_out_shape = model.get_tensor_shape(mt_output) - mt_thres = consumer.input[1] - T = model.get_initializer(mt_thres) - assert T.shape[0] == 1 or T.shape[0] == channels, ( - consumer.name - + """: First dimension of - thresholds neither 1 nor Channels.""" - ) - odt = model.get_tensor_datatype(mt_output) - scale = getCustomOp(consumer).get_nodeattr("out_scale") - assert scale == 1.0, ( - consumer.name - + ": out_scale must be equal to 1.0 for HLS conversion." - ) - actval = getCustomOp(consumer).get_nodeattr("out_bias") - assert int(actval) == actval, ( - consumer.name - + ": out_bias must be integer for HLS conversion." - ) - actval = int(actval) - assert (not odt.signed()) or (actval < 0), ( - consumer.name + ": Signed output requres actval < 0" - ) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mt_output, mt_out_shape) - # create and insert new VectorVectorActivation node - new_node = helper.make_node( - "VectorVectorActivation", - [mm_input, mm_weight, mt_thres], - [mt_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - resType="lut", - PE=pe, - Dim=[mm_in_shape[1], mm_in_shape[2]], - Channels=channels, - Kernel=[k_h, k_w], - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=actval, - noActivation=0, - name="VectorVectorActivation_" + n.name, - mem_mode=self.mem_mode, - ) - graph.node.insert(node_ind, new_node) - # remove old nodes - graph.node.remove(n) - graph.node.remove(consumer) - graph_modified = True - else: - # no activation, matmul only - odt = model.get_tensor_datatype(mm_output) - model.set_tensor_shape(mm_input, mm_in_shape) - model.set_tensor_shape(mm_output, mm_out_shape) - # create and insert new VVAU node - new_node = helper.make_node( - "VectorVectorActivation", - [mm_input, mm_weight], - [mm_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - resType="lut", - PE=pe, - Dim=[mm_in_shape[1], mm_in_shape[2]], - Channels=channels, - Kernel=[k_h, k_w], - inputDataType=idt.name, - weightDataType=wdt.name, - outputDataType=odt.name, - ActVal=0, - noActivation=1, - name="VectorVectorActivation_" + n.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(n) - graph_modified = True - if graph_modified: - model = model.transform(MinimizeAccumulatorWidth()) - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferThresholdingLayer(Transformation): - """Convert any MultiThreshold into a standalone thresholding HLS layer.""" - - def __init__(self, mem_mode="const"): - super().__init__() - self.mem_mode = mem_mode - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "MultiThreshold": - thl_input = node.input[0] - thl_threshold = node.input[1] - thl_output = node.output[0] - thl_in_shape = model.get_tensor_shape(thl_input) - thl_thres_shape = model.get_tensor_shape(thl_threshold) - idt = model.get_tensor_datatype(thl_input) - - # skip conversion for layers with float input - if not idt.is_integer(): - continue - - # check layout of inputs/outputs, and convert if needed - # check layout and convert if necessary - thl_in_layout = model.get_tensor_layout(thl_input) - if thl_in_layout == DataLayout.NCHW: - thl_input = nchw_to_nhwc(thl_input, model, node_ind) - node_ind += 1 - thl_in_shape = model.get_tensor_shape(thl_input) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - thl_output_layout = model.get_tensor_layout(thl_output) - if thl_output_layout == DataLayout.NCHW: - thl_output = nchw_to_nhwc(thl_output, model, node_ind, reverse=True) - node_ind += 1 - - # now safe to assume number of channels is in last dimension - ifc = int(thl_in_shape[-1]) + num_labels = int(fc_in_shape[-1]) + num_inp_vecs = list(fc_in_shape[:-1]) # create node with no parallelization first pe = 1 - odt = model.get_tensor_datatype(thl_output) - scale = getCustomOp(node).get_nodeattr("out_scale") - assert scale == 1.0, ( - node.name - + ": MultiThreshold out_scale must be 1 for HLS conversion." - ) - actval = getCustomOp(node).get_nodeattr("out_bias") - assert int(actval) == actval, ( - node.name - + ": MultiThreshold out_bias must be integer for HLS conversion." - ) - actval = int(actval) - assert (not odt.signed()) or (actval < 0), ( - node.name + ": Signed output requres actval < 0" - ) - # create and insert new Thresholding_Batch node + k = model.get_initializer(k_input)[0] + + # create and insert new LabelSelect node new_node = helper.make_node( - "Thresholding_Batch", - [thl_input, thl_threshold], - [thl_output], + "LabelSelect", + [fc_input], + [idx_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - NumChannels=ifc, + Labels=num_labels, PE=pe, - numSteps=thl_thres_shape[1], + K=k, inputDataType=idt.name, - weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth - outputDataType=odt.name, - numInputVectors=list(thl_in_shape[:-1]), - ActVal=actval, - mem_mode=self.mem_mode, - name="Thresholding_Batch_" + node.name, + numInputVectors=num_inp_vecs, + name="LabelSelect_" + node.name, ) - graph.node.insert(insert_point, new_node) + graph.node.insert(node_ind, new_node) # remove old node graph.node.remove(node) graph_modified = True if graph_modified: - model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferAddStreamsLayer(Transformation): - """Convert any Add into a AddStreams HLS layer.""" +class InferGlobalAccPoolLayer(Transformation): + """Convert any GlobalAveragePool into a GlobalAccPool HW layer and a scalar Mul.""" def apply(self, model): graph = model.graph @@ -1163,31 +811,12 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "Add": + if node.op_type == "GlobalAveragePool": in0 = node.input[0] - in1 = node.input[1] result = node.output[0] in0_shape = model.get_tensor_shape(in0) - in1_shape = model.get_tensor_shape(in1) - in0_static = not (model.get_initializer(in0) is None) - in1_static = not (model.get_initializer(in1) is None) - - # skip if different shapes on inputs - if in0_shape != in1_shape: - continue - # skip if any of inputs have initializers - # (this node is meant for adding two dynamic streams) - if in0_static or in1_static: - continue - - idt0 = model.get_tensor_datatype(in0) - idt1 = model.get_tensor_datatype(in1) - # skip if different data types on inputs - if idt0 != idt1: - continue - - idt = idt0 + idt = model.get_tensor_datatype(in0) # skip conversion for layers with float input if not idt.is_integer(): @@ -1195,7 +824,6 @@ def apply(self, model): # check layout and convert if necessary in0_layout = model.get_tensor_layout(in0) - in1_layout = model.get_tensor_layout(in1) result_layout = model.get_tensor_layout(result) if in0_layout == DataLayout.NCHW: @@ -1203,12 +831,7 @@ def apply(self, model): node_ind += 1 in0_shape = model.get_tensor_shape(in0) - if in1_layout == DataLayout.NCHW: - in1 = nchw_to_nhwc(in1, model, node_ind) - node_ind += 1 - in1_shape = model.get_tensor_shape(in1) - - # keep track of where we need to insert the HLS Op + # keep track of where we need to insert the HW Op # it has to be ahead of the output transform insert_point = node_ind @@ -1216,25 +839,48 @@ def apply(self, model): result = nchw_to_nhwc(result, model, node_ind, reverse=True) node_ind += 1 - # now safe to assume num_channels is size of last dimension - num_channels = int(in0_shape[-1]) + num_ch = int(in0_shape[-1]) + vecs = in0_shape[:-1] # create node with no parallelization first pe = 1 - # create and insert new AddStreams_Batch node - new_node = helper.make_node( - "AddStreams_Batch", - [in0, in1], - [result], + # create an additional tensor of the same shape and layout as result + out_shape = model.get_tensor_shape(result) + pool_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + ) + model.graph.value_info.append(pool_out) + pool_out = pool_out.name + model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) + + new_pool = helper.make_node( + "GlobalAccPool", + [in0], + [pool_out], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - NumChannels=num_channels, + NumChannels=num_ch, PE=pe, inputDataType=idt.name, - numInputVectors=in0_shape[:-1], - name="AddStreams_Batch_" + node.name, + numInputVectors=vecs, + name="GlobalAccPool_" + node.name, ) - graph.node.insert(insert_point, new_node) + + mul_value = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] + ) + model.graph.value_info.append(mul_value) + model.set_initializer( + mul_value.name, np.array(1 / (vecs[1] * vecs[2]), dtype=np.float32) + ) + new_mul = helper.make_node( + "Mul", + [pool_out, mul_value.name], + [result], + ) + graph.node.insert(insert_point, new_pool) + graph.node.insert(insert_point + 1, new_mul) + node_ind += 1 # remove old node graph.node.remove(node) graph_modified = True @@ -1245,112 +891,194 @@ def apply(self, model): return (model, graph_modified) -class InferDuplicateStreamsLayer(Transformation): - """Insert a DuplicateStreams HLS layer for any tensor with fanout == 2""" - - def apply(self, model): +class InferPool(Transformation): + """If kernel_shape > strides, replace Pool layer with with of Im2col + + pool(with kernel_shape == strides), plus Transpose layers to keep the original + data layout.""" + + def apply(self, model): graph = model.graph node_ind = 0 graph_modified = False for node in graph.node: node_ind += 1 - successors = model.find_consumers(node.output[0]) - if successors is not None and len(successors) >= 2: - output_tensor = node.output[0] - n_outputs = len(successors) + if node.op_type in ["MaxPool", "QuantAvgPool2d", "MaxPoolNHWC"]: + node_input = node.input[0] + ishape = model.get_tensor_shape(node_input) + node_output = node.output[0] + idt = model.get_tensor_datatype(node_input) + oshape = model.get_tensor_shape(node_output) + # only support 4D input tensors (1D convs need extra dummy dim) + if len(ishape) != 4: + continue - dt = model.get_tensor_datatype(output_tensor) + # extract pool parameters + if node.op_type == "MaxPool": + kh, kw = list(get_by_name(node.attribute, "kernel_shape").ints) + sh, sw = list(get_by_name(node.attribute, "strides").ints) + dlayout = "NCHW" + elif node.op_type == "QuantAvgPool2d": + inst = getCustomOp(node) + # QuantAvgPool2d has a single scalar attribute + # for kernel size and stride (implicit square) + kh = kw = inst.get_nodeattr("kernel") + sh = sw = inst.get_nodeattr("stride") + dlayout = inst.get_nodeattr("data_layout") + elif node.op_type == "MaxPoolNHWC": + inst = getCustomOp(node) + kh, kw = inst.get_nodeattr("kernel_shape") + sh, sw = inst.get_nodeattr("strides") + dlayout = "NHWC" + try: + pad = list(get_by_name(node.attribute, "pads").ints) + except AttributeError: + pad = [0, 0, 0, 0] - # skip conversion for layers with float input - if not dt.is_integer(): + if not idt.is_integer(): continue - # create clone tensors - out_shape = model.get_tensor_shape(output_tensor) - out_tensor_clones = [] - for i in range(n_outputs): - clone = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape + if (kh < sh) or (kw < sw): + # TODO check/implement swg support + continue + + odt = model.get_tensor_datatype(node_output) + + if dlayout == "NCHW": + _, ifm_ch, ifm_h, ifm_w = ishape + _, ofm_ch, ofm_h, ofm_w = oshape + elif dlayout == "NHWC": + _, ifm_h, ifm_w, ifm_ch = ishape + _, ofm_h, ofm_w, ofm_ch = oshape + else: + raise Exception("Unknown dlayout: " + str(dlayout)) + + # if data layout NCHW, we need transpose nodes surrounding + # the hw layer + if dlayout == "NCHW": + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ifm_h, ifm_w, ifm_ch), # NHWC ) - model.graph.value_info.append(clone) - out_tensor_clones += [clone.name] + graph.value_info.append(inp_trans_out) + inp_trans_out = inp_trans_out.name + model.set_tensor_datatype(inp_trans_out, idt) - num_ch = int(out_shape[-1]) - vecs = out_shape[:-1] + pool_output = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_h, ofm_w, ofm_ch), + ) + graph.value_info.append(pool_output) + pool_output = pool_output.name - # create node with no parallelization first - pe = 1 + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_h, ofm_w, ifm_ch * kh * kw), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) - dup_node = helper.make_node( - "DuplicateStreams_Batch", - [output_tensor], - out_tensor_clones, + # create new nodes + if dlayout == "NCHW": + # NCHW -> NHWC + inp_trans_node = helper.make_node( + "Transpose", [node_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + im2col_in = inp_trans_out + else: + im2col_in = node_input + pool_output = node_output + + accum_bits = 0 + pool_size_param = 0 # will be overridden if neededs + pad_value = 0 + if node.op_type in ["MaxPool", "MaxPoolNHWC"]: + pool_fxn = "MaxPool" + odt = idt + pad_value = idt.min() + elif node.op_type == "QuantAvgPool2d": + assert odt.is_integer(), """Output data type for QuantAvgPool2d + needs to be integer""" + assert all(x == 0 for x in pad), "Padding is not supported for QuantAvgPool2d" + inst = getCustomOp(node) + pool_fxn = "QuantAvgPool" + pool_size_param = inst.get_shifts() + accum_bits = inst.get_accum_size() + + else: + raise Exception( + "pad_value and pool_fxn not configured for {}".format(node.op_type) + ) + + # format input tensor + im2col_node = helper.make_node( + "Im2Col", + [im2col_in], + [im2col_out], + domain="qonnx.custom_op.general", + stride=[sh, sw], + kernel_size=[kh, kw], + pad_amount=pad, + pad_value=pad_value, + depthwise=1, + input_shape="(1,{},{},{})".format(ifm_h, ifm_w, ifm_ch), + name="Im2Col_" + node.name, + ) + + # Warning PE has to be equal to ifm_ch until Im2Col is replaced by + # ConvolutionInputGenerator with depthwise=1. + # For other settings the output will be incorrect due to incorrect input + # data layout + pool_node = helper.make_node( + "Pool", + [im2col_out], + [pool_output], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", - NumChannels=num_ch, - PE=pe, - inputDataType=dt.name, - numInputVectors=vecs, - NumOutputStreams=n_outputs, - outFIFODepths=[2] * n_outputs, - name="DuplicateStreams_Batch_" + node.name, + InputDataType=idt.name, + OutputDataType=odt.name, + Channels=ifm_ch, + PE=ifm_ch, + KernelSize=[kh, kw], + Function=pool_fxn, + OutImgDims=[ofm_h, ofm_w], + AccumBits=accum_bits, + Size=pool_size_param, + BatchSize=1, + name="Pool_" + node.name, ) - graph.node.insert(node_ind, dup_node) - - # connect successors to out tensor clone - clone_idx = 0 - for successor in successors: - for i, succ_input in enumerate(successor.input): - if succ_input == output_tensor: - successor.input[i] = out_tensor_clones[clone_idx] - clone_idx += 1 - # if one node has multiple connections to the same output - # find_direct_successors will return one node per input - # so break the inner loop will result in correct behaviour - break + if dlayout == "NCHW": + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [pool_output], [node_output], perm=[0, 3, 1, 2] + ) + # insert nodes where the conv is to preserve topological ordering + if dlayout == "NCHW": + graph.node.insert(node_ind, inp_trans_node) + graph.node.insert(node_ind + 1, im2col_node) + graph.node.insert(node_ind + 2, pool_node) + graph.node.insert(node_ind + 3, out_trans_node) + else: + graph.node.insert(node_ind, im2col_node) + graph.node.insert(node_ind + 1, pool_node) + # remove old node + graph.node.remove(node) graph_modified = True if graph_modified: - model = model.transform(SortGraph()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) return (model, graph_modified) -class InferChannelwiseLinearLayer(Transformation): - """Convert any channel-wise Add/Mul into a HLS layer.""" - - def get_smallest_possible(self, vals): - """Returns smallest (fewest bits) possible DataType that can represent - value. Prefers unsigned integers where possible.""" - vals = np.array(vals, dtype=np.float64) - for v in vals: - assert int(v) == v, "Error float value" - - for k in DataType.get_accumulator_dt_cands(): - dt = DataType[k] - - if dt in [DataType["BIPOLAR"], DataType["TERNARY"], DataType["FLOAT32"]]: - # not currently supported - continue - - if (dt.min() <= vals).all() and (vals <= dt.max()).all(): - return dt - - warnings.warn( - """InferChannelwiseLinearLayer: Output values may not be - representable with supported data types. - Setting maximum width data type available. - This will lead to errors if there are no constrains on the input - """ - ) - - if (0 <= vals).all(): - return DataType["UINT64"] - else: - return DataType["INT64"] +class InferLookupLayer(Transformation): + """Convert Gather nodes with constant op0 into Lookup HW layers.""" def apply(self, model): graph = model.graph @@ -1358,285 +1086,12 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "Add" or node.op_type == "Mul": - # assuming input[0] is dynamic - ll_input = node.input[0] - ll_output = node.output[0] - ll_in_shape = model.get_tensor_shape(ll_input) - - # check if input 1 has an initializer - ll_const = node.input[1] - if ll_const is not None: - ll_cinit = model.get_initializer(ll_const) - if ll_cinit is None: - # input 1 is also dynamic - continue - else: - continue - - # get number of channels and channel index from input - ll_in_layout = model.get_tensor_layout(ll_input) - if ll_in_layout == DataLayout.NHWC or ll_in_layout == DataLayout.NC: - ch_index = -1 - ch = ll_in_shape[-1] - elif ll_in_layout == DataLayout.NCHW: - ch_index = 1 - ch = ll_in_shape[1] - else: - continue - - # check if the shape of initializer is compatible - ll_cinit_shape = list(ll_cinit.shape) - if np.prod(ll_cinit_shape) == 1: - warnings.warn( - "Broadcasting " + str(node.op_type) + "(" + node.name + ")" - ) - ll_cinit = np.full((ch), ll_cinit.flatten()[0]) - elif np.prod(ll_cinit_shape) != ch or ll_cinit_shape[ch_index] != ch: - # parameter shape not compatible with Channelwise_batch - continue - - # check initializer contains integers as floats - if not (ll_cinit.astype(np.int32) == ll_cinit).all(): - continue - # all initializer conditions are met - - # check inputs - idt = model.get_tensor_datatype(ll_input) - if not idt.is_integer(): - # skip conversion for layers with float input - continue - - # check layout of inputs/outputs, and convert if needed - # check layout and convert if necessary - if ll_in_layout == DataLayout.NCHW: - ll_input = nchw_to_nhwc(ll_input, model, node_ind) - node_ind += 1 - ll_in_shape = model.get_tensor_shape(ll_input) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - ll_output_layout = model.get_tensor_layout(ll_output) - if ll_output_layout == DataLayout.NCHW: - ll_output = nchw_to_nhwc(ll_output, model, node_ind, reverse=True) - node_ind += 1 - - # get parameter data type - param_min = min(ll_cinit.flatten()) - param_max = max(ll_cinit.flatten()) - pdt = self.get_smallest_possible([param_min, param_max]) - - # set function and determine output data type - if node.op_type == "Add": - func = "add" - out_min = idt.min() + param_min - out_max = idt.max() + param_max - odt = self.get_smallest_possible([out_min, out_max]) - elif node.op_type == "Mul": - func = "mul" - possible_limits = [] - possible_limits += [idt.min() * param_min] - possible_limits += [idt.min() * param_max] - possible_limits += [idt.max() * param_min] - possible_limits += [idt.max() * param_max] - odt = self.get_smallest_possible(possible_limits) - - model.set_initializer(ll_const, ll_cinit.reshape(ch)) - model.set_tensor_datatype(ll_output, odt) - - # create node with no parallelization first - pe = 1 - assert ch % pe == 0, "Requirement IFC divisable by PE is violated." - # create and insert node - new_node = helper.make_node( - "ChannelwiseOp_Batch", - [ll_input, ll_const], - [ll_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - Func=func, - NumChannels=ch, - PE=pe, - inputDataType=idt.name, - paramDataType=pdt.name, - outputDataType=odt.name, - numInputVectors=list(ll_in_shape[:-1]), - name="ChannelwiseOp_Batch_" + node.name, - ) - graph.node.insert(insert_point, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferLabelSelectLayer(Transformation): - """Convert any TopK into a LabelSelect HLS layer.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "TopK": - fc_input = node.input[0] - k_input = node.input[1] - val_output = node.output[0] - idx_output = node.output[1] - fc_in_shape = model.get_tensor_shape(fc_input) - - idt = model.get_tensor_datatype(fc_input) - - # skip conversion for layers with float input - if not idt.is_integer(): - continue - - # skip conversion for if value output is connected (not supported) - if model.find_consumer(val_output) is not None: - continue - - num_labels = int(fc_in_shape[-1]) - num_inp_vecs = list(fc_in_shape[:-1]) - # create node with no parallelization first - pe = 1 - - k = model.get_initializer(k_input)[0] - - # create and insert new LabelSelect_Batch node - new_node = helper.make_node( - "LabelSelect_Batch", - [fc_input], - [idx_output], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - Labels=num_labels, - PE=pe, - K=k, - inputDataType=idt.name, - numInputVectors=num_inp_vecs, - name="LabelSelect_Batch_" + node.name, - ) - graph.node.insert(node_ind, new_node) - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferGlobalAccPoolLayer(Transformation): - """Convert any GlobalAveragePool into a GlobalAccPool HLS layer and a scalar Mul.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "GlobalAveragePool": - in0 = node.input[0] - result = node.output[0] - in0_shape = model.get_tensor_shape(in0) - - idt = model.get_tensor_datatype(in0) - - # skip conversion for layers with float input - if not idt.is_integer(): - continue - - # check layout and convert if necessary - in0_layout = model.get_tensor_layout(in0) - result_layout = model.get_tensor_layout(result) - - if in0_layout == DataLayout.NCHW: - in0 = nchw_to_nhwc(in0, model, node_ind) - node_ind += 1 - in0_shape = model.get_tensor_shape(in0) - - # keep track of where we need to insert the HLS Op - # it has to be ahead of the output transform - insert_point = node_ind - - if result_layout == DataLayout.NCHW: - result = nchw_to_nhwc(result, model, node_ind, reverse=True) - node_ind += 1 - - num_ch = int(in0_shape[-1]) - vecs = in0_shape[:-1] - # create node with no parallelization first - pe = 1 - - # create an additional tensor of the same shape and layout as result - out_shape = model.get_tensor_shape(result) - pool_out = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape - ) - model.graph.value_info.append(pool_out) - pool_out = pool_out.name - model.set_tensor_layout(pool_out, model.get_tensor_layout(result)) - - new_pool = helper.make_node( - "GlobalAccPool_Batch", - [in0], - [pool_out], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - NumChannels=num_ch, - PE=pe, - inputDataType=idt.name, - numInputVectors=vecs, - name="GlobalAccPool_Batch_" + node.name, - ) - - mul_value = helper.make_tensor_value_info( - model.make_new_valueinfo_name(), TensorProto.FLOAT, [1] - ) - model.graph.value_info.append(mul_value) - model.set_initializer( - mul_value.name, np.array(1 / (vecs[1] * vecs[2]), dtype=np.float32) - ) - new_mul = helper.make_node( - "Mul", - [pool_out, mul_value.name], - [result], - ) - graph.node.insert(insert_point, new_pool) - graph.node.insert(insert_point + 1, new_mul) - node_ind += 1 - # remove old node - graph.node.remove(node) - graph_modified = True - - if graph_modified: - model = model.transform(InferShapes()) - model = model.transform(InferDataTypes()) - return (model, graph_modified) - - -class InferLookupLayer(Transformation): - """Convert Gather nodes with constant op0 into Lookup HLS layers.""" - - def apply(self, model): - graph = model.graph - node_ind = 0 - graph_modified = False - for node in graph.node: - node_ind += 1 - if node.op_type == "Gather": - emb_name = node.input[0] - embs = model.get_initializer(emb_name) - axis = get_by_name(node.attribute, "axis") - # skip conversion if input0 is not constant - if embs is None: + if node.op_type == "Gather": + emb_name = node.input[0] + embs = model.get_initializer(emb_name) + axis = get_by_name(node.attribute, "axis") + # skip conversion if input0 is not constant + if embs is None: continue # skip conversion if axis != 0 if axis is not None and axis.i != 0: @@ -1677,7 +1132,7 @@ def apply(self, model): class InferConcatLayer(Transformation): """Convert suitable Concat nodes (operating on last/-1 axis) - into StreamingConcat HLS layers.""" + into StreamingConcat HW layers.""" def apply(self, model): graph = model.graph @@ -1699,9 +1154,7 @@ def apply(self, model): dt0 = model.get_tensor_datatype(node.input[0]) if dt0 is None: continue - dt_coherent = all( - [model.get_tensor_datatype(x) == dt0 for x in node.input] - ) + dt_coherent = all([model.get_tensor_datatype(x) == dt0 for x in node.input]) if not dt_coherent: continue # skip conversion if any inputs are static @@ -1795,7 +1248,7 @@ def apply(self, model): node_ind += 1 in1_shape = model.get_tensor_shape(in1) - # keep track of where we need to insert the HLS Op + # keep track of where we need to insert the HW Op # it has to be ahead of the output transform insert_point = node_ind @@ -1829,7 +1282,412 @@ def apply(self, model): graph.node.remove(nd) graph_modified = True - # if graph_modified: - # model = model.transform(InferShapes()) - # model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferBinaryMatrixVectorActivation(Transformation): + """Convert XnorPopcountMatMul layers to + MatrixVectorActivation layers. Any immediately following MultiThreshold + layers will also be absorbed into the MVTU.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "XnorPopcountMatMul": + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + assert model.get_tensor_datatype(mm_input) == DataType["BINARY"], ( + n.name + + """: First + input for xnorpopcount is not Wset to FINN DataType BINARY.""" + ) + assert model.get_tensor_datatype(mm_weight) == DataType["BINARY"], ( + n.name + + """: Second + input (weights) for xnorpopcount is not set to FINN DataType BINARY.""" + ) + idt = DataType["BINARY"] + wdt = DataType["BINARY"] + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisiable by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + if odt.bitwidth() == 1: + # covers both bipolar and binary + actval = 0 + else: + actval = odt.min() + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=1, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=1, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + name=n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferQuantizedMatrixVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + MatrixVectorActivation layers.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is None: + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # extract weight shape, note that ONNX and finn-hlslib + # make different assumptions about dim order here + # ONNX assumes W has (in, out) shape + # finn-hlslib assumes W has (out, in) shape + mh = int(W.shape[1]) + mw = int(W.shape[0]) + # create node with no parallelization first + pe = 1 + simd = 1 + wmem = mw * mh // (pe * simd) + assert mw * mh == wmem * pe * simd, ( + n.name + + """: Requirement (MW * MH) divisible by + (WMEM * PE * SIMD) is violated.""" + ) + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # TODO ensure integer thresholds? + # create MVTU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == mh, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor MH.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + odt_is_bipolar = odt == DataType["BIPOLAR"] + bipolar_ok = odt_is_bipolar and (scale == 2.0) and (actval == -1) + assert scale == 1.0 or bipolar_ok, ( + consumer.name + ": out_scale=1 or bipolar output needed for conversion." + ) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + if bipolar_ok: + # remove bias for bipolar, since + # binary->bipolar is achieved by reinterpretation + actval = 0 + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + binaryXnorMode=0, + noActivation=0, + numInputVectors=list(mm_in_shape[:-1]), + name="MVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new MatrixVectorActivation node + new_node = helper.make_node( + "MVAU", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=mw, + MH=mh, + SIMD=simd, + PE=pe, + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + binaryXnorMode=0, + noActivation=1, + numInputVectors=list(mm_in_shape[:-1]), + name="MVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + return (model, graph_modified) + + +class InferVectorVectorActivation(Transformation): + """Convert MatMul layers with quantized inputs and weights to + VectorVectorActivation layers, if the sparsity annotation + of the weight matrix indicates that the MatMul layer belongs to + a depthwise convolution. Any immediately following MultiThreshold + layers will also be absorbed into the VVAU.""" + + def __init__(self): + super().__init__() + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "MatMul" and model.get_tensor_sparsity(n.input[1]) is not None: + sparsity = model.get_tensor_sparsity(n.input[1]) + try: + k_h, k_w = sparsity["dw"]["kernel_shape"] + except KeyError: + raise Exception( + n.name + + """: sparsity annotation doesn't indicate that MatMul + belongs to a depthwise convolution.""" + ) + + mm_input = n.input[0] + mm_weight = n.input[1] + mm_output = n.output[0] + mm_in_shape = model.get_tensor_shape(mm_input) + mm_out_shape = model.get_tensor_shape(mm_output) + idt = model.get_tensor_datatype(mm_input) + wdt = model.get_tensor_datatype(mm_weight) + if idt.is_integer() and wdt.is_integer(): + mm_output = n.output[0] + W = model.get_initializer(mm_weight) + # infer dense weight tensor from sparse weight matrix + # kernel size (k_h, k_w) which was extracted above and the value of + # the channels is used. + # the weight matrix has a shape of (k_h * k_w * Channels, Channels) + # we need to reverse the creation of the sparse weight matrix + # to achieve a weight tensor of shape (Channels, 1, k_h, k_w) + channels = int(W.shape[1]) + # transpose to achieve a shape of (k_h * k_w * Channels, Channels) + W = W.T + # reshape to (Channels, k_h, k_w, Channels) to transpose afterwards + # to (Channels, Channels, k_h, k_w) + W = W.reshape(channels, k_h, k_w, channels) + W = W.transpose(0, 3, 1, 2) + # now we can extract the values using a for loop over the channels + # and fill a zero numpy array in the correct shape + w_tensor = np.zeros((channels, 1, k_h, k_w), dtype=np.float32) + for ch in range(channels): + w_tensor[ch][0] = W[ch][ch] + model.set_initializer(mm_weight, w_tensor) + model.set_tensor_shape(mm_weight, (channels, 1, k_h, k_w)) + # create node with pe=channels as default + pe = channels + # see if we have any following thresholds + consumer = model.find_consumer(mm_output) + if consumer is not None and consumer.op_type == "MultiThreshold": + # create VVAU (i.e. including activation) + mt_output = consumer.output[0] + mt_out_shape = model.get_tensor_shape(mt_output) + mt_thres = consumer.input[1] + T = model.get_initializer(mt_thres) + assert T.shape[0] == 1 or T.shape[0] == channels, ( + consumer.name + + """: First dimension of + thresholds neither 1 nor Channels.""" + ) + odt = model.get_tensor_datatype(mt_output) + scale = getCustomOp(consumer).get_nodeattr("out_scale") + assert scale == 1.0, ( + consumer.name + ": out_scale must be equal to 1.0 for HLS conversion." + ) + actval = getCustomOp(consumer).get_nodeattr("out_bias") + assert int(actval) == actval, ( + consumer.name + ": out_bias must be integer for HLS conversion." + ) + actval = int(actval) + assert (not odt.signed()) or (actval < 0), ( + consumer.name + ": Signed output requres actval < 0" + ) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mt_output, mt_out_shape) + # create and insert new VectorVectorActivation node + new_node = helper.make_node( + "VVAU", + [mm_input, mm_weight, mt_thres], + [mt_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=actval, + noActivation=0, + name="VVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(n) + graph.node.remove(consumer) + graph_modified = True + else: + # no activation, matmul only + odt = model.get_tensor_datatype(mm_output) + model.set_tensor_shape(mm_input, mm_in_shape) + model.set_tensor_shape(mm_output, mm_out_shape) + # create and insert new VVAU node + new_node = helper.make_node( + "VVAU", + [mm_input, mm_weight], + [mm_output], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + PE=pe, + Dim=[mm_in_shape[1], mm_in_shape[2]], + Channels=channels, + Kernel=[k_h, k_w], + inputDataType=idt.name, + weightDataType=wdt.name, + outputDataType=odt.name, + ActVal=0, + noActivation=1, + name="VVAU_" + n.name, + ) + graph.node.insert(node_ind, new_node) + # remove old node + graph.node.remove(n) + graph_modified = True + if graph_modified: + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py index 07d6961be3..f34c6b90af 100644 --- a/src/finn/transformation/fpgadataflow/create_dataflow_partition.py +++ b/src/finn/transformation/fpgadataflow/create_dataflow_partition.py @@ -52,7 +52,7 @@ def __init__(self, partition_model_dir=None): def apply(self, model): def filter_fc_extw(x): - if x.op_type == "IODMA": + if x.op_type == "IODMA_hls": burst_mode = get_by_name(x.attribute, "burstMode") if burst_mode is not None: burst_mode = burst_mode.s.decode("UTF-8") diff --git a/src/finn/transformation/fpgadataflow/create_stitched_ip.py b/src/finn/transformation/fpgadataflow/create_stitched_ip.py index 8e2c69bad4..4212e2b58a 100644 --- a/src/finn/transformation/fpgadataflow/create_stitched_ip.py +++ b/src/finn/transformation/fpgadataflow/create_stitched_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,8 +27,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import json import multiprocessing as mp import os @@ -42,7 +41,7 @@ ReplaceVerilogRelPaths, ) from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def is_external_input(model, node, i): @@ -50,12 +49,13 @@ def is_external_input(model, node, i): # True only if input is unconnected and has no initializer # Only esception is second input of FC layers when mem_mode is external node_inst = getCustomOp(node) + op_type = node.op_type producer = model.find_producer(node.input[i]) if producer is None: if model.get_initializer(node.input[i]) is None: return True else: - if node.op_type == "MatrixVectorActivation": + if op_type.startswith("MVAU"): if node_inst.get_nodeattr("mem_mode") == "external": return True return False @@ -86,9 +86,7 @@ class CreateStitchedIP(Transformation): The packaged block design IP can be found under the ip subdirectory. """ - def __init__( - self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signature=[] - ): + def __init__(self, fpgapart, clk_ns, ip_name="finn_design", vitis=False, signature=[]): super().__init__() self.fpgapart = fpgapart self.clk_ns = clk_ns @@ -121,17 +119,13 @@ def connect_clk_rst(self, node): # make clock and reset external, if they aren't already if not self.clock_reset_are_external: self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" - % (inst_name, clock_intf_name) + "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, clock_intf_name) ) self.connect_cmds.append("set_property name ap_clk [get_bd_ports ap_clk_0]") self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" - % (inst_name, reset_intf_name) - ) - self.connect_cmds.append( - "set_property name ap_rst_n [get_bd_ports ap_rst_n_0]" + "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, reset_intf_name) ) + self.connect_cmds.append("set_property name ap_rst_n [get_bd_ports ap_rst_n_0]") self.clock_reset_are_external = True self.intf_names["clk"] = ["ap_clk"] self.intf_names["rst"] = ["ap_rst_n"] @@ -172,13 +166,9 @@ def connect_axi(self, node): ) self.connect_cmds.append("assign_bd_address") seg_name = "%s/Data_m_axi_gmem/SEG_%s_Reg" % (inst_name, ext_if_name) - self.connect_cmds.append( - "set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name) - ) + self.connect_cmds.append("set_property offset 0 [get_bd_addr_segs {%s}]" % (seg_name)) # TODO should propagate this information from the node instead of 4G - self.connect_cmds.append( - "set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name) - ) + self.connect_cmds.append("set_property range 4G [get_bd_addr_segs {%s}]" % (seg_name)) self.intf_names["aximm"] = [(ext_if_name, aximm_intf_name[0][1])] self.has_aximm = True @@ -215,8 +205,7 @@ def connect_s_axis_external(self, node, idx=None): continue input_intf_name = input_intf_names[i][0] self.connect_cmds.append( - "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" - % (inst_name, input_intf_name) + "make_bd_intf_pins_external [get_bd_intf_pins %s/%s]" % (inst_name, input_intf_name) ) self.connect_cmds.append( "set_property name s_axis_%d [get_bd_intf_ports %s_0]" @@ -236,12 +225,10 @@ def connect_ap_none_external(self, node): for i in range(len(input_intf_names)): input_intf_name = input_intf_names[i] self.connect_cmds.append( - "make_bd_pins_external [get_bd_pins %s/%s]" - % (inst_name, input_intf_name) + "make_bd_pins_external [get_bd_pins %s/%s]" % (inst_name, input_intf_name) ) self.connect_cmds.append( - "set_property name %s [get_bd_ports %s_0]" - % (input_intf_name, input_intf_name) + "set_property name %s [get_bd_ports %s_0]" % (input_intf_name, input_intf_name) ) def insert_signature(self, checksum_count): @@ -267,12 +254,10 @@ def insert_signature(self, checksum_count): ) # set clk and reset self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/ap_clk]" - % signature_name + "connect_bd_net [get_bd_ports ap_clk] [get_bd_pins %s/ap_clk]" % signature_name ) self.connect_cmds.append( - "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/ap_rst_n]" - % signature_name + "connect_bd_net [get_bd_ports ap_rst_n] [get_bd_pins %s/ap_rst_n]" % signature_name ) fclk_mhz = 1 / (self.clk_ns * 0.001) fclk_hz = fclk_mhz * 1000000 @@ -290,9 +275,7 @@ def insert_signature(self, checksum_count): self.connect_cmds.append( "make_bd_intf_pins_external [get_bd_intf_pins %s/s_axi]" % signature_name ) - self.connect_cmds.append( - "set_property name s_axilite_info [get_bd_intf_ports s_axi_0]" - ) + self.connect_cmds.append("set_property name s_axilite_info [get_bd_intf_ports s_axi_0]") self.connect_cmds.append("assign_bd_address") def apply(self, model): @@ -303,14 +286,14 @@ def apply(self, model): ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/memstream") if self.signature: ip_dirs.append("$::env(FINN_ROOT)/finn-rtllib/axi_info") - if model.graph.node[0].op_type not in ["StreamingFIFO", "IODMA"]: + if model.graph.node[0].op_type not in ["StreamingFIFO_rtl", "IODMA_hls"]: warnings.warn( """First node is not StreamingFIFO or IODMA. You may experience incorrect stitched-IP rtlsim or hardware behavior. It is strongly recommended to insert FIFOs prior to calling CreateStitchedIP.""" ) - if model.graph.node[0].op_type == "StreamingFIFO": + if model.graph.node[0].op_type == "StreamingFIFO_rtl": firstfifo = getCustomOp(model.graph.node[0]) if firstfifo.get_nodeattr("impl_style") == "vivado": warnings.warn( @@ -320,7 +303,7 @@ def apply(self, model): ) for node in model.graph.node: # ensure that all nodes are fpgadataflow, and that IPs are generated - assert is_fpgadataflow_node( + assert is_hls_node(node) or is_rtl_node( node ), "All nodes must be FINN fpgadataflow nodes." node_inst = getCustomOp(node) @@ -337,12 +320,10 @@ def apply(self, model): if producer is None: continue j = list(producer.output).index(node.input[i]) - src_intf_name = getCustomOp( - producer - ).get_verilog_top_module_intf_names()["m_axis"][j][0] - dst_intf_name = node_inst.get_verilog_top_module_intf_names()[ - "s_axis" - ][i][0] + src_intf_name = getCustomOp(producer).get_verilog_top_module_intf_names()[ + "m_axis" + ][j][0] + dst_intf_name = node_inst.get_verilog_top_module_intf_names()["s_axis"][i][0] self.connect_cmds.append( "connect_bd_intf_net [get_bd_intf_pins %s/%s] " "[get_bd_intf_pins %s/%s]" @@ -371,7 +352,7 @@ def apply(self, model): if self.signature: # extract number of checksum layer from graph - checksum_layers = model.get_nodes_by_op_type("checksum") + checksum_layers = model.get_nodes_by_op_type("CheckSum_hls") self.insert_signature(len(checksum_layers)) # create a temporary folder for the project @@ -382,9 +363,10 @@ def apply(self, model): tcl = [] # create vivado project tcl.append( - "create_project %s %s -part %s" - % (prjname, vivado_stitch_proj_dir, self.fpgapart) + "create_project %s %s -part %s" % (prjname, vivado_stitch_proj_dir, self.fpgapart) ) + # no warnings on long module names + tcl.append("set_msg_config -id {[BD 41-1753]} -suppress") # add all the generated IP dirs to ip_repo_paths ip_dirs_str = " ".join(ip_dirs) tcl.append("set_property ip_repo_paths [%s] [current_project]" % ip_dirs_str) @@ -397,8 +379,7 @@ def apply(self, model): fclk_mhz = 1 / (self.clk_ns * 0.001) fclk_hz = fclk_mhz * 1000000 model.set_metadata_prop("clk_ns", str(self.clk_ns)) - tcl.append("set_property CONFIG.FREQ_HZ %f [get_bd_ports /ap_clk]" % fclk_hz) - tcl.append("regenerate_bd_layout") + tcl.append("set_property CONFIG.FREQ_HZ %d [get_bd_ports /ap_clk]" % round(fclk_hz)) tcl.append("validate_bd_design") tcl.append("save_bd_design") # create wrapper hdl (for rtlsim later on) @@ -412,12 +393,11 @@ def apply(self, model): wrapper_filename = "%s/hdl/%s_wrapper.v" % (bd_base, block_name) tcl.append("add_files -norecurse %s" % wrapper_filename) model.set_metadata_prop("wrapper_filename", wrapper_filename) - tcl.append("set_property top finn_design_wrapper [current_fileset]") + tcl.append("set_property top %s_wrapper [current_fileset]" % block_name) # synthesize to DCP and export stub, DCP and constraints if self.vitis: tcl.append( - "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]" - % bd_filename + "set_property SYNTH_CHECKPOINT_MODE Hierarchical [ get_files %s ]" % bd_filename ) tcl.append( "set_property -name {STEPS.SYNTH_DESIGN.ARGS.MORE OPTIONS} " @@ -450,6 +430,8 @@ def apply(self, model): ) % (vivado_stitch_proj_dir, block_vendor, block_library, block_name) ) + # Allow user to customize clock in deployment of stitched IP + tcl.append("set_property ipi_drc {ignore_freq_hz true} [ipx::current_core]") # in some cases, the IP packager seems to infer an aperture of 64K or 4G, # preventing address assignment of the DDR_LOW and/or DDR_HIGH segments # the following is a hotfix to remove this aperture during IODMA packaging @@ -467,16 +449,9 @@ def apply(self, model): # if targeting Vitis, add some properties to the IP if self.vitis: # replace source code with dcp - tcl.append( - "set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv - ) - tcl.append( - "set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv - ) - tcl.append( - "set_property supported_families { } [ipx::find_open_core %s]" - % block_vlnv - ) + tcl.append("set_property sdx_kernel true [ipx::find_open_core %s]" % block_vlnv) + tcl.append("set_property sdx_kernel_type rtl [ipx::find_open_core %s]" % block_vlnv) + tcl.append("set_property supported_families { } [ipx::find_open_core %s]" % block_vlnv) tcl.append( "set_property xpm_libraries {XPM_CDC XPM_MEMORY XPM_FIFO} " "[ipx::find_open_core %s]" % block_vlnv @@ -491,32 +466,20 @@ def apply(self, model): "ipx::remove_all_file " "[ipx::get_file_groups xilinx_anylanguagebehavioralsimulation]" ) - tcl.append( - "ipx::remove_all_file " - "[ipx::get_file_groups xilinx_anylanguagesynthesis]" - ) + tcl.append("ipx::remove_all_file " "[ipx::get_file_groups xilinx_anylanguagesynthesis]") tcl.append( "ipx::remove_file_group " "xilinx_anylanguagebehavioralsimulation [ipx::current_core]" ) - tcl.append( - "ipx::remove_file_group " - "xilinx_anylanguagesynthesis [ipx::current_core]" - ) + tcl.append("ipx::remove_file_group " "xilinx_anylanguagesynthesis [ipx::current_core]") # remove sim and src folders tcl.append("file delete -force %s/ip/sim" % vivado_stitch_proj_dir) tcl.append("file delete -force %s/ip/src" % vivado_stitch_proj_dir) # copy and add DCP, stub, and xdc tcl.append("file mkdir %s/ip/dcp" % vivado_stitch_proj_dir) tcl.append("file mkdir %s/ip/impl" % vivado_stitch_proj_dir) - tcl.append( - "file copy -force %s.dcp %s/ip/dcp" - % (block_name, vivado_stitch_proj_dir) - ) - tcl.append( - "file copy -force %s.xdc %s/ip/impl" - % (block_name, vivado_stitch_proj_dir) - ) + tcl.append("file copy -force %s.dcp %s/ip/dcp" % (block_name, vivado_stitch_proj_dir)) + tcl.append("file copy -force %s.xdc %s/ip/impl" % (block_name, vivado_stitch_proj_dir)) tcl.append("ipx::add_file_group xilinx_implementation [ipx::current_core]") tcl.append( "ipx::add_file impl/%s.xdc [ipx::get_file_groups xilinx_implementation]" @@ -527,37 +490,97 @@ def apply(self, model): "[ipx::get_files impl/%s.xdc " "-of_objects [ipx::get_file_groups xilinx_implementation]]" % block_name ) - tcl.append( - "ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]" - ) + tcl.append("ipx::add_file_group " "xilinx_synthesischeckpoint [ipx::current_core]") tcl.append( "ipx::add_file dcp/%s.dcp " "[ipx::get_file_groups xilinx_synthesischeckpoint]" % block_name ) - tcl.append( - "ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]" - ) + tcl.append("ipx::add_file_group xilinx_simulationcheckpoint [ipx::current_core]") tcl.append( "ipx::add_file dcp/%s.dcp " "[ipx::get_file_groups xilinx_simulationcheckpoint]" % block_name ) # add a rudimentary driver mdd to get correct ranges in xparameters.h later on - example_data_dir = pk.resource_filename("finn.qnn-data", "mdd-data/") + example_data_dir = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/mdd-data" copytree(example_data_dir, vivado_stitch_proj_dir + "/data") - tcl.append("file copy -force data ip/") - tcl.append("ipx::add_file_group -type software_driver {} [ipx::current_core]") - tcl.append( - "set_property type mdd [ipx::add_file data/finn_design.mdd " - "[ipx::get_file_groups xilinx_softwaredriver -of_objects " - "[ipx::current_core]]]" - ) + + ##### + # Core Cleanup Operations tcl.append( - "set_property type tclSource [ipx::add_file data/finn_design.tcl " - "[ipx::get_file_groups xilinx_softwaredriver -of_objects " - "[ipx::current_core]]]" + """ +set core [ipx::current_core] + +# Add rudimentary driver +file copy -force data ip/ +set file_group [ipx::add_file_group -type software_driver {} $core] +set_property type mdd [ipx::add_file data/finn_design.mdd $file_group] +set_property type tclSource [ipx::add_file data/finn_design.tcl $file_group] + +# Remove all XCI references to subcores +set impl_files [ipx::get_file_groups xilinx_implementation -of $core] +foreach xci [ipx::get_files -of $impl_files {*.xci}] { + ipx::remove_file [get_property NAME $xci] $impl_files +} + +# Construct a single flat memory map for each AXI-lite interface port +foreach port [get_bd_intf_ports -filter {CONFIG.PROTOCOL==AXI4LITE}] { + set pin $port + set awidth "" + while { $awidth == "" } { + set pins [get_bd_intf_pins -of [get_bd_intf_nets -boundary_type lower -of $pin]] + set kill [lsearch $pins $pin] + if { $kill >= 0 } { set pins [lreplace $pins $kill $kill] } + if { [llength $pins] != 1 } { break } + set pin [lindex $pins 0] + set awidth [get_property CONFIG.ADDR_WIDTH $pin] + } + if { $awidth == "" } { + puts "CRITICAL WARNING: Unable to construct address map for $port." + } { + set range [expr 2**$awidth] + set range [expr $range < 4096 ? 4096 : $range] + puts "INFO: Building address map for $port: 0+:$range" + set name [get_property NAME $port] + set addr_block [ipx::add_address_block Reg0 [ipx::add_memory_map $name $core]] + set_property range $range $addr_block + set_property slave_memory_map_ref $name [ipx::get_bus_interfaces $name -of $core] + } +} + +# Finalize and Save +ipx::update_checksums $core +ipx::save_core $core + +# Remove stale subcore references from component.xml +file rename -force ip/component.xml ip/component.bak +set ifile [open ip/component.bak r] +set ofile [open ip/component.xml w] +set buf [list] +set kill 0 +while { [eof $ifile] != 1 } { + gets $ifile line + if { [string match {**} $line] == 1 } { + foreach l $buf { puts $ofile $l } + set buf [list $line] + } elseif { [llength $buf] > 0 } { + lappend buf $line + + if { [string match {**} $line] == 1 } { + if { $kill == 0 } { foreach l $buf { puts $ofile $l } } + set buf [list] + set kill 0 + } elseif { [string match {**} $line] == 1 } { + set kill 1 + } + } else { + puts $ofile $line + } +} +close $ifile +close $ofile +""" ) - tcl.append("ipx::update_checksums [ipx::find_open_core %s]" % block_vlnv) - tcl.append("ipx::save_core [ipx::find_open_core %s]" % block_vlnv) + # export list of used Verilog files (for rtlsim later on) tcl.append( "set all_v_files [get_files -filter {USED_IN_SYNTHESIS == 1 " diff --git a/src/finn/transformation/fpgadataflow/derive_characteristic.py b/src/finn/transformation/fpgadataflow/derive_characteristic.py index 67eb96995e..4d3ac7dc67 100644 --- a/src/finn/transformation/fpgadataflow/derive_characteristic.py +++ b/src/finn/transformation/fpgadataflow/derive_characteristic.py @@ -1,4 +1,5 @@ -# Copyright (c) 2022, Xilinx +# Copyright (C) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class DeriveCharacteristic(NodeLocalTransformation): @@ -58,16 +59,14 @@ def __init__(self, period, num_workers=None, manual_bypass=False): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) inst.derive_characteristic_fxns(period=self.period) except KeyError: # exception if op_type is not supported - raise Exception( - "Custom op_type %s is currently not supported." % op_type - ) + raise Exception("Custom op_type %s is currently not supported." % op_type) return (node, False) def apply(self, model: ModelWrapper): @@ -76,7 +75,7 @@ def apply(self, model: ModelWrapper): return (model, run_again) # apply manual fix for DuplicateStreams and AddStreams for # simple residual reconvergent paths with bypass - addstrm_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + addstrm_nodes = model.get_nodes_by_op_type("AddStreams_hls") for addstrm_node in addstrm_nodes: # we currently only support the case where one branch is # a bypass @@ -85,8 +84,8 @@ def apply(self, model: ModelWrapper): if (b0 is None) or (b1 is None): warnings.warn("Found unsupported AddStreams, skipping") return (model, run_again) - b0_is_bypass = b0.op_type == "DuplicateStreams_Batch" - b1_is_bypass = b1.op_type == "DuplicateStreams_Batch" + b0_is_bypass = b0.op_type == "DuplicateStreams_hls" + b1_is_bypass = b1.op_type == "DuplicateStreams_hls" if (not b0_is_bypass) and (not b1_is_bypass): warnings.warn("Found unsupported AddStreams, skipping") return (model, run_again) @@ -103,24 +102,16 @@ def apply(self, model: ModelWrapper): # for DuplicateStreams, use comp_branch_first's input characterization # for AddStreams, use comp_branch_last's output characterization period = comp_branch_first.get_nodeattr("io_chrc_period") - comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[ - : 2 * period - ] - comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[ - 2 * period : - ] + comp_branch_first_f = comp_branch_first.get_nodeattr("io_characteristic")[: 2 * period] + comp_branch_last_f = comp_branch_last.get_nodeattr("io_characteristic")[2 * period :] ds_node_inst = registry.getCustomOp(ds_node) addstrm_node_inst = registry.getCustomOp(addstrm_node) ds_node_inst.set_nodeattr("io_chrc_period", period) ds_node_inst.set_nodeattr("io_characteristic", comp_branch_first_f * 2) addstrm_node_inst.set_nodeattr("io_chrc_period", period) addstrm_node_inst.set_nodeattr("io_characteristic", comp_branch_last_f * 2) - warnings.warn( - f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}" - ) - warnings.warn( - f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}" - ) + warnings.warn(f"Set {ds_node.name} chrc. from {comp_branch_first.onnx_node.name}") + warnings.warn(f"Set {addstrm_node.name} chrc. from {comp_branch_last.onnx_node.name}") return (model, run_again) @@ -140,16 +131,14 @@ def __init__(self, num_workers=None, io_fifo_depth=32): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps prod = registry.getCustomOp(node) - assert op_type != "StreamingFIFO", "Found existing FIFOs" + assert not (op_type.startswith("StreamingFIFO")), "Found existing FIFOs" period = prod.get_nodeattr("io_chrc_period") prod_chrc = prod.get_nodeattr("io_chrc_out")[0] - assert ( - len(prod_chrc) == 2 * period - ), "Found unexpected characterization attribute" + assert len(prod_chrc) == 2 * period, "Found unexpected characterization attribute" if any([x > 2 for x in prod.get_nodeattr("outFIFODepths")]): # FIFO depth already set, can skip this node return (node, False) @@ -186,14 +175,12 @@ def applyNodeLocal(self, node): # finally, check node inputs to ensure FIFOs are added to # any top-level inputs (at least self.io_fifo_depth deep) in_fifo_depths = prod.get_nodeattr("inFIFODepths") - for (i, input_name) in enumerate(node.input): + for i, input_name in enumerate(node.input): if input_name in [x.name for x in model.graph.input]: in_fifo_depths[i] = max(self.io_fifo_depth, in_fifo_depths[i]) prod.set_nodeattr("inFIFODepths", in_fifo_depths) except KeyError: # exception if op_type is not supported - raise Exception( - "Custom op_type %s is currently not supported." % op_type - ) + raise Exception("Custom op_type %s is currently not supported." % op_type) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/externalize_params.py b/src/finn/transformation/fpgadataflow/externalize_params.py index 732b82c675..5e21d8cb2a 100644 --- a/src/finn/transformation/fpgadataflow/externalize_params.py +++ b/src/finn/transformation/fpgadataflow/externalize_params.py @@ -42,7 +42,7 @@ def apply(self, model): graph_modified = False def filter_fc_extw(x): - if x.op_type == "IODMA": + if x.op_type == "IODMA_hls": burst_mode = get_by_name(x.attribute, "burstMode") if burst_mode is not None: burst_mode = burst_mode.s.decode("UTF-8") @@ -64,11 +64,7 @@ def filter_fc_extw(x): assert iodma_init is not None # remove output-side initializer to get correct dataflow partitioning model.graph.initializer.remove( - [ - x - for x in model.graph.initializer - if x.name == extw_tensor_name_out - ][0] + [x for x in model.graph.initializer if x.name == extw_tensor_name_out][0] ) graph_modified = True diff --git a/src/finn/transformation/fpgadataflow/floorplan.py b/src/finn/transformation/fpgadataflow/floorplan.py index 549b94d9f2..b24145afcb 100644 --- a/src/finn/transformation/fpgadataflow/floorplan.py +++ b/src/finn/transformation/fpgadataflow/floorplan.py @@ -56,7 +56,6 @@ def __init__(self, floorplan=None): self.user_floorplan = floorplan def apply(self, model): - # read in a user-specified floorplan or generate a default one if self.user_floorplan is None: self.user_floorplan = model.analysis(floorplan_params) @@ -82,7 +81,7 @@ def apply(self, model): if node_slr == -1: unassigned_nodes += 1 node_inst.set_nodeattr("slr", default_slr) - if node.op_type == "StreamingDataWidthConverter_Batch": + if node.op_type.startswith("StreamingDataWidthConverter"): # if we have SLR assignment already. use that if node_slr != -1: continue @@ -96,7 +95,7 @@ def apply(self, model): narrow_neighbour = model.find_producer(node.input[0]) node_slr = getCustomOp(narrow_neighbour).get_nodeattr("slr") node_inst.set_nodeattr("slr", node_slr) - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): # if we have SLR assignment already. use that if node_slr != -1: continue @@ -120,18 +119,16 @@ def apply(self, model): df_nodes = list( filter(lambda x: get_by_name(x.attribute, "backend") is not None, all_nodes) ) - dma_nodes = list(filter(lambda x: x.op_type == "IODMA", df_nodes)) + dma_nodes = list(filter(lambda x: x.op_type == "IODMA_hls", df_nodes)) non_dma_nodes = list(filter(lambda x: x not in dma_nodes, df_nodes)) dyn_tlastmarker_nodes = list( filter( - lambda x: x.op_type == "TLastMarker" + lambda x: x.op_type == "TLastMarker_hls" and getCustomOp(x).get_nodeattr("DynIters") == "true", non_dma_nodes, ) ) - non_dma_nodes = list( - filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes) - ) + non_dma_nodes = list(filter(lambda x: x not in dyn_tlastmarker_nodes, non_dma_nodes)) for node in dma_nodes: node_inst = getCustomOp(node) @@ -153,7 +150,7 @@ def apply(self, model): continue elif not ( - node.op_type == "MatrixVectorActivation" + node.op_type.startswith("MVAU") and node_inst.get_nodeattr("mem_mode") is not None and node_inst.get_nodeattr("mem_mode") == "external" ): @@ -166,21 +163,18 @@ def apply(self, model): pre_inst = getCustomOp(pre_node) pre_slr = pre_inst.get_nodeattr("slr") if node_slr == pre_slr: - axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()[ - "axilite" - ] + axilite_intf_name = pre_inst.get_verilog_top_module_intf_names()["axilite"] if len(axilite_intf_name) != 0: node_inst.set_nodeattr("partition_id", partition_cnt) partition_cnt += 1 else: partition_id = pre_inst.get_nodeattr("partition_id") node_inst.set_nodeattr("partition_id", partition_id) - break - else: - # no matching, new partition - node_inst.set_nodeattr("partition_id", partition_cnt) - partition_cnt += 1 + else: + # no matching, new partition + node_inst.set_nodeattr("partition_id", partition_cnt) + partition_cnt += 1 # save the updated floorplan floorplan = model.analysis(floorplan_params) diff --git a/src/finn/transformation/fpgadataflow/hlssynth_ip.py b/src/finn/transformation/fpgadataflow/hlssynth_ip.py index c091dbd5ed..5b901d9284 100644 --- a/src/finn/transformation/fpgadataflow/hlssynth_ip.py +++ b/src/finn/transformation/fpgadataflow/hlssynth_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,18 +32,18 @@ import warnings from qonnx.transformation.base import NodeLocalTransformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node class HLSSynthIP(NodeLocalTransformation): - """For each node: generate IP block from code in folder + """For each HLS node: generate IP block from code in folder that is referenced in node attribute "code_gen_dir_ipgen" and save path of generated project in node attribute "ipgen_path". All nodes in the graph must have the fpgadataflow backend attribute. Any nodes that already have a ipgen_path attribute pointing to a valid path will be skipped. - This transformation calls Vivado HLS for synthesis, so it will run for + This transformation calls Vitis HLS for synthesis, so it will run for some time (minutes to hours depending on configuration). * num_workers (int or None) number of parallel workers, see documentation in @@ -54,7 +55,7 @@ def __init__(self, num_workers=None): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) @@ -64,11 +65,9 @@ def applyNodeLocal(self, node): ), """Node attribute "code_gen_dir_ipgen" is empty. Please run transformation PrepareIP first.""" - if not os.path.isdir( - inst.get_nodeattr("ipgen_path") - ) or not inst.get_nodeattr("code_gen_dir_ipgen") in inst.get_nodeattr( - "ipgen_path" - ): + if not os.path.isdir(inst.get_nodeattr("ipgen_path")) or not inst.get_nodeattr( + "code_gen_dir_ipgen" + ) in inst.get_nodeattr("ipgen_path"): # call the compilation function for this node inst.ipgen_singlenode_code() else: @@ -81,7 +80,5 @@ def applyNodeLocal(self, node): is empty.""" except KeyError: # exception if op_type is not supported - raise Exception( - "Custom op_type %s is currently not supported." % op_type - ) + raise Exception("Custom op_type %s is currently not supported." % op_type) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py new file mode 100644 index 0000000000..8dbf7071fc --- /dev/null +++ b/src/finn/transformation/fpgadataflow/infer_pixel_padding_deconv.py @@ -0,0 +1,205 @@ +import numpy as np +import warnings +from onnx import TensorProto, helper +from qonnx.transformation.base import Transformation +from qonnx.transformation.lower_convs_to_matmul import _auto_pad_to_explicit_padding +from qonnx.util.basic import get_by_name + + +class InferPixelPaddingDeconv(Transformation): + """ + Lowering and conversion of ConvTranspose (NCHW) nodes to + FMPadding_Pixel + Im2Col + MatMul (NHWC) surrounded by Transpose nodes + note: this transformation produces a mix of hw layers and non hw layers + to implement this on an FPGA the Im2Col and MatMul nodes need to be converted to hw layers + after applying this transformation and the resulting transpose nodes need to be streamlined. + See deconv test case under tests/fpgadataflow for an example. + """ + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "ConvTranspose": + # conversion currently only supported for group=1 + group = get_by_name(n.attribute, "group").i + if group != 1: + warnings.warn( + "%s : Only group=1 is currently supported. Can't infer PixelPaddingDeconv." + % n.name + ) + continue + deconv_input = n.input[0] + deconv_output = n.output[0] + idt = model.get_tensor_datatype(deconv_input) + odt = model.get_tensor_datatype(deconv_output) + k_h = get_by_name(n.attribute, "kernel_shape").ints[0] + k_w = get_by_name(n.attribute, "kernel_shape").ints[1] + stride_h = get_by_name(n.attribute, "strides").ints[0] + stride_w = get_by_name(n.attribute, "strides").ints[1] + weight_name = n.input[1] + W_conv = model.get_initializer(weight_name) + ifm_ch = model.get_tensor_shape(n.input[0])[1] # assume NCHW + ofm_ch = model.get_tensor_shape(n.output[0])[1] # assume NCHW + ifm_dim_h = model.get_tensor_shape(n.input[0])[2] # assume NCHW + ifm_dim_w = model.get_tensor_shape(n.input[0])[3] + ofm_dim_h = model.get_tensor_shape(n.output[0])[2] # assume NCHW + ofm_dim_w = model.get_tensor_shape(n.output[0])[3] + dilation_attr = get_by_name(n.attribute, "dilations") + if dilation_attr is not None: + dilation = dilation_attr.ints + else: + dilation = [1, 1] # default value + # handle both auto_pad and explicit padding + auto_pad = get_by_name(n.attribute, "auto_pad") + if auto_pad is not None: + # find equivalent specified padding + auto_pad = auto_pad.s.decode("utf-8") + if auto_pad == "NOTSET": + # use specified padding + pad = get_by_name(n.attribute, "pads").ints + else: + pad = _auto_pad_to_explicit_padding( + auto_pad, + ifm_dim_h, + ifm_dim_w, + k_h, + k_w, + stride_h, + stride_w, + len(model.get_tensor_shape(n.input[0])) - 2, + ) + else: + # use specified padding + pad = get_by_name(n.attribute, "pads").ints + + # If len(pad) == 2, assume no padding for other dimension + if len(pad) == 2: # only one dimension should be padded + assert ( + ifm_dim_h == 1 or ifm_dim_w == 1 + ), "Padding is assumed to be 1D, image is 2D" + # reuse ConvTranspose weights for new matmul weights + # conv weights are [IFM][OFM][k][k] + # We need to rotate the weights and make them [OFM][IFM][k][k] + # for pixel padding deconv to remain mathematically equivalent + # and then convert to [OFM][k][k][IFM] (to remain compatible + # with finn-hlslib and how it does im2col/sliding window) + W_conv = np.rot90(W_conv, 2, [2, 3]) + W_conv = np.moveaxis(W_conv, 0, 1) + W_matmul = W_conv.transpose(0, 2, 3, 1) # W_conv = [OFM, IFM, k_H, k_W] + # reshape into [OFM][k*k*IFM] matrix + W_matmul = W_matmul.reshape(ofm_ch, ifm_ch * k_h * k_w) + # transpose to get ONNX-compatible [k*k*IFM][OFM] matrix + W_matmul = W_matmul.T + model.set_initializer(weight_name, W_matmul) + + # Compute intermediate parameters + padded_odim_h = ifm_dim_h + (ifm_dim_h - 1) * (stride_h - 1) + padded_odim_w = ifm_dim_w + (ifm_dim_w - 1) * (stride_w - 1) + conv_padding = [dilation[0] * (k_h - 1) - pad[0]] * 4 + + # create new intermediate values + inp_trans_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ifm_dim_h, ifm_dim_w, ifm_ch), # NHWC + ) + padding_pixel_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, padded_odim_h, padded_odim_w, ifm_ch), # NHWC + ) + graph.value_info.append(inp_trans_out) + graph.value_info.append(padding_pixel_out) + inp_trans_out = inp_trans_out.name + padding_pixel_out = padding_pixel_out.name + model.set_tensor_datatype(inp_trans_out, idt) + model.set_tensor_datatype(padding_pixel_out, idt) + + need_im2col = True + if all(p == 0 for p in conv_padding): + padding = 0 + + # k_h=k_w==1: pointwise convolution, thus no im2col needed + if k_h == 1 and k_w == 1 and padding == 0 and stride_h == 1 and stride_w == 1: + need_im2col = False + + if need_im2col: + im2col_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w), + ) + graph.value_info.append(im2col_out) + im2col_out = im2col_out.name + model.set_tensor_datatype(im2col_out, idt) + + matmul_out = helper.make_tensor_value_info( + model.make_new_valueinfo_name(), + TensorProto.FLOAT, + (1, ofm_dim_h, ofm_dim_w, ofm_ch), + ) + graph.value_info.append(matmul_out) + matmul_out = matmul_out.name + model.set_tensor_datatype(matmul_out, odt) + + # create new nodes + + # NCHW -> NHWC + inp_trans_node = helper.make_node( + "Transpose", [deconv_input], [inp_trans_out], perm=[0, 2, 3, 1] + ) + # Pixel Padding + fmpadding_pixel_node = helper.make_node( + "FMPadding_Pixel", + [inp_trans_out], + [padding_pixel_out], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + ImgDim=(ifm_dim_h, ifm_dim_w), + Stride=[stride_h, stride_w], + NumChannels=ifm_ch, + inputDataType=str(idt.name), + numInputVectors=1, + SIMD=1, + ) + # lower input tensor + matmul_input = padding_pixel_out + if need_im2col: + matmul_input = im2col_out + im2col_node = helper.make_node( + "Im2Col", + [padding_pixel_out], + [im2col_out], + domain="qonnx.custom_op.general", + stride=[1, 1], + kernel_size=[k_h, k_w], + pad_amount=conv_padding, + input_shape="(1,{},{},{})".format(padded_odim_h, padded_odim_w, ifm_ch), + depthwise=False, + dilations=dilation, + ) + + # do matmul + matmul_node = helper.make_node("MatMul", [matmul_input, weight_name], [matmul_out]) + # NHWC -> NCHW + out_trans_node = helper.make_node( + "Transpose", [matmul_out], [deconv_output], perm=[0, 3, 1, 2] + ) + # insert nodes where the conv is to preserve topological ordering + graph.node.insert(node_ind, inp_trans_node) + if need_im2col: + graph.node.insert(node_ind + 1, fmpadding_pixel_node) + graph.node.insert(node_ind + 2, im2col_node) + graph.node.insert(node_ind + 3, matmul_node) + graph.node.insert(node_ind + 4, out_trans_node) + else: + graph.node.insert(node_ind + 1, fmpadding_pixel_node) + graph.node.insert(node_ind + 2, matmul_node) + graph.node.insert(node_ind + 3, out_trans_node) + # remove old nodes + graph.node.remove(n) + + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/insert_dwc.py b/src/finn/transformation/fpgadataflow/insert_dwc.py index 632d1f813b..33cc3e86d3 100644 --- a/src/finn/transformation/fpgadataflow/insert_dwc.py +++ b/src/finn/transformation/fpgadataflow/insert_dwc.py @@ -1,3 +1,31 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + from onnx import TensorProto from onnx import helper as oh from qonnx.custom_op.registry import getCustomOp @@ -7,19 +35,16 @@ def _is_dwc_node(node): - if node.op_type == "StreamingDataWidthConverter_Batch": - return True - else: - return False + return node.op_type.startswith("StreamingDataWidthConverter") def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: + if is_fpgadataflow_node(node): if _is_dwc_node(node): # no DWC for DWCs return False - elif node.op_type == "IODMA": + elif node.op_type == "IODMA_hls": # IODMA data shapes/widths need special handling return False else: @@ -48,8 +73,7 @@ def apply(self, model): if consumers == []: continue assert len(consumers) == 1, ( - n.name - + ": HLS node with fan-out higher than 1 cannot be stitched" + n.name + ": HW node with fan-out higher than 1 cannot be stitched" ) consumer = consumers[0] if _suitable_node(consumer) is True: @@ -61,9 +85,9 @@ def apply(self, model): # - if FC and external mem, it could be connected to input 1 # - if concat, could be connected to any input if ( - consumer.op_type == "MatrixVectorActivation" + consumer.op_type.startswith("MVAU") and n1.get_nodeattr("mem_mode") == "external" - ) or (consumer.op_type == "StreamingConcat"): + ) or (consumer.op_type.startswith("StreamingConcat")): # get input idx in_idx = None for idx, n_input in enumerate(consumer.input): @@ -81,15 +105,7 @@ def apply(self, model): dwc_in_width = n0.get_outstream_width() # determine dwc outwidth dwc_out_width = n1.get_instream_width() - larger_width = max(dwc_in_width, dwc_out_width) - smaller_width = min(dwc_in_width, dwc_out_width) - both_8bit_aligned = (larger_width % 8 == 0) and ( - smaller_width % 8 == 0 - ) - if both_8bit_aligned: - impl_style = "vivado" - else: - impl_style = "hls" + node_optype = "StreamingDataWidthConverter" # determine shape for dwc dwc_shape = n0.get_normal_output_shape() @@ -105,7 +121,7 @@ def apply(self, model): graph.value_info.append(dwc_output_tensor) dwc_node = oh.make_node( - "StreamingDataWidthConverter_Batch", + node_optype, [output_name], [dwc_output_tensor.name], domain="finn.custom_op.fpgadataflow", @@ -114,7 +130,6 @@ def apply(self, model): inWidth=dwc_in_width, outWidth=dwc_out_width, dataType=str(dtype.name), - impl_style=impl_style, ) # insert dwc graph.node.insert(node_ind + 1, dwc_node) diff --git a/src/finn/transformation/fpgadataflow/insert_fifo.py b/src/finn/transformation/fpgadataflow/insert_fifo.py index bfeee95e9b..9df193efcf 100644 --- a/src/finn/transformation/fpgadataflow/insert_fifo.py +++ b/src/finn/transformation/fpgadataflow/insert_fifo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -37,7 +38,7 @@ def _is_fifo_node(node): - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): return True else: return False @@ -45,8 +46,8 @@ def _is_fifo_node(node): def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: - if _is_fifo_node(node) is False: + if is_fpgadataflow_node(node): + if not _is_fifo_node(node): return True else: return False @@ -85,9 +86,7 @@ class InsertFIFO(Transformation): The other node attributes necessary to create a FIFO node are taken from the node the FIFO node is inserted after: 'folded_shape' and 'dtype'""" - def __init__( - self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto" - ): + def __init__(self, create_shallow_fifos=False, max_qsrl_depth=None, vivado_ram_style="auto"): super().__init__() self.create_shallow_fifos = create_shallow_fifos self.max_qsrl_depth = max_qsrl_depth @@ -129,6 +128,7 @@ def apply(self, model): folded output shape of the first node is not the same as the folded output shape of the second node. A streaming fifo can't be implemented in between these nodes.""" + n_shape = n0.get_normal_output_shape() # check if outFIFOdepths attribute of first node # and inFIFOdepths attribute of consumer node is equal @@ -151,10 +151,7 @@ def apply(self, model): graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) - if ( - self.max_qsrl_depth is None - or fifo_depth <= self.max_qsrl_depth - ): + if self.max_qsrl_depth is None or fifo_depth <= self.max_qsrl_depth: impl_style = "rtl" else: impl_style = "vivado" @@ -167,6 +164,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, @@ -188,14 +186,15 @@ def apply(self, model): first_node = model.find_consumer(graph_in_name) # insert FIFO as first node, except when first node is DMA if ( - first_node.op_type != "StreamingFIFO" - and first_node.op_type != "IODMA" + not first_node.op_type.startswith("StreamingFIFO") + and first_node.op_type != "IODMA_hls" ): inp_ind = list(first_node.input).index(graph_in_name) n_input = first_node.input[inp_ind] n0 = getCustomOp(first_node) # determine fifo node attributes fld_shape = n0.get_folded_input_shape(inp_ind) + n_shape = n0.get_normal_input_shape(inp_ind) dtype = n0.get_input_datatype(inp_ind) fifo_depth = n0.get_nodeattr("inFIFODepths")[inp_ind] @@ -204,7 +203,7 @@ def apply(self, model): fifo_output_tensor = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, - n0.get_normal_input_shape(), + n0.get_normal_input_shape(inp_ind), ) graph.value_info.append(fifo_output_tensor) model.set_tensor_datatype(fifo_output_tensor.name, dtype) @@ -221,6 +220,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, @@ -243,17 +243,18 @@ def apply(self, model): for graph_out_name in graph_out_names: final_node = model.find_producer(graph_out_name) if ( - final_node.op_type != "StreamingFIFO" - and final_node.op_type != "IODMA" + not final_node.op_type.startswith("StreamingFIFO") + and final_node.op_type != "IODMA_hls" ): assert ( - final_node.op_type != "TLastMarker" + final_node.op_type != "TLastMarker_hls" ), """Insert tlast marker should be done after inserting the FIFOs""" n0 = getCustomOp(final_node) out_ind = list(final_node.output).index(graph_out_name) # determine fifo node attributes fld_shape = n0.get_folded_output_shape(out_ind) + n_shape = n0.get_normal_output_shape(out_ind) dtype = n0.get_output_datatype(out_ind) fifo_depth = n0.get_nodeattr("outFIFODepths")[out_ind] @@ -279,6 +280,7 @@ def apply(self, model): backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=str(dtype.name), impl_style=impl_style, ram_style=self.vivado_ram_style, diff --git a/src/finn/transformation/fpgadataflow/insert_hook.py b/src/finn/transformation/fpgadataflow/insert_hook.py index 21ec3f049f..843a32a73e 100644 --- a/src/finn/transformation/fpgadataflow/insert_hook.py +++ b/src/finn/transformation/fpgadataflow/insert_hook.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -33,11 +34,11 @@ from qonnx.transformation.base import Transformation from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def _is_hook_node(node): - if node.op_type in ["CheckSum"]: + if node.op_type in ["CheckSum_hls"]: return True else: return False @@ -45,8 +46,8 @@ def _is_hook_node(node): def _suitable_node(node): if node is not None: - if is_fpgadataflow_node(node) is True: - if _is_hook_node(node) is False: + if is_hls_node(node) or is_rtl_node(node): + if not _is_hook_node(node): return True else: return False @@ -74,15 +75,14 @@ def apply(self, model): for output_name in n.output: consumers = model.find_consumers(output_name) assert len(consumers) <= 1, ( - n.name - + ": HLS node with fan-out higher than 1 cannot be stitched" + n.name + ": HLS node with fan-out higher than 1 cannot be stitched" ) n0 = getCustomOp(n) n0_hook = n0.get_nodeattr("output_hook") if n0_hook in list_supported_hooks: if n0_hook == "checksum": if len(consumers) == 1: - if consumers[0].op_type == "CheckSum": + if consumers[0].op_type == "CheckSum_hls": continue n0_normal_oshape = n0.get_normal_output_shape() n0_folded_oshape = n0.get_folded_output_shape() @@ -100,10 +100,10 @@ def apply(self, model): [1], ) chk_node = oh.make_node( - "CheckSum", + "CheckSum_hls", [output_name], outputs=[chk_otensor.name, chk_result.name], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", words_per_frame=words_per_frame, items_per_word=items_per_word, diff --git a/src/finn/transformation/fpgadataflow/insert_iodma.py b/src/finn/transformation/fpgadataflow/insert_iodma.py index 28bcd9598a..91d4ab1559 100644 --- a/src/finn/transformation/fpgadataflow/insert_iodma.py +++ b/src/finn/transformation/fpgadataflow/insert_iodma.py @@ -51,9 +51,7 @@ def __init__( self.insert_input = insert_input self.insert_output = insert_output self.insert_extmemw = insert_extmemw - assert ( - 2 ** math.log2(max_intfwidth) == max_intfwidth - ), "max_intfwidth must be a power of 2" + assert 2 ** math.log2(max_intfwidth) == max_intfwidth, "max_intfwidth must be a power of 2" self.max_intfwidth = max_intfwidth def get_mem_init(self, weights, pe, simd): @@ -108,7 +106,7 @@ def apply(self, model): graph_in_names = [x.name for x in model.graph.input] for graph_in_name in graph_in_names: first_node = model.find_consumer(graph_in_name) - if first_node.op_type == "IODMA": + if first_node.op_type == "IODMA_hls": # IODMA already inserted for this input continue else: @@ -122,13 +120,9 @@ def apply(self, model): padded_instream_width = first_node_inst.get_instream_width_padded() padded_instream_bytes = padded_instream_width // 8 # determine the feasible interface width - transfer_bits = padded_instream_width * np.prod( - in_folded_shape[:-1] - ) + transfer_bits = padded_instream_width * np.prod(in_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) - assert ( - intfwidth % 8 == 0 - ), "No feasible interface width for transfer size" + assert intfwidth % 8 == 0, "No feasible interface width for transfer size" # make new buffer first_node_in = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, in_shape @@ -140,7 +134,7 @@ def apply(self, model): # padding problems for i/o DMA first_node.input[0] = first_node_in.name dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [graph_in_name], [first_node_in.name], numInputVectors=in_folded_shape[:-1], @@ -149,7 +143,7 @@ def apply(self, model): intfWidth=intfwidth, streamWidth=padded_instream_width, direction="in", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.insert(0, dma_node) @@ -159,7 +153,7 @@ def apply(self, model): graph_out_names = [x.name for x in model.graph.output] for graph_out_name in graph_out_names: final_node = model.find_producer(graph_out_name) - if final_node.op_type == "IODMA": + if final_node.op_type == "IODMA_hls": continue else: out_shape = model.get_tensor_shape(graph_out_name) @@ -169,18 +163,12 @@ def apply(self, model): # take advantage of AXI stream width padding for DMA alignment # (AXI streams are always padded to 8 bits) # this is the width of stream input to DMA - padded_outstream_width = ( - final_node_inst.get_outstream_width_padded() - ) + padded_outstream_width = final_node_inst.get_outstream_width_padded() padded_outstream_bytes = padded_outstream_width // 8 # determine the feasible interface width - transfer_bits = padded_outstream_width * np.prod( - out_folded_shape[:-1] - ) + transfer_bits = padded_outstream_width * np.prod(out_folded_shape[:-1]) intfwidth = math.gcd(transfer_bits, self.max_intfwidth) - assert ( - intfwidth % 8 == 0 - ), "No feasible interface width for transfer size" + assert intfwidth % 8 == 0, "No feasible interface width for transfer size" # make new buffer final_node_out = oh.make_tensor_value_info( model.make_new_valueinfo_name(), TensorProto.FLOAT, out_shape @@ -192,7 +180,7 @@ def apply(self, model): # FIXME: currently always using 8-bit dtypes to work around the # padding problems for i/o DMA dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [final_node_out.name], [graph_out_name], numInputVectors=out_folded_shape[:-1], @@ -201,7 +189,7 @@ def apply(self, model): intfWidth=intfwidth, streamWidth=padded_outstream_width, direction="out", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.append(dma_node) @@ -211,8 +199,7 @@ def apply(self, model): # attached IODMA fc_extw_nodes = list( filter( - lambda x: x.op_type - in ["MatrixVectorActivation", "VectorVectorActivation"] + lambda x: x.op_type in ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl"] and getCustomOp(x).get_nodeattr("mem_mode") == "external" and model.find_producer(x.input[1]) is None, all_nodes, @@ -226,9 +213,7 @@ def apply(self, model): # determine the feasible interface width transfer_bits = np.prod(w_shape) * w_dtype.bitwidth() intfwidth = math.gcd(transfer_bits, self.max_intfwidth) - assert ( - intfwidth % 8 == 0 - ), "No feasible interface width for transfer size" + assert intfwidth % 8 == 0, "No feasible interface width for transfer size" # calculate width of stream output from DMA pe = get_by_name(fc_node.attribute, "PE").i simd = get_by_name(fc_node.attribute, "SIMD").i @@ -245,7 +230,7 @@ def apply(self, model): model.set_tensor_datatype(fc_node_in.name, w_dtype) model.set_initializer(fc_node_in.name, W) dma_node = oh.make_node( - "IODMA", + "IODMA_hls", [fc_w_name], [fc_node_in.name], numInputVectors=[iodma_mem.shape[0]], @@ -255,7 +240,7 @@ def apply(self, model): streamWidth=streamWidth, direction="in", burstMode="wrap", - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) fc_node.input[1] = fc_node_in.name diff --git a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py index 1610916eb6..2131100dcf 100644 --- a/src/finn/transformation/fpgadataflow/insert_tlastmarker.py +++ b/src/finn/transformation/fpgadataflow/insert_tlastmarker.py @@ -35,7 +35,7 @@ class InsertTLastMarker(Transformation): - """Ensure that the graph is started/terminated with a TLastMarker node, inserting + """Ensure that the graph is started/terminated with a TLastMarker_hls node, inserting one if necessary. Use constructor args to determine type of TLastMarker to be inserted. More information available on the TLastMarker documentation. @@ -52,12 +52,10 @@ def apply(self, model): graph_out_name = model.graph.output[0].name final_node = model.find_producer(graph_out_name) graph_modified = False - if final_node.op_type != "TLastMarker" and not ( - final_node.op_type == "IODMA" - and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") - == "out" + if final_node.op_type != "TLastMarker_hls" and not ( + final_node.op_type == "IODMA_hls" + and get_by_name(final_node.attribute, "direction").s.decode("UTF-8") == "out" ): - custom_op = getCustomOp(final_node) num_iters = int(custom_op.get_number_output_values()) stream_width = int(custom_op.get_outstream_width()) @@ -73,7 +71,7 @@ def apply(self, model): # reroute final node output to final_node_out_name final_node.output[0] = final_node_out.name tlast_node = oh.make_node( - "TLastMarker", + "TLastMarker_hls", [final_node_out.name], [graph_out_name], NumIters=num_iters, @@ -82,7 +80,7 @@ def apply(self, model): DynIters=(1 if self.dyniters else 0), Direction="out", Protocol=("external" if self.external else "internal"), - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.append(tlast_node) @@ -105,28 +103,23 @@ def apply(self, model): # the input is in the list of graph inputs because it has an # initializer (TODO: fix this with a clean-up transform) if ( - first_node.op_type == "MatrixVectorActivation" + first_node.op_type.startswith("MVAU") and get_by_name(first_node.attribute, "mem_mode").s.decode("UTF-8") != "external" ): continue # 2. node is either a TLastMarker or an input IODMA - if first_node.op_type != "TLastMarker" and not ( - first_node.op_type == "IODMA" - and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") - == "in" + if first_node.op_type != "TLastMarker_hls" and not ( + first_node.op_type == "IODMA_hls" + and get_by_name(first_node.attribute, "direction").s.decode("UTF-8") == "in" ): - custom_op = getCustomOp(first_node) num_iters = np.prod(custom_op.get_folded_input_shape()[1:-1]) inp_idx = list(first_node.input).index(graph_in_name) if inp_idx > 0: - if ( - first_node.op_type == "MatrixVectorActivation" - and inp_idx == 1 - ): + if first_node.op_type.startswith("MVAU") and inp_idx == 1: stream_width = int(custom_op.get_weightstream_width()) - elif first_node.op_type == "AddStreams_Batch" and inp_idx == 1: + elif first_node.op_type.startswith("AddStreams") and inp_idx == 1: stream_width = int(custom_op.get_instream_width()) else: raise Exception("No method to determine stream width") @@ -148,7 +141,7 @@ def apply(self, model): # reroute final node output to first_node_in_name first_node.input[inp_idx] = first_node_in.name tlast_node = oh.make_node( - "TLastMarker", + "TLastMarker_hls", [graph_in_name], [first_node_in.name], NumIters=num_iters, @@ -157,7 +150,7 @@ def apply(self, model): DynIters=(1 if self.dyniters else 0), Direction="in", Protocol=("external" if self.external else "internal"), - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", ) model.graph.node.insert(insert_idx, tlast_node) diff --git a/src/finn/transformation/fpgadataflow/make_deployment.py b/src/finn/transformation/fpgadataflow/make_deployment.py deleted file mode 100644 index d4684dc83c..0000000000 --- a/src/finn/transformation/fpgadataflow/make_deployment.py +++ /dev/null @@ -1,116 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import os -import subprocess -from distutils.dir_util import copy_tree -from qonnx.transformation.base import Transformation -from shutil import copy - -import finn.transformation.fpgadataflow.templates as templates -from finn.util.basic import make_build_dir - - -class DeployToPYNQ(Transformation): - """Collects all necessary files for deployment and copies them to the PYNQ board. - Expects information about PYNQ board to make scp possible: - - IP address of board, username and password for board and target directory where - the files are stored on the board""" - - def __init__(self, ip, port, username, password, target_dir): - super().__init__() - self.ip = ip - self.port = port - self.username = username - self.password = password - self.target_dir = target_dir - - def apply(self, model): - # set metadata properties accordingly to user input specifications - model.set_metadata_prop("pynq_ip", self.ip) - model.set_metadata_prop("pynq_port", str(self.port)) - model.set_metadata_prop("pynq_username", self.username) - model.set_metadata_prop("pynq_password", self.password) - model.set_metadata_prop("pynq_target_dir", self.target_dir) - - # create directory for deployment files - deployment_dir = make_build_dir(prefix="pynq_deployment_") - model.set_metadata_prop("pynq_deployment_dir", deployment_dir) - - # get and copy necessary files - # .bit and .hwh file - bitfile = model.get_metadata_prop("bitfile") - hwh_file = model.get_metadata_prop("hw_handoff") - deploy_files = [bitfile, hwh_file] - - for dfile in deploy_files: - if dfile is not None: - copy(dfile, deployment_dir) - - # helper script for Alveo - platform = model.get_metadata_prop("platform") - if platform == "alveo": - alveo_run_sh = templates.alveo_run_sh_template - fill_dict = { - "$REMOTE_DEPLOY_DIR$": self.target_dir - + "/" - + os.path.basename(deployment_dir), - "$CONDA_ENV_NAME$": "finn-pynq-alveo", - "$REMOTE_XRT$": os.environ["XILINX_XRT"], - "$REMOTE_PLATFORM_REPO_PATHS$": os.environ["PLATFORM_REPO_PATHS"], - "$BITFILE$": os.path.basename(bitfile), - } - for key, value in fill_dict.items(): - alveo_run_sh = alveo_run_sh.replace(key, value) - alveo_run_sh_path = deployment_dir + "/alveo_run.sh" - with open(alveo_run_sh_path, "w") as f: - f.write(alveo_run_sh) - - # driver.py and python libraries - pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir") - copy_tree(pynq_driver_dir, deployment_dir) - model.set_metadata_prop("pynq_deploy_dir", deployment_dir) - model.set_metadata_prop("exec_mode", "remote_pynq") - - # create target directory on PYNQ board - cmd = 'ssh {}@{} -p {} "mkdir -p {}"'.format( - self.username, self.ip, self.port, self.target_dir - ) - bash_command = ["/bin/bash", "-c", cmd] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate() - # copy directory to PYNQ board using scp - cmd = "scp -P{} -r {} {}@{}:{}".format( - self.port, deployment_dir, self.username, self.ip, self.target_dir - ) - bash_command = ["/bin/bash", "-c", cmd] - process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) - process_compile.communicate() - - return (model, False) diff --git a/src/finn/transformation/fpgadataflow/make_pynq_driver.py b/src/finn/transformation/fpgadataflow/make_pynq_driver.py index dce98e54a3..ea9bd2aa26 100644 --- a/src/finn/transformation/fpgadataflow/make_pynq_driver.py +++ b/src/finn/transformation/fpgadataflow/make_pynq_driver.py @@ -26,9 +26,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pkg_resources as pk - import numpy as np import os import qonnx @@ -56,14 +53,10 @@ def to_external_tensor(init, w_dtype): weight_width = init.shape[1] * w_dtype.bitwidth() weight_width_padded = roundup_to_integer_multiple(weight_width, 4) - hex_init = pack_innermost_dim_as_hex_string( - init, w_dtype, weight_width_padded, prefix="0x" - ) + hex_init = pack_innermost_dim_as_hex_string(init, w_dtype, weight_width_padded, prefix="0x") ext_weight = np.array([], dtype=np.uint8) for line in hex_init: - array_line = [ - x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x")) - ] + array_line = [x for x in reversed(hexstring2npbytearray(line, remove_prefix="0x"))] ext_weight = np.append(ext_weight, array_line) return ext_weight @@ -88,14 +81,13 @@ def __init__(self, platform): self.platform = platform def apply(self, model): - # create a temporary folder for the generated driver pynq_driver_dir = make_build_dir(prefix="pynq_driver_") model.set_metadata_prop("pynq_driver_dir", pynq_driver_dir) # create the base FINN driver -- same for all accels - driver_base_template = pk.resource_filename( - "finn.qnn-data", "templates/driver/driver_base.py" + driver_base_template = ( + os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/driver_base.py" ) driver_base_py = pynq_driver_dir + "/driver_base.py" shutil.copy(driver_base_template, driver_base_py) @@ -115,9 +107,7 @@ def apply(self, model): files_to_copy.append( (qonnx_path + "/core/__init__.py", qonnx_target_path + "/core/__init__.py") ) - files_to_copy.append( - (qonnx_path + "/util/basic.py", qonnx_target_path + "/util/basic.py") - ) + files_to_copy.append((qonnx_path + "/util/basic.py", qonnx_target_path + "/util/basic.py")) files_to_copy.append( (qonnx_path + "/util/__init__.py", qonnx_target_path + "/util/__init__.py") ) @@ -133,7 +123,7 @@ def apply(self, model): finn_target_path + "/util/__init__.py", ) ) - for (src_file, target_file) in files_to_copy: + for src_file, target_file in files_to_copy: shutil.copy(src_file, target_file) # extract input-output shapes from the graph # TODO convert this to an analysis pass? @@ -156,7 +146,7 @@ def apply(self, model): Ensure CreateDataflowPartition called before driver creation.""" first_df_model = ModelWrapper(getCustomOp(i_consumer).get_nodeattr("model")) assert ( - first_df_model.graph.node[0].op_type == "IODMA" + first_df_model.graph.node[0].op_type == "IODMA_hls" ), "First partition must hold input IODMA" successors = model.find_direct_successors(i_consumer) successor_input_num = list(successors[0].input).index(i_consumer.output[0]) @@ -165,13 +155,9 @@ def apply(self, model): first_node = successor_df_model.find_consumer( successor_df_model.graph.input[successor_input_num].name ) - i_tensor_shape_folded = tuple( - getCustomOp(first_node).get_folded_input_shape() - ) + i_tensor_shape_folded = tuple(getCustomOp(first_node).get_folded_input_shape()) # generate dummy folded i/o tensors and their packed versions - i_tensor_dummy_folded = gen_finn_dt_tensor( - i_tensor_dt, i_tensor_shape_folded - ) + i_tensor_dummy_folded = gen_finn_dt_tensor(i_tensor_dt, i_tensor_shape_folded) i_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( i_tensor_dummy_folded, i_tensor_dt ) @@ -202,23 +188,17 @@ def apply(self, model): Ensure CreateDataflowPartition called before driver creation.""" df_model = ModelWrapper(getCustomOp(o_producer).get_nodeattr("model")) assert ( - df_model.graph.node[-1].op_type == "IODMA" + df_model.graph.node[-1].op_type == "IODMA_hls" ), "Partition must hold output IODMA" predecessors = model.find_direct_predecessors(o_producer) - predecessor_output_num = list(predecessors[0].output).index( - o_producer.input[0] - ) + predecessor_output_num = list(predecessors[0].output).index(o_producer.input[0]) predecessor_sdp = getCustomOp(predecessors[0]) predecessor_df_model = ModelWrapper(predecessor_sdp.get_nodeattr("model")) last_node = predecessor_df_model.find_producer( predecessor_df_model.graph.output[predecessor_output_num].name ) - o_tensor_shape_folded = tuple( - getCustomOp(last_node).get_folded_output_shape() - ) - o_tensor_dummy_folded = gen_finn_dt_tensor( - o_tensor_dt, o_tensor_shape_folded - ) + o_tensor_shape_folded = tuple(getCustomOp(last_node).get_folded_output_shape()) + o_tensor_dummy_folded = gen_finn_dt_tensor(o_tensor_dt, o_tensor_shape_folded) o_tensor_dummy_packed = dpk.finnpy_to_packed_bytearray( o_tensor_dummy_folded, o_tensor_dt ) @@ -253,20 +233,14 @@ def apply(self, model): sdp_inst = getCustomOp(node) idma_name = sdp_inst.get_nodeattr("instance_name") df_model = ModelWrapper(sdp_inst.get_nodeattr("model")) - assert df_model.graph.node[0].op_type == "IODMA" + assert df_model.graph.node[0].op_type == "IODMA_hls" iodma_node = getCustomOp(df_model.graph.node[0]) if iodma_node.get_nodeattr("burstMode") == "wrap": # input weights dma? - init_tensor = df_model.get_initializer( - iodma_node.onnx_node.input[0] - ) + init_tensor = df_model.get_initializer(iodma_node.onnx_node.input[0]) ext_weight_dma_cnt += 1 - w_dtype = df_model.get_tensor_datatype( - iodma_node.onnx_node.input[0] - ) + w_dtype = df_model.get_tensor_datatype(iodma_node.onnx_node.input[0]) init_external_tensor = to_external_tensor(init_tensor, w_dtype) - np.save( - weights_dir + "/" + idma_name + ".npy", init_external_tensor - ) + np.save(weights_dir + "/" + idma_name + ".npy", init_external_tensor) idma_idx += 1 # fill in the driver template @@ -293,8 +267,8 @@ def apply(self, model): # add validate.py to run full top-1 test (only for suitable networks) validate_py = pynq_driver_dir + "/validate.py" - validate_template = pk.resource_filename( - "finn.qnn-data", "templates/driver/validate.py" + validate_template = ( + os.environ["FINN_ROOT"] + "/src/finn/qnn-data/templates/driver/validate.py" ) shutil.copy(validate_template, validate_py) @@ -308,7 +282,7 @@ def apply(self, model): dataflow_model = ModelWrapper(dataflow_model_filename) rt_layer_ind = 0 for node in dataflow_model.graph.node: - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: + if node.op_type.startswith("MVAU") or node.op_type.startswith("Thresholding"): node_inst = getCustomOp(node) is_rt_weights = node_inst.get_nodeattr("runtime_writeable_weights") if is_rt_weights == 1: @@ -318,9 +292,7 @@ def apply(self, model): rt_layer_ind, node.name, ) - node_inst.make_weight_file( - fcl_w, "decoupled_runtime", w_filename - ) + node_inst.make_weight_file(fcl_w, "decoupled_runtime", w_filename) rt_layer_ind += 1 elif node.op_type == "StreamingDataflowPartition": warnings.warn( diff --git a/src/finn/transformation/fpgadataflow/make_zynq_proj.py b/src/finn/transformation/fpgadataflow/make_zynq_proj.py index f48566326e..fc2047b08e 100644 --- a/src/finn/transformation/fpgadataflow/make_zynq_proj.py +++ b/src/finn/transformation/fpgadataflow/make_zynq_proj.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,6 +46,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir, pynq_native_port_width, pynq_part_map from . import templates @@ -62,8 +64,8 @@ def collect_ip_dirs(model, ipstitch_path): ), """The directory that should contain the generated ip blocks doesn't exist.""" ip_dirs += [ip_dir_value] - if node.op_type in ["MatrixVectorActivation", "Thresholding_Batch"]: - if node_inst.get_nodeattr("mem_mode") == "decoupled": + if node.op_type.startswith("MVAU") or node.op_type == "Thresholding_hls": + if node_inst.get_nodeattr("mem_mode") == "internal_decoupled": need_memstreamer = True ip_dirs += [ipstitch_path + "/ip"] if need_memstreamer: @@ -92,7 +94,6 @@ def __init__(self, platform, enable_debug=False): self.enable_debug = 1 if enable_debug else 0 def apply(self, model): - # create a config file and empty list of xo files config = [] idma_idx = 0 @@ -110,15 +111,12 @@ def apply(self, model): ipstitch_path = kernel_model.get_metadata_prop("vivado_stitch_proj") if ipstitch_path is None or (not os.path.isdir(ipstitch_path)): raise Exception( - "No stitched IPI design found for %s, apply CreateStitchedIP first." - % node.name + "No stitched IPI design found for %s, apply CreateStitchedIP first." % node.name ) vivado_stitch_vlnv = kernel_model.get_metadata_prop("vivado_stitch_vlnv") if vivado_stitch_vlnv is None: - raise Exception( - "No vlnv found for %s, apply CreateStitchedIP first." % node.name - ) + raise Exception("No vlnv found for %s, apply CreateStitchedIP first." % node.name) ip_dirs = ["list"] ip_dirs += collect_ip_dirs(kernel_model, ipstitch_path) @@ -170,9 +168,7 @@ def apply(self, model): "[get_bd_intf_pins smartconnect_0/S%02d_AXI]" % (instance_names[node.name], aximm_idx) ) - assert ( - len(ifnames["axilite"]) == 1 - ), "Must have 1 AXI lite interface on IODMA nodes" + assert len(ifnames["axilite"]) == 1, "Must have 1 AXI lite interface on IODMA nodes" axilite_intf_name = ifnames["axilite"][0] assert axilite_intf_name is not None config.append( @@ -182,8 +178,7 @@ def apply(self, model): ) # assign_bd_address with appropriate range/offset config.append( - "assign_axi_addr_proc %s/%s" - % (instance_names[node.name], axilite_intf_name) + "assign_axi_addr_proc %s/%s" % (instance_names[node.name], axilite_intf_name) ) aximm_idx += 1 @@ -269,23 +264,18 @@ def apply(self, model): bash_command = ["bash", synth_project_sh] process_compile = subprocess.Popen(bash_command, stdout=subprocess.PIPE) process_compile.communicate() - bitfile_name = ( - vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit" - ) + bitfile_name = vivado_pynq_proj_dir + "/finn_zynq_link.runs/impl_1/top_wrapper.bit" if not os.path.isfile(bitfile_name): raise Exception( - "Synthesis failed, no bitfile found. Check logs under %s" - % vivado_pynq_proj_dir + "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir ) deploy_bitfile_name = vivado_pynq_proj_dir + "/resizer.bit" copy(bitfile_name, deploy_bitfile_name) # set bitfile attribute model.set_metadata_prop("bitfile", deploy_bitfile_name) hwh_name_alts = [ - vivado_pynq_proj_dir - + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh", - vivado_pynq_proj_dir - + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh", + vivado_pynq_proj_dir + "/finn_zynq_link.srcs/sources_1/bd/top/hw_handoff/top.hwh", + vivado_pynq_proj_dir + "/finn_zynq_link.gen/sources_1/bd/top/hw_handoff/top.hwh", ] hwh_name = None for hwh_name_cand in hwh_name_alts: @@ -293,8 +283,7 @@ def apply(self, model): hwh_name = hwh_name_cand if not os.path.isfile(hwh_name): raise Exception( - "Synthesis failed, no bitfile found. Check logs under %s" - % vivado_pynq_proj_dir + "Synthesis failed, no bitfile found. Check logs under %s" % vivado_pynq_proj_dir ) deploy_hwh_name = vivado_pynq_proj_dir + "/resizer.hwh" copy(hwh_name, deploy_hwh_name) @@ -333,6 +322,7 @@ def apply(self, model): prep_transforms = [ InsertIODMA(self.axi_port_width), InsertDWC(), + SpecializeLayers(), Floorplan(), CreateDataflowPartition(partition_model_dir=self.partition_model_dir), ] @@ -348,23 +338,18 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform(SpecializeLayers()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) - kernel_model = kernel_model.transform( - PrepareIP(self.fpga_part, self.period_ns) - ) + kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns)) kernel_model = kernel_model.transform(HLSSynthIP()) kernel_model = kernel_model.transform( - CreateStitchedIP( - self.fpga_part, self.period_ns, sdp_node.onnx_node.name, False - ) + CreateStitchedIP(self.fpga_part, self.period_ns, sdp_node.onnx_node.name, False) ) kernel_model.set_metadata_prop("platform", "zynq-iodma") kernel_model.save(dataflow_model_filename) # Assemble design from IPs - model = model.transform( - MakeZYNQProject(self.platform, enable_debug=self.enable_debug) - ) + model = model.transform(MakeZYNQProject(self.platform, enable_debug=self.enable_debug)) # set platform attribute for correct remote execution model.set_metadata_prop("platform", "zynq-iodma") diff --git a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py index bc020ca428..61159fde0c 100644 --- a/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py +++ b/src/finn/transformation/fpgadataflow/minimize_accumulator_width.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,6 +29,7 @@ from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation +from qonnx.transformation.infer_datatypes import InferDataTypes from finn.util.fpgadataflow import is_fpgadataflow_node @@ -41,9 +43,15 @@ def __init__(self): super().__init__() def apply(self, model): - for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + for node_id in range(len(model.graph.node)): + # Since InferDataTypes potentially changes node attributes in each loop iterations, + # the for-loop cannot loop over a list of a snapshot of the graph's node protos + node = model.graph.node[node_id] + if is_fpgadataflow_node(node): inst = getCustomOp(node) if hasattr(inst, "minimize_accumulator_width"): inst.minimize_accumulator_width(model) + # Since this transformation is applied iteratively, we have to ensure that + # we propagate the new datatype to other layers + model = model.transform(InferDataTypes()) return (model, False) diff --git a/finn-rtllib/memstream/sim/gen_memblocks.sh b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py similarity index 64% rename from finn-rtllib/memstream/sim/gen_memblocks.sh rename to src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py index b6e6b656ad..49770f7d0c 100644 --- a/finn-rtllib/memstream/sim/gen_memblocks.sh +++ b/src/finn/transformation/fpgadataflow/minimize_weight_bit_width.py @@ -1,6 +1,4 @@ -#!/bin/bash - -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,12 +26,24 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -NLINES=`cat $1 | wc -l` -NBLOCKS=$(( ($NLINES + 1023) / 1024 )) -rm memblock_*.dat +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation + +from finn.util.fpgadataflow import is_fpgadataflow_node + + +class MinimizeWeightBitWidth(Transformation): + """For relevant nodes, call the weight bit width minimization + functions to save on resources. May alter tensor weightDataType + if the node does not have runtime writeable weights.""" + + def __init__(self): + super().__init__() -for (( i=0; i<$NBLOCKS; i++ )) -do - START=$(( 1 + $i * 1024 )) - tail -n +$START $1 | head -n 1024 >> memblock_$i.dat -done + def apply(self, model): + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + if hasattr(inst, "minimize_weight_bit_width"): + inst.minimize_weight_bit_width(model) + return (model, False) diff --git a/src/finn/transformation/fpgadataflow/prepare_cppsim.py b/src/finn/transformation/fpgadataflow/prepare_cppsim.py index 07021c1e8d..d4cc6dcc99 100644 --- a/src/finn/transformation/fpgadataflow/prepare_cppsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_cppsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from qonnx.util.basic import get_num_default_workers from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node def _codegen_single_node(node, model): @@ -49,9 +50,7 @@ def _codegen_single_node(node, model): code_gen_dir = inst.get_nodeattr("code_gen_dir_cppsim") # ensure that there is a directory if code_gen_dir == "" or not os.path.isdir(code_gen_dir): - code_gen_dir = make_build_dir( - prefix="code_gen_cppsim_" + str(node.name) + "_" - ) + code_gen_dir = make_build_dir(prefix="code_gen_cppsim_" + str(node.name) + "_") inst.set_nodeattr("code_gen_dir_cppsim", code_gen_dir) # ensure that there is generated code inside the dir inst.code_generation_cppsim(model) @@ -80,7 +79,7 @@ def __init__(self, num_workers=None): self._num_workers = mp.cpu_count() def prepareCppSim_node(self, node): - if is_fpgadataflow_node(node) is True: + if is_hls_node(node): _codegen_single_node(node, self.model) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/prepare_ip.py b/src/finn/transformation/fpgadataflow/prepare_ip.py index 2ebd6310f0..a74e0f7afc 100644 --- a/src/finn/transformation/fpgadataflow/prepare_ip.py +++ b/src/finn/transformation/fpgadataflow/prepare_ip.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from qonnx.transformation.base import Transformation from finn.util.basic import make_build_dir -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def _codegen_single_node(node, model, fpgapart, clk): @@ -47,9 +48,7 @@ def _codegen_single_node(node, model, fpgapart, clk): code_gen_dir = inst.get_nodeattr("code_gen_dir_ipgen") # ensure that there is a directory if code_gen_dir == "" or not os.path.isdir(code_gen_dir): - code_gen_dir = make_build_dir( - prefix="code_gen_ipgen_" + str(node.name) + "_" - ) + code_gen_dir = make_build_dir(prefix="code_gen_ipgen_" + str(node.name) + "_") inst.set_nodeattr("code_gen_dir_ipgen", code_gen_dir) # ensure that there is generated code inside the dir inst.code_generation_ipgen(model, fpgapart, clk) @@ -74,8 +73,15 @@ class PrepareIP(Transformation): will be skipped. Outcome if succesful: Node attribute "code_gen_dir_ipgen" contains path to folder - that contains generated C++ code that can be used to generate a Vivado IP block. - The subsequent transformation is HLSSynthIP""" + that contains: + + * For HLS layers: generated C++ code that can be used to generate a Vivado IP block. + The necessary subsequent transformation is HLSSynthIP. + + * For RTL layers: filled template verilog files that can be used to instantiate as + module during IP stitching. + + """ def __init__(self, fpgapart, clk): super().__init__() @@ -84,6 +90,6 @@ def __init__(self, fpgapart, clk): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): _codegen_single_node(node, model, self.fpgapart, self.clk) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py index 645d86cf14..b8f45deb1d 100644 --- a/src/finn/transformation/fpgadataflow/prepare_rtlsim.py +++ b/src/finn/transformation/fpgadataflow/prepare_rtlsim.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ from finn.transformation.fpgadataflow.replace_verilog_relpaths import ( ReplaceVerilogRelPaths, ) -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node try: from pyverilator import PyVerilator @@ -63,7 +64,7 @@ def apply(self, model): def applyNodeLocal(self, node): op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) @@ -74,7 +75,5 @@ def applyNodeLocal(self, node): ), "Failed to prepare RTLSim, no rtlsim_so attribute found." except KeyError: # exception if op_type is not supported - raise Exception( - "Custom op_type %s is currently not supported." % op_type - ) + raise Exception("Custom op_type %s is currently not supported." % op_type) return (node, False) diff --git a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py index 4e7970caa0..de13166e73 100644 --- a/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py +++ b/src/finn/transformation/fpgadataflow/replace_verilog_relpaths.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,7 +31,7 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import Transformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class ReplaceVerilogRelPaths(Transformation): @@ -41,7 +42,7 @@ def __init__(self): def apply(self, model): for node in model.graph.node: - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) diff --git a/src/finn/transformation/fpgadataflow/set_exec_mode.py b/src/finn/transformation/fpgadataflow/set_exec_mode.py index a08d153cb2..405ddb0c42 100644 --- a/src/finn/transformation/fpgadataflow/set_exec_mode.py +++ b/src/finn/transformation/fpgadataflow/set_exec_mode.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -29,12 +30,15 @@ import qonnx.custom_op.registry as registry from qonnx.transformation.base import Transformation -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node class SetExecMode(Transformation): """Set attribute exec_mode in all fpgadataflow nodes to specify which - kind of execution should be used ("cppsim" or "rtlsim")""" + kind of execution should be used ("cppsim" or "rtlsim"). + Note that RTL components do not support cppsim. When cppsim is selected + for RTL components, by default the execution of the HW op parent is + executed.""" def __init__(self, mode): super().__init__() @@ -43,7 +47,7 @@ def __init__(self, mode): def apply(self, model): for node in model.graph.node: op_type = node.op_type - if is_fpgadataflow_node(node) is True: + if is_hls_node(node) or is_rtl_node(node): try: # lookup op_type in registry of CustomOps inst = registry.getCustomOp(node) @@ -56,7 +60,5 @@ def apply(self, model): was not successful. Node attribute "exec_mode" is not set""" except KeyError: # exception if op_type is not supported - raise Exception( - "Custom op_type %s is currently not supported." % op_type - ) + raise Exception("Custom op_type %s is currently not supported." % op_type) return (model, False) diff --git a/src/finn/transformation/fpgadataflow/set_fifo_depths.py b/src/finn/transformation/fpgadataflow/set_fifo_depths.py index 35e7b9e6c9..82ee536d50 100644 --- a/src/finn/transformation/fpgadataflow/set_fifo_depths.py +++ b/src/finn/transformation/fpgadataflow/set_fifo_depths.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -47,7 +48,8 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_hls_node, is_rtl_node from finn.util.pyverilator import pyverilate_stitched_ip, verilator_fifosim @@ -84,12 +86,13 @@ def optimize_depth(depth): class RemoveShallowFIFOs(Transformation): - """Remove small FIFOs as the streaming components have depth-2 FIFOs on the - input/outputs by default.""" + """Remove zero-depth FIFOs The threshold used to be 2 instead of 0, but + with increasing number of FINN RTL components 2-depth FIFOs are still + important for decoupling..""" # TODO add unit test - def __init__(self, shallow_threshold=2): + def __init__(self, shallow_threshold=0): self.shallow_threshold = shallow_threshold def apply(self, model): @@ -100,7 +103,7 @@ def apply(self, model): else: is_first_node = True if ( - node.op_type == "StreamingFIFO" + node.op_type.startswith("StreamingFIFO") and getCustomOp(node).get_nodeattr("depth") <= self.shallow_threshold and (not is_first_node) ): @@ -164,16 +167,16 @@ def apply(self, model): for node in model.graph.node: # look for following pattern: # ConvolutionInputGenerator -> StreamingFIFO -> MatrixVectorActivation - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): fifo_prod = model.find_producer(node.input[0]) fifo_cons = model.find_consumer(node.output[0]) if fifo_prod is None: continue - if fifo_prod.op_type != "ConvolutionInputGenerator": + if not fifo_prod.op_type.startswith("ConvolutionInputGenerator"): continue if fifo_cons is None: continue - if fifo_cons.op_type != "MatrixVectorActivation": + if not fifo_cons.op_type.startswith("MVAU"): continue op_inst = getCustomOp(node) depth = op_inst.get_nodeattr("depth") @@ -240,7 +243,7 @@ def __init__( clk_ns=10.0, max_qsrl_depth=256, max_depth=None, - swg_exception=True, + swg_exception=False, vivado_ram_style="auto", force_python_sim=False, ): @@ -256,16 +259,17 @@ def __init__( def apply(self, model): # these optypes may potentially use external weights # we'll temporarily change them to use decoupled mode for FIFO sizing - extw_optypes = ["MatrixVectorActivation", "VectorVectorActivation"] + extw_optypes = ["MVAU_hls", "MVAU_rtl", "VVAU_hls", "VVAU_rtl"] # change external to decoupled and warn user # this way we are sure we have exactly one input/output modified_fc_nodes = [] for node in model.graph.node: # verify assumptions - assert is_fpgadataflow_node(node), "Found non-fpgadataflow node: " + str( + assert is_hls_node(node) or is_rtl_node(node), "Found non-fpgadataflow node: " + str( node ) - assert node.op_type != "StreamingFIFO", "Found existing StreamingFIFO node" + op_type = node.op_type + assert not op_type.startswith("StreamingFIFO"), "Found existing StreamingFIFO node" node = getCustomOp(node) ifd = node.get_nodeattr("inFIFODepths") ofd = node.get_nodeattr("outFIFODepths") @@ -281,36 +285,34 @@ def apply(self, model): ofd[o] = np.prod(node.get_folded_output_shape(o)[:-1]) node.set_nodeattr("inFIFODepths", ifd) node.set_nodeattr("outFIFODepths", ofd) - if node.onnx_node.op_type in extw_optypes: mmode = node.get_nodeattr("mem_mode") if mmode == "external": modified_fc_nodes.append(node.onnx_node.name) - node.set_nodeattr("mem_mode", "decoupled") + node.set_nodeattr("mem_mode", "internal_decoupled") reset_implementation(node) warnings.warn( - "Changed mem_mode from external to decoupled for " + "Changed mem_mode from external to internal_decoupled for " + node.onnx_node.name ) # insert stream infrastructure (DWC/FIFO) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers(self.fpgapart)) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) # gather FIFO names, check they are of expected depth fifos = {} - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl") for node in fifo_nodes: fifos[node.name] = 0 node = getCustomOp(node) node.set_nodeattr("depth_monitor", 1) node.set_nodeattr("impl_style", "rtl") # check depths and fix as necessary - if (self.max_depth is not None) and ( - node.get_nodeattr("depth") != self.max_depth - ): + if (self.max_depth is not None) and (node.get_nodeattr("depth") != self.max_depth): node.set_nodeattr("depth", self.max_depth) # insert FIFOs and do all transformations for RTLsim @@ -373,14 +375,12 @@ def apply(self, model): ncycles = ncycles - 1 if not output_detected: - warnings.warn( - "No output detected, calculated FIFO depths may not be correct" - ) + warnings.warn("No output detected, calculated FIFO depths may not be correct") else: # do rtlsim in C++ for FIFO sizing # determine # inputs for FIFO sizing according to topology type swg_nodes = [ - x for x in model.graph.node if "ConvolutionInputGenerator" in x.op_type + x for x in model.graph.node if x.op_type.startswith("ConvolutionInputGenerator") ] if len(swg_nodes) == 0: # MLP, no layer overlap @@ -405,7 +405,7 @@ def apply(self, model): for node in model.graph.node: # set FIFO depth, reset FIFO implementation, # and set implementation/ram styles - if node.op_type == "StreamingFIFO": + if node.op_type.startswith("StreamingFIFO"): assert node.name in fifos, "FIFO node not found in size dictionary" # set depth of FIFO depth = optimize_depth(fifos[node.name]) @@ -443,15 +443,13 @@ def apply(self, model): # handle custom sizing for SWG FIFOs if desired if self.swg_exception: - model = model.transform( - CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth) - ) + model = model.transform(CapConvolutionFIFODepths(max_qsrl_depth=self.max_qsrl_depth)) # remove shallow FIFOs model = model.transform(RemoveShallowFIFOs()) # reflect final values in attributes for node in model.graph.node: - if node.op_type != "StreamingFIFO": + if not node.op_type.startswith("StreamingFIFO"): node_inst = getCustomOp(node) fifodepth_in = [] for node_inp in node.input: @@ -466,7 +464,7 @@ def apply(self, model): pass else: # there is a producer for this input - if prod.op_type == "StreamingFIFO": + if prod.op_type.startswith("StreamingFIFO"): prod_inst = getCustomOp(prod) fifodepth_in.append(prod_inst.get_nodeattr("depth")) else: @@ -485,7 +483,7 @@ def apply(self, model): pass else: # there is a consumer for this input - if cons.op_type == "StreamingFIFO": + if cons.op_type.startswith("StreamingFIFO"): cons_inst = getCustomOp(cons) fifodepth_out.append(cons_inst.get_nodeattr("depth")) else: @@ -572,14 +570,13 @@ def apply(self, model): graph_modified = False for node in graph.node: node_ind += 1 - if node.op_type == "StreamingFIFO": + if node.op_type == ("StreamingFIFO_rtl"): n_inst = getCustomOp(node) depth = n_inst.get_nodeattr("depth") - cfgs = get_fifo_split_configs( - depth, self.max_qsrl_depth, self.max_vivado_depth - ) + cfgs = get_fifo_split_configs(depth, self.max_qsrl_depth, self.max_vivado_depth) if len(cfgs) > 1: fld_shape = n_inst.get_folded_output_shape() + n_shape = n_inst.get_normal_output_shape() dtype = n_inst.get_nodeattr("dataType") ram_style = n_inst.get_nodeattr("ram_style") shape = model.get_tensor_shape(node.input[0]) @@ -598,13 +595,14 @@ def apply(self, model): graph.value_info.append(out_tensor) model.set_tensor_datatype(out_tensor.name, DataType[dtype]) fifo_node = helper.make_node( - "StreamingFIFO", + "StreamingFIFO_rtl", [inp], [outp], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.rtl", backend="fpgadataflow", depth=fifo_depth, folded_shape=fld_shape, + normal_shape=n_shape, dataType=dtype, impl_style=impl_style, ram_style=ram_style, diff --git a/src/finn/transformation/fpgadataflow/set_folding.py b/src/finn/transformation/fpgadataflow/set_folding.py index 2301fccdd4..eaee499e6a 100644 --- a/src/finn/transformation/fpgadataflow/set_folding.py +++ b/src/finn/transformation/fpgadataflow/set_folding.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,7 +35,7 @@ from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles -from finn.util.fpgadataflow import is_fpgadataflow_node +from finn.util.fpgadataflow import is_hls_node, is_rtl_node def divisors(num): @@ -75,12 +76,12 @@ class SetFolding(Transformation): * the producer of the node is expected to be a ConvolutionInputGenerator with depthwise=1, whose SIMD value will be set equal to the PE value of its consumer node - + * the VVAU also supports SIMD ("input window") parallelism next to + PE ("channels"), but current ConvInpGen limitations require PE to be fully + unfolded before SIMD is increased """ - def __init__( - self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True - ): + def __init__(self, target_cycles_per_frame=1000, mvau_wwidth_max=36, two_pass_relaxation=True): super().__init__() self.target_cycles_per_frame = target_cycles_per_frame self.mvau_wwidth_max = mvau_wwidth_max @@ -99,30 +100,33 @@ def apply(self, model): graph = model.graph # these ops use PE parallelism, up to a max value of NumChannels pe_ops = [ - "AddStreams_Batch", - "ChannelwiseOp_Batch", - "DuplicateStreams_Batch", - "GlobalAccPool_Batch", - "Thresholding_Batch", + "AddStreams_hls", + "ChannelwiseOp_hls", + "DuplicateStreams_hls", + "GlobalAccPool_hls", + "Thresholding_hls", + "Thresholding_rtl", ] # these ops use SIMD parallelism, up to a max value of NumChannels - # ConvolutionInputGenerator has a special case when depthwise=1 + # ConvolutionInputGenerator* has a special case when depthwise=1 + # ConvolutionInputGenerator_rtl supports additional parallelism by + # setting parallel_window=1 mode after maxing out SIMD simd_ops = [ - "DownSampler", - "FMPadding_Batch", - "ConvolutionInputGenerator", - "ConvolutionInputGenerator1D", + "DownSampler_hls", + "FMPadding_hls", + "FMPadding_Pixel_hls", + "ConvolutionInputGenerator_hls", "ConvolutionInputGenerator_rtl", ] # these ops are preceded by depthwise SWG and have special behavior, # as explained in the SetFolding docstring - depthwise_op_exceptions = ["VectorVectorActivation", "Pool_Batch"] + depthwise_op_exceptions = ["VVAU_hls", "VVAU_rtl", "Pool_hls"] for node in graph.node: - if not is_fpgadataflow_node(node): + if not (is_hls_node(node) or is_rtl_node(node)): continue op_type = node.op_type node_inst = getCustomOp(node) - if op_type == "MatrixVectorActivation": + if op_type in ["MVAU_hls", "MVAU_rtl"]: max_simd = node_inst.get_nodeattr("MW") max_pe = node_inst.get_nodeattr("MH") node_inst.set_nodeattr("PE", 1) @@ -138,8 +142,7 @@ def apply(self, model): # finish if target met break if ( - node_inst.get_weight_datatype().bitwidth() - * node_inst.get_nodeattr("SIMD") + node_inst.get_weight_datatype().bitwidth() * node_inst.get_nodeattr("SIMD") > self.mvau_wwidth_max ): # revert if we've gone above width threshold @@ -150,36 +153,64 @@ def apply(self, model): elif op_type in pe_ops: max_pe = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_pe, "PE") - elif op_type == "LabelSelect_Batch": + elif op_type == "LabelSelect_hls": max_pe = node_inst.get_nodeattr("Labels") self.optimize_attribute_val(node_inst, max_pe, "PE") elif op_type in depthwise_op_exceptions: + # init/reset SIMD of VVAU + if op_type in ["VVAU_hls", "VVAU_rtl"]: + node_inst.set_nodeattr("SIMD", 1) max_pe = node_inst.get_nodeattr("Channels") self.optimize_attribute_val(node_inst, max_pe, "PE") + # increase SIMD for VVAU once PE is exhausted + pe = node_inst.get_nodeattr("PE") + cyc = node_inst.get_exp_cycles() + if ( + op_type in ["VVAU_hls", "VVAU_rtl"] + and pe == max_pe + and cyc > self.target_cycles_per_frame + ): + max_simd = np.prod(node_inst.get_nodeattr("Kernel")) + self.optimize_attribute_val(node_inst, max_simd, "SIMD") # also set the folding of the upsteam DW SWU # which must be identical to this node swu_node = model.find_producer(node.input[0]) if swu_node.op_type.startswith("ConvolutionInputGenerator"): swu_node_inst = getCustomOp(swu_node) - pe = node_inst.get_nodeattr("PE") swu_node_inst.set_nodeattr("SIMD", pe) + # enable parallel_window mode of RTL SWG if needed + if swu_node.op_type == "ConvolutionInputGenerator_rtl": + if op_type.startswith("VVAU") and node_inst.get_nodeattr("SIMD") > 1: + swu_node_inst.set_nodeattr("parallel_window", 1) + else: + swu_node_inst.set_nodeattr("parallel_window", 0) else: - if op_type == "VectorVectorActivation": + if op_type in ["VVAU_hls", "VVAU_rtl"]: ksize = np.prod(node_inst.get_nodeattr("Kernel")) - elif op_type == "Pool_Batch": + elif op_type == "Pool_hls": ksize = node_inst.get_nodeattr("KernelSize") else: raise Exception("Undefined edge case for %s" % op_type) if ksize != 1: # pointwise vvau/pool lack a SWU - raise Exception( - "Expected SWU on DW op input, found " + swu_node.op_type - ) + raise Exception("Expected SWU on DW op input, found " + swu_node.op_type) elif op_type in simd_ops: if op_type.startswith("ConvolutionInputGenerator"): depthwise = node_inst.get_nodeattr("depthwise") if depthwise == 0: max_simd = node_inst.get_nodeattr("IFMChannels") + # init/reset parallel_window mode of RTL SWG + if op_type == "ConvolutionInputGenerator_rtl": + node_inst.set_nodeattr("parallel_window", 0) self.optimize_attribute_val(node_inst, max_simd, "SIMD") + # enable parallel_window mode of RTL SWG if needed + simd = node_inst.get_nodeattr("SIMD") + cyc = node_inst.get_exp_cycles() + if ( + op_type == "ConvolutionInputGenerator_rtl" + and simd == max_simd + and cyc > self.target_cycles_per_frame + ): + node_inst.set_nodeattr("parallel_window", 1) else: # depthwise SWGs are handled separately continue @@ -187,9 +218,7 @@ def apply(self, model): max_simd = node_inst.get_nodeattr("NumChannels") self.optimize_attribute_val(node_inst, max_simd, "SIMD") else: - warnings.warn( - "SetFolding doesn't know how to handle op_type " + op_type - ) + warnings.warn("SetFolding doesn't know how to handle op_type " + op_type) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) diff --git a/src/finn/transformation/fpgadataflow/specialize_layers.py b/src/finn/transformation/fpgadataflow/specialize_layers.py new file mode 100644 index 0000000000..e71d6c23a4 --- /dev/null +++ b/src/finn/transformation/fpgadataflow/specialize_layers.py @@ -0,0 +1,323 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import warnings +from onnx import helper +from qonnx.core.datatype import DataType +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.base import Transformation + +from finn.custom_op.fpgadataflow.hls import custom_op as hls_variants +from finn.custom_op.fpgadataflow.rtl import custom_op as rtl_variants +from finn.util.fpgadataflow import is_versal + + +def _determine_impl_style(node, fpgapart): + optype = node.op_type + + # check if there is an HLS or RTL variant or both + hls_variant = optype + "_hls" in hls_variants.keys() + rtl_variant = optype + "_rtl" in rtl_variants.keys() + + # check if user has specified a preferred_impl_style + inst = getCustomOp(node) + impl_style = inst.get_nodeattr("preferred_impl_style") + + # if impl_style not set, for "simple" layers always try + # to use rtl variant if available + if impl_style == "": + if optype == "StreamingDataWidthConverter": + return _dwc_determine_impl_style(node) + if rtl_variant: + if optype == "MVAU": + inp_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 + ) + weight_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 + ) + if inp_width_fit and weight_width_fit and _mvu_rtl_possible(node): + return "rtl" + else: + return "hls" + elif optype == "VVAU": + inp_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("inputDataType")].bitwidth() >= 4 + ) + weight_width_fit = ( + DataType[getCustomOp(node).get_nodeattr("weightDataType")].bitwidth() >= 4 + ) + if inp_width_fit and weight_width_fit and _vvu_rtl_possible(node, fpgapart): + return "rtl" + else: + return "hls" + return "rtl" + # but if no rtl variant, set impl_style to hls + elif hls_variant: + return "hls" + # if there is neither an rtl nor hls variant + # throw error + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + + # check if user setting can be fulfilled + # otherwise change impl_style + elif impl_style == "hls": + if optype == "ConvolutionInputGenerator": + if not _swg_hls_possible(node): + warn_str = ( + """Settings are not supported in HLS. Node %s will automatically be + set to RTL variant.""" + % node.name + ) + warnings.warn(warn_str) + return "rtl" + else: + return "hls" + + if hls_variant: + return "hls" + elif rtl_variant: + warn_str = """There is no HLS variant of %s. Node %s will automatically be + set to RTL variant.""" % ( + node.op_type, + node.name, + ) + warnings.warn(warn_str) + return "rtl" + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + elif impl_style == "rtl": + # rtl dwc does not support every inWidth to outWidth ratio + if optype == "StreamingDataWidthConverter": + if _dwc_determine_impl_style(node) != "rtl": + warn_str = """RTL implementation of DWC requires + stream widths that are integer width ratios + from each other. Node %s will automatically be + set to HLS variant.""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" + else: + # user setting can be fulfilled + return "rtl" + elif optype == "MVAU": + if _mvu_rtl_possible(node): + return "rtl" + else: + warn_str = """There is no RTL variant for %s. The node will automatically be + set to HLS variant. Please check the bit-widths to be <= 8 and ensure the + thresholds are implemented as standalone layer""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" + elif optype == "VVAU": + if _vvu_rtl_possible(node, fpgapart): + return "rtl" + else: + warn_str = """There is no RTL variant for %s. The node will automatically be + set to HLS variant. Please check the bit-widths to be <= 8 and ensure the + thresholds are implemented as standalone layer. Note that the RTL-variant + of this layer is only supported on Versal boards""" % ( + node.name, + ) + warnings.warn(warn_str) + return "hls" + + if rtl_variant: + return "rtl" + elif hls_variant: + warn_str = """There is no RTL variant of %s. Node %s will automatically be + set to HLS variant.""" % ( + node.op_type, + node.name, + ) + warnings.warn(warn_str) + return "hls" + else: + raise Exception( + """Node {} with optype {} has no hw implementation variant)""".format( + node.name, optype + ) + ) + else: + raise Exception( + """Invalid value for attribute preferred_impl_style! Is currently set to: {} + has to be set to one of the following value ("hls", "rtl")""".format( + impl_style + ) + ) + + +def _dwc_determine_impl_style(node): + # when possible use rtl variant + dwc = getCustomOp(node) + dwc_in_width = dwc.get_nodeattr("inWidth") + dwc_out_width = dwc.get_nodeattr("outWidth") + # check if rtl variant can be used + iwidth_d = dwc_in_width % dwc_out_width == 0 + owidth_d = dwc_out_width % dwc_in_width == 0 + if iwidth_d or owidth_d: + return "rtl" + else: + return "hls" + + +def _swg_hls_possible(node): + # there are some constraints to + # the HLS variant of the SWG + # first constraint to check is + # if user has set dynamic_mode to 1 + # this is only supported in rtl variant + swg = getCustomOp(node) + if swg.get_nodeattr("dynamic_mode"): + return False + # the 2D HLS implementation for SWG + # can only be used for square inputs + # and no dilation + if swg.get_nodeattr("is1D"): + return True + else: + # extract all attributes to check + k = swg.get_nodeattr("ConvKernelDim") + ifm_dim = swg.get_nodeattr("IFMDim") + ofm_dim = swg.get_nodeattr("OFMDim") + s = swg.get_nodeattr("Stride") + d = swg.get_nodeattr("Dilation") + # check if square and dilation=1 + if ( + k[0] == k[1] + and ifm_dim[0] == ifm_dim[1] + and ofm_dim[0] == ofm_dim[1] + and s[0] == s[1] + and d[0] == d[1] == 1 + ): + return True + else: + return False + + +def _mvu_rtl_possible(n): + # Checks whether RTL-based MVU is supported + # Currently, for DSP48 we only support computations up to + # 8sx8u (8-bit signed weights x 8-bit (un)signed activations) + # and for DSP58 we support up to 8sx9s. Next to that, + # embedded thresholding functionality is not supported and + # neither binaryxnormode computation. + inp_width_in_range = ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 + ) or ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 + and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + ) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + not_binaryxnor_mode = getCustomOp(n).get_nodeattr("binaryXnorMode") == 0 + + return ( + inp_width_in_range + and weight_width_in_range + and signed_weights + and no_activation + and not_binaryxnor_mode + ) + + +def _vvu_rtl_possible(n, fpgapart): + # Checks whether RTL-based VVU is supported + # Currently, we only support RTL-VVU on DSP58 up to 8sx9s inputs + # (8-bit signed weights x (9-bit signed OR 8-bit (un)signed) activations). + # Next to that, embedded thresholding functionality is not supported. + in_width_in_range = ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() <= 8 + ) or ( + DataType[getCustomOp(n).get_nodeattr("inputDataType")].bitwidth() == 9 + and DataType[getCustomOp(n).get_nodeattr("inputDataType")].min() < 0 + ) + weight_width_in_range = DataType[getCustomOp(n).get_nodeattr("weightDataType")].bitwidth() <= 8 + signed_weights = DataType[getCustomOp(n).get_nodeattr("weightDataType")].min() < 0 + is_versal_family = is_versal(fpgapart) + no_activation = getCustomOp(n).get_nodeattr("noActivation") == 1 + + return ( + in_width_in_range + and weight_width_in_range + and signed_weights + and is_versal_family + and no_activation + ) + + +class SpecializeLayers(Transformation): + """Specialize all layers to either HLS or RTL variants""" + + def __init__(self, fpgapart=""): + super().__init__() + self.fpgapart = fpgapart + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for node in graph.node: + # Skip nodes that are not hw layers + if not node.domain == "finn.custom_op.fpgadataflow": + continue + node_ind += 1 + impl_style = _determine_impl_style(node, self.fpgapart) + optype = node.op_type + "_" + impl_style + + new_node = helper.make_node( + optype, + node.input, + node.output, + domain="finn.custom_op.fpgadataflow." + impl_style, + ) + # add all attributes + for attribute in node.attribute: + if attribute.name != "preferred_impl_style": + new_node.attribute.append(attribute) + if new_node.op_type == "MVAU_rtl": + is_versal_family = is_versal(self.fpgapart) + getCustomOp(new_node).set_nodeattr("is_versal", is_versal_family) + graph.node.insert(node_ind, new_node) + # remove old nodes + graph.node.remove(node) + graph_modified = True + return (model, graph_modified) diff --git a/src/finn/transformation/fpgadataflow/template_driver.py b/src/finn/transformation/fpgadataflow/template_driver.py index 05ee6ad920..a65e060ed9 100644 --- a/src/finn/transformation/fpgadataflow/template_driver.py +++ b/src/finn/transformation/fpgadataflow/template_driver.py @@ -62,6 +62,7 @@ import os from qonnx.core.datatype import DataType from driver_base import FINNExampleOverlay +from pynq.pl_server.device import Device # dictionary describing the I/O of the FINN-generated accelerator io_shape_dict = { @@ -90,6 +91,7 @@ parser.add_argument('--exec_mode', help='Please select functional verification ("execute") or throughput test ("throughput_test")', default="execute") parser.add_argument('--platform', help='Target platform: zynq-iodma alveo', default="$PLATFORM$") parser.add_argument('--batchsize', help='number of samples for inference', type=int, default=1) + parser.add_argument('--device', help='FPGA device to be used', type=int, default=0) parser.add_argument('--bitfile', help='name of bitfile (i.e. "resizer.bit")', default="resizer.bit") parser.add_argument('--inputfile', help='name(s) of input npy file(s) (i.e. "input.npy")', nargs="*", type=str, default=["input.npy"]) parser.add_argument('--outputfile', help='name(s) of output npy file(s) (i.e. "output.npy")', nargs="*", type=str, default=["output.npy"]) @@ -103,12 +105,14 @@ inputfile = args.inputfile outputfile = args.outputfile runtime_weight_dir = args.runtime_weight_dir + devID = args.device + device = Device.devices[devID] # instantiate FINN accelerator driver and pass batchsize and bitfile accel = FINNExampleOverlay( bitfile_name = bitfile, platform = platform, io_shape_dict = io_shape_dict, batch_size = batch_size, - runtime_weight_dir = runtime_weight_dir + runtime_weight_dir = runtime_weight_dir, device=device ) # for the remote execution the data from the input npy file has to be loaded, @@ -135,5 +139,5 @@ file.close() print("Results written to nw_metrics.txt") else: - raise Exception("Exec mode has to be set to remote_pynq or throughput_test") + raise Exception("Exec mode has to be set to execute or throughput_test") """ diff --git a/src/finn/transformation/fpgadataflow/templates.py b/src/finn/transformation/fpgadataflow/templates.py index f52bad0ffb..ccf4e7a943 100644 --- a/src/finn/transformation/fpgadataflow/templates.py +++ b/src/finn/transformation/fpgadataflow/templates.py @@ -117,9 +117,15 @@ } elseif {$BOARD == "RFSoC2x2"} { set_property board_part xilinx.com:rfsoc2x2:part0:1.1 [current_project] set ZYNQ_TYPE "zynq_us+" +} elseif {$BOARD == "RFSoC4x2"} { + set_property board_part realdigital.org:rfsoc4x2:part0:1.0 [current_project] + set ZYNQ_TYPE "zynq_us+" } elseif {$BOARD == "Ultra96"} { set_property board_part avnet.com:ultra96v1:part0:1.2 [current_project] set ZYNQ_TYPE "zynq_us+" +} elseif {$BOARD == "Ultra96-V2"} { + set_property board_part avnet.com:ultra96v2:part0:1.2 [current_project] + set ZYNQ_TYPE "zynq_us+" } elseif {$BOARD == "Pynq-Z2"} { set ZYNQ_TYPE "zynq_7000" set_property board_part tul.com.tw:pynq-z2:part0:1.0 [current_project] @@ -135,7 +141,8 @@ create_bd_design "top" if {$ZYNQ_TYPE == "zynq_us+"} { - create_bd_cell -type ip -vlnv xilinx.com:ip:zynq_ultra_ps_e:3.4 zynq_ps + set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:zynq_ultra_ps_e:*"]] + create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps apply_bd_automation -rule xilinx.com:bd_rule:zynq_ultra_ps_e -config {apply_board_preset "1" } [get_bd_cells zynq_ps] #activate one slave port, deactivate the second master port set_property -dict [list CONFIG.PSU__USE__S_AXI_GP2 {1}] [get_bd_cells zynq_ps] @@ -144,7 +151,8 @@ set_property -dict [list CONFIG.PSU__OVERRIDE__BASIC_CLOCK {0}] [get_bd_cells zynq_ps] set_property -dict [list CONFIG.PSU__CRL_APB__PL0_REF_CTRL__FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] } elseif {$ZYNQ_TYPE == "zynq_7000"} { - create_bd_cell -type ip -vlnv xilinx.com:ip:processing_system7:5.5 zynq_ps + set zynq_ps_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:processing_system7:*"]] + create_bd_cell -type ip -vlnv $zynq_ps_vlnv zynq_ps apply_bd_automation -rule xilinx.com:bd_rule:processing_system7 -config {make_external "FIXED_IO, DDR" apply_board_preset "1" Master "Disable" Slave "Disable" } [get_bd_cells zynq_ps] set_property -dict [list CONFIG.PCW_USE_S_AXI_HP0 {1}] [get_bd_cells zynq_ps] set_property -dict [list CONFIG.PCW_FPGA0_PERIPHERAL_FREQMHZ [expr int($FREQ_MHZ)]] [get_bd_cells zynq_ps] @@ -153,8 +161,10 @@ } #instantiate axi interconnect, axi smartconnect -create_bd_cell -type ip -vlnv xilinx.com:ip:axi_interconnect:2.1 axi_interconnect_0 -create_bd_cell -type ip -vlnv xilinx.com:ip:smartconnect:1.0 smartconnect_0 +set interconnect_vlnv [get_property VLNV [get_ipdefs -all "xilinx.com:ip:axi_interconnect:*" -filter design_tool_contexts=~*IPI*]] +set smartconnect_vlnv [get_property VLNV [get_ipdefs "xilinx.com:ip:smartconnect:*"]] +create_bd_cell -type ip -vlnv $interconnect_vlnv axi_interconnect_0 +create_bd_cell -type ip -vlnv $smartconnect_vlnv smartconnect_0 #set number of axilite interfaces, and number of axi master interfaces set_property -dict [list CONFIG.NUM_SI $NUM_AXIMM] [get_bd_cells smartconnect_0] set_property -dict [list CONFIG.NUM_MI $NUM_AXILITE] [get_bd_cells axi_interconnect_0] @@ -242,22 +252,6 @@ close_project """ -alveo_run_sh_template = """#!/bin/bash - -if [ "$#" -ne 2 ]; then - echo "Usage: alveo_run.sh " - exit -1 -fi - -cd $REMOTE_DEPLOY_DIR$ -eval "$(conda shell.bash hook)" -conda activate $CONDA_ENV_NAME$ -source $REMOTE_XRT$/setup.sh -export PLATFORM_REPO_PATHS=$REMOTE_PLATFORM_REPO_PATHS$ -python3.6 driver.py --exec_mode=$1 --batchsize=$2 --bitfile=$BITFILE$ \ - --inputfile=input.npy --outputfile=output.npy --platform=alveo -""" - vitis_gen_xml_report_tcl_template = """ open_project $VITIS_PROJ_PATH$/_x/link/vivado/vpl/prj/prj.xpr open_run impl_1 diff --git a/src/finn/transformation/fpgadataflow/vitis_build.py b/src/finn/transformation/fpgadataflow/vitis_build.py index e0a5666000..da7624b8ff 100644 --- a/src/finn/transformation/fpgadataflow/vitis_build.py +++ b/src/finn/transformation/fpgadataflow/vitis_build.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -49,6 +50,7 @@ from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.insert_iodma import InsertIODMA from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import make_build_dir from . import templates @@ -56,9 +58,7 @@ def _check_vitis_envvars(): assert "VITIS_PATH" in os.environ, "VITIS_PATH must be set for Vitis" - assert ( - "PLATFORM_REPO_PATHS" in os.environ - ), "PLATFORM_REPO_PATHS must be set for Vitis" + assert "PLATFORM_REPO_PATHS" in os.environ, "PLATFORM_REPO_PATHS must be set for Vitis" assert ( "XILINX_XRT" in os.environ ), "XILINX_XRT must be set for Vitis, ensure the XRT env is sourced" @@ -97,9 +97,7 @@ def apply(self, model): # NOTE: this assumes the graph is Vitis-compatible: max one axi lite interface # developed from instructions in UG1393 (v2019.2) and package_xo documentation # package_xo is responsible for generating the kernel xml - assert ( - len(interfaces["axilite"]) <= 1 - ), "CreateVitisXO supports max 1 AXI lite interface" + assert len(interfaces["axilite"]) <= 1, "CreateVitisXO supports max 1 AXI lite interface" axilite_intf_name = None if len(interfaces["axilite"]) == 1: axilite_intf_name = interfaces["axilite"][0] @@ -114,14 +112,12 @@ def apply(self, model): ) arg_id += 1 args_string.append( - "{numReps:0:%s:%s:0x4:0x1C:uint:0}" - % (str(arg_id), axilite_intf_name) + "{numReps:0:%s:%s:0x4:0x1C:uint:0}" % (str(arg_id), axilite_intf_name) ) arg_id += 1 else: args_string.append( - "{numReps:0:%s:%s:0x4:0x10:uint:0}" - % (str(arg_id), axilite_intf_name) + "{numReps:0:%s:%s:0x4:0x10:uint:0}" % (str(arg_id), axilite_intf_name) ) arg_id += 1 for intf in interfaces["s_axis"] + interfaces["m_axis"]: @@ -139,9 +135,10 @@ def apply(self, model): model.set_metadata_prop("vitis_xo", xo_path) # generate the package_xo command in a tcl script - package_xo_string = ( - "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s" - % (xo_path, self.ip_name, stitched_ip_dir) + package_xo_string = "package_xo -force -xo_path %s -kernel_name %s -ip_directory %s" % ( + xo_path, + self.ip_name, + stitched_ip_dir, ) for arg in args_string: package_xo_string += " -kernel_xml_args " + arg @@ -236,7 +233,7 @@ def apply(self, model): node_mem_port = sdp_node.get_nodeattr("mem_port") if node_mem_port == "": # configure good defaults based on board - if "u50" in self.platform or "u280" in self.platform: + if "u50" in self.platform or "u280" in self.platform or "u55c" in self.platform: # Use HBM where available (also U50 does not have DDR) mem_type = "HBM" mem_idx = 0 @@ -255,9 +252,7 @@ def apply(self, model): mem_type = "DDR" mem_idx = 1 node_mem_port = "%s[%d]" % (mem_type, mem_idx) - config.append( - "sp=%s.m_axi_gmem0:%s" % (instance_names[node.name], node_mem_port) - ) + config.append("sp=%s.m_axi_gmem0:%s" % (instance_names[node.name], node_mem_port)) # connect streams if producer is not None: for i in range(len(node.input)): @@ -281,14 +276,10 @@ def apply(self, model): # add Vivado physopt directives if desired if self.strategy == VitisOptStrategy.PERFORMANCE_BEST: config.append("[vivado]") - config.append( - "prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=ExploreWithRemap" - ) + config.append("prop=run.impl_1.STEPS.OPT_DESIGN.ARGS.DIRECTIVE=ExploreWithRemap") config.append("prop=run.impl_1.STEPS.PLACE_DESIGN.ARGS.DIRECTIVE=Explore") config.append("prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.IS_ENABLED=true") - config.append( - "prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore" - ) + config.append("prop=run.impl_1.STEPS.PHYS_OPT_DESIGN.ARGS.DIRECTIVE=Explore") config.append("prop=run.impl_1.STEPS.ROUTE_DESIGN.ARGS.DIRECTIVE=Explore") config = "\n".join(config) + "\n" @@ -341,9 +332,7 @@ def apply(self, model): with open(gen_rep_xml_sh, "w") as f: f.write("#!/bin/bash \n") f.write("cd {}\n".format(link_dir)) - f.write( - "vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl") - ) + f.write("vivado -mode batch -source %s\n" % (link_dir + "/gen_report_xml.tcl")) f.write("cd {}\n".format(working_dir)) bash_command = ["bash", gen_rep_xml_sh] process_genxml = subprocess.Popen(bash_command, stdout=subprocess.PIPE) @@ -394,7 +383,7 @@ def __init__( def apply(self, model): _check_vitis_envvars() # prepare at global level, then break up into kernels - prep_transforms = [InsertIODMA(512), InsertDWC()] + prep_transforms = [InsertIODMA(512), InsertDWC(), SpecializeLayers()] for trn in prep_transforms: model = model.transform(trn) model = model.transform(GiveUniqueNodeNames()) @@ -416,21 +405,16 @@ def apply(self, model): dataflow_model_filename = sdp_node.get_nodeattr("model") kernel_model = ModelWrapper(dataflow_model_filename) kernel_model = kernel_model.transform(InsertFIFO()) + kernel_model = kernel_model.transform(SpecializeLayers()) kernel_model = kernel_model.transform(RemoveUnusedTensors()) kernel_model = kernel_model.transform(GiveUniqueNodeNames(prefix)) kernel_model.save(dataflow_model_filename) - kernel_model = kernel_model.transform( - PrepareIP(self.fpga_part, self.period_ns) - ) + kernel_model = kernel_model.transform(PrepareIP(self.fpga_part, self.period_ns)) kernel_model = kernel_model.transform(HLSSynthIP()) kernel_model = kernel_model.transform( - CreateStitchedIP( - self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True - ) - ) - kernel_model = kernel_model.transform( - CreateVitisXO(sdp_node.onnx_node.name) + CreateStitchedIP(self.fpga_part, self.period_ns, sdp_node.onnx_node.name, True) ) + kernel_model = kernel_model.transform(CreateVitisXO(sdp_node.onnx_node.name)) kernel_model.set_metadata_prop("platform", "alveo") kernel_model.save(dataflow_model_filename) # Assemble design from kernels diff --git a/src/finn/transformation/move_reshape.py b/src/finn/transformation/move_reshape.py index cec04a182b..2e6639c5c6 100644 --- a/src/finn/transformation/move_reshape.py +++ b/src/finn/transformation/move_reshape.py @@ -1,22 +1,9 @@ import warnings from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.base import Transformation -from qonnx.util.basic import get_by_name, is_finn_op +from qonnx.util.basic import get_by_name - -def _is_fpgadataflow_node(node): - if node is not None: - if is_finn_op(node.domain): - n_backend = get_by_name(node.attribute, "backend") - if n_backend is None: - return False - backend_value = n_backend.s.decode("UTF-8") - if backend_value == "fpgadataflow": - return True - else: - return False - else: - return False +from finn.util.fpgadataflow import is_fpgadataflow_node class RemoveCNVtoFCFlatten(Transformation): @@ -34,10 +21,10 @@ def apply(self, model): oshape = model.get_tensor_shape(n.output[0]) if len(oshape) == 2 and ishape[0] == oshape[0]: producer = model.find_producer(n.input[0]) - if _is_fpgadataflow_node(producer) is True: + if is_fpgadataflow_node(producer): # standalone flatten, remove consumer = model.find_consumer(n.output[0]) - if _is_fpgadataflow_node(consumer) is True: + if is_fpgadataflow_node(consumer): graph_modified = True consumer.input[0] = n.input[0] graph.node.remove(n) @@ -48,15 +35,13 @@ def apply(self, model): perms = list(get_by_name(transp_node.attribute, "perm").ints) if perms == [0, 3, 1, 2]: producer = model.find_producer(transp_node.input[0]) - if _is_fpgadataflow_node(producer) is True: + if is_fpgadataflow_node(producer): consumer = model.find_consumer(n.output[0]) - if consumer.op_type == "MatrixVectorActivation": + if consumer.op_type.startswith("MVAU"): fc_inst = getCustomOp(consumer) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") - (b, h, w, c) = model.get_tensor_shape( - transp_node.input[0] - ) + (b, h, w, c) = model.get_tensor_shape(transp_node.input[0]) # absorb transpose into weight matrix, # allowing FC layer to operate on the NHWC input W = model.get_initializer(consumer.input[1]) @@ -78,8 +63,6 @@ def apply(self, model): into subsequent node" ) else: - warnings.warn( - "Unsupported transpose node before flatten layer" - ) + warnings.warn("Unsupported transpose node before flatten layer") return (model, graph_modified) diff --git a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py index 34f11d1e95..c921b3d472 100644 --- a/src/finn/transformation/qonnx/convert_qonnx_to_finn.py +++ b/src/finn/transformation/qonnx/convert_qonnx_to_finn.py @@ -66,9 +66,7 @@ class ConvertQONNXtoFINN(Transformation): def __init__( self, - filter_function=default_filter_function_generator( - max_multithreshold_bit_width=8 - ), + filter_function=default_filter_function_generator(max_multithreshold_bit_width=8), ): super().__init__() self._filter_function = filter_function diff --git a/src/finn/transformation/qonnx/fold_quant_weights.py b/src/finn/transformation/qonnx/fold_quant_weights.py index e8339ae244..0f6cbacb82 100644 --- a/src/finn/transformation/qonnx/fold_quant_weights.py +++ b/src/finn/transformation/qonnx/fold_quant_weights.py @@ -57,13 +57,9 @@ def apply(self, model): is_const_shape = (n.op_type == "Shape") and (ishape is not None) if is_all_constant_inputs or is_const_shape: # Check node validity - if ( - n.op_type == "Quant" - and not model.get_initializer(n.input[2]) == 0 - ): + if n.op_type == "Quant" and not model.get_initializer(n.input[2]) == 0: raise ValueError( - "Only Quant nodes with zero-point == 0 " - "are currently supported." + "Only Quant nodes with zero-point == 0 " "are currently supported." ) if model.is_fork_node(n): raise ValueError( @@ -73,8 +69,7 @@ def apply(self, model): target_node = model.find_direct_successors(n) if target_node is None: raise RuntimeError( - "Weights quantized with the Quant node must have " - "a successor node." + "Weights quantized with the Quant node must have " "a successor node." ) else: target_node = target_node[0] @@ -102,7 +97,14 @@ def apply(self, model): model.set_initializer(node_out, q_node_output) else: # Check next operator type - mul_like_nodes = ["Mul", "Div", "Conv", "MatMul", "Gather"] + mul_like_nodes = [ + "Mul", + "Div", + "Conv", + "MatMul", + "Gather", + "ConvTranspose", + ] add_like_nodes = ["Add", "Sub"] all_supported_ops = mul_like_nodes.copy() all_supported_ops.extend(add_like_nodes) @@ -126,9 +128,7 @@ def apply(self, model): model.set_tensor_datatype(node_out, new_dtype) # Reshape scale for Conv if required - target_output_shape = model.get_tensor_shape( - target_node.output[0] - ) + target_output_shape = model.get_tensor_shape(target_node.output[0]) if target_node.op_type == "Conv" and len(scale.shape) > 0: conv_out_shape = [1] * len(target_output_shape) # only support per-output channel scaling @@ -160,9 +160,7 @@ def apply(self, model): "Can only constant fold scaled Quant weights " "if a successor exists." ) - assert ( - len(successor) == 1 - ), "Only implemented for a single consumer" + assert len(successor) == 1, "Only implemented for a single consumer" successor = successor[0] succ_output_name = successor.output[0] diff --git a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py index 5a3f176f1f..52eb55355a 100644 --- a/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py +++ b/src/finn/transformation/qonnx/infer_quant_avg_pool_2d.py @@ -46,7 +46,7 @@ def _get_signed_from_upstream(model, trunc_node): # Check if the input of this node already has a FINN datatype signed = None inp_dt = model.get_tensor_datatype(node.input[0]) - if inp_dt is not None and inp_dt is not DataType["FLOAT32"]: + if inp_dt is not None and inp_dt != DataType["FLOAT32"]: signed = inp_dt.signed() # Go further up the graph, since the datatype inference works top down # these nodes should either be sign preserving ops or they already have a @@ -67,23 +67,23 @@ def _get_signed_from_upstream(model, trunc_node): ) next_node = next_node[0] out_dt = model.get_tensor_datatype(next_node.output[0]) - if out_dt is not None and out_dt is not DataType["FLOAT32"]: + if out_dt is not None and out_dt != DataType["FLOAT32"]: signed = out_dt.signed() break # Special cases where the node has an internal or intrinsic datatype. if next_node.op_type == "MultiThreshold": - mt_inst = getCustomOp(next_node) + mt_inst = getCustomOp(next_node, onnx_opset_version=9) out_dt = DataType[mt_inst.get_nodeattr("out_dtype")] - if out_dt is not None and out_dt is not DataType["FLOAT32"]: + if out_dt is not None and out_dt != DataType["FLOAT32"]: signed = out_dt.signed() break if next_node.op_type == "BipolarQuant": signed = True break if next_node.op_type == "Quant": - q_inst = getCustomOp(next_node) + q_inst = getCustomOp(next_node, onnx_opset_version=9) out_dt = q_inst.get_integer_datatype(model) - if out_dt is not None and out_dt is not DataType["FLOAT32"]: + if out_dt is not None and out_dt != DataType["FLOAT32"]: signed = out_dt.signed() break @@ -124,18 +124,10 @@ def apply(self, model): node_ind += 1 if n.op_type == "AveragePool": mul_node = model.find_direct_successors(n) - if ( - mul_node is not None - and len(mul_node) == 1 - and mul_node[0].op_type == "Mul" - ): + if mul_node is not None and len(mul_node) == 1 and mul_node[0].op_type == "Mul": mul_node = mul_node[0] t_node = model.find_direct_successors(mul_node) - if ( - t_node is not None - and len(t_node) == 1 - and t_node[0].op_type == "Trunc" - ): + if t_node is not None and len(t_node) == 1 and t_node[0].op_type == "Trunc": t_node = t_node[0] running_node_index = node_ind # Check node for compatibility @@ -143,27 +135,16 @@ def apply(self, model): k_s = get_by_name(n.attribute, "kernel_shape") if k_s is None or len(k_s.ints) != 2 or len(set(k_s.ints)) != 1: raise ValueError( - "FINN only supports average pooling with " - "2D square kernels." + "FINN only supports average pooling with " "2D square kernels." ) k_s = k_s.ints[0] pads = get_by_name(n.attribute, "pads") - if ( - pads is None - or len(set(pads.ints)) != 1 - or pads.ints[0] != 0 - ): - raise ValueError( - "FINN dosn't support padding for average pooling." - ) + if pads is None or len(set(pads.ints)) != 1 or pads.ints[0] != 0: + raise ValueError("FINN dosn't support padding for average pooling.") stride = get_by_name(n.attribute, "strides") - if ( - stride is None - or len(stride.ints) != 2 - or len(set(stride.ints)) != 1 - ): + if stride is None or len(stride.ints) != 2 or len(set(stride.ints)) != 1: raise ValueError( "FINN only supports 2D strides with equal values in " "each direction." @@ -172,11 +153,7 @@ def apply(self, model): # Mul node mul_val = model.get_initializer(mul_node.input[1]) - if ( - mul_val is None - or len(mul_val.shape) != 0 - or mul_val != k_s * k_s - ): + if mul_val is None or len(mul_val.shape) != 0 or mul_val != k_s * k_s: raise ValueError( f"The Mul node after the AveragePool node must have " f"static initialization at the second input, " @@ -188,10 +165,10 @@ def apply(self, model): # Trunc node rounding_mode = get_by_name(t_node.attribute, "rounding_mode") - if rounding_mode is None or rounding_mode.s != b"FLOOR": + normalized_mode_string = rounding_mode.s.upper() + if rounding_mode is None or normalized_mode_string != b"FLOOR": raise ValueError( - "The Trunc node must have the rounding_mode " - "set to 'FLOOR'." + "The Trunc node must have the rounding_mode " "set to 'FLOOR'." ) for inp in t_node.input[1:]: if model.get_initializer(inp) is None: @@ -207,13 +184,8 @@ def apply(self, model): f"the Trunc node, it currently is {zero_pt}." ) trunc_in_bits = model.get_initializer(t_node.input[3]).flatten() - trunc_out_bits = model.get_initializer( - t_node.input[4] - ).flatten() - if ( - len(trunc_in_bits.shape) != 1 - or len(trunc_out_bits.shape) != 1 - ): + trunc_out_bits = model.get_initializer(t_node.input[4]).flatten() + if len(trunc_in_bits.shape) != 1 or len(trunc_out_bits.shape) != 1: raise ValueError( f"Finn only supports scalar bit widths " f"for the Trunc node. The input bit width " @@ -228,9 +200,7 @@ def apply(self, model): # https://github.com/Xilinx/finn-base/blob/ # 7c2603a95e90e4de2575020e575c24eab6a15889/src/finn/custom_op/ # general/quantavgpool2d.py#L94 - ibits = math.floor( - math.log(2**trunc_in_bits / (k_s * k_s), 2) - ) + ibits = math.floor(math.log(2**trunc_in_bits / (k_s * k_s), 2)) # Get sign signed = _get_signed_from_upstream(model, t_node) # ToDo: Change this to NHWC, diff --git a/src/finn/transformation/qonnx/qonnx_activation_handlers.py b/src/finn/transformation/qonnx/qonnx_activation_handlers.py index 9819086d82..323e391df4 100644 --- a/src/finn/transformation/qonnx/qonnx_activation_handlers.py +++ b/src/finn/transformation/qonnx/qonnx_activation_handlers.py @@ -286,6 +286,7 @@ class QuantReluHandler(QuantActBaseHandler): def valid_predecessor_op_types(self): return [ "Relu", + "Selu", ] def _check_compatibility(self): @@ -293,16 +294,19 @@ def _check_compatibility(self): q_inst = getCustomOp(self._q_node) narrow = q_inst.get_nodeattr("narrow") signed = q_inst.get_nodeattr("signed") - if signed or narrow: - raise ValueError( - "FINN only supports unsigned and non-narrow Quant nodes " - "for Relu activations." - ) if not self._model.get_initializer(self._q_node.input[2]) == 0: raise ValueError( "Only Quant nodes with zero-point == 0 " "are currently supported for ReLu activations." ) + act_node = self._model.find_direct_predecessors(self._q_node) + act_node = act_node[0] + if act_node.op_type == "Relu": + if signed or narrow: + raise ValueError( + "FINN only supports unsigned and non-narrow Quant nodes " + "for Relu activations." + ) elif self._q_node.op_type == "BipolarQuant": return else: @@ -312,7 +316,31 @@ def _calculate_act_bias(self): # No bias allowed for Relu activations, see: https://github.com/Xilinx/ # brevitas/blob/a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/ # export/onnx/finn/handler/act.py#L48 - bias = np.array([0.0], dtype=np_default_dtype) + act_node = self._model.find_direct_predecessors(self._q_node) + act_node = act_node[0] + if act_node.op_type == "Relu": + bias = np.array([0.0], dtype=np_default_dtype) + elif act_node.op_type == "Selu": + # Gather parameters + q_inst = getCustomOp(self._q_node) + if self._q_node.op_type == "Quant": + bit_width = self._model.get_initializer(self._q_node.input[3]) + narrow = q_inst.get_nodeattr("narrow") + elif self._q_node.op_type == "BipolarQuant": + bit_width = 1.0 + else: + raise RuntimeError("Got an unexpected quantizer node type") + # Calculate bias, see: https://github.com/Xilinx/brevitas/blob/ + # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/ + # onnx/finn/handler/act.py#L64 + if bit_width == 1.0: + bias = np.array([-0.5], dtype=np_default_dtype) + else: + if narrow: + min_non_scaled_val = -(2 ** (bit_width - 1) - 1) + else: + min_non_scaled_val = -(2 ** (bit_width - 1)) + bias = np.array([min_non_scaled_val], dtype=np_default_dtype) return bias def _calculate_thresholds(self): @@ -323,27 +351,49 @@ def _calculate_thresholds(self): bit_width = 1.0 else: raise RuntimeError("Got an unexpected quantizer node type") - quant_scale = self._model.get_initializer(self._q_node.input[1]).astype( - np.float32 - ) - # q_inst = getCustomOp(self._q_node) - # narrow = q_inst.get_nodeattr("narrow") + quant_scale = self._model.get_initializer(self._q_node.input[1]).astype(np.float32) + act_node = self._model.find_direct_predecessors(self._q_node) + act_node = act_node[0] + if act_node.op_type == "Relu": + # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/ + # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/ + # onnx/finn/handler/act.py#L21 + num_distinct_values = 2**bit_width + num_thresholds = int(num_distinct_values - 1) + flat_scale = quant_scale.flatten().astype(np.float32) + num_scale_channels = flat_scale.shape[0] + step = np.abs(flat_scale).astype(np.float32) + min_threshold = step / 2 + thresholds = np.empty((num_scale_channels, num_thresholds), dtype=np_default_dtype) + for c in range(num_scale_channels): + for t in range(num_thresholds): + thresholds[c][t] = min_threshold[c] + step[c] * t - # Calculate thersholds, see: https://github.com/Xilinx/brevitas/blob/ - # a5bfd6dc5e030f0047ac1ee47932b60e8e873e17/src/brevitas/export/ - # onnx/finn/handler/act.py#L21 - num_distinct_values = 2**bit_width - num_thresholds = int(num_distinct_values - 1) - flat_scale = quant_scale.flatten().astype(np.float32) - num_scale_channels = flat_scale.shape[0] - step = np.abs(flat_scale).astype(np.float32) - min_threshold = step / 2 - thresholds = np.empty( - (num_scale_channels, num_thresholds), dtype=np_default_dtype - ) - for c in range(num_scale_channels): - for t in range(num_thresholds): - thresholds[c][t] = min_threshold[c] + step[c] * t + elif act_node.op_type == "Selu": + q_inst = getCustomOp(self._q_node) + narrow = q_inst.get_nodeattr("narrow") + if narrow: + num_distinct_values = 2**bit_width - 1 + else: + num_distinct_values = 2**bit_width + + num_thresholds = int(num_distinct_values - 1) + flat_scale = quant_scale.flatten().astype(np.float32) + num_scale_channels = flat_scale.shape[0] + scale = np.abs(flat_scale).astype(np.float32) + half_scale = scale / 2 + # alpha and lambda + # from https://pytorch.org/docs/stable/generated/torch.nn.SELU.html + alpha = 1.6732632423543772848170429916717 + selu_scale = 1.0507009873554804934193349852946 + thresholds = np.empty((num_scale_channels, num_thresholds), dtype=np_default_dtype) + for c in range(num_scale_channels): + for t in range(num_thresholds): + step = -1.0 + half_scale + scale[c] * t + if step <= 0: + thresholds[c][t] = np.log(step / (alpha * selu_scale) + 1) + else: + thresholds[c][t] = step / selu_scale # ToDo: The index 1 needs to be changed to -1 for the channels last format num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1] @@ -367,14 +417,13 @@ def _remove_activation_node(self, multi_threshold_node): act_node = self._model.find_direct_predecessors(self._q_node) if act_node is None: raise RuntimeError( - "For handling of Relu activations a predecesor to " - "the Quant node must exist." + "For handling of Relu activations a predecesor to " "the Quant node must exist." ) act_node = act_node[0] - if not act_node.op_type == "Relu": + if act_node.op_type not in self.valid_predecessor_op_types(): raise RuntimeError( - "The predecesor of the Quant node must be Relu for handling " - "of Relu activations." + "The predecesor of the Quant node must be Relu or Selu for handling " + "of activations." ) # Reroute upstream tensor @@ -409,9 +458,7 @@ def _check_compatibility(self): q_inst = getCustomOp(self._q_node) signed = q_inst.get_nodeattr("signed") if not signed: - raise ValueError( - "FINN only supports signed Quant nodes for identity activations." - ) + raise ValueError("FINN only supports signed Quant nodes for identity activations.") if not self._model.get_initializer(self._q_node.input[2]) == 0: raise ValueError( "Only Quant nodes with zero-point == 0 " @@ -480,9 +527,7 @@ def _calculate_thresholds(self): num_scale_channels = flat_scale.shape[0] step = np.abs(flat_scale) half_step = step / 2.0 - thresholds = np.empty( - (num_scale_channels, num_thresholds), dtype=np_default_dtype - ) + thresholds = np.empty((num_scale_channels, num_thresholds), dtype=np_default_dtype) # compute the value of the smallest threshold, we'll neg-bias all # generated thresholds by this much min_threshold = -half_step - step * ((num_thresholds // 2) - 1) @@ -493,9 +538,7 @@ def _calculate_thresholds(self): thresholds[c][t] = min_threshold[c] + step[c] * t # ToDo: The index 1 needs to be changed to -1 for the channels last format - num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[ - 1 - ] + num_output_channels = self._model.get_tensor_shape(self._q_node.output[0])[1] final_shape = (num_output_channels, num_thresholds) if thresholds.shape != final_shape: thresholds = np.broadcast_to(thresholds, final_shape) @@ -517,9 +560,7 @@ def _calculate_act_scale(self): if bit_width != 1: scale = quant_scale else: - assert ( - quant_scale.flatten().shape[0] == 1 - ), "Unsupported BIPOLAR per channel scale" + assert quant_scale.flatten().shape[0] == 1, "Unsupported BIPOLAR per channel scale" assert quant_scale.flatten()[0] == 1.0, "Unsupported BIPOLAR scale != 1" scale = quant_scale * 2 return scale diff --git a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py index 48dda3820d..1b1aea1bab 100644 --- a/src/finn/transformation/qonnx/quant_act_to_multithreshold.py +++ b/src/finn/transformation/qonnx/quant_act_to_multithreshold.py @@ -87,9 +87,7 @@ class ConvertQuantActToMultiThreshold(Transformation): def __init__( self, - filter_function=default_filter_function_generator( - max_multithreshold_bit_width=8 - ), + filter_function=default_filter_function_generator(max_multithreshold_bit_width=8), ): super().__init__() self._filter_function = filter_function diff --git a/src/finn/transformation/streamline/absorb.py b/src/finn/transformation/streamline/absorb.py index 73df52f890..e3e2468bba 100644 --- a/src/finn/transformation/streamline/absorb.py +++ b/src/finn/transformation/streamline/absorb.py @@ -80,9 +80,7 @@ def apply(self, model): steps = T.shape[-1] new_min = bias new_max = steps + bias - odt = DataType.get_smallest_possible(steps).name.replace( - "UINT", "INT" - ) + odt = DataType.get_smallest_possible(steps).name.replace("UINT", "INT") odt = DataType[odt] assert odt.allowed(new_max) and odt.allowed( new_min @@ -112,11 +110,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Add" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if consumer is not None and consumer.op_type == "MultiThreshold": add_weight_name = n.input[1] @@ -153,11 +147,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Mul" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n): mul_weight_name = n.input[1] A = model.get_initializer(mul_weight_name) assert A is not None, "Initializer for mul weights is not set." @@ -203,9 +193,7 @@ def apply(self, model): is_scalar = np.prod(A.shape) == 1 actual_ndims = len(tuple(filter(lambda x: x > 1, A.shape))) is_1d = actual_ndims == 1 - is_not_bipolar = ( - model.get_tensor_datatype(mul_weight_name) != DataType["BIPOLAR"] - ) + is_not_bipolar = model.get_tensor_datatype(mul_weight_name) != DataType["BIPOLAR"] is_signed = (A < 0).any() if is_signed and (is_scalar or is_1d) and is_not_bipolar: start_name = n.input[0] @@ -219,9 +207,7 @@ def apply(self, model): model.set_tensor_datatype(sign_mul_param_name, DataType["BIPOLAR"]) # replace original mul weight by magnitudes model.set_initializer(mul_weight_name, np.abs(A)) - new_mul = oh.make_node( - "Mul", [start_name, sign_mul_param_name], [middle_name] - ) + new_mul = oh.make_node("Mul", [start_name, sign_mul_param_name], [middle_name]) n.input[0] = middle_name graph.node.insert(node_ind - 1, new_mul) graph_modified = True @@ -338,13 +324,9 @@ def apply(self, model): mt_cand.output[0] ) # Create a new ValueInfoProto and set the shape - model.set_tensor_shape( - intermediate_tensor_name, intermediate_tensor_shape - ) + model.set_tensor_shape(intermediate_tensor_name, intermediate_tensor_shape) # Set the tensor layout - model.set_tensor_layout( - intermediate_tensor_name, DataLayout.NHWC - ) + model.set_tensor_layout(intermediate_tensor_name, DataLayout.NHWC) # Set the tensor FINN datatype model.set_tensor_datatype( intermediate_tensor_name, intermediate_tensor_finn_dtype @@ -379,8 +361,7 @@ def apply(self, model): for n in graph.node: node_ind += 1 if ( - n.op_type == "Reshape" - and (model.get_initializer(n.input[1]) == [1, -1]).all() + n.op_type == "Reshape" and (model.get_initializer(n.input[1]) == [1, -1]).all() ) or n.op_type == "Flatten": prod = model.find_producer(n.input[0]) if ( @@ -556,23 +537,17 @@ def apply(self, model): if sizes is not None: ishape = model.get_tensor_shape(mt_cand.input[0]) ns, cs, hs, ws = sizes / np.asarray(ishape) - model.set_initializer( - mt_cand.input[2], np.asarray([ns, cs, hs, ws]) - ) + model.set_initializer(mt_cand.input[2], np.asarray([ns, cs, hs, ws])) mt_cand.input.remove(mt_cand.input[3]) # scales already specified, transpose indices to NHWC scales = model.get_initializer(mt_cand.input[2]) assert scales is not None ns, cs, hs, ws = scales - model.set_initializer( - mt_cand.input[2], np.asarray([ns, hs, ws, cs]) - ) + model.set_initializer(mt_cand.input[2], np.asarray([ns, hs, ws, cs])) # get rid of first tranpose node mt_cand.input[0] = node.input[0] graph.node.remove(node) - is_last_node = mt_cand.output[0] in [ - x.name for x in model.graph.output - ] + is_last_node = mt_cand.output[0] in [x.name for x in model.graph.output] new_tensor_name = model.make_new_valueinfo_name() if is_last_node: diff --git a/src/finn/transformation/streamline/reorder.py b/src/finn/transformation/streamline/reorder.py index 29eefacc32..8ac2d7dad6 100644 --- a/src/finn/transformation/streamline/reorder.py +++ b/src/finn/transformation/streamline/reorder.py @@ -53,11 +53,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Add" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -73,9 +69,7 @@ def apply(self, model): A = model.get_initializer(mul_weight_name) B = model.get_initializer(add_weight_name) if (A is None) or (B is None): - warnings.warn( - "Mul or add does not have constant params, skipping" - ) + warnings.warn("Mul or add does not have constant params, skipping") continue start_name = n.input[0] middle_name = n.output[0] @@ -116,11 +110,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Mul" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -174,11 +164,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Add" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -235,11 +221,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Add" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Add" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -317,11 +299,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Mul" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -360,6 +338,55 @@ def apply(self, model): return (model, graph_modified) +class MoveScalarMulPastConvTranspose(Transformation): + """Move scalar mul operations past ConvTranspose operations. We want to have muls + next to each other such that they can be collapsed into a single mul.""" + + def apply(self, model): + graph = model.graph + node_ind = 0 + graph_modified = False + for n in graph.node: + node_ind += 1 + if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n): + consumer = model.find_consumer(n.output[0]) + if ( + consumer is not None + and consumer.op_type == "ConvTranspose" + and not model.is_join_node(consumer) + ): + mul_weight_name = n.input[1] + A = model.get_initializer(mul_weight_name) + if A is None: + warnings.warn("Mul param is not constant, skipping") + continue + conv_node = consumer + mul_node = n + start_name = mul_node.input[0] + conv_in_name = conv_node.input[0] + conv_in_shape = model.get_tensor_shape(conv_in_name) + conv_out_name = conv_node.output[0] + conv_out_shape = model.get_tensor_shape(conv_out_name) + if all(x == 1 for x in A.shape): + # if the mul is scalar, we can simply swap the order of ops + # rewire mul input to be conv input + conv_node.input[0] = start_name + model.set_tensor_shape(start_name, conv_in_shape) + # use old conv input tensor as conv output + conv_node.output[0] = conv_in_name + model.set_tensor_shape(conv_in_name, conv_out_shape) + # use new conv output as new mul node input + mul_node.input[0] = conv_in_name + # use old conv output as new mul node output + mul_node.output[0] = conv_out_name + # move add node past conv node + graph.node.remove(mul_node) + graph.node.insert(node_ind, mul_node) + graph_modified = True + model = model.transform(InferShapes()) + return (model, graph_modified) + + class MoveMulPastDWConv(Transformation): """Move channelwise mul operations past depthwise conv operations. We want to have muls next to each other such that they can be collapsed into a single mul.""" @@ -370,11 +397,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Mul" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -436,11 +459,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Mul" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Mul" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -465,9 +484,7 @@ def apply(self, model): maxpool_out_shape = model.get_tensor_shape(maxpool_out_name) # do not support non-2D MaxPool - kernel_shape = list( - get_by_name(maxpool_node.attribute, "kernel_shape").ints - ) + kernel_shape = list(get_by_name(maxpool_node.attribute, "kernel_shape").ints) if len(kernel_shape) != 2: continue @@ -675,9 +692,7 @@ def apply(self, model): if ceil_mode is not None: ceil_mode = ceil_mode.i else: - ceil_mode = ( - 0 # default to ceil_mode=0 (equivalent to np.floor) - ) + ceil_mode = 0 # default to ceil_mode=0 (equivalent to np.floor) n.op_type = "MaxPoolNHWC" n.domain = "qonnx.custom_op.general" start_name = n.input[0] @@ -702,9 +717,7 @@ def apply(self, model): if ceil_mode is not None: ceil_mode = ceil_mode.i else: - ceil_mode = ( - 0 # default to ceil_mode=0 (equivalent to np.floor) - ) + ceil_mode = 0 # default to ceil_mode=0 (equivalent to np.floor) n.op_type = "MaxPoolNHWC" n.domain = "qonnx.custom_op.general" start_name = producer.input[0] @@ -739,8 +752,7 @@ def apply(self, model): if n.op_type == "Upsample" or n.op_type == "Resize": if model.get_tensor_layout(n.input[0]) != DataLayout.NCHW: warnings.warn( - "%s: Input not NCHW. Can't operate transformation on node." - % n.name + "%s: Input not NCHW. Can't operate transformation on node." % n.name ) continue consumer = model.find_consumer(n.output[0]) @@ -818,7 +830,6 @@ def apply(self, model): and model.is_fork_node(n) and not model.is_join_node(n) ): - # Restrict this transform to operations with constant parameters # Assuming parameters is in input 1 if len(n.input) > 1: @@ -863,9 +874,7 @@ def apply(self, model): consumer_node.input[idx] = new_output_tensor_name break else: - raise Exception( - "Consumer should have the current node output as input" - ) + raise Exception("Consumer should have the current node output as input") graph.node.insert(node_ind, consumer_node) @@ -892,9 +901,7 @@ def __init__(self): class MoveTransposePastFork(MoveOpPastFork): def __init__(self): - super().__init__( - ["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints} - ) + super().__init__(["Transpose"], lambda x: {"perm": get_by_name(x.attribute, "perm").ints}) class MoveMaxPoolPastMultiThreshold(Transformation): @@ -918,9 +925,7 @@ def apply(self, model): mt_out = consumer.output[0] mt_odt = model.get_tensor_datatype(mt_out) if mt_odt.signed() and has_padding: - warnings.warn( - "Skipping padded MaxPool + signed-output MultiThreshold" - ) + warnings.warn("Skipping padded MaxPool + signed-output MultiThreshold") continue # check for non-decreasing thresholds and nonnegative # scale factor in MultiThreshold @@ -1031,11 +1036,7 @@ def apply(self, model): node_ind = 0 for n in graph.node: node_ind += 1 - if ( - n.op_type == "Flatten" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Flatten" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None @@ -1121,11 +1122,7 @@ def apply(self, model): graph_modified = False for n in graph.node: node_ind += 1 - if ( - n.op_type == "Transpose" - and not model.is_fork_node(n) - and not model.is_join_node(n) - ): + if n.op_type == "Transpose" and not model.is_fork_node(n) and not model.is_join_node(n): consumer = model.find_consumer(n.output[0]) if ( consumer is not None diff --git a/src/finn/transformation/streamline/round_thresholds.py b/src/finn/transformation/streamline/round_thresholds.py index 601dab04cb..5ba5ee0ff5 100644 --- a/src/finn/transformation/streamline/round_thresholds.py +++ b/src/finn/transformation/streamline/round_thresholds.py @@ -57,8 +57,7 @@ def apply(self, model): model.set_tensor_datatype(n.input[1], idtype) graph_modified = True if idtype.is_integer() and ( - (Tnew < (idtype.min() - 1)).any() - or (Tnew > (idtype.max() + 1)).any() + (Tnew < (idtype.min() - 1)).any() or (Tnew > (idtype.max() + 1)).any() ): # clip any large thresholds to input range + 1 Tnew = np.clip(Tnew, idtype.min() - 1, idtype.max() + 1) diff --git a/src/finn/util/basic.py b/src/finn/util/basic.py index 3bc5b803db..1995d9f06a 100644 --- a/src/finn/util/basic.py +++ b/src/finn/util/basic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020 Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -11,7 +11,7 @@ # this list of conditions and the following disclaimer in the documentation # and/or other materials provided with the distribution. # -# * Neither the name of Xilinx nor the names of its +# * Neither the name of FINN nor the names of its # contributors may be used to endorse or promote products derived from # this software without specific prior written permission. # @@ -30,16 +30,22 @@ import subprocess import sys import tempfile +from qonnx.util.basic import roundup_to_integer_multiple + +# test boards +test_board_map = ["Pynq-Z1", "KV260_SOM", "ZCU104", "U250"] # mapping from PYNQ board names to FPGA part names pynq_part_map = dict() pynq_part_map["Ultra96"] = "xczu3eg-sbva484-1-e" +pynq_part_map["Ultra96-V2"] = "xczu3eg-sbva484-1-i" pynq_part_map["Pynq-Z1"] = "xc7z020clg400-1" pynq_part_map["Pynq-Z2"] = "xc7z020clg400-1" pynq_part_map["ZCU102"] = "xczu9eg-ffvb1156-2-e" pynq_part_map["ZCU104"] = "xczu7ev-ffvc1156-2-e" pynq_part_map["ZCU111"] = "xczu28dr-ffvg1517-2-e" pynq_part_map["RFSoC2x2"] = "xczu28dr-ffvg1517-2-e" +pynq_part_map["RFSoC4x2"] = "xczu48dr-ffvg1517-2-e" pynq_part_map["KV260_SOM"] = "xck26-sfvc784-2LV-c" @@ -48,10 +54,12 @@ pynq_native_port_width["Pynq-Z1"] = 64 pynq_native_port_width["Pynq-Z2"] = 64 pynq_native_port_width["Ultra96"] = 128 +pynq_native_port_width["Ultra96-V2"] = 128 pynq_native_port_width["ZCU102"] = 128 pynq_native_port_width["ZCU104"] = 128 pynq_native_port_width["ZCU111"] = 128 pynq_native_port_width["RFSoC2x2"] = 128 +pynq_native_port_width["RFSoC4x2"] = 128 pynq_native_port_width["KV260_SOM"] = 128 # Alveo device and platform mappings @@ -60,12 +68,19 @@ alveo_part_map["U200"] = "xcu200-fsgd2104-2-e" alveo_part_map["U250"] = "xcu250-figd2104-2L-e" alveo_part_map["U280"] = "xcu280-fsvh2892-2L-e" +alveo_part_map["U55C"] = "xcu55c-fsvh2892-2L-e" alveo_default_platform = dict() -alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_201920_3" -alveo_default_platform["U200"] = "xilinx_u200_xdma_201830_2" +alveo_default_platform["U50"] = "xilinx_u50_gen3x16_xdma_5_202210_1" +alveo_default_platform["U200"] = "xilinx_u200_gen3x16_xdma_2_202110_1" alveo_default_platform["U250"] = "xilinx_u250_gen3x16_xdma_2_1_202010_1" -alveo_default_platform["U280"] = "xilinx_u280_xdma_201920_3" +alveo_default_platform["U280"] = "xilinx_u280_gen3x16_xdma_1_202211_1" +alveo_default_platform["U55C"] = "xilinx_u55c_gen3x16_xdma_3_202210_1" + +# Create a joint part map, encompassing other boards too +part_map = {**pynq_part_map, **alveo_part_map} +part_map["VEK280"] = "xcve2802-vsvh1760-2MP-e-S" +part_map["VCK190"] = "xcvc1902-vsva2197-2MP-e-S" def get_rtlsim_trace_depth(): @@ -219,3 +234,57 @@ def is_exe(fpath): return exe_file return None + + +mem_primitives_versal = { + "URAM_72x4096": (72, 4096), + "URAM_36x8192": (36, 8192), + "URAM_18x16384": (18, 16384), + "URAM_9x32768": (9, 32768), + "BRAM18_36x512": (36, 512), + "BRAM18_18x1024": (18, 1024), + "BRAM18_9x2048": (9, 2048), + "LUTRAM": (1, 64), +} + + +def get_memutil_alternatives( + req_mem_spec, mem_primitives=mem_primitives_versal, sort_min_waste=True +): + """Computes how many instances of a memory primitive are necessary to + implement a desired memory size, where req_mem_spec is the desired + size and the primitive_spec is the primitve size. The sizes are expressed + as tuples of (mem_width, mem_depth). Returns a list of tuples of the form + (primitive_name, (primitive_count, efficiency, waste)) where efficiency in + range [0,1] indicates how much of the total capacity is utilized, and waste + indicates how many bits of storage are wasted. If sort_min_waste is True, + the list is sorted by increasing waste. + """ + ret = [ + (primitive_name, memutil(req_mem_spec, primitive_spec)) + for (primitive_name, primitive_spec) in mem_primitives.items() + ] + if sort_min_waste: + ret = sorted(ret, key=lambda x: x[1][2]) + return ret + + +def memutil(req_mem_spec, primitive_spec): + """Computes how many instances of a memory primitive are necessary to + implemented a desired memory size, where req_mem_spec is the desired + size and the primitive_spec is the primitve size. The sizes are expressed + as tuples of (mem_width, mem_depth). Returns (primitive_count, efficiency, waste) + where efficiency in range [0,1] indicates how much of the total capacity is + utilized, and waste indicates how many bits of storage are wasted.""" + + req_width, req_depth = req_mem_spec + prim_width, prim_depth = primitive_spec + + match_width = roundup_to_integer_multiple(req_width, prim_width) + match_depth = roundup_to_integer_multiple(req_depth, prim_depth) + count_width = match_width // prim_width + count_depth = match_depth // prim_depth + count = count_depth * count_width + eff = (req_width * req_depth) / (count * prim_width * prim_depth) + waste = (count * prim_width * prim_depth) - (req_width * req_depth) + return (count, eff, waste) diff --git a/src/finn/util/create.py b/src/finn/util/create.py index ed3e1a843e..09ec4f334c 100644 --- a/src/finn/util/create.py +++ b/src/finn/util/create.py @@ -108,15 +108,11 @@ def hls_mlp_maker(layer_spec): odt = lyr["odt"] if i == 0: - global_in = helper.make_tensor_value_info( - current_in_name, TensorProto.FLOAT, [1, mw] - ) + global_in = helper.make_tensor_value_info(current_in_name, TensorProto.FLOAT, [1, mw]) model.graph.input.append(global_in) if i == len(layer_spec) - 1: - global_out = helper.make_tensor_value_info( - current_out_name, TensorProto.FLOAT, [1, mh] - ) + global_out = helper.make_tensor_value_info(current_out_name, TensorProto.FLOAT, [1, mh]) model.graph.output.append(global_out) # there are two ways to implement bipolar weights and inputs for @@ -147,7 +143,7 @@ def hls_mlp_maker(layer_spec): actval = 0 no_act = 1 FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, [current_out_name], domain="finn.custom_op.fpgadataflow", diff --git a/src/finn/util/data_packing.py b/src/finn/util/data_packing.py index 797dad32a2..7698850029 100644 --- a/src/finn/util/data_packing.py +++ b/src/finn/util/data_packing.py @@ -149,9 +149,7 @@ def pack_innermost_dim_as_hex_string( ndarray = np.asarray(ndarray, dtype=np.float32) def fun(x): - return array2hexstring( - x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix - ) + return array2hexstring(x, dtype, pad_to_nbits, reverse=reverse_inner, prefix=prefix) return np.apply_along_axis(fun, ndarray.ndim - 1, ndarray) @@ -220,7 +218,7 @@ def unpack_innermost_dim_from_hex_string( if conv_dtype == DataType["BIPOLAR"]: ar_list = [2 * x - 1 for x in ar_list] # interpret values as signed values - elif conv_dtype.name.startswith("INT"): + elif conv_dtype.signed() and conv_dtype.is_integer(): mask = 2 ** (conv_dtype.bitwidth() - 1) ar_list = [-(x & mask) + (x & ~mask) for x in ar_list] @@ -232,9 +230,7 @@ def unpack_innermost_dim_from_hex_string( return array -def numpy_to_hls_code( - ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False -): +def numpy_to_hls_code(ndarray, dtype, hls_var_name, pack_innermost_dim=True, no_decl=False): """Return C++ code representation of a numpy ndarray with FINN DataType dtype, using hls_var_name as the resulting C++ variable name. If pack_innermost_dim is specified, the innermost dimension of the ndarray @@ -311,9 +307,7 @@ def npy_to_rtlsim_input(input_file, input_dtype, pad_to_nbits, reverse_inner=Tru return packed_data -def rtlsim_output_to_npy( - output, path, dtype, shape, packedBits, targetBits, reverse_inner=True -): +def rtlsim_output_to_npy(output, path, dtype, shape, packedBits, targetBits, reverse_inner=True): """Convert a flattened sequence of Python arbitrary-precision integers output into a NumPy array, saved as npy file at path. Each arbitrary-precision integer is assumed to be a packed array of targetBits-bit elements, which @@ -418,9 +412,7 @@ def packed_bytearray_to_finnpy( """ - if ( - not issubclass(type(packed_bytearray), np.ndarray) - ) or packed_bytearray.dtype != np.uint8: + if (not issubclass(type(packed_bytearray), np.ndarray)) or packed_bytearray.dtype != np.uint8: raise Exception("packed_bytearray_to_finnpy needs NumPy uint8 arrays") if packed_bytearray.ndim == 0: raise Exception("packed_bytearray_to_finnpy expects at least 1D ndarray") @@ -446,9 +438,7 @@ def packed_bytearray_to_finnpy( if reverse_endian: packed_bytearray = np.flip(packed_bytearray, axis=-1) # convert innermost dim of byte array to hex strings - packed_hexstring = np.apply_along_axis( - npbytearray2hexstring, packed_dim, packed_bytearray - ) + packed_hexstring = np.apply_along_axis(npbytearray2hexstring, packed_dim, packed_bytearray) ret = unpack_innermost_dim_from_hex_string( packed_hexstring, dtype, output_shape, packed_bits, reverse_inner ) diff --git a/src/finn/util/fpgadataflow.py b/src/finn/util/fpgadataflow.py index 769ddb9465..3d3d343cd4 100644 --- a/src/finn/util/fpgadataflow.py +++ b/src/finn/util/fpgadataflow.py @@ -41,3 +41,39 @@ def is_fpgadataflow_node(node): is_node = True return is_node + + +def is_hls_node(node): + """Returns True if given node is hls node. Otherwise False.""" + is_node = False + if node is not None: + if node.domain == "finn.custom_op.fpgadataflow.hls": + n_backend = get_by_name(node.attribute, "backend") + if n_backend is not None: + backend_value = n_backend.s.decode("UTF-8") + if backend_value == "fpgadataflow": + is_node = True + + return is_node + + +def is_rtl_node(node): + """Returns True if given node is rtl node. Otherwise False.""" + is_node = False + if node is not None: + if node.domain == "finn.custom_op.fpgadataflow.rtl": + n_backend = get_by_name(node.attribute, "backend") + if n_backend is not None: + backend_value = n_backend.s.decode("UTF-8") + if backend_value == "fpgadataflow": + is_node = True + + return is_node + + +def is_versal(fpgapart): + """Returns whether board is part of the Versal family""" + return ( + fpgapart[0:4] in ["xcvc", "xcve", "xcvp", "xcvm", "xqvc", "xqvm"] + or fpgapart[0:5] == "xqrvc" + ) diff --git a/src/finn/util/gdrive.py b/src/finn/util/gdrive.py deleted file mode 100644 index d525437300..0000000000 --- a/src/finn/util/gdrive.py +++ /dev/null @@ -1,65 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import gspread -import os -import warnings -from datetime import datetime - -from finn.util.basic import get_finn_root - - -def upload_to_end2end_dashboard(data_dict): - gdrive_key = get_finn_root() + "/gdrive-key/service_account.json" - if not os.path.isfile(gdrive_key): - warnings.warn("Google Drive key not found, skipping dashboard upload") - return - gc = gspread.service_account(filename=gdrive_key) - spreadsheet = gc.open("finn-end2end-dashboard") - worksheet = spreadsheet.get_worksheet(0) - keys = list(data_dict.keys()) - vals = list(data_dict.values()) - # check against existing header - existing_keys = worksheet.row_values(1) - if not set(existing_keys).issuperset(set(keys)): - # create new worksheet - dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - worksheet = spreadsheet.add_worksheet( - title="Dashboard " + dtstr, rows=10, cols=len(keys), index=0 - ) - # create header row with keys - worksheet.update("A1:1", [keys]) - # freeze and make header bold - worksheet.freeze(rows=1) - worksheet.format("A1:1", {"textFormat": {"bold": True}}) - # insert values into new row at appropriate positions - worksheet.insert_row([], index=2) - for i in range(len(keys)): - colind = existing_keys.index(keys[i]) - col_letter = chr(ord("A") + colind) - worksheet.update("%s2" % col_letter, vals[i]) diff --git a/src/finn/util/imagenet.py b/src/finn/util/imagenet.py index b4548bb352..1d63adf58b 100644 --- a/src/finn/util/imagenet.py +++ b/src/finn/util/imagenet.py @@ -137,8 +137,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5): class_names = { 0: "tench, Tinca tinca", 1: "goldfish, Carassius auratus", - 2: "great white shark, white shark, man-eater, man-eating shark, " - "Carcharodon carcharias", + 2: "great white shark, white shark, man-eater, man-eating shark, " "Carcharodon carcharias", 3: "tiger shark, Galeocerdo cuvieri", 4: "hammerhead, hammerhead shark", 5: "electric ray, crampfish, numbfish, torpedo", @@ -184,8 +183,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5): 45: "Gila monster, Heloderma suspectum", 46: "green lizard, Lacerta viridis", 47: "African chameleon, Chamaeleo chamaeleon", - 48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, " - "Varanus komodoensis", + 48: "Komodo dragon, Komodo lizard, dragon lizard, giant lizard, " "Varanus komodoensis", 49: "African crocodile, Nile crocodile, Crocodylus niloticus", 50: "American alligator, Alligator mississipiensis", 51: "triceratops", @@ -286,8 +284,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5): 144: "pelican", 145: "king penguin, Aptenodytes patagonica", 146: "albatross, mollymawk", - 147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, " - "Eschrichtius robustus", + 147: "grey whale, gray whale, devilfish, Eschrichtius gibbosus, " "Eschrichtius robustus", 148: "killer whale, killer, orca, grampus, sea wolf, Orcinus orca", 149: "dugong, Dugong dugon", 150: "sea lion", @@ -580,8 +577,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5): 433: "bathing cap, swimming cap", 434: "bath towel", 435: "bathtub, bathing tub, bath, tub", - 436: "beach wagon, station wagon, wagon, estate car, beach waggon, " - "station waggon, waggon", + 436: "beach wagon, station wagon, wagon, estate car, beach waggon, " "station waggon, waggon", 437: "beacon, lighthouse, beacon light, pharos", 438: "beaker", 439: "bearskin, busby, shako", @@ -636,8 +632,7 @@ def measure_topk(n_images, fxn_pre, fxn_exec, fxn_post, verbose=True, k=5): 487: "cellular telephone, cellular phone, cellphone, cell, mobile phone", 488: "chain", 489: "chainlink fence", - 490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, " - "ring armour", + 490: "chain mail, ring mail, mail, chain armor, chain armour, ring armor, " "ring armour", 491: "chain saw, chainsaw", 492: "chest", 493: "chiffonier, commode", diff --git a/src/finn/util/platforms.py b/src/finn/util/platforms.py index 8212cb5712..8856ce0ab8 100644 --- a/src/finn/util/platforms.py +++ b/src/finn/util/platforms.py @@ -104,9 +104,7 @@ def compute_resources(self): def guide_resources(self): guide = [] # TODO: assert limits is of correct size - guide_res = ( - np.tile(np.array(self.compute_resources), (self.ndevices, 1)) - ).astype(int) + guide_res = (np.tile(np.array(self.compute_resources), (self.ndevices, 1))).astype(int) for i in range(self.nslr * self.ndevices): # when in multi-FPGA mode, subtract cost of UDP connection from eth_slr local_slr = i % self.nslr @@ -159,9 +157,7 @@ def compute_connection_cost(self): xlocal[i][j] = 1 # tile connection cost matrices for entire system for i in range(self.ndevices): - x[ - i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr - ] = xlocal + x[i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr] = xlocal # set cost for ethernet connections, assuming daisy-chaining for i in range(self.ndevices - 1): x[i * self.nslr + self.eth_slr][(i + 1) * self.nslr + self.eth_slr] = 10 @@ -182,9 +178,7 @@ def compute_connection_resource(self): slllocal[i][j] = self.sll_count[i][j] # tile connection cost matrices for entire system for i in range(self.ndevices): - sll[ - i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr - ] = slllocal + sll[i * self.nslr : (i + 1) * self.nslr, i * self.nslr : (i + 1) * self.nslr] = slllocal # set cost for ethernet connections, assuming daisy-chaining eth = np.full((self.nslr * self.ndevices, self.nslr * self.ndevices), 0) # no Eth throughput constraints from one SLR to itself @@ -467,11 +461,42 @@ def compute_resources(self): ] +class Alveo_NxU55C_Platform(Platform): + def __init__( + self, + ndevices=1, + limits=DEFAULT_RES_LIMITS, + avg_constraints=DEFAULT_AVG_CONSTRAINTS, + ): + sll_counts = [[0, 5000, 0], [5000, 0, 5000], [0, 5000, 0]] + super(Alveo_NxU55C_Platform, self).__init__( + nslr=3, + ndevices=ndevices, + sll_count=sll_counts, + ddr_slr=[], + hbm_slr=0, + eth_slr=1, + eth_gbps=100, + limits=limits, + avg_constraints=avg_constraints, + ) + + @property + def compute_resources(self): + # according to UG1120 + return [ + [386000, 773000, 2 * 600, 320, 2664], + [364000, 729000, 2 * 576, 320, 2784], + [381000, 763000, 2 * 600, 320, 2856], + ] + + platforms = dict() platforms["U50"] = Alveo_NxU50_Platform platforms["U200"] = Alveo_NxU200_Platform platforms["U250"] = Alveo_NxU250_Platform platforms["U280"] = Alveo_NxU280_Platform +platforms["U55C"] = Alveo_NxU55C_Platform platforms["Pynq-Z1"] = Zynq7020_Platform platforms["Pynq-Z2"] = Zynq7020_Platform platforms["Ultra96"] = ZU3EG_Platform diff --git a/src/finn/util/pyverilator.py b/src/finn/util/pyverilator.py index 8d18858569..7486402be5 100644 --- a/src/finn/util/pyverilator.py +++ b/src/finn/util/pyverilator.py @@ -26,8 +26,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import numpy as np import os import shutil @@ -86,11 +84,7 @@ def file_to_basename(x): src_exts = [".v", ".sv"] all_verilog_files = list( - set( - filter( - lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs - ) - ) + set(filter(lambda x: any(map(lambda y: x.endswith(y), src_exts)), all_verilog_srcs)) ) verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh" @@ -98,9 +92,7 @@ def file_to_basename(x): # use custom version of axis infrastructure vh # to enable Verilator to simulate AMD/Xilinx components (e.g DWC) - custom_vh = pk.resource_filename( - "finn.qnn-data", "verilog/custom_axis_infrastructure.vh" - ) + custom_vh = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/verilog/custom_axis_infrastructure.vh" shutil.copy(custom_vh, verilog_header_dir + "/axis_infrastructure_v1_1_0.vh") for fn in all_verilog_srcs: if fn.endswith(".vh"): @@ -118,6 +110,8 @@ def file_to_basename(x): if not remove_entry: filtered_verilog_files.append(vfile) remove_entry = True + elif "swg_pkg" in vfile: + continue else: filtered_verilog_files.append(vfile) @@ -135,9 +129,7 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000): vivado_stitch_proj_dir = prepare_stitched_ip_for_verilator(model) verilog_header_dir = vivado_stitch_proj_dir + "/pyverilator_vh" build_dir = make_build_dir("verilator_fifosim_") - fifosim_cpp_fname = pk.resource_filename( - "finn.qnn-data", "cpp/verilator_fifosim.cpp" - ) + fifosim_cpp_fname = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/cpp/verilator_fifosim.cpp" with open(fifosim_cpp_fname, "r") as f: fifosim_cpp_template = f.read() assert len(model.graph.input) == 1, "Only a single input stream is supported" @@ -146,9 +138,7 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000): first_node = model.find_consumer(iname) oname = model.graph.output[0].name last_node = model.find_producer(oname) - assert (first_node is not None) and ( - last_node is not None - ), "Failed to find first/last nodes" + assert (first_node is not None) and (last_node is not None), "Failed to find first/last nodes" fnode_inst = getCustomOp(first_node) lnode_inst = getCustomOp(last_node) ishape_folded = fnode_inst.get_folded_input_shape() @@ -157,7 +147,7 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000): fifo_log = [] fifo_log_templ = ' results_file << "maxcount%s" << "\\t" ' fifo_log_templ += "<< to_string(top->maxcount%s) << endl;" - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO") + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl") fifo_ind = 0 for fifo_node in fifo_nodes: fifo_node = getCustomOp(fifo_node) @@ -175,7 +165,7 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000): "FIFO_DEPTH_LOGGING": fifo_log, } - for (key, val) in template_dict.items(): + for key, val in template_dict.items(): fifosim_cpp_template = fifosim_cpp_template.replace(f"@{key}@", str(val)) with open(build_dir + "/verilator_fifosim.cpp", "w") as f: @@ -196,7 +186,8 @@ def verilator_fifosim(model, n_inputs, max_iters=100000000): xpm_memory = f"{vivado_path}/data/ip/xpm/xpm_memory/hdl/xpm_memory.sv" xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv" xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv" - verilog_file_arg = ["finn_design_wrapper.v", xpm_memory, xpm_cdc, xpm_fifo] + swg_pkg = os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv" + verilog_file_arg = [swg_pkg, "finn_design_wrapper.v", xpm_memory, xpm_cdc, xpm_fifo] verilator_args = [ "perl", @@ -315,8 +306,10 @@ def file_to_basename(x): xpm_cdc = f"{vivado_path}/data/ip/xpm/xpm_cdc/hdl/xpm_cdc.sv" xpm_fifo = f"{vivado_path}/data/ip/xpm/xpm_fifo/hdl/xpm_fifo.sv" + swg_pkg = os.environ["FINN_ROOT"] + "/finn-rtllib/swg/swg_pkg.sv" + sim = PyVerilator.build( - [top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc], + [swg_pkg, top_module_file_name, xpm_fifo, xpm_memory, xpm_cdc], verilog_path=[vivado_stitch_proj_dir, verilog_header_dir], build_dir=build_dir, trace_depth=get_rtlsim_trace_depth(), diff --git a/src/finn/util/test.py b/src/finn/util/test.py index bd8bde2820..2115e058a8 100644 --- a/src/finn/util/test.py +++ b/src/finn/util/test.py @@ -26,10 +26,9 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest +import importlib_resources as importlib import numpy as np import onnx import onnx.numpy_helper as nph @@ -106,37 +105,26 @@ def load_test_checkpoint_or_skip(filename): pytest.skip(filename + " not found from previous test step, skipping") -def get_build_env(kind, target_clk_ns): +def get_build_env(board, target_clk_ns): """Get board-related build environment for testing. - - kind = either zynq or alveo. + - board = any from pynq_part_map or alveo_part_map """ ret = {} - if kind == "zynq": - ret["board"] = os.getenv("PYNQ_BOARD", default="Pynq-Z1") - ret["part"] = pynq_part_map[ret["board"]] - ret["ip"] = os.getenv("PYNQ_IP", "") - ret["username"] = os.getenv("PYNQ_USERNAME", "xilinx") - ret["password"] = os.getenv("PYNQ_PASSWORD", "xilinx") - ret["port"] = os.getenv("PYNQ_PORT", 22) - ret["target_dir"] = os.getenv("PYNQ_TARGET_DIR", "/home/xilinx/finn") - ret["build_fxn"] = ZynqBuild(ret["board"], target_clk_ns) - elif kind == "alveo": - ret["board"] = os.getenv("ALVEO_BOARD", default="U250") - ret["part"] = alveo_part_map[ret["board"]] - ret["platform"] = alveo_default_platform[ret["board"]] - ret["ip"] = os.getenv("ALVEO_IP", "") - ret["username"] = os.getenv("ALVEO_USERNAME", "") - ret["password"] = os.getenv("ALVEO_PASSWORD", "") - ret["port"] = os.getenv("ALVEO_PORT", 22) - ret["target_dir"] = os.getenv("ALVEO_TARGET_DIR", "/tmp/finn_alveo_deploy") + if board in pynq_part_map: + ret["kind"] = "zynq" + ret["part"] = pynq_part_map[board] + ret["build_fxn"] = ZynqBuild(board, target_clk_ns) + elif board in alveo_part_map: + ret["kind"] = "alveo" + ret["part"] = alveo_part_map[board] ret["build_fxn"] = VitisBuild( ret["part"], target_clk_ns, - ret["platform"], + alveo_default_platform[board], strategy=VitisOptStrategy.BUILD_SPEED, ) else: - raise Exception("Unknown test build environment spec") + raise Exception("Unknown board specified") return ret @@ -148,10 +136,9 @@ def get_example_input(topology): onnx_tensor = onnx.load_tensor_from_string(raw_i) return nph.to_array(onnx_tensor) elif topology == "cnv": - fn = pk.resource_filename( - "finn.qnn-data", "cifar10/cifar10-test-data-class3.npz" - ) - input_tensor = np.load(fn)["arr_0"].astype(np.float32) + ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz" + with importlib.as_file(ref) as fn: + input_tensor = np.load(fn)["arr_0"].astype(np.float32) return input_tensor else: raise Exception("Unknown topology, can't return example input") diff --git a/src/finn/util/vcd.py b/src/finn/util/vcd.py index 1f77276d5a..69dd82c5ea 100644 --- a/src/finn/util/vcd.py +++ b/src/finn/util/vcd.py @@ -69,7 +69,7 @@ def get_fifo_count_max(vcd_file, fifo_count_signal): assert len(d) != 0, "FIFO count signal not found" events = list(d.values())[0]["tv"] max = 0 - for (time, val) in events: + for time, val in events: current = int(val, base=2) if current > max: max = current @@ -140,7 +140,7 @@ def get_stream_if_stats(vcd_file, if_base_name): status = {"V": 0, "R": 0} last_time = 0 total_rising_clock_edges = 0 - for (sig, time, val) in events: + for sig, time, val in events: # pyverilator generates 5 time units per sample time = time / 5 # pyverilator generates 4 samples per clock period diff --git a/tests/brevitas/king_charles.jpg b/tests/brevitas/king_charles.jpg index c1400a484e..d3639a69e9 100755 Binary files a/tests/brevitas/king_charles.jpg and b/tests/brevitas/king_charles.jpg differ diff --git a/tests/brevitas/test_brevitas_avg_pool_export.py b/tests/brevitas/test_brevitas_avg_pool_export.py index 669601ecb6..053b632221 100644 --- a/tests/brevitas/test_brevitas_avg_pool_export.py +++ b/tests/brevitas/test_brevitas_avg_pool_export.py @@ -30,10 +30,8 @@ import numpy as np import os import torch -from brevitas.export import FINNManager -from brevitas.export.onnx.generic.manager import BrevitasONNXManager -from brevitas.nn import QuantAvgPool2d -from brevitas.quant_tensor import QuantTensor +from brevitas.export import export_qonnx +from brevitas.nn import QuantIdentity, QuantReLU, TruncAvgPool2d from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.infer_datatypes import InferDataTypes @@ -48,10 +46,9 @@ @pytest.mark.brevitas_export -@pytest.mark.parametrize("QONNX_export", [False, True]) @pytest.mark.parametrize("kernel_size", [2, 3]) @pytest.mark.parametrize("stride", [1, 2]) -@pytest.mark.parametrize("signed", [True, False]) +@pytest.mark.parametrize("signed", [True]) # TODO: Add unsigned test case @pytest.mark.parametrize("bit_width", [2, 4]) @pytest.mark.parametrize("input_bit_width", [4, 8, 16]) @pytest.mark.parametrize("channels", [2, 4]) @@ -64,79 +61,56 @@ def test_brevitas_avg_pool_export( input_bit_width, channels, idim, - QONNX_export, ): - export_onnx_path = base_export_onnx_path.replace( - ".onnx", f"test_QONNX-{QONNX_export}.onnx" - ) - quant_avgpool = QuantAvgPool2d( + export_onnx_path = base_export_onnx_path.replace(".onnx", "test_QONNX.onnx") + if signed: + quant_node = QuantIdentity( + bit_width=input_bit_width, + return_quant_tensor=True, + ) + else: + quant_node = QuantReLU( + bit_width=input_bit_width, + return_quant_tensor=True, + ) + quant_avgpool = TruncAvgPool2d( kernel_size=kernel_size, stride=stride, bit_width=bit_width, return_quant_tensor=False, + float_to_int_impl_type="FLOOR", ) - quant_avgpool.eval() + model_brevitas = torch.nn.Sequential(quant_node, quant_avgpool) + model_brevitas.eval() # determine input - prefix = "INT" if signed else "UINT" - dt_name = prefix + str(input_bit_width) - dtype = DataType[dt_name] input_shape = (1, channels, idim, idim) - input_array = gen_finn_dt_tensor(dtype, input_shape) - # Brevitas QuantAvgPool layers need QuantTensors to export correctly - # which requires setting up a QuantTensor instance with the scale - # factor, zero point, bitwidth and signedness - scale_array = np.ones((1, channels, 1, 1)).astype(np.float32) - scale_array *= 0.5 - input_tensor = torch.from_numpy(input_array * scale_array).float() - scale_tensor = torch.from_numpy(scale_array).float() - zp = torch.tensor(0.0) - input_quant_tensor = QuantTensor( - input_tensor, scale_tensor, zp, input_bit_width, signed, training=False - ) + input_array = gen_finn_dt_tensor(DataType["FLOAT32"], input_shape) - # export - if QONNX_export: - BrevitasONNXManager.export( - quant_avgpool, - export_path=export_onnx_path, - input_t=input_quant_tensor, - ) - model = ModelWrapper(export_onnx_path) + input_tensor = torch.from_numpy(input_array).float() - # Statically set the additional inputs generated by the BrevitasONNXManager - model.graph.input.remove(model.graph.input[3]) - model.graph.input.remove(model.graph.input[2]) - model.graph.input.remove(model.graph.input[1]) - model.set_initializer("1", scale_array) - model.set_initializer("2", np.array(0.0).astype(np.float32)) - model.set_initializer("3", np.array(input_bit_width).astype(np.float32)) - model.save(export_onnx_path) + # export + export_qonnx( + model_brevitas, + export_path=export_onnx_path, + input_t=input_tensor, + ) + model = ModelWrapper(export_onnx_path) + model.save(export_onnx_path) - qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) - model = ModelWrapper(export_onnx_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(export_onnx_path) - else: - FINNManager.export( - quant_avgpool, export_path=export_onnx_path, input_t=input_quant_tensor - ) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(InferDataTypes()) # reference brevitas output - ref_output_array = quant_avgpool(input_quant_tensor).detach().numpy() + ref_output_array = model_brevitas(input_tensor).detach().numpy() # finn output - if QONNX_export: - # Manually apply the Quant tensor scaling for QONNX - idict = {model.graph.input[0].name: input_array * scale_array} - else: - idict = {model.graph.input[0].name: input_array} + idict = {model.graph.input[0].name: input_array} odict = oxe.execute_onnx(model, idict, True) finn_output = odict[model.graph.output[0].name] # compare outputs assert np.isclose(ref_output_array, finn_output).all() # cleanup - # assert False os.remove(export_onnx_path) diff --git a/tests/brevitas/test_brevitas_cnv.py b/tests/brevitas/test_brevitas_cnv.py index 62aab2e3c2..3950a5b6a7 100644 --- a/tests/brevitas/test_brevitas_cnv.py +++ b/tests/brevitas/test_brevitas_cnv.py @@ -26,15 +26,13 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest -import brevitas.onnx as bo +import importlib_resources as importlib import numpy as np import os import torch -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs @@ -51,29 +49,24 @@ @pytest.mark.brevitas_export @pytest.mark.parametrize("abits", [1, 2]) @pytest.mark.parametrize("wbits", [1, 2]) -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_brevitas_cnv_export_exec(wbits, abits, QONNX_export): +def test_brevitas_cnv_export_exec(wbits, abits): if wbits > abits: pytest.skip("No wbits > abits cases at the moment") cnv = get_test_model_trained("CNV", wbits, abits) ishape = (1, 3, 32, 32) - if QONNX_export: - BrevitasONNXManager.export(cnv, ishape, export_onnx_path) - qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) - model = ModelWrapper(export_onnx_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(export_onnx_path) - else: - bo.export_finn_onnx(cnv, ishape, export_onnx_path) + export_qonnx(cnv, torch.randn(ishape), export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(RemoveStaticGraphInputs()) assert len(model.graph.input) == 1 assert len(model.graph.output) == 1 - fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz") - input_tensor = np.load(fn)["arr_0"].astype(np.float32) + ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz" + with importlib.as_file(ref) as fn: + input_tensor = np.load(fn)["arr_0"].astype(np.float32) input_tensor = input_tensor / 255 assert input_tensor.shape == (1, 3, 32, 32) # run using FINN-based execution diff --git a/tests/brevitas/test_brevitas_debug.py b/tests/brevitas/test_brevitas_debug.py index 181d610fff..d6879a727b 100644 --- a/tests/brevitas/test_brevitas_debug.py +++ b/tests/brevitas/test_brevitas_debug.py @@ -34,12 +34,9 @@ import onnx.numpy_helper as nph import os import torch -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper -from qonnx.transformation.fold_constants import FoldConstants -from qonnx.transformation.general import RemoveStaticGraphInputs -from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe @@ -48,41 +45,23 @@ @pytest.mark.brevitas_export -@pytest.mark.parametrize("QONNX_export", [False, True]) @pytest.mark.parametrize("QONNX_FINN_conversion", [False, True]) -def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion): - if (not QONNX_export) and QONNX_FINN_conversion: - pytest.skip("This test configuration is not valid and is thus skipped.") +def test_brevitas_debug(QONNX_FINN_conversion): finn_onnx = "test_brevitas_debug.onnx" fc = get_test_model_trained("TFC", 2, 2) ishape = (1, 1, 28, 28) - if QONNX_export: - dbg_hook = bo.enable_debug(fc, proxy_level=True) - BrevitasONNXManager.export(fc, ishape, finn_onnx) - # DebugMarkers have the brevitas.onnx domain, so that needs adjusting - model = ModelWrapper(finn_onnx) - dbg_nodes = model.get_nodes_by_op_type("DebugMarker") - for dbg_node in dbg_nodes: - dbg_node.domain = "qonnx.custom_op.general" - model.save(finn_onnx) - qonnx_cleanup(finn_onnx, out_file=finn_onnx) - if QONNX_FINN_conversion: - model = ModelWrapper(finn_onnx) - model = model.transform(ConvertQONNXtoFINN()) - model.save(finn_onnx) - else: - dbg_hook = bo.enable_debug(fc) - bo.export_finn_onnx(fc, ishape, finn_onnx) + dbg_hook = bo.enable_debug(fc, proxy_level=True) + export_qonnx(fc, torch.randn(ishape), finn_onnx) + # DebugMarkers have the brevitas.onnx domain, so that needs adjusting + model = ModelWrapper(finn_onnx) + dbg_nodes = model.get_nodes_by_op_type("DebugMarker") + for dbg_node in dbg_nodes: + dbg_node.domain = "qonnx.custom_op.general" + model.save(finn_onnx) + qonnx_cleanup(finn_onnx, out_file=finn_onnx) + if QONNX_FINN_conversion: model = ModelWrapper(finn_onnx) - # DebugMarkers have the brevitas.onnx domain, so that needs adjusting - # ToDo: We should probably have transformation pass, which does this - # domain conversion for us? - dbg_nodes = model.get_nodes_by_op_type("DebugMarker") - for dbg_node in dbg_nodes: - dbg_node.domain = "qonnx.custom_op.general" - model = model.transform(InferShapes()) - model = model.transform(FoldConstants()) - model = model.transform(RemoveStaticGraphInputs()) + model = model.transform(ConvertQONNXtoFINN()) model.save(finn_onnx) model = ModelWrapper(finn_onnx) assert len(model.graph.input) == 1 @@ -106,17 +85,12 @@ def test_brevitas_debug(QONNX_export, QONNX_FINN_conversion): names_common = names_brevitas.intersection(names_finn) # The different exports return debug markers in different numbers and places print(len(names_common)) - if QONNX_export and not QONNX_FINN_conversion: + if not QONNX_FINN_conversion: assert len(names_common) == 12 - elif QONNX_export and QONNX_FINN_conversion: - assert len(names_common) == 8 else: - assert len(names_common) == 16 + assert len(names_common) == 8 for dbg_name in names_common: - if QONNX_export: - tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy() - else: - tensor_pytorch = dbg_hook.values[dbg_name].detach().numpy() + tensor_pytorch = dbg_hook.values[dbg_name].value.detach().numpy() tensor_finn = output_dict[dbg_name] assert np.isclose(tensor_finn, tensor_pytorch, atol=1e-5).all() os.remove(finn_onnx) diff --git a/tests/brevitas/test_brevitas_deconv.py b/tests/brevitas/test_brevitas_deconv.py new file mode 100644 index 0000000000..dfcecc9187 --- /dev/null +++ b/tests/brevitas/test_brevitas_deconv.py @@ -0,0 +1,82 @@ +# Copyright (c) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +import brevitas.nn as qnn +import numpy as np +import os +import torch +from brevitas.export import export_qonnx +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.cleanup import cleanup as qonnx_cleanup + +import finn.core.onnx_exec as oxe +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN + +export_path = "test_brevitas_deconv.onnx" + + +@pytest.mark.brevitas_export +@pytest.mark.parametrize("ifm_ch", [3]) +@pytest.mark.parametrize("ofm_ch", [5]) +@pytest.mark.parametrize("mh", [4]) +@pytest.mark.parametrize("mw", [4]) +@pytest.mark.parametrize("padding", [1]) +@pytest.mark.parametrize("stride", [2]) +@pytest.mark.parametrize("kw", [4]) +@pytest.mark.parametrize("bias", [False]) +def test_brevitas_QTransposeConv(ifm_ch, ofm_ch, mh, mw, padding, stride, kw, bias): + kh = kw + oh = stride * (mh - 1) - (2 * padding) + kh + if oh % mh != 0: + pytest.skip("Skip test because oh needs to be divisible by mh") + ishape = (1, ifm_ch, mh, mw) # NCHW + inp = torch.randn(ishape) + b_deconv = qnn.QuantConvTranspose2d( + in_channels=ifm_ch, + out_channels=ofm_ch, + kernel_size=kw, + stride=stride, + padding=padding, + bias=bias, + ) + # outp = el(inp) # expects NCHW data format + export_qonnx(b_deconv, input_t=inp, export_path=export_path, opset_version=11) + qonnx_cleanup(export_path, out_file=export_path) + model = ModelWrapper(export_path) + model = model.transform(ConvertQONNXtoFINN()) + model = model.transform(InferShapes()) + inp_tensor = np.random.uniform(low=-1.0, high=1.0, size=ishape).astype(np.float32) + idict = {model.graph.input[0].name: inp_tensor} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + inp_tensor = torch.from_numpy(inp_tensor).float() + expected = b_deconv.forward(inp_tensor).detach().numpy() + assert np.isclose(produced, expected, atol=1e-3).all() + os.remove(export_path) diff --git a/tests/brevitas/test_brevitas_fc.py b/tests/brevitas/test_brevitas_fc.py index 211fdb629b..842d099f57 100644 --- a/tests/brevitas/test_brevitas_fc.py +++ b/tests/brevitas/test_brevitas_fc.py @@ -28,12 +28,11 @@ import pytest -import brevitas.onnx as bo import numpy as np import onnx import onnx.numpy_helper as nph import torch -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.fold_constants import FoldConstants @@ -56,26 +55,19 @@ @pytest.mark.parametrize("wbits", [1, 2]) # network topology / size @pytest.mark.parametrize("size", ["TFC", "SFC", "LFC"]) -# QONNX export -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits, QONNX_export): +def test_brevitas_fc_onnx_export_and_exec(size, wbits, abits): if size == "LFC" and wbits == 2 and abits == 2: pytest.skip("No LFC-w2a2 present at the moment") if wbits > abits: pytest.skip("No wbits > abits cases at the moment") - nname = "%s_%dW%dA_QONNX-%d" % (size, wbits, abits, QONNX_export) + nname = "%s_%dW%dA" % (size, wbits, abits) finn_onnx = export_onnx_path + "/%s.onnx" % nname fc = get_test_model_trained(size, wbits, abits) ishape = (1, 1, 28, 28) - if QONNX_export: - BrevitasONNXManager.export(fc, ishape, finn_onnx) - qonnx_cleanup(finn_onnx, out_file=finn_onnx) - model = ModelWrapper(finn_onnx) - model = model.transform(ConvertQONNXtoFINN()) - model.save(finn_onnx) - else: - bo.export_finn_onnx(fc, ishape, finn_onnx) + export_qonnx(fc, torch.randn(ishape), finn_onnx) + qonnx_cleanup(finn_onnx, out_file=finn_onnx) model = ModelWrapper(finn_onnx) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(RemoveStaticGraphInputs()) diff --git a/tests/brevitas/test_brevitas_mobilenet.py b/tests/brevitas/test_brevitas_mobilenet.py index b1475b6f4e..be200f6cd4 100644 --- a/tests/brevitas/test_brevitas_mobilenet.py +++ b/tests/brevitas/test_brevitas_mobilenet.py @@ -28,9 +28,9 @@ import pytest -import brevitas.onnx as bo import numpy as np import torch +from brevitas.export import export_qonnx from PIL import Image from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper @@ -45,9 +45,11 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.insert_topk import InsertTopK from qonnx.transformation.merge_onnx_models import MergeONNXModels +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe import finn.transformation.streamline.absorb as absorb +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.basic import get_finn_root, make_build_dir from finn.util.pytorch import NormalizePreProc from finn.util.test import crop_center, get_test_model_trained, resize_smaller_side @@ -76,12 +78,12 @@ def test_brevitas_mobilenet(): std = 0.226 ch = 3 preproc = NormalizePreProc(mean, std, ch) - bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx) + export_qonnx(preproc, torch.randn(1, 3, 224, 224), preproc_onnx) + qonnx_cleanup(preproc_onnx, out_file=preproc_onnx) preproc_model = ModelWrapper(preproc_onnx) + preproc_model = preproc_model.transform(ConvertQONNXtoFINN()) # set input finn datatype to UINT8 - preproc_model.set_tensor_datatype( - preproc_model.graph.input[0].name, DataType["UINT8"] - ) + preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType["UINT8"]) preproc_model = preproc_model.transform(InferShapes()) preproc_model = preproc_model.transform(GiveUniqueNodeNames()) preproc_model = preproc_model.transform(GiveUniqueParameterTensors()) @@ -89,7 +91,8 @@ def test_brevitas_mobilenet(): finn_onnx = export_onnx_path + "/quant_mobilenet_v1_4b_exported.onnx" mobilenet = get_test_model_trained("mobilenet", 4, 4) - bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx) + export_qonnx(mobilenet, torch.randn(1, 3, 224, 224), finn_onnx) + qonnx_cleanup(finn_onnx, out_file=finn_onnx) # do forward pass in PyTorch/Brevitas input_tensor = preproc.forward(img_torch) @@ -100,7 +103,9 @@ def test_brevitas_mobilenet(): expected_top5_prob = [] for index in expected_top5: expected_top5_prob.append(expected_topk[index]) + model = ModelWrapper(finn_onnx) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(InsertTopK()) @@ -121,4 +126,4 @@ def test_brevitas_mobilenet(): produced = odict[model.graph.output[0].name] produced_prob = odict["TopK_0_out0"] * a0 assert (produced.flatten() == expected_top5).all() - assert np.isclose(produced_prob.flatten(), expected_top5_prob).all() + assert np.isclose(produced_prob.flatten(), expected_top5_prob, atol=2.2 * 1e-1).all() diff --git a/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py index 5d70acb102..08a193714a 100644 --- a/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py +++ b/tests/brevitas/test_brevitas_non_scaled_quanthardtanh_export.py @@ -28,7 +28,6 @@ import pytest -import brevitas.onnx as bo import numpy as np import onnx # noqa import os @@ -36,7 +35,7 @@ from brevitas.core.quant import QuantType from brevitas.core.restrict_val import RestrictValueType from brevitas.core.scaling import ScalingImplType -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from brevitas.nn import QuantHardTanh from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.infer_shapes import InferShapes @@ -52,10 +51,7 @@ @pytest.mark.parametrize("abits", [1, 2, 4, 8]) @pytest.mark.parametrize("narrow_range", [False, True]) @pytest.mark.parametrize("max_val", [1.0, 1 - 2 ** (-7)]) -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_brevitas_act_export_qhardtanh_nonscaled( - abits, narrow_range, max_val, QONNX_export -): +def test_brevitas_act_export_qhardtanh_nonscaled(abits, narrow_range, max_val): def get_quant_type(bit_width): if bit_width is None: return QuantType.FP @@ -76,20 +72,13 @@ def get_quant_type(bit_width): scaling_impl_type=ScalingImplType.CONST, narrow_range=narrow_range, ) - if QONNX_export: - m_path = export_onnx_path - BrevitasONNXManager.export(b_act, ishape, m_path) - qonnx_cleanup(m_path, out_file=m_path) - model = ModelWrapper(m_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(m_path) - else: - bo.export_finn_onnx(b_act, ishape, export_onnx_path) - model = ModelWrapper(export_onnx_path) + m_path = export_onnx_path + export_qonnx(b_act, torch.randn(ishape), m_path) + qonnx_cleanup(m_path, out_file=m_path) + model = ModelWrapper(m_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) - inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype( - np.float32 - ) + inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(np.float32) idict = {model.graph.input[0].name: inp_tensor} odict = oxe.execute_onnx(model, idict, True) produced = odict[model.graph.output[0].name] diff --git a/tests/brevitas/test_brevitas_qconv2d.py b/tests/brevitas/test_brevitas_qconv2d.py index 214c55e5fd..4b27671891 100644 --- a/tests/brevitas/test_brevitas_qconv2d.py +++ b/tests/brevitas/test_brevitas_qconv2d.py @@ -28,7 +28,6 @@ import pytest -import brevitas.onnx as bo import numpy as np import os import torch @@ -36,7 +35,7 @@ from brevitas.core.restrict_val import RestrictValueType from brevitas.core.scaling import ScalingImplType from brevitas.core.stats import StatsOp -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from brevitas.nn import QuantConv2d from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper @@ -54,8 +53,7 @@ @pytest.mark.parametrize("dw", [False, True]) @pytest.mark.parametrize("bias", [True, False]) @pytest.mark.parametrize("in_channels", [32]) -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_brevitas_QConv2d(dw, bias, in_channels, QONNX_export): +def test_brevitas_QConv2d(dw, bias, in_channels): ishape = (1, 32, 111, 111) if dw is True: groups = in_channels @@ -94,16 +92,11 @@ def test_brevitas_QConv2d(dw, bias, in_channels, QONNX_export): weight_tensor = gen_finn_dt_tensor(DataType["INT4"], w_shape) b_conv.weight = torch.nn.Parameter(torch.from_numpy(weight_tensor).float()) b_conv.eval() - if QONNX_export: - m_path = export_onnx_path - BrevitasONNXManager.export(b_conv, ishape, m_path) - qonnx_cleanup(m_path, out_file=m_path) - model = ModelWrapper(m_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(m_path) - else: - bo.export_finn_onnx(b_conv, ishape, export_onnx_path) - model = ModelWrapper(export_onnx_path) + m_path = export_onnx_path + export_qonnx(b_conv, torch.randn(ishape), m_path) + qonnx_cleanup(m_path, out_file=m_path) + model = ModelWrapper(m_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) inp_tensor = np.random.uniform(low=-1.0, high=1.0, size=ishape).astype(np.float32) idict = {model.graph.input[0].name: inp_tensor} diff --git a/tests/brevitas/test_brevitas_qlinear.py b/tests/brevitas/test_brevitas_qlinear.py index bcd75a5455..a6ea077e7a 100644 --- a/tests/brevitas/test_brevitas_qlinear.py +++ b/tests/brevitas/test_brevitas_qlinear.py @@ -28,12 +28,11 @@ import pytest -import brevitas.onnx as bo import numpy as np import os import torch from brevitas.core.quant import QuantType -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from brevitas.nn import QuantLinear from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper @@ -53,10 +52,7 @@ @pytest.mark.parametrize("in_features", [3]) @pytest.mark.parametrize("w_bits", [4]) @pytest.mark.parametrize("i_dtype", [DataType["UINT4"]]) -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_brevitas_qlinear( - bias, out_features, in_features, w_bits, i_dtype, QONNX_export -): +def test_brevitas_qlinear(bias, out_features, in_features, w_bits, i_dtype): i_shape = (1, in_features) w_shape = (out_features, in_features) b_linear = QuantLinear( @@ -68,21 +64,14 @@ def test_brevitas_qlinear( weight_quant_type=QuantType.INT, weight_scaling_per_output_channel=True, ) - weight_tensor_fp = np.random.uniform(low=-1.0, high=1.0, size=w_shape).astype( - np.float32 - ) + weight_tensor_fp = np.random.uniform(low=-1.0, high=1.0, size=w_shape).astype(np.float32) b_linear.weight.data = torch.from_numpy(weight_tensor_fp) b_linear.eval() - if QONNX_export: - m_path = export_onnx_path - BrevitasONNXManager.export(b_linear, i_shape, m_path) - qonnx_cleanup(m_path, out_file=m_path) - model = ModelWrapper(m_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(m_path) - else: - bo.export_finn_onnx(b_linear, i_shape, export_onnx_path) - model = ModelWrapper(export_onnx_path) + m_path = export_onnx_path + export_qonnx(b_linear, torch.randn(i_shape), m_path) + qonnx_cleanup(m_path, out_file=m_path) + model = ModelWrapper(m_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) inp_tensor = gen_finn_dt_tensor(i_dtype, i_shape) idict = {model.graph.input[0].name: inp_tensor} diff --git a/tests/brevitas/test_brevitas_relu_act_export.py b/tests/brevitas/test_brevitas_relu_act_export.py index 3dc46ec31e..2254670202 100644 --- a/tests/brevitas/test_brevitas_relu_act_export.py +++ b/tests/brevitas/test_brevitas_relu_act_export.py @@ -28,20 +28,16 @@ import pytest -import brevitas.onnx as bo import numpy as np import onnx # noqa import os import torch -from brevitas.core.quant import QuantType -from brevitas.core.restrict_val import RestrictValueType from brevitas.core.scaling import ScalingImplType -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from brevitas.nn import QuantReLU from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.cleanup import cleanup as qonnx_cleanup -from torch import nn import finn.core.onnx_exec as oxe from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN @@ -51,63 +47,27 @@ @pytest.mark.brevitas_export @pytest.mark.parametrize("abits", [2, 4, 8]) -@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)]) -@pytest.mark.parametrize( - "scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER] -) -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type, QONNX_export): - min_val = -1.0 - ishape = (1, 15) - +@pytest.mark.parametrize("ishape", [(1, 15), (1, 32, 1, 1)]) +def test_brevitas_act_export_relu( + abits, + ishape, +): b_act = QuantReLU( bit_width=abits, - max_val=max_val, - scaling_impl_type=scaling_impl_type, - restrict_scaling_type=RestrictValueType.LOG_FP, - quant_type=QuantType.INT, ) - if scaling_impl_type == ScalingImplType.PARAMETER: - checkpoint = { - "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\ -scaling_impl.learned_value": torch.tensor( - 0.49 - ).type( - torch.FloatTensor - ) - } - b_act.load_state_dict(checkpoint) - if QONNX_export: - m_path = export_onnx_path - BrevitasONNXManager.export(b_act, ishape, m_path) - qonnx_cleanup(m_path, out_file=m_path) - model = ModelWrapper(m_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(m_path) - else: - bo.export_finn_onnx(b_act, ishape, export_onnx_path) - model = ModelWrapper(export_onnx_path) + m_path = export_onnx_path + export_qonnx(b_act, torch.randn(ishape), m_path) + qonnx_cleanup(m_path, out_file=m_path) + model = ModelWrapper(m_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) - inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype( - np.float32 - ) + inp_tensor = np.random.uniform(low=-1.0, high=6.0, size=ishape).astype(np.float32) idict = {model.graph.input[0].name: inp_tensor} odict = oxe.execute_onnx(model, idict, True) produced = odict[model.graph.output[0].name] inp_tensor = torch.from_numpy(inp_tensor).float() b_act.eval() expected = b_act.forward(inp_tensor).detach().numpy() - if not np.isclose(produced, expected, atol=1e-3).all(): - print(abits, max_val, scaling_impl_type) - print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach()) - if abits < 5: - print( - "thres:", - ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]), - ) - print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]])) - print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]])) - print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]])) assert np.isclose(produced, expected, atol=1e-3).all() os.remove(export_onnx_path) @@ -115,148 +75,32 @@ def test_brevitas_act_export_relu(abits, max_val, scaling_impl_type, QONNX_expor @pytest.mark.brevitas_export @pytest.mark.parametrize("abits", [2, 4, 8]) -@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)]) -@pytest.mark.parametrize("scaling_per_channel", [True, False]) -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_brevitas_act_export_relu_imagenet( - abits, max_val, scaling_per_channel, QONNX_export +@pytest.mark.parametrize("ishape", [(1, 15, 4, 4), (1, 32, 1, 1)]) +def test_brevitas_act_export_relu_channel( + abits, + ishape, ): - out_channels = 32 - ishape = (1, out_channels, 1, 1) - min_val = -1.0 + ch = ishape[1] b_act = QuantReLU( bit_width=abits, - quant_type=QuantType.INT, - scaling_impl_type=ScalingImplType.PARAMETER, - scaling_per_channel=scaling_per_channel, - restrict_scaling_type=RestrictValueType.LOG_FP, - scaling_min_val=2e-16, max_val=6.0, - return_quant_tensor=False, - per_channel_broadcastable_shape=(1, out_channels, 1, 1), + scaling_impl_type=ScalingImplType.CONST, + scaling_per_output_channel=True, + per_channel_broadcastable_shape=(1, ch, 1, 1), ) - if scaling_per_channel is True: - rand_tensor = (2) * torch.rand((1, out_channels, 1, 1)) - else: - rand_tensor = torch.tensor(1.2398) - checkpoint = { - "act_quant_proxy.fused_activation_quant_proxy.tensor_quant.\ -scaling_impl.learned_value": rand_tensor.type( - torch.FloatTensor - ) - } - b_act.load_state_dict(checkpoint) - if QONNX_export: - m_path = export_onnx_path - BrevitasONNXManager.export(b_act, ishape, m_path) - qonnx_cleanup(m_path, out_file=m_path) - model = ModelWrapper(m_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(m_path) - else: - bo.export_finn_onnx(b_act, ishape, export_onnx_path) - model = ModelWrapper(export_onnx_path) + m_path = export_onnx_path + export_qonnx(b_act, torch.randn(ishape), m_path) + qonnx_cleanup(m_path, out_file=m_path) + model = ModelWrapper(m_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) - inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype( - np.float32 - ) + inp_tensor = np.random.uniform(low=-1.0, high=6.0, size=ishape).astype(np.float32) idict = {model.graph.input[0].name: inp_tensor} odict = oxe.execute_onnx(model, idict, True) produced = odict[model.graph.output[0].name] inp_tensor = torch.from_numpy(inp_tensor).float() b_act.eval() expected = b_act.forward(inp_tensor).detach().numpy() - if not np.isclose(produced, expected, atol=1e-3).all(): - print(abits, max_val) - print("scale: ", b_act.quant_act_scale().type(torch.FloatTensor).detach()) - if abits < 5: - print( - "thres:", - ", ".join(["{:8.4f}".format(x) for x in b_act.export_thres[0]]), - ) - print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]])) - print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]])) - print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]])) - - assert np.isclose(produced, expected, atol=1e-3).all() - os.remove(export_onnx_path) - - -class PyTorchTestModel(nn.Module): - def __init__(self, abits): - super(PyTorchTestModel, self).__init__() - out_channels = 32 - self.b_act = QuantReLU( - bit_width=abits, - quant_type=QuantType.INT, - scaling_impl_type=ScalingImplType.PARAMETER, - scaling_per_channel=True, - restrict_scaling_type=RestrictValueType.LOG_FP, - scaling_min_val=2e-16, - max_val=6.0, - return_quant_tensor=False, - per_channel_broadcastable_shape=(1, out_channels, 1, 1), - ) - - def forward(self, x): - act_out = self.b_act(x) - y0 = act_out * 2.0 - y1 = act_out * -1.0 - y = y0 + y1 - return y - - -@pytest.mark.brevitas_export -@pytest.mark.parametrize("abits", [2, 4, 8]) -@pytest.mark.parametrize("max_val", [1.0, 1.5, 1 - 2 ** (-7)]) -@pytest.mark.parametrize("scaling_per_channel", [True]) -@pytest.mark.parametrize("QONNX_export", [True]) -def test_brevitas_act_export_relu_forking( - abits, max_val, scaling_per_channel, QONNX_export -): - out_channels = 32 - ishape = (1, out_channels, 1, 1) - min_val = -1.0 - model_pyt = PyTorchTestModel(abits) - - rand_tensor = (2) * torch.rand((1, out_channels, 1, 1)) - - checkpoint = { - "b_act.act_quant_proxy.fused_activation_quant_proxy." - "tensor_quant.scaling_impl.learned_value": rand_tensor.type(torch.FloatTensor) - } - model_pyt.load_state_dict(checkpoint) - - if QONNX_export: - m_path = export_onnx_path - BrevitasONNXManager.export(model_pyt, ishape, m_path) - qonnx_cleanup(m_path, out_file=m_path) - model = ModelWrapper(m_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(m_path) - - model = ModelWrapper(export_onnx_path) - model = model.transform(InferShapes()) - inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype( - np.float32 - ) - idict = {model.graph.input[0].name: inp_tensor} - odict = oxe.execute_onnx(model, idict, True) - produced = odict[model.graph.output[0].name] - inp_tensor = torch.from_numpy(inp_tensor).float() - model_pyt.eval() - expected = model_pyt.forward(inp_tensor).detach().numpy() - if not np.isclose(produced, expected, atol=1e-3).all(): - print(abits, max_val) - print("scale: ", model_pyt.quant_act_scale().type(torch.FloatTensor).detach()) - if abits < 5: - print( - "thres:", - ", ".join(["{:8.4f}".format(x) for x in model_pyt.export_thres[0]]), - ) - print("input:", ", ".join(["{:8.4f}".format(x) for x in inp_tensor[0]])) - print("prod :", ", ".join(["{:8.4f}".format(x) for x in produced[0]])) - print("expec:", ", ".join(["{:8.4f}".format(x) for x in expected[0]])) assert np.isclose(produced, expected, atol=1e-3).all() os.remove(export_onnx_path) diff --git a/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py index 403d406105..e7d87faed8 100644 --- a/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py +++ b/tests/brevitas/test_brevitas_scaled_qhardtanh_export.py @@ -28,7 +28,6 @@ import pytest -import brevitas.onnx as bo import numpy as np import onnx # noqa import os @@ -36,7 +35,7 @@ from brevitas.core.quant import QuantType from brevitas.core.restrict_val import RestrictValueType from brevitas.core.scaling import ScalingImplType -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from brevitas.nn import QuantHardTanh from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.infer_shapes import InferShapes @@ -53,12 +52,9 @@ @pytest.mark.parametrize("narrow_range", [False, True]) @pytest.mark.parametrize("min_val", [-1.0, -(1 - 2 ** (-7)), -2]) @pytest.mark.parametrize("max_val", [1.0, 1 - 2 ** (-7), 2]) -@pytest.mark.parametrize( - "scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER] -) -@pytest.mark.parametrize("QONNX_export", [False, True]) +@pytest.mark.parametrize("scaling_impl_type", [ScalingImplType.CONST, ScalingImplType.PARAMETER]) def test_brevitas_act_export_qhardtanh_scaled( - abits, narrow_range, min_val, max_val, scaling_impl_type, QONNX_export + abits, narrow_range, min_val, max_val, scaling_impl_type ): def get_quant_type(bit_width): if bit_width is None: @@ -89,20 +85,13 @@ def get_quant_type(bit_width): ) } b_act.load_state_dict(checkpoint) - if QONNX_export: - m_path = export_onnx_path - BrevitasONNXManager.export(b_act, ishape, m_path) - qonnx_cleanup(m_path, out_file=m_path) - model = ModelWrapper(m_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(m_path) - else: - bo.export_finn_onnx(b_act, ishape, export_onnx_path) - model = ModelWrapper(export_onnx_path) + m_path = export_onnx_path + export_qonnx(b_act, torch.randn(ishape), m_path) + qonnx_cleanup(m_path, out_file=m_path) + model = ModelWrapper(m_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) - inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype( - np.float32 - ) + inp_tensor = np.random.uniform(low=min_val, high=max_val, size=ishape).astype(np.float32) idict = {model.graph.input[0].name: inp_tensor} odict = oxe.execute_onnx(model, idict, True) produced = odict[model.graph.output[0].name] diff --git a/tests/brevitas/test_brevitas_selu_act_export.py b/tests/brevitas/test_brevitas_selu_act_export.py new file mode 100644 index 0000000000..c8d040dbee --- /dev/null +++ b/tests/brevitas/test_brevitas_selu_act_export.py @@ -0,0 +1,72 @@ +# Copyright (c) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import onnx # noqa +import os +import torch +from brevitas.export import export_qonnx +from brevitas.nn import QuantIdentity +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.util.basic import get_preferred_onnx_opset +from qonnx.util.cleanup import cleanup as qonnx_cleanup + +import finn.core.onnx_exec as oxe +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN + + +@pytest.mark.brevitas_export +@pytest.mark.parametrize("abits", [2, 4, 8]) +@pytest.mark.parametrize("ishape", [(1, 15), (1, 32, 1, 1)]) +@pytest.mark.parametrize("narrow", [True, False]) +def test_brevitas_act_export_selu(abits, ishape, narrow): + export_path = "test_brevitas_selu_act_export_%s.onnx" % str(abits) + b_act = torch.nn.Sequential(torch.nn.SELU(), QuantIdentity(bit_width=abits, narrow=narrow)) + + export_qonnx( + b_act, + torch.randn(ishape), + export_path, + opset_version=get_preferred_onnx_opset(), + ) + qonnx_cleanup(export_path, out_file=export_path) + model = ModelWrapper(export_path) + model = model.transform(ConvertQONNXtoFINN()) + + inp_tensor = np.random.uniform(low=-1.0, high=6.0, size=ishape).astype(np.float32) + idict = {model.graph.input[0].name: inp_tensor} + odict = oxe.execute_onnx(model, idict, True) + produced = odict[model.graph.output[0].name] + inp_tensor = torch.from_numpy(inp_tensor).float() + b_act.eval() + expected = b_act.forward(inp_tensor).detach().numpy() + + assert np.isclose(produced, expected, atol=1e-3).all() + os.remove(export_path) diff --git a/tests/brevitas/test_brevitas_validate_mobilenet.py b/tests/brevitas/test_brevitas_validate_mobilenet.py index 55915838e8..18f8fa9a41 100644 --- a/tests/brevitas/test_brevitas_validate_mobilenet.py +++ b/tests/brevitas/test_brevitas_validate_mobilenet.py @@ -35,6 +35,7 @@ import torch import torchvision.datasets as datasets import torchvision.transforms as transforms +from brevitas.export import export_qonnx from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.general import ( @@ -48,10 +49,12 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.insert_topk import InsertTopK from qonnx.transformation.merge_onnx_models import MergeONNXModels +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe import finn.transformation.streamline.absorb as absorb import finn.util.imagenet as imagenet_util +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.basic import make_build_dir from finn.util.pytorch import NormalizePreProc from finn.util.test import get_test_model_trained @@ -101,9 +104,6 @@ def test_brevitas_mobilenet_preproc(): @pytest.mark.brevitas_export @pytest.mark.slow -# marked as XFAIL until Brevitas export issues are resolved: -# https://github.com/Xilinx/brevitas/issues/173 -@pytest.mark.xfail def test_brevitas_compare_exported_mobilenet(): if "IMAGENET_VAL_PATH" not in os.environ.keys(): pytest.skip("Can't do validation without IMAGENET_VAL_PATH") @@ -113,8 +113,10 @@ def test_brevitas_compare_exported_mobilenet(): # export preprocessing preproc_onnx = export_onnx_path + "/quant_mobilenet_v1_4b_preproc.onnx" preproc = NormalizePreProc(mean, std, ch) - bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx) + export_qonnx(preproc, torch.randn(1, 3, 224, 224), preproc_onnx) + qonnx_cleanup(preproc_onnx, out_file=preproc_onnx) preproc_model = ModelWrapper(preproc_onnx) + preproc_model = preproc_model.transform(ConvertQONNXtoFINN()) preproc_model = preproc_model.transform(InferShapes()) preproc_model = preproc_model.transform(GiveUniqueNodeNames()) preproc_model = preproc_model.transform(GiveUniqueParameterTensors()) @@ -124,8 +126,10 @@ def test_brevitas_compare_exported_mobilenet(): mobilenet = get_test_model_trained("mobilenet", 4, 4) if debug_mode: dbg_hook = bo.enable_debug(mobilenet) - bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx) + export_qonnx(mobilenet, torch.randn(1, 3, 224, 224), finn_onnx) + qonnx_cleanup(finn_onnx, out_file=finn_onnx) model = ModelWrapper(finn_onnx) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(RemoveStaticGraphInputs()) @@ -145,9 +149,7 @@ def test_brevitas_compare_exported_mobilenet(): model = model.transform(MergeONNXModels(preproc_model)) model.save(export_onnx_path + "/quant_mobilenet_v1_4b.onnx") - with open( - export_onnx_path + "/mobilenet_validation.csv", "w", newline="" - ) as csvfile: + with open(export_onnx_path + "/mobilenet_validation.csv", "w", newline="") as csvfile: writer = csv.writer(csvfile) writer.writerow( [ @@ -164,7 +166,7 @@ def test_brevitas_compare_exported_mobilenet(): workload = imagenet_util.get_val_images(n_images, interleave_classes=True) all_inds_ok = True all_probs_ok = True - for (img_path, target_id) in workload: + for img_path, target_id in workload: img_np = imagenet_util.load_resize_crop(img_path) img_torch = torch.from_numpy(img_np).float() # do forward pass in PyTorch/Brevitas diff --git a/tests/end2end/test_end2end_bnn_pynq.py b/tests/end2end/test_end2end_bnn_pynq.py index 858363d6d3..556ba1d187 100644 --- a/tests/end2end/test_end2end_bnn_pynq.py +++ b/tests/end2end/test_end2end_bnn_pynq.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,20 +29,18 @@ import pytest -import brevitas.onnx as bo +import itertools import numpy as np # as of Feb'20 there is a bug that segfaults ONNX shape inference if we # import pytorch before onnx, so we make sure to import onnx first import onnx # NOQA import os -import subprocess import torch import warnings -from brevitas.export.onnx.generic.manager import BrevitasONNXManager -from collections import OrderedDict +from brevitas.export import export_qonnx from dataset_loading import cifar, mnist -from datetime import datetime +from distutils.dir_util import copy_tree from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp @@ -60,13 +59,13 @@ from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.transformation.merge_onnx_models import MergeONNXModels from qonnx.util.cleanup import cleanup as qonnx_cleanup -from scipy.stats import linregress +from shutil import copy -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.analysis.fpgadataflow.dataflow_performance import dataflow_performance from finn.core.onnx_exec import execute_onnx -from finn.core.throughput_test import throughput_test_remote, throughput_test_rtlsim +from finn.core.throughput_test import throughput_test_rtlsim from finn.transformation.fpgadataflow.annotate_cycles import AnnotateCycles from finn.transformation.fpgadataflow.annotate_resources import AnnotateResources from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -76,12 +75,18 @@ from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_dwc import InsertDWC -from finn.transformation.fpgadataflow.make_deployment import DeployToPYNQ from finn.transformation.fpgadataflow.make_pynq_driver import MakePYNQDriver +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline @@ -89,8 +94,8 @@ MakeMaxPoolNHWC, MoveScalarLinearPastInvariants, ) -from finn.util.basic import get_finn_root -from finn.util.gdrive import upload_to_end2end_dashboard +from finn.util.basic import get_finn_root, make_build_dir, test_board_map +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.pytorch import ToTensor from finn.util.test import ( execute_parent, @@ -103,40 +108,21 @@ build_dir = os.environ["FINN_BUILD_DIR"] target_clk_ns = 20 -mem_mode = "decoupled" +mem_mode = "internal_decoupled" rtlsim_trace = False -def get_checkpoint_name(topology, wbits, abits, QONNX_export, step): - return build_dir + "/end2end_%s_w%da%d_QONNX-%d_%s.onnx" % ( +def get_checkpoint_name(topology, wbits, abits, step): + return build_dir + "/end2end_%s_w%da%d_%s.onnx" % ( topology, wbits, abits, - QONNX_export, step, ) -def get_dashboard_data(topology, wbits, abits): - stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits) - stats_dict = OrderedDict() - if os.path.isfile(stats_file): - with open(stats_file, "r") as f: - stats_dict_txt = f.read() - stats_dict = eval(stats_dict_txt) - return stats_dict - - -def update_dashboard_data(topology, wbits, abits, key, val): - stats_dict = get_dashboard_data(topology, wbits, abits) - stats_dict[key] = val - stats_file = build_dir + "/end2end_%s_w%da%d.txt" % (topology, wbits, abits) - with open(stats_file, "w") as f: - f.write(str(stats_dict)) - - def fold_tfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # (PE, SIMD, ramstyle) for each layer config = [(16, 49, "block"), (8, 8, "auto"), (8, 8, "auto"), (10, 8, "distributed")] for fcl, (pe, simd, ramstyle) in zip(fc_layers, config): @@ -144,17 +130,18 @@ def fold_tfc(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) - inp_qnt.set_nodeattr("mem_mode", "decoupled") - inp_qnt.set_nodeattr("runtime_writeable_weights", 1) + # TODO: update PYNQ driver to support runtime writeable weights for RTL Thresholding + # inp_qnt.set_nodeattr("runtime_writeable_weights", 1) return model def fold_lfc(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # (PE, SIMD, ramstyle) for each layer config = [ (32, 49, "block"), @@ -168,15 +155,16 @@ def fold_lfc(model): fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) fcl_inst.set_nodeattr("runtime_writeable_weights", 1) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") # set parallelism for input quantizer to be same as first layer's SIMD - inp_qnt_node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + inp_qnt_node = model.get_nodes_by_op_type("Thresholding_rtl")[0] inp_qnt = getCustomOp(inp_qnt_node) inp_qnt.set_nodeattr("PE", 49) return model def fold_cnv_large(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ (16, 3), @@ -193,8 +181,9 @@ def fold_cnv_large(model): fcl_inst = getCustomOp(fcl) fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -203,7 +192,7 @@ def fold_cnv_large(model): def fold_cnv_small(model): - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") # each tuple is (PE, SIMD) for a layer folding = [ (8, 3, "distributed"), @@ -221,8 +210,9 @@ def fold_cnv_small(model): fcl_inst.set_nodeattr("PE", pe) fcl_inst.set_nodeattr("SIMD", simd) fcl_inst.set_nodeattr("ram_style", ramstyle) + fcl_inst.set_nodeattr("mem_mode", "internal_decoupled") - swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_layers = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") for i in range(len(swg_layers)): swg_inst = getCustomOp(swg_layers[i]) simd = folding[i][1] @@ -269,7 +259,7 @@ def measure_top1_accuracy(model_chkpt, dataset, parent_chkpt=None): raise Exception("Unrecognized dataset") # move from dataset_loader layout to ONNX layout: NHWC -> NCHW testx = testx.transpose(0, 3, 1, 2) - model = ModelWrapper(model_chkpt) + model = load_test_checkpoint_or_skip(model_chkpt) iname = model.graph.input[0].name oname = model.graph.output[0].name if parent_chkpt is None: @@ -309,42 +299,179 @@ def topology2dataset(topology): raise Exception("Unrecognized topology") -@pytest.mark.parametrize("wbits", [1, 2]) -@pytest.mark.parametrize("abits", [1, 2]) -@pytest.mark.parametrize("topology", ["lfc", "tfc", "cnv"]) -@pytest.mark.parametrize("QONNX_export", [False, True]) -@pytest.mark.end2end +def deploy_based_on_board(model, model_title, topology, wbits, abits, board): + # Check if a deployment directory for this board type already exists + if ("FINN_DEPLOY_DIR" in os.environ) and (board in os.environ["FINN_DEPLOY_DIR"]): + deploy_dir_root = os.environ["FINN_DEPLOY_DIR"] + else: + deploy_dir_root = make_build_dir(prefix="hw_deployment_" + board + "_") + # Set it for the next round if multiple bitstreams are selected for generation + os.environ["FINN_DEPLOY_DIR"] = deploy_dir_root + + # create directory for deployment files + deployment_dir = deploy_dir_root + "/" + board + "/" + model_title + os.makedirs(deployment_dir) + model.set_metadata_prop("pynq_deployment_dir", deployment_dir) + + # get and copy necessary files + # .bit and .hwh file + bitfile = model.get_metadata_prop("bitfile") + hwh_file = model.get_metadata_prop("hw_handoff") + deploy_files = [bitfile, hwh_file] + + for dfile in deploy_files: + if dfile is not None: + copy(dfile, deployment_dir) + + # create input and output test files + (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( + topology, wbits, abits, return_topk=1 + ) + + # Some changes are required in order to prepare the input tensor data for hardware + # testing. The ONNX graphs for these models contain nodes that manipulate the input + # tensor shape which FINN considers when creating the model. The same input tensor + # shaping needs to be done here on the input data. + # For the convolutional models, the graph contains the Transpose node. The Brevitas + # model works in NCHW layout but the FINN kernels are optimized for NHWC. + # The FC models contain a Reshape node, which FINN uses, so we therefore have to + # reshape the input tensor data to match the reshaping in the model + if topology == "cnv": + input_tensor_npy = input_tensor_npy.transpose(0, 2, 3, 1) + else: + input_shape = input_tensor_npy.shape + new_input_shape = (input_shape[0], np.prod(input_shape[1:])) + input_tensor_npy = input_tensor_npy.reshape(new_input_shape) + + np.save(os.path.join(deployment_dir, "input.npy"), input_tensor_npy.copy()) + np.save(os.path.join(deployment_dir, "output_reference.npy"), output_tensor_npy) + + # driver.py and python libraries + pynq_driver_dir = model.get_metadata_prop("pynq_driver_dir") + copy_tree(pynq_driver_dir, deployment_dir) + model.set_metadata_prop("pynq_deploy_dir", deployment_dir) + + +# parameters that make up inputs to test case(s) +def get_full_parameterized_test_list(marker, wbits_list, abits_list, topology_list, board_list): + test_cases = [ + ( + f"{marker}_w{param1}_a{param2}_{param3}_{param4}", + { + "wbits": param1, + "abits": param2, + "topology": param3, + "board": param4, + }, + ) + for param1, param2, param3, param4 in itertools.product( + wbits_list, + abits_list, + topology_list, + board_list, + ) + ] + return test_cases + + +def pytest_generate_tests(metafunc): + idlist = [] + argvalues = [] + scenarios = [] + + # Full set of test parameters + wbits = [1, 2] + abits = [1, 2] + topology = ["lfc", "tfc", "cnv"] + + # Separate the full list of markers used on command line. + # This allows a user to select multiple markers + all_markers_used = metafunc.config.getoption("-m").split(" ") + + for marker in all_markers_used: + if "sanity_bnn" in marker: + # Define a set of sanity tests that target each of + # the supported boards with fixed parameters + scenarios.extend( + get_full_parameterized_test_list( + "sanity_bnn", + wbits_list=[1], + abits_list=[1], + topology_list=["lfc"], + board_list=[test_board_map[0]], + ) + ) + scenarios.extend( + get_full_parameterized_test_list( + "sanity_bnn", + wbits_list=[1], + abits_list=[2], + topology_list=["cnv"], + board_list=[test_board_map[1]], + ) + ) + scenarios.extend( + get_full_parameterized_test_list( + "sanity_bnn", + wbits_list=[2], + abits_list=[2], + topology_list=["tfc"], + board_list=[test_board_map[2]], + ) + ) + scenarios.extend( + get_full_parameterized_test_list( + "sanity_bnn", + wbits_list=[2], + abits_list=[2], + topology_list=["cnv"], + board_list=[test_board_map[3]], + ) + ) + + if "bnn_" in marker: + # Target the full set of parameters for a single board + # Extract the board name from the marker used, as it is in the form of 'bnn_' + bnn_board = next( + (element for element in test_board_map if marker.split("_")[1] in element.lower()), + None, + ) + test_cases = get_full_parameterized_test_list( + "bnn", wbits, abits, topology, [bnn_board] + ) + scenarios.extend(test_cases) + + if len(scenarios) > 0: + for scenario in scenarios: + idlist.append(scenario[0]) + items = scenario[1].items() + argnames = [x[0] for x in items] + argvalues.append([x[1] for x in items]) + metafunc.parametrize(argnames, argvalues, ids=idlist, scope="class") + + +@pytest.mark.sanity_bnn +@pytest.mark.bnn_pynq +@pytest.mark.bnn_zcu104 +@pytest.mark.bnn_kv260 +@pytest.mark.bnn_u250 class TestEnd2End: - def test_export(self, topology, wbits, abits, QONNX_export): + def test_export(self, topology, wbits, abits, board): if wbits > abits: pytest.skip("No wbits > abits end2end network configs for now") if topology == "lfc" and not (wbits == 1 and abits == 1): pytest.skip("Skipping certain lfc configs") (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) - chkpt_name = get_checkpoint_name(topology, wbits, abits, QONNX_export, "export") - if QONNX_export: - BrevitasONNXManager.export(model, ishape, chkpt_name) - qonnx_cleanup(chkpt_name, out_file=chkpt_name) - model = ModelWrapper(chkpt_name) - model = model.transform(ConvertQONNXtoFINN()) - model.save(chkpt_name) - else: - bo.export_finn_onnx(model, ishape, chkpt_name) - nname = "%s_w%da%d" % (topology, wbits, abits) - update_dashboard_data(topology, wbits, abits, "network", nname) - dtstr = datetime.now().strftime("%Y-%m-%d %H:%M:%S") - update_dashboard_data(topology, wbits, abits, "datetime", dtstr) - finn_commit = subprocess.check_output( - ["git", "rev-parse", "HEAD"], cwd=get_finn_root() - ) - finn_commit = finn_commit.decode("utf-8").strip() - update_dashboard_data(topology, wbits, abits, "finn-commit", finn_commit) + chkpt_name = get_checkpoint_name(topology, wbits, abits, "export") + export_qonnx(model, torch.randn(ishape), chkpt_name, opset_version=13) + qonnx_cleanup(chkpt_name, out_file=chkpt_name) + model = ModelWrapper(chkpt_name) + model = model.transform(ConvertQONNXtoFINN()) + model.save(chkpt_name) assert os.path.isfile(chkpt_name) - def test_import_and_tidy(self, topology, wbits, abits, QONNX_export): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "export" - ) + def test_import_and_tidy(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "export") model = load_test_checkpoint_or_skip(prev_chkpt_name) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) @@ -352,24 +479,22 @@ def test_import_and_tidy(self, topology, wbits, abits, QONNX_export): model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) model = model.transform(RemoveStaticGraphInputs()) - chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "import_and_tidy" - ) + chkpt = get_checkpoint_name(topology, wbits, abits, "import_and_tidy") model.save(chkpt) - def test_add_pre_and_postproc(self, topology, wbits, abits, QONNX_export): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "import_and_tidy" - ) + def test_add_pre_and_postproc(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "import_and_tidy") model = load_test_checkpoint_or_skip(prev_chkpt_name) global_inp_name = model.graph.input[0].name ishape = model.get_tensor_shape(global_inp_name) # preprocessing: torchvision's ToTensor divides uint8 inputs by 255 totensor_pyt = ToTensor() - chkpt_preproc_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "preproc" - ) - bo.export_finn_onnx(totensor_pyt, ishape, chkpt_preproc_name) + chkpt_preproc_name = get_checkpoint_name(topology, wbits, abits, "preproc") + export_qonnx(totensor_pyt, torch.randn(ishape), chkpt_preproc_name, opset_version=13) + qonnx_cleanup(chkpt_preproc_name, out_file=chkpt_preproc_name) + pre_model = ModelWrapper(chkpt_preproc_name) + pre_model = pre_model.transform(ConvertQONNXtoFINN()) + pre_model.save(chkpt_preproc_name) assert os.path.isfile(chkpt_preproc_name) # join preprocessing and core model pre_model = ModelWrapper(chkpt_preproc_name) @@ -381,9 +506,7 @@ def test_add_pre_and_postproc(self, topology, wbits, abits, QONNX_export): model.set_tensor_datatype(global_inp_name, DataType["UINT8"]) # postprocessing: insert Top-1 node at the end model = model.transform(InsertTopK(k=1)) - chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "pre_post" - ) + chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post") # tidy-up again model = model.transform(InferShapes()) model = model.transform(FoldConstants()) @@ -394,10 +517,8 @@ def test_add_pre_and_postproc(self, topology, wbits, abits, QONNX_export): model.save(chkpt_name) assert os.path.isfile(chkpt_name) - def test_streamline(self, topology, wbits, abits, QONNX_export): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "pre_post" - ) + def test_streamline(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "pre_post") model = load_test_checkpoint_or_skip(prev_chkpt_name) model = model.transform(absorb.AbsorbSignBiasIntoMultiThreshold()) # move past any reshapes to be able to streamline input scaling @@ -413,66 +534,58 @@ def test_streamline(self, topology, wbits, abits, QONNX_export): model = model.transform(absorb.AbsorbScalarMulAddIntoTopK()) model = model.transform(InferDataLayouts()) model = model.transform(RemoveUnusedTensors()) - model.save( - get_checkpoint_name(topology, wbits, abits, QONNX_export, "streamline") - ) + model.save(get_checkpoint_name(topology, wbits, abits, "streamline")) - def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "streamline" - ) + def test_convert_to_hw_layers(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "streamline") model = load_test_checkpoint_or_skip(prev_chkpt_name) if topology == "tfc" and wbits == 1 and abits == 1: # use standalone thresholds for tfc-w1a1 to also exercise that option - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for bipolar MatMul layers - model = model.transform(to_hls.InferBinaryMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) # needed for non-bipolar MatMul layers - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) # TopK to LabelSelect - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) # input quantization (if any) to standalone thresholding - model = model.transform(to_hls.InferThresholdingLayer()) + model = model.transform(to_hw.InferThresholdingLayer()) # needed for convolutions if "fc" not in topology: - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(RemoveCNVtoFCFlatten()) # get rid of Tranpose -> Tranpose identity seq model = model.transform(absorb.AbsorbConsecutiveTransposes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(InferDataLayouts()) - model.save( - get_checkpoint_name( - topology, wbits, abits, QONNX_export, "convert_to_hls_layers" - ) - ) + model.save(get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers")) exp_layer_counts = { "tfc": [ ("Reshape", 1), - ("Thresholding_Batch", 1), - ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("Thresholding", 1), + ("MVAU", 4), + ("LabelSelect", 1), ], "tfc-1-1": [ ("Reshape", 1), - ("Thresholding_Batch", 4), - ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("Thresholding", 4), + ("MVAU", 4), + ("LabelSelect", 1), ], "lfc": [ ("Reshape", 1), - ("Thresholding_Batch", 1), - ("MatrixVectorActivation", 4), - ("LabelSelect_Batch", 1), + ("Thresholding", 1), + ("MVAU", 4), + ("LabelSelect", 1), ], "cnv": [ ("Transpose", 1), - ("Thresholding_Batch", 1), + ("Thresholding", 1), ("ConvolutionInputGenerator", 6), - ("MatrixVectorActivation", 9), - ("StreamingMaxPool_Batch", 2), - ("LabelSelect_Batch", 1), + ("MVAU", 9), + ("StreamingMaxPool", 2), + ("LabelSelect", 1), ], } if topology == "tfc" and wbits == 1 and abits == 1: @@ -480,54 +593,107 @@ def test_convert_to_hls_layers(self, topology, wbits, abits, QONNX_export): else: exp_key = topology exp_layer_counts = exp_layer_counts[exp_key] - for (op_type, exp_count) in exp_layer_counts: + for op_type, exp_count in exp_layer_counts: assert len(model.get_nodes_by_op_type(op_type)) == exp_count - def test_create_dataflow_partition(self, topology, wbits, abits, QONNX_export): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "convert_to_hls_layers" - ) + def test_specialize_layers(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "convert_to_hw_layers") + model = load_test_checkpoint_or_skip(prev_chkpt_name) + # set preferred impl style to hls for all layers + force_hls_boards = ["Pynq-Z1", "U250"] + if topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model.save(get_checkpoint_name(topology, wbits, abits, "specialize_layers")) + exp_layer_counts = { + "tfc": [ + ("Reshape", 1), + ("Thresholding_rtl", 1), + ("MVAU_hls", 4), + ("LabelSelect_hls", 1), + ], + "tfc-1-1": [ + ("Reshape", 1), + ("Thresholding_rtl", 4), + ("MVAU_hls", 4), + ("LabelSelect_hls", 1), + ], + "lfc": [ + ("Reshape", 1), + ("Thresholding_rtl", 1), + ("MVAU_hls", 4), + ("LabelSelect_hls", 1), + ], + "cnv": [ + ("Transpose", 1), + ("Thresholding_rtl", 1), + ("ConvolutionInputGenerator_rtl", 6), + ("MVAU_hls", 9), + ("StreamingMaxPool_hls", 2), + ("LabelSelect_hls", 1), + ], + "cnv-2-2": [ + ("Transpose", 1), + ("Thresholding_hls", 1), + ("ConvolutionInputGenerator_hls", 6), + ("MVAU_hls", 9), + ("StreamingMaxPool_hls", 2), + ("LabelSelect_hls", 1), + ], + } + if topology == "tfc" and wbits == 1 and abits == 1: + exp_key = "tfc-1-1" + elif topology == "cnv" and wbits == 2 and abits == 2 and board in force_hls_boards: + exp_key = "cnv-2-2" + else: + exp_key = topology + exp_layer_counts = exp_layer_counts[exp_key] + for op_type, exp_count in exp_layer_counts: + assert len(model.get_nodes_by_op_type(op_type)) == exp_count + + def test_create_dataflow_partition(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "specialize_layers") model = load_test_checkpoint_or_skip(prev_chkpt_name) parent_model = model.transform(CreateDataflowPartition()) - parent_model_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "dataflow_parent" - ) + parent_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") parent_model.save(parent_model_chkpt) sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] sdp_node = getCustomOp(sdp_node) dataflow_model_filename = sdp_node.get_nodeattr("model") dataflow_model = load_test_checkpoint_or_skip(dataflow_model_filename) - dataflow_model_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "dataflow_model" - ) + dataflow_model_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_model") dataflow_model.save(dataflow_model_chkpt) - def test_fold(self, topology, wbits, abits, QONNX_export): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "dataflow_model" - ) + def test_fold(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "dataflow_model") model = load_test_checkpoint_or_skip(prev_chkpt_name) folding_fxn = get_folding_function(topology, wbits, abits) model = folding_fxn(model) - model.save(get_checkpoint_name(topology, wbits, abits, QONNX_export, "fold")) + model.save(get_checkpoint_name(topology, wbits, abits, "fold")) + + def test_minimize_bit_width(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold") + model = load_test_checkpoint_or_skip(prev_chkpt_name) + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(MinimizeWeightBitWidth()) + curr_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width") + model.save(curr_chkpt_name) @pytest.mark.slow @pytest.mark.vivado - def test_cppsim(self, topology, wbits, abits, QONNX_export): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "fold" - ) + def test_cppsim(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "minimize_bit_width") model = load_test_checkpoint_or_skip(prev_chkpt_name) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) - cppsim_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "cppsim" - ) + cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim") model.save(cppsim_chkpt) - parent_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "dataflow_parent" - ) + parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( topology, wbits, abits, return_topk=1 ) @@ -536,56 +702,51 @@ def test_cppsim(self, topology, wbits, abits, QONNX_export): @pytest.mark.slow @pytest.mark.vivado - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_ipgen(self, topology, wbits, abits, QONNX_export, kind): - if kind == "alveo" and ("VITIS_PATH" not in os.environ): + def test_ipgen(self, topology, wbits, abits, board): + build_data = get_build_env(board, target_clk_ns) + if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ): pytest.skip("VITIS_PATH not set") - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "fold" - ) + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fold") model = load_test_checkpoint_or_skip(prev_chkpt_name) - test_fpga_part = get_build_env(kind, target_clk_ns)["part"] model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(PrepareIP(build_data["part"], target_clk_ns)) model = model.transform(HLSSynthIP()) - model.save( - get_checkpoint_name(topology, wbits, abits, QONNX_export, "ipgen_" + kind) - ) + model.save(get_checkpoint_name(topology, wbits, abits, "ipgen_" + board)) @pytest.mark.slow @pytest.mark.vivado - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_set_fifo_depths(self, topology, wbits, abits, QONNX_export, kind): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "ipgen_" + kind - ) + def test_set_fifo_depths(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipgen_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) - test_fpga_part = get_build_env(kind, target_clk_ns)["part"] - model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) - fifo_layers = model.get_nodes_by_op_type("StreamingFIFO") - assert len(fifo_layers) > 0 - model.save( - get_checkpoint_name( - topology, wbits, abits, QONNX_export, "fifodepth_" + kind + test_fpga_part = get_build_env(board, target_clk_ns)["part"] + if topology == "cnv" and wbits == 2 and abits == 2 and board == "Pynq-Z1": + # Enabling swg_exception for this single test case. Disabling the exception results in + # a design that exceeds the resources of the Pynq-Z1 board. In future this should be + # revisited and handled correctly as the swg_exception is poorly justified. + model = model.transform( + InsertAndSetFIFODepths(test_fpga_part, target_clk_ns, swg_exception=True) ) - ) + else: + model = model.transform(InsertAndSetFIFODepths(test_fpga_part, target_clk_ns)) + + fifo_layers = model.get_nodes_by_op_type("StreamingFIFO_rtl") + assert len(fifo_layers) > 0 + model.save(get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board)) @pytest.mark.slow @pytest.mark.vivado - @pytest.mark.parametrize("kind", ["zynq"]) - def test_ipstitch_rtlsim(self, topology, wbits, abits, QONNX_export, kind): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "fifodepth_" + kind - ) + def test_ipstitch_rtlsim(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) - test_fpga_part = get_build_env(kind, target_clk_ns)["part"] + test_fpga_part = get_build_env(board, target_clk_ns)["part"] model = model.transform(InsertDWC()) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(AnnotateCycles()) perf = model.analysis(dataflow_performance) latency = perf["critical_path_cycles"] # rtlsim only supports impl_style=rtl for StreamingFIFO, ensure that - for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO"): + for fifo_layer in model.get_nodes_by_op_type("StreamingFIFO_rtl"): getCustomOp(fifo_layer).set_nodeattr("impl_style", "rtl") model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) @@ -593,17 +754,11 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, QONNX_export, kind): model.set_metadata_prop("exec_mode", "rtlsim") os.environ["LIVENESS_THRESHOLD"] = str(int(latency * 1.1)) if rtlsim_trace: - model.set_metadata_prop( - "rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits) - ) + model.set_metadata_prop("rtlsim_trace", "%s_w%da%d.vcd" % (topology, wbits, abits)) os.environ["RTLSIM_TRACE_DEPTH"] = "3" - rtlsim_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind - ) + rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board) model.save(rtlsim_chkpt) - parent_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "dataflow_parent" - ) + parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( topology, wbits, abits, return_topk=1 ) @@ -612,11 +767,8 @@ def test_ipstitch_rtlsim(self, topology, wbits, abits, QONNX_export, kind): @pytest.mark.slow @pytest.mark.vivado - @pytest.mark.parametrize("kind", ["zynq"]) - def test_throughput_rtlsim(self, topology, wbits, abits, QONNX_export, kind): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind - ) + def test_throughput_rtlsim(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) n_nodes = len(model.graph.node) perf_est = model.analysis(dataflow_performance) @@ -627,33 +779,18 @@ def test_throughput_rtlsim(self, topology, wbits, abits, QONNX_export, kind): ret = throughput_test_rtlsim(model, batchsize=batchsize) res_cycles = ret["cycles"] est_cycles = latency + cycles_per_sample_est * batchsize - # warnings.warn("Estimated & rtlsim performance: " + str(perf)) - # for (k, v) in perf.items(): - # update_dashboard_data(topology, wbits, abits, k, v) - update_dashboard_data(topology, wbits, abits, "cycles_rtlsim", latency) assert (abs(res_cycles - est_cycles) / res_cycles) < 0.15 @pytest.mark.slow @pytest.mark.vivado - @pytest.mark.parametrize("kind", ["zynq"]) - def test_validate_top1(self, topology, wbits, abits, QONNX_export, kind): + def test_validate_top1(self, topology, wbits, abits, board): if "TEST_END2END_VALIDATE_TOP1" not in os.environ: pytest.skip("TEST_END2END_VALIDATE_TOP1 not set") - prepostproc_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "pre_post" - ) - streamline_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "streamline" - ) - parent_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "dataflow_parent" - ) - cppsim_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "cppsim" - ) - rtlsim_chkpt = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "ipstitch_rtlsim_" + kind - ) + prepostproc_chkpt = get_checkpoint_name(topology, wbits, abits, "pre_post") + streamline_chkpt = get_checkpoint_name(topology, wbits, abits, "streamline") + parent_chkpt = get_checkpoint_name(topology, wbits, abits, "dataflow_parent") + cppsim_chkpt = get_checkpoint_name(topology, wbits, abits, "cppsim") + rtlsim_chkpt = get_checkpoint_name(topology, wbits, abits, "ipstitch_rtlsim_" + board) dataset = topology2dataset(topology) assert measure_top1_accuracy(prepostproc_chkpt, dataset) > 80 assert measure_top1_accuracy(streamline_chkpt, dataset) > 80 @@ -663,156 +800,33 @@ def test_validate_top1(self, topology, wbits, abits, QONNX_export, kind): @pytest.mark.slow @pytest.mark.vivado @pytest.mark.vitis - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_build(self, topology, wbits, abits, QONNX_export, kind): - if kind == "alveo" and ("VITIS_PATH" not in os.environ): + def test_build(self, topology, wbits, abits, board): + build_data = get_build_env(board, target_clk_ns) + if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ): pytest.skip("VITIS_PATH not set") - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "fifodepth_" + kind - ) + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "fifodepth_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) - cfg = get_build_env(kind, target_clk_ns) - model = model.transform(cfg["build_fxn"]) + model = model.transform(build_data["build_fxn"]) model = model.transform(AnnotateResources("synth")) - synth_dct = eval(model.get_metadata_prop("res_total_top_synth")) - for (k, v) in synth_dct.items(): - update_dashboard_data(topology, wbits, abits, k, v) - update_dashboard_data(topology, wbits, abits, "board", cfg["board"]) - model.save( - get_checkpoint_name(topology, wbits, abits, QONNX_export, "build_" + kind) - ) + model.save(get_checkpoint_name(topology, wbits, abits, "build_" + board)) @pytest.mark.slow @pytest.mark.vivado @pytest.mark.vitis - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_make_pynq_driver(self, topology, wbits, abits, QONNX_export, kind): - if kind == "alveo" and ("VITIS_PATH" not in os.environ): + def test_make_pynq_driver(self, topology, wbits, abits, board): + build_data = get_build_env(board, target_clk_ns) + if build_data["kind"] == "alveo" and ("VITIS_PATH" not in os.environ): pytest.skip("VITIS_PATH not set") - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "build_" + kind - ) + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "build_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) - kind_to_driver_platform = {"zynq": "zynq-iodma", "alveo": "alveo"} - model = model.transform(MakePYNQDriver(kind_to_driver_platform[kind])) - model.save( - get_checkpoint_name(topology, wbits, abits, QONNX_export, "driver_" + kind) - ) + board_to_driver_platform = "alveo" if build_data["kind"] == "alveo" else "zynq-iodma" + model = model.transform(MakePYNQDriver(board_to_driver_platform)) + model.save(get_checkpoint_name(topology, wbits, abits, "driver_" + board)) - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_deploy(self, topology, wbits, abits, QONNX_export, kind): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "driver_" + kind - ) + def test_deploy(self, topology, wbits, abits, board): + prev_chkpt_name = get_checkpoint_name(topology, wbits, abits, "driver_" + board) model = load_test_checkpoint_or_skip(prev_chkpt_name) - cfg = get_build_env(kind, target_clk_ns) - if cfg["ip"] == "": - pytest.skip("PYNQ board IP address not specified") - model = model.transform( - DeployToPYNQ( - cfg["ip"], - cfg["port"], - cfg["username"], - cfg["password"], - cfg["target_dir"], - ) - ) + model_title = "%s_w%d_a%d_%s" % ("bnn", wbits, abits, topology) + deploy_based_on_board(model, model_title, topology, wbits, abits, board) # save the model to be able to link it to the parent - model.save( - get_checkpoint_name(topology, wbits, abits, QONNX_export, "deploy_" + kind) - ) - - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_run_on_hw(self, topology, wbits, abits, QONNX_export, kind): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "deploy_" + kind - ) - model = load_test_checkpoint_or_skip(prev_chkpt_name) # NOQA - cfg = get_build_env(kind, target_clk_ns) - if cfg["ip"] == "": - pytest.skip("PYNQ board IP address not specified") - (input_tensor_npy, output_tensor_npy) = get_golden_io_pair( - topology, wbits, abits, return_topk=1 - ) - parent_model = load_test_checkpoint_or_skip( - get_checkpoint_name(topology, wbits, abits, QONNX_export, "dataflow_parent") - ) - iname = parent_model.graph.input[0].name - oname = parent_model.graph.output[0].name - sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] - sdp_node = getCustomOp(sdp_node) - sdp_node.set_nodeattr("model", prev_chkpt_name) - ret = execute_onnx(parent_model, {iname: input_tensor_npy}, True) - y = ret[oname] - assert np.isclose(y, output_tensor_npy).all() - - @pytest.mark.parametrize("kind", ["zynq", "alveo"]) - def test_throughput_hw(self, topology, wbits, abits, QONNX_export, kind): - prev_chkpt_name = get_checkpoint_name( - topology, wbits, abits, QONNX_export, "deploy_" + kind - ) - end2end_example = "%s_w%da%d_%s" % (topology, wbits, abits, kind) - model = load_test_checkpoint_or_skip(prev_chkpt_name) # NOQA - cfg = get_build_env(kind, target_clk_ns) - if cfg["ip"] == "": - pytest.skip("PYNQ board IP address not specified") - ret = dict() - # try a range of batch sizes, some may fail due to insufficient DMA - # buffers - bsize_range_in = [8**i for i in range(5)] - bsize_range = [] - for bsize in bsize_range_in: - res = throughput_test_remote(model, bsize) - if res is not None: - ret[bsize] = res - bsize_range.append(bsize) - else: - # assume we reached largest possible N - break - y = [ret[key]["runtime[ms]"] for key in bsize_range] - lrret = linregress(bsize_range, y) - ret_str = "" - ret_str += "\n" + "%s Throughput Test Results" % end2end_example - ret_str += "\n" + "-----------------------------" - ret_str += "\n" + "From linear regression:" - ret_str += "\n" + "Invocation overhead: %f ms" % lrret.intercept - ret_str += "\n" + "Time per sample: %f ms" % lrret.slope - ret_str += "\n" + "Raw data:" - - ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( - "N", "runtime[ms]", "fclk[mhz]", "fps", "DRAM rd[MB/s]", "DRAM wr[MB/s]" - ) - for k in bsize_range: - v = ret[k] - ret_str += "\n" + "{:<8} {:<16} {:<16} {:<16} {:<16} {:<16}".format( - k, - np.round(v["runtime[ms]"], 4), - v["fclk[mhz]"], - np.round(v["throughput[images/s]"], 2), - np.round(v["DRAM_in_bandwidth[MB/s]"], 2), - np.round(v["DRAM_out_bandwidth[MB/s]"], 2), - ) - ret_str += "\n" + "-----------------------------" - warnings.warn(ret_str) - largest_bsize = bsize_range[-1] - update_dashboard_data( - topology, wbits, abits, "fclk[mhz]", ret[largest_bsize]["fclk[mhz]"] - ) - update_dashboard_data( - topology, - wbits, - abits, - "throughput[images/s]", - ret[largest_bsize]["throughput[images/s]"], - ) - - def test_upload_results_to_dashboard(self, topology, wbits, abits, QONNX_export): - # ToDo: Extend the dashboard to also upload QONNX exported models? - if QONNX_export: - pytest.skip("Dashboard data upload is disabled for QONNX exported models.") - else: - dashboard_data = get_dashboard_data(topology, wbits, abits) - if len(dashboard_data.keys()) > 0: - upload_to_end2end_dashboard(dashboard_data) - else: - pytest.skip("No data to upload to dashboard") + model.save(get_checkpoint_name(topology, wbits, abits, "deploy_" + board)) diff --git a/tests/end2end/test_end2end_cybsec_mlp.py b/tests/end2end/test_end2end_cybsec_mlp.py index 290afc3084..9ee07d57a3 100644 --- a/tests/end2end/test_end2end_cybsec_mlp.py +++ b/tests/end2end/test_end2end_cybsec_mlp.py @@ -26,23 +26,17 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest -import brevitas.onnx as bo import json import numpy as np import os import shutil -import subprocess import torch import torch.nn as nn -import wget from brevitas.core.quant import QuantType -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +from brevitas.export import export_qonnx from brevitas.nn import QuantIdentity, QuantLinear, QuantReLU -from brevitas.quant_tensor import QuantTensor from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.util.cleanup import cleanup as qonnx_cleanup @@ -51,20 +45,20 @@ import finn.builder.build_dataflow_config as build_cfg from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.basic import make_build_dir -from finn.util.test import get_build_env, load_test_checkpoint_or_skip +from finn.util.test import load_test_checkpoint_or_skip target_clk_ns = 10 -build_kind = "zynq" +build_board = "Pynq-Z1" build_dir = os.environ["FINN_BUILD_DIR"] -def get_checkpoint_name(step, QONNX_export): +def get_checkpoint_name(step): if step == "build": # checkpoint for build step is an entire dir - return build_dir + "/end2end_cybsecmlp_build_QONNX-%d" % (QONNX_export) + return build_dir + "/end2end_cybsecmlp_build" else: # other checkpoints are onnx files - return build_dir + "/end2end_cybsecmlp_QONNX-%d_%s.onnx" % (QONNX_export, step) + return build_dir + "/end2end_cybsecmlp_%s.onnx" % step class CybSecMLPForExport(nn.Module): @@ -85,10 +79,9 @@ def forward(self, x): return out_final -@pytest.mark.parametrize("QONNX_export", [False, True]) @pytest.mark.end2end -def test_end2end_cybsec_mlp_export(QONNX_export): - assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/") +def test_end2end_cybsec_mlp_export(): + assets_dir = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/cybsec-mlp" # load up trained net in Brevitas input_size = 593 hidden1 = 64 @@ -112,88 +105,57 @@ def test_end2end_cybsec_mlp_export(QONNX_export): QuantReLU(bit_width=act_bit_width), QuantLinear(hidden3, num_classes, bias=True, weight_bit_width=weight_bit_width), ) - trained_state_dict = torch.load(assets_dir + "/state_dict.pth")[ - "models_state_dict" - ][0] + trained_state_dict = torch.load(assets_dir + "/state_dict.pth")["models_state_dict"][0] model.load_state_dict(trained_state_dict, strict=False) W_orig = model[0].weight.data.detach().numpy() # pad the second (593-sized) dimensions with 7 zeroes at the end W_new = np.pad(W_orig, [(0, 0), (0, 7)]) model[0].weight.data = torch.from_numpy(W_new) model_for_export = CybSecMLPForExport(model) - export_onnx_path = get_checkpoint_name("export", QONNX_export) + export_onnx_path = get_checkpoint_name("export") input_shape = (1, 600) - # create a QuantTensor instance to mark the input as bipolar during export - input_a = np.random.randint(0, 1, size=input_shape).astype(np.float32) - input_a = 2 * input_a - 1 - scale = 1.0 - input_t = torch.from_numpy(input_a * scale) - input_qt = QuantTensor( - input_t, scale=torch.tensor(scale), bit_width=torch.tensor(1.0), signed=True - ) - if QONNX_export: - # With the BrevitasONNXManager we need to manually set - # the FINN DataType at the input - BrevitasONNXManager.export( - model_for_export, input_shape, export_path=export_onnx_path - ) - model = ModelWrapper(export_onnx_path) - model.set_tensor_datatype(model.graph.input[0].name, DataType["BIPOLAR"]) - model.save(export_onnx_path) - qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) - model = ModelWrapper(export_onnx_path) - model = model.transform(ConvertQONNXtoFINN()) - model.save(export_onnx_path) - else: - bo.export_finn_onnx( - model_for_export, export_path=export_onnx_path, input_t=input_qt - ) + # With the onnx export from Brevitas we need to manually set + # the FINN DataType at the input + export_qonnx(model_for_export, torch.randn(input_shape), export_path=export_onnx_path) + model = ModelWrapper(export_onnx_path) + model.set_tensor_datatype(model.graph.input[0].name, DataType["BIPOLAR"]) + model.save(export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) + model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) assert os.path.isfile(export_onnx_path) # fix input datatype - finn_model = ModelWrapper(export_onnx_path) - finnonnx_in_tensor_name = finn_model.graph.input[0].name - assert tuple(finn_model.get_tensor_shape(finnonnx_in_tensor_name)) == (1, 600) + finnonnx_in_tensor_name = model.graph.input[0].name + assert tuple(model.get_tensor_shape(finnonnx_in_tensor_name)) == (1, 600) # verify a few exported ops - if QONNX_export: - # The first "Mul" node doesn't exist in the QONNX export, - # because the QuantTensor scale is not exported. - # However, this node would have been unity scale anyways and - # the models are still equivalent. - assert finn_model.graph.node[0].op_type == "Add" - assert finn_model.graph.node[1].op_type == "Div" - assert finn_model.graph.node[2].op_type == "MatMul" - assert finn_model.graph.node[-1].op_type == "MultiThreshold" - else: - assert finn_model.graph.node[0].op_type == "Mul" - assert finn_model.get_initializer(finn_model.graph.node[0].input[1]) == 1.0 - assert finn_model.graph.node[1].op_type == "Add" - assert finn_model.graph.node[2].op_type == "Div" - assert finn_model.graph.node[3].op_type == "MatMul" - assert finn_model.graph.node[-1].op_type == "MultiThreshold" + # The first "Mul" node doesn't exist in the QONNX export, + # because the QuantTensor scale is not exported. + # However, this node would have been unity scale anyways and + # the models are still equivalent. + assert model.graph.node[0].op_type == "Add" + assert model.graph.node[1].op_type == "Div" + assert model.graph.node[2].op_type == "MatMul" + assert model.graph.node[-1].op_type == "MultiThreshold" # verify datatypes on some tensors - assert ( - finn_model.get_tensor_datatype(finnonnx_in_tensor_name) == DataType["BIPOLAR"] - ) - first_matmul_w_name = finn_model.get_nodes_by_op_type("MatMul")[0].input[1] - assert finn_model.get_tensor_datatype(first_matmul_w_name) == DataType["INT2"] + assert model.get_tensor_datatype(finnonnx_in_tensor_name) == DataType["BIPOLAR"] + first_matmul_w_name = model.get_nodes_by_op_type("MatMul")[0].input[1] + assert model.get_tensor_datatype(first_matmul_w_name) == DataType["INT2"] @pytest.mark.slow @pytest.mark.vivado @pytest.mark.end2end -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_end2end_cybsec_mlp_build(QONNX_export): - model_file = get_checkpoint_name("export", QONNX_export) +def test_end2end_cybsec_mlp_build(): + model_file = get_checkpoint_name("export") load_test_checkpoint_or_skip(model_file) - build_env = get_build_env(build_kind, target_clk_ns) - output_dir = make_build_dir(f"test_end2end_cybsec_mlp_build_QONNX-{QONNX_export}") + output_dir = make_build_dir("test_end2end_cybsec_mlp_build") cfg = build.DataflowBuildConfig( output_dir=output_dir, target_fps=1000000, synth_clk_period_ns=target_clk_ns, - board=build_env["board"], + board=build_board, shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, @@ -206,6 +168,7 @@ def test_end2end_cybsec_mlp_build(QONNX_export): # check the generated files assert os.path.isfile(output_dir + "/time_per_step.json") assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/driver/driver.py") est_cycles_report = output_dir + "/report/estimate_layer_cycles.json" assert os.path.isfile(est_cycles_report) @@ -219,69 +182,11 @@ def test_end2end_cybsec_mlp_build(QONNX_export): # examine the report contents with open(est_cycles_report, "r") as f: est_cycles_dict = json.load(f) - assert est_cycles_dict["MatrixVectorActivation_0"] == 80 - assert est_cycles_dict["MatrixVectorActivation_1"] == 64 + assert est_cycles_dict["MVAU_hls_0"] == 80 + assert est_cycles_dict["MVAU_hls_1"] == 64 with open(est_res_report, "r") as f: est_res_dict = json.load(f) - assert est_res_dict["total"]["LUT"] == 11360.0 + assert est_res_dict["total"]["LUT"] == 7899.0 assert est_res_dict["total"]["BRAM_18K"] == 36.0 - shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build", QONNX_export)) - - -@pytest.mark.end2end -@pytest.mark.xfail -@pytest.mark.parametrize("QONNX_export", [False, True]) -def test_end2end_cybsec_mlp_run_on_hw(QONNX_export): - build_env = get_build_env(build_kind, target_clk_ns) - assets_dir = pk.resource_filename("finn.qnn-data", "cybsec-mlp/") - deploy_dir = get_checkpoint_name("build", QONNX_export) - if not os.path.isdir(deploy_dir): - pytest.skip(deploy_dir + " not found from previous test step, skipping") - driver_dir = deploy_dir + "/driver" - assert os.path.isdir(driver_dir) - # put all assets into driver dir - shutil.copy(assets_dir + "/validate-unsw-nb15.py", driver_dir) - # put a copy of binarized dataset into driver dir - dataset_url = ( - "https://zenodo.org/record/4519767/files/unsw_nb15_binarized.npz?download=1" - ) - dataset_local = driver_dir + "/unsw_nb15_binarized.npz" - if not os.path.isfile(dataset_local): - wget.download(dataset_url, out=dataset_local) - assert os.path.isfile(dataset_local) - # create a shell script for running validation: 10 batches x 10 imgs - with open(driver_dir + "/validate.sh", "w") as f: - f.write( - """#!/bin/bash -cd %s/driver -echo %s | sudo -S python3.6 validate-unsw-nb15.py --batchsize=10 --limit_batches=10 - """ - % ( - build_env["target_dir"] + "/end2end_cybsecmlp_build", - build_env["password"], - ) - ) - # set up rsync command - remote_target = "%s@%s:%s" % ( - build_env["username"], - build_env["ip"], - build_env["target_dir"], - ) - rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target]) - assert rsync_res.returncode == 0 - remote_verif_cmd = [ - "ssh", - "%s@%s" % (build_env["username"], build_env["ip"]), - "sh", - build_env["target_dir"] + "/end2end_cybsecmlp_build/driver/validate.sh", - ] - verif_res = subprocess.run( - remote_verif_cmd, - stdout=subprocess.PIPE, - universal_newlines=True, - input=build_env["password"], - ) - assert verif_res.returncode == 0 - log_output = verif_res.stdout.split("\n") - assert log_output[-3] == "batch 10 / 10 : total OK 93 NOK 7" - assert log_output[-2] == "Final accuracy: 93.000000" + shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build")) + shutil.rmtree(get_checkpoint_name("build")) diff --git a/tests/end2end/test_end2end_mobilenet_v1.py b/tests/end2end/test_end2end_mobilenet_v1.py index 2f4df956ac..cbf89c2eae 100644 --- a/tests/end2end/test_end2end_mobilenet_v1.py +++ b/tests/end2end/test_end2end_mobilenet_v1.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -27,11 +28,11 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. import pytest -import brevitas.onnx as bo import numpy as np import os import time import torch +from brevitas.export import export_qonnx from PIL import Image from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper @@ -52,8 +53,9 @@ from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.transformation.merge_onnx_models import MergeONNXModels from qonnx.transformation.remove import RemoveIdentityOps +from qonnx.util.cleanup import cleanup as qonnx_cleanup -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb import finn.transformation.streamline.reorder as reorder from finn.core.onnx_exec import execute_onnx @@ -61,8 +63,16 @@ from finn.transformation.fpgadataflow.create_dataflow_partition import ( CreateDataflowPartition, ) +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.collapse_repeated import CollapseRepeatedMul from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds @@ -81,7 +91,6 @@ test_platform = alveo_default_platform[test_board] test_fpga_part = alveo_part_map[test_board] target_clk_ns = 3 -mem_mode = "decoupled" large_fifo_ram_style = "ultra" extra_fold = 1 first_layer_res_type = "dsp" @@ -95,12 +104,12 @@ def test_end2end_mobilenet_export(): std = 0.226 ch = 3 preproc = NormalizePreProc(mean, std, ch) - bo.export_finn_onnx(preproc, (1, 3, 224, 224), preproc_onnx) + export_qonnx(preproc, torch.randn(1, 3, 224, 224), preproc_onnx) + qonnx_cleanup(preproc_onnx, out_file=preproc_onnx) preproc_model = ModelWrapper(preproc_onnx) + preproc_model = preproc_model.transform(ConvertQONNXtoFINN()) # set input finn datatype to UINT8 - preproc_model.set_tensor_datatype( - preproc_model.graph.input[0].name, DataType["UINT8"] - ) + preproc_model.set_tensor_datatype(preproc_model.graph.input[0].name, DataType["UINT8"]) preproc_model = preproc_model.transform(InferShapes()) preproc_model = preproc_model.transform(FoldConstants()) preproc_model = preproc_model.transform(GiveUniqueNodeNames()) @@ -111,7 +120,8 @@ def test_end2end_mobilenet_export(): # export mobilenet finn_onnx = build_dir + "/end2end_mobilenet_export.onnx" mobilenet = get_test_model_trained("mobilenet", 4, 4) - bo.export_finn_onnx(mobilenet, (1, 3, 224, 224), finn_onnx) + export_qonnx(mobilenet, torch.randn(1, 3, 224, 224), finn_onnx) + qonnx_cleanup(finn_onnx, out_file=finn_onnx) # calculate golden output with pytorch/brevitas and save as .npy # get single image as input and prepare image @@ -145,10 +155,9 @@ def test_end2end_mobilenet_export(): @pytest.mark.end2end def test_end2end_mobilenet_tidy_and_merge_with_preproc(): - preproc_model = load_test_checkpoint_or_skip( - build_dir + "/end2end_mobilenet_preproc.onnx" - ) + preproc_model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_preproc.onnx") model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_export.onnx") + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(InsertTopK()) @@ -191,17 +200,13 @@ def test_end2end_mobilenet_streamline(): model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataTypes()) model.save(build_dir + "/end2end_mobilenet_streamlined.onnx") - assert ( - len(model.get_nodes_by_op_type("Add")) == 1 - ) # only final quantized bias Add op remains + assert len(model.get_nodes_by_op_type("Add")) == 1 # only final quantized bias Add op remains assert len(model.get_nodes_by_op_type("Mul")) == 0 # no Mul ops remain @pytest.mark.end2end def test_end2end_mobilenet_lowering(): - model = load_test_checkpoint_or_skip( - build_dir + "/end2end_mobilenet_streamlined.onnx" - ) + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_streamlined.onnx") model = model.transform(LowerConvsToMatMul()) model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) model = model.transform(absorb.AbsorbConsecutiveTransposes()) @@ -213,31 +218,41 @@ def test_end2end_mobilenet_lowering(): @pytest.mark.end2end -def test_end2end_mobilenet_convert_to_hls_layers(): +@pytest.mark.xfail +def test_end2end_mobilenet_convert_to_hw_layers(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_lowered.onnx") - model = model.transform(to_hls.InferPool_Batch()) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferVectorVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation(mem_mode)) - model = model.transform(to_hls.InferChannelwiseLinearLayer()) - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferPool()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferThresholdingLayer()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(InferShapes()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) - model.save(build_dir + "/end2end_mobilenet_hls_layers.onnx") + model.save(build_dir + "/end2end_mobilenet_hw_layers.onnx") + + +@pytest.mark.end2end +def test_end2end_mobilenet_specialize_layers(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_hw_layers.onnx") + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + model.save(build_dir + "/end2end_mobilenet_specialize_layers.onnx") @pytest.mark.end2end def test_end2end_mobilenet_folding(): - model = load_test_checkpoint_or_skip( - build_dir + "/end2end_mobilenet_hls_layers.onnx" - ) + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_specialize_layers.onnx") # optional extra folding to use fewer resources # applied while setting the attributes on each node assert extra_fold in [1, 2, 4] - # set up folding for the depthwise conv layers impl'd by VVAUs + # set up folding for the conv layers impl'd by MVAUs # each value is PE for a layer - fc_layers = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_layers = model.get_nodes_by_op_type("MVAU_hls") + fc_layers += model.get_nodes_by_op_type("MVAU_rtl") # each tuple is (PE, SIMD, ram_style) for a layer folding = [ (32, 3, "block"), @@ -266,7 +281,8 @@ def test_end2end_mobilenet_folding(): getCustomOp(fc_layers[0]).set_nodeattr("resType", first_layer_res_type) # set up folding for the depthwise conv layers impl'd by VVAUs # each value is PE for a layer - vvau_layers = model.get_nodes_by_op_type("VectorVectorActivation") + vvau_layers = model.get_nodes_by_op_type("VVAU_hls") + vvau_layers += model.get_nodes_by_op_type("VVAU_rtl") folding = [32, 32, 64, 16, 32, 8, 16, 16, 16, 16, 16, 4, 8] for vvau, pe in zip(vvau_layers, folding): vvau_inst = getCustomOp(vvau) @@ -277,11 +293,11 @@ def test_end2end_mobilenet_folding(): convinputgen_inst.set_nodeattr("SIMD", pe // extra_fold) # set SIMD in preceeding FMPadding to same value padding = model.find_direct_predecessors(convinputgen)[0] - if padding.op_type == "FMPadding_Batch": + if padding.op_type == "FMPadding_hls": padding_inst = getCustomOp(padding) padding_inst.set_nodeattr("SIMD", pe // extra_fold) # adjust final pooling layer + its inpgen - pool_node = model.get_nodes_by_op_type("Pool_Batch")[0] + pool_node = model.get_nodes_by_op_type("Pool_hls")[0] pool_inst = getCustomOp(pool_node) pool_inst.set_nodeattr("PE", 4 // extra_fold) pool_inpgen = model.find_direct_predecessors(pool_node)[0] @@ -292,8 +308,16 @@ def test_end2end_mobilenet_folding(): @pytest.mark.end2end -def test_end2end_mobilenet_create_dataflow_partition(): +def test_end2end_mobilenet_minimize_bit_width(): model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx") + model = model.transform(MinimizeAccumulatorWidth()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.save(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") + + +@pytest.mark.end2end +def test_end2end_mobilenet_create_dataflow_partition(): + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") parent_model = model.transform(CreateDataflowPartition()) parent_model.save(build_dir + "/end2end_mobilenet_dataflow_parent.onnx") sdp_node = parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] @@ -309,7 +333,7 @@ def test_end2end_mobilenet_create_dataflow_partition(): @pytest.mark.end2end @pytest.mark.xfail def test_end2end_mobilenet_cppsim(): - model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_folded.onnx") + model = load_test_checkpoint_or_skip(build_dir + "/end2end_mobilenet_minimize_bitwidth.onnx") x = np.load(build_dir + "/end2end_mobilenet_input.npy") inp_name = model.graph.input[0].name out_name = model.graph.output[0].name diff --git a/tests/end2end/test_ext_weights.py b/tests/end2end/test_ext_weights.py index 0a92c74a38..bac343bedf 100644 --- a/tests/end2end/test_ext_weights.py +++ b/tests/end2end/test_ext_weights.py @@ -1,4 +1,5 @@ -# Copyright (c) 2021, Xilinx +# Copyright (C) 2021-2022, Xilinx, Inc. +# Copyright (C) 2022-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,8 +27,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest import os @@ -38,10 +37,9 @@ import finn.builder.build_dataflow as build import finn.builder.build_dataflow_config as build_cfg from finn.util.basic import make_build_dir -from finn.util.test import get_build_env, load_test_checkpoint_or_skip +from finn.util.test import load_test_checkpoint_or_skip target_clk_ns = 10 -build_kind = "zynq" build_dir = os.environ["FINN_BUILD_DIR"] onnx_zip_url = "https://github.com/Xilinx/finn-examples" onnx_zip_url += "/releases/download/v0.0.1a/onnx-models-bnn-pynq.zip" @@ -83,17 +81,15 @@ def test_end2end_ext_weights_download(): def test_end2end_ext_weights_build(): model_file = get_checkpoint_name("download") load_test_checkpoint_or_skip(model_file) - build_env = get_build_env(build_kind, target_clk_ns) - folding_config_file = pk.resource_filename( - "finn.qnn-data", "test_ext_weights/tfc-w1a1-extw.json" - ) + test_data = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/test_ext_weights" + folding_config_file = test_data + "/tfc-w1a1-extw.json" output_dir = make_build_dir("test_end2end_ext_weights_build") cfg = build.DataflowBuildConfig( output_dir=output_dir, verbose=True, folding_config_file=folding_config_file, synth_clk_period_ns=target_clk_ns, - board=build_env["board"], + board="Pynq-Z1", shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, @@ -110,69 +106,3 @@ def test_end2end_ext_weights_build(): if os.path.isdir(get_checkpoint_name("build")): shutil.rmtree(get_checkpoint_name("build")) shutil.copytree(output_dir + "/deploy", get_checkpoint_name("build")) - - -@pytest.mark.board -@pytest.mark.end2end -@pytest.mark.xfail -def test_end2end_ext_weights_dataset(): - # make sure we have local copies of mnist dataset files - subprocess.check_output(["mkdir", "-p", mnist_local]) - for f in mnist_files: - if not os.path.isfile(mnist_local + "/" + f): - wget.download(mnist_url + "/" + f, out=mnist_local + "/" + f) - assert os.path.isfile(mnist_local + "/" + f) - # rsync to board - build_env = get_build_env(build_kind, target_clk_ns) - mnist_target = "%s@%s:%s" % (build_env["username"], build_env["ip"], "/tmp/") - - rsync_dataset_cmd = ["rsync", "-rv", mnist_local + "/", mnist_target] - subprocess.check_output(rsync_dataset_cmd) - - -@pytest.mark.end2end -@pytest.mark.xfail -def test_end2end_ext_weights_run_on_hw(): - build_env = get_build_env(build_kind, target_clk_ns) - deploy_dir = get_checkpoint_name("build") - if not os.path.isdir(deploy_dir): - pytest.skip(deploy_dir + " not found from previous test step, skipping") - driver_dir = deploy_dir + "/driver" - assert os.path.isdir(driver_dir) - # create a shell script for running validation: 10 batches x 10 imgs - with open(driver_dir + "/validate.sh", "w") as f: - f.write( - """#!/bin/bash -cd %s/driver -echo %s | sudo -S python3.6 validate.py --dataset mnist --bitfile %s - """ - % ( - build_env["target_dir"] + "/end2end_ext_weights_build", - build_env["password"], - "../bitfile/finn-accel.bit", - ) - ) - # set up rsync command - remote_target = "%s@%s:%s" % ( - build_env["username"], - build_env["ip"], - build_env["target_dir"], - ) - rsync_res = subprocess.run(["rsync", "-avz", deploy_dir, remote_target]) - assert rsync_res.returncode == 0 - remote_verif_cmd = [ - "ssh", - "%s@%s" % (build_env["username"], build_env["ip"]), - "sh", - build_env["target_dir"] + "/end2end_ext_weights_build/driver/validate.sh", - ] - verif_res = subprocess.run( - remote_verif_cmd, - stdout=subprocess.PIPE, - universal_newlines=True, - input=build_env["password"], - ) - assert verif_res.returncode == 0 - log_output = verif_res.stdout.split("\n") - assert log_output[-3] == "batch 100 / 100 : total OK 9296 NOK 704" - assert log_output[-2] == "Final accuracy: 92.960000" diff --git a/tests/fpgadataflow/test_code_gen_trafo.py b/tests/fpgadataflow/test_code_gen_trafo.py index f5edabbd4b..deb9dd43b4 100644 --- a/tests/fpgadataflow/test_code_gen_trafo.py +++ b/tests/fpgadataflow/test_code_gen_trafo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -50,10 +51,10 @@ def test_code_gen_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", node_inp_list, ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", code_gen_dir="", executable_path="", diff --git a/tests/fpgadataflow/test_compilation_trafo.py b/tests/fpgadataflow/test_compilation_trafo.py index d04b68a56b..7022311d4c 100644 --- a/tests/fpgadataflow/test_compilation_trafo.py +++ b/tests/fpgadataflow/test_compilation_trafo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -51,10 +52,10 @@ def test_compilation_trafo(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, mh]) node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", node_inp_list, ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", code_gen_dir="", executable_path="", diff --git a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py similarity index 80% rename from tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py rename to tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py index 98a7c76ee4..c5d0281203 100644 --- a/tests/fpgadataflow/test_convert_to_hls_1d_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_1d_conv_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,7 +42,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -49,6 +50,8 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_fpgadataflow_node # conv_config: @@ -71,7 +74,7 @@ @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): +def test_convert_to_hw_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): pad, kernel_size, stride, dilation = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -85,9 +88,6 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ pad_h = pad[0] + pad[2] pad_w = pad[1] + pad[3] - if use_rtl_swg and exec_mode == "cppsim": - pytest.skip("cppsim not supported for RTL SWG") - if depthwise is True: group = out_chn = in_chn conv_param_shape = [out_chn, 1, k_h, k_w] @@ -96,12 +96,8 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ out_chn = 20 conv_param_shape = [out_chn, in_chn, k_h, k_w] - out_feature_dim_h = compute_conv_output_dim( - in_feature_dim_h, k_h, stride_h, pad_h, dilation_h - ) - out_feature_dim_w = compute_conv_output_dim( - in_feature_dim_w, k_w, stride_w, pad_w, dilation_w - ) + out_feature_dim_h = compute_conv_output_dim(in_feature_dim_h, k_h, stride_h, pad_h, dilation_h) + out_feature_dim_w = compute_conv_output_dim(in_feature_dim_w, k_w, stride_w, pad_w, dilation_w) input_shape = [1, in_chn, in_feature_dim_h, in_feature_dim_w] output_shape = [1, out_chn, out_feature_dim_h, out_feature_dim_w] @@ -117,9 +113,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) - value_info = [ - helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape) - ] + value_info = [helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)] modelproto = qonnx_make_model( helper.make_graph( @@ -127,9 +121,7 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ inputs=[top_in], outputs=[top_out], value_info=value_info, - nodes=[ - helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config) - ], + nodes=[helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)], ) ) @@ -143,12 +135,23 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) + new_model = new_model.transform(to_hw.InferConvInpGen()) + if not use_rtl_swg: + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) else: - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0] + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) + # set folding parameters for MVAU + if new_model.get_nodes_by_op_type("MVAU_hls"): + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + else: + fc_node = new_model.get_nodes_by_op_type("MVAU_rtl")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") @@ -179,12 +182,12 @@ def test_convert_to_hls_1d_conv_layer(conv_config, depthwise, use_rtl_swg, exec_ assert oxe.compare_execution(model, new_model, inp_dict) if pad_h == 1 and pad_w == 1: - padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0] padding_inst = getCustomOp(padding_node) assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = new_model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py similarity index 73% rename from tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py rename to tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py index 089d1ae420..4b063f8505 100644 --- a/tests/fpgadataflow/test_convert_to_hls_channelwise_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_channelwise_layer.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,21 +38,21 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def prepare_inputs(input_tensor): return {"inp": input_tensor} -def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): - +def make_single_channelwise_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, ishape) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, ishape) p0 = helper.make_tensor_value_info("p0", TensorProto.FLOAT, pshape) @@ -76,13 +76,9 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): # parameter datatype -@pytest.mark.parametrize( - "pdt", [DataType["BIPOLAR"], DataType["UINT4"], DataType["INT2"]] -) +@pytest.mark.parametrize("pdt", [DataType["BIPOLAR"], DataType["UINT4"], DataType["INT2"]]) # input datatype -@pytest.mark.parametrize( - "idt", [DataType["INT32"], DataType["UINT4"], DataType["INT4"]] -) +@pytest.mark.parametrize("idt", [DataType["INT32"], DataType["UINT4"], DataType["INT4"]]) # function @pytest.mark.parametrize("onnx_op_name", ["Add", "Mul"]) # vector parameter or scalar parameter (broadcast) @@ -92,9 +88,7 @@ def make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_channelwise_layer( - pdt, idt, onnx_op_name, scalar_param, exec_mode -): +def test_convert_to_hw_channelwise_layer(pdt, idt, onnx_op_name, scalar_param, exec_mode): ifm_ch = 16 ifm_dim = 5 ishape = (1, ifm_ch, ifm_dim, ifm_dim) @@ -104,7 +98,7 @@ def test_convert_to_hls_channelwise_layer( pshape = (1, ifm_ch, 1, 1) np.random.seed(0) - model = make_single_maxpool_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) + model = make_single_channelwise_modelwrapper(onnx_op_name, ishape, idt, pdt, pshape) # Since the aren't Data types with a bit width of a non power of 2, # there are cases where the input won't use it full range. @@ -118,26 +112,32 @@ def test_convert_to_hls_channelwise_layer( input_dict = prepare_inputs(x) y_expected = oxe.execute_onnx(model, input_dict)["outp"] - new_model = model.transform(to_hls.InferChannelwiseLinearLayer()) - new_model = new_model.transform(GiveUniqueNodeNames()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(GiveUniqueNodeNames()) + + ctx_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) + y_produced = ctx_produced["outp"] + + assert (y_produced == y_expected).all() + assert model.graph.node[1].op_type == "ChannelwiseOp" + + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": - new_model = new_model.transform(PrepareCppSim()) - new_model = new_model.transform(CompileCppSim()) - new_model = new_model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) elif exec_mode == "rtlsim": - new_model = new_model.transform(SetExecMode("rtlsim")) - new_model = new_model.transform(GiveUniqueNodeNames()) - new_model = new_model.transform(PrepareIP("xc7z020clg400-1", 5)) - new_model = new_model.transform(HLSSynthIP()) - new_model = new_model.transform(PrepareRTLSim()) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP("xc7z020clg400-1", 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) else: raise Exception("Unknown exec_mode") - ctx_produced = oxe.execute_onnx( - new_model, input_dict, return_full_exec_context=True - ) + ctx_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=True) y_produced = ctx_produced["outp"] assert (y_produced == y_expected).all() - assert new_model.graph.node[1].op_type == "ChannelwiseOp_Batch" + assert model.graph.node[1].op_type == "ChannelwiseOp_hls" diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py similarity index 76% rename from tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py rename to tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py index 3512c39cb3..f7b3c55c2a 100755 --- a/tests/fpgadataflow/test_convert_to_hls_conv_fc_transition.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_fc_transition.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -34,6 +35,7 @@ from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.im2col import compute_conv_output_dim +from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames, RemoveUnusedTensors from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_datatypes import InferDataTypes @@ -42,14 +44,16 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.move_reshape import RemoveCNVtoFCFlatten from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MoveScalarLinearPastInvariants +from finn.util.fpgadataflow import is_fpgadataflow_node def get_multithreshold_rand_params(channels, num_of_thres, seed=None): @@ -78,7 +82,7 @@ def get_multithreshold_rand_params(channels, num_of_thres, seed=None): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): +def test_convert_to_hw_conv_fc_transition(conv_config, depthwise, use_reshape): np.random.seed(0) idt = DataType["UINT4"] odt = DataType["UINT4"] @@ -102,12 +106,8 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): out_chn = 8 conv_param_shape = [out_chn, in_chn, kernel_size_h, kernel_size_w] - output_size_h = compute_conv_output_dim( - input_size_h, kernel_size_h, stride_h, 2 * pad_h - ) - output_size_w = compute_conv_output_dim( - input_size_w, kernel_size_w, stride_w, 2 * pad_w - ) + output_size_h = compute_conv_output_dim(input_size_h, kernel_size_h, stride_h, 2 * pad_h) + output_size_w = compute_conv_output_dim(input_size_w, kernel_size_w, stride_w, 2 * pad_w) input_shape = [1, in_chn, input_size_h, input_size_w] fc_param_shape = [out_chn * output_size_h * output_size_w, fc_filters] @@ -120,34 +120,20 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): conv_config["pads"] = [pad_h, pad_w, pad_h, pad_w] conv_config["strides"] = [stride_h, stride_w] - global_in = helper.make_tensor_value_info( - "global_in", TensorProto.FLOAT, input_shape - ) - global_out = helper.make_tensor_value_info( - "global_out", TensorProto.FLOAT, output_shape - ) + global_in = helper.make_tensor_value_info("global_in", TensorProto.FLOAT, input_shape) + global_out = helper.make_tensor_value_info("global_out", TensorProto.FLOAT, output_shape) value_info = [ - helper.make_tensor_value_info( - "conv_param", TensorProto.FLOAT, conv_param_shape - ), + helper.make_tensor_value_info("conv_param", TensorProto.FLOAT, conv_param_shape), helper.make_tensor_value_info("thres1_param", TensorProto.FLOAT, (out_chn, 15)), - helper.make_tensor_value_info( - "matmul_param", TensorProto.FLOAT, fc_param_shape - ), - helper.make_tensor_value_info( - "thres2_param", TensorProto.FLOAT, (fc_filters, 15) - ), + helper.make_tensor_value_info("matmul_param", TensorProto.FLOAT, fc_param_shape), + helper.make_tensor_value_info("thres2_param", TensorProto.FLOAT, (fc_filters, 15)), helper.make_tensor_value_info("reshape_shape", TensorProto.INT64, []), ] if use_reshape: - flatten_node = helper.make_node( - "Reshape", ["thres1_out", "reshape_shape"], ["flatten_out"] - ) + flatten_node = helper.make_node("Reshape", ["thres1_out", "reshape_shape"], ["flatten_out"]) else: - flatten_node = helper.make_node( - "Flatten", ["thres1_out"], ["flatten_out"], axis=1 - ) + flatten_node = helper.make_node("Flatten", ["thres1_out"], ["flatten_out"], axis=1) modelproto = qonnx_make_model( helper.make_graph( @@ -156,9 +142,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): outputs=[global_out], value_info=value_info, nodes=[ - helper.make_node( - "Conv", ["global_in", "conv_param"], ["conv_out"], **conv_config - ), + helper.make_node("Conv", ["global_in", "conv_param"], ["conv_out"], **conv_config), helper.make_node( "MultiThreshold", ["conv_out", "thres1_param"], @@ -167,9 +151,7 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): out_dtype="UINT4", ), flatten_node, - helper.make_node( - "MatMul", ["flatten_out", "matmul_param"], ["matmul_out"] - ), + helper.make_node("MatMul", ["flatten_out", "matmul_param"], ["matmul_out"]), helper.make_node( "MultiThreshold", ["matmul_out", "thres2_param"], @@ -190,18 +172,10 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): model.set_tensor_datatype("thres1_param", DataType["INT32"]) model.set_tensor_datatype("thres2_param", DataType["INT32"]) - model.set_initializer( - "conv_param", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape) - ) - model.set_initializer( - "thres1_param", get_multithreshold_rand_params(out_chn, 15, seed=0) - ) - model.set_initializer( - "thres2_param", get_multithreshold_rand_params(fc_filters, 15, seed=0) - ) - model.set_initializer( - "matmul_param", gen_finn_dt_tensor(fc_weight_dt, fc_param_shape) - ) + model.set_initializer("conv_param", gen_finn_dt_tensor(conv_weight_dt, conv_param_shape)) + model.set_initializer("thres1_param", get_multithreshold_rand_params(out_chn, 15, seed=0)) + model.set_initializer("thres2_param", get_multithreshold_rand_params(fc_filters, 15, seed=0)) + model.set_initializer("matmul_param", gen_finn_dt_tensor(fc_weight_dt, fc_param_shape)) model.set_initializer("reshape_shape", np.array([1, -1], dtype=np.int64)) model = model.transform(InferShapes()) @@ -217,15 +191,20 @@ def test_convert_to_hls_conv_fc_transition(conv_config, depthwise, use_reshape): new_model = new_model.transform(InferDataLayouts()) new_model = new_model.transform(RemoveUnusedTensors()) - # convert_to_hls + # convert_to_hw if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - new_model = new_model.transform(to_hls.InferThresholdingLayer()) - new_model = new_model.transform(to_hls.InferConvInpGen()) - new_model = new_model.transform(to_hls.InferStreamingMaxPool()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(to_hw.InferThresholdingLayer()) + new_model = new_model.transform(to_hw.InferConvInpGen()) + new_model = new_model.transform(to_hw.InferStreamingMaxPool()) new_model = new_model.transform(RemoveCNVtoFCFlatten()) new_model = new_model.transform(absorb.AbsorbConsecutiveTransposes()) + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + new_model = new_model.transform(SpecializeLayers()) new_model = new_model.transform(GiveUniqueNodeNames()) new_model = new_model.transform(InferDataLayouts()) diff --git a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py similarity index 79% rename from tests/fpgadataflow/test_convert_to_hls_conv_layer.py rename to tests/fpgadataflow/test_convert_to_hw_conv_layer.py index de31ef0f12..61f8af7806 100644 --- a/tests/fpgadataflow/test_convert_to_hls_conv_layer.py +++ b/tests/fpgadataflow/test_convert_to_hw_conv_layer.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,7 +42,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -49,6 +50,8 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.fpgadataflow import is_fpgadataflow_node # conv_config kernel_size,stride, pad @@ -62,7 +65,7 @@ @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): +def test_convert_to_hw_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mode): kernel_size, stride, pad = conv_config np.random.seed(0) idt = DataType["UINT4"] @@ -71,10 +74,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod in_chn = 16 if use_rtl_swg and exec_mode == "cppsim": - pytest.skip("cppsim not supported for RTL SWG") - - if use_rtl_swg and kernel_size == 1: - pytest.skip("1x1 kernel not supported by current RTL SWG") + pytest.skip("Skip cppsim if SWG in rtl") if depthwise is True: group = out_chn = in_chn @@ -85,9 +85,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod conv_param_shape = [out_chn, in_chn, kernel_size, kernel_size] total_pad = 2 * pad - out_feature_dim = compute_conv_output_dim( - in_feature_dim, kernel_size, stride, total_pad - ) + out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, total_pad) input_shape = [1, in_chn, in_feature_dim, in_feature_dim] output_shape = [1, out_chn, out_feature_dim, out_feature_dim] @@ -103,9 +101,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod top_in = helper.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) top_out = helper.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) - value_info = [ - helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape) - ] + value_info = [helper.make_tensor_value_info("p1", TensorProto.FLOAT, conv_param_shape)] modelproto = qonnx_make_model( helper.make_graph( @@ -113,9 +109,7 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod inputs=[top_in], outputs=[top_out], value_info=value_info, - nodes=[ - helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config) - ], + nodes=[helper.make_node("Conv", ["top_in", "p1"], ["top_out"], **conv_config)], ) ) @@ -129,12 +123,23 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod model = model.transform(InferDataTypes()) new_model = model.transform(LowerConvsToMatMul()) - new_model = new_model.transform(to_hls.InferConvInpGen(use_rtl_variant=use_rtl_swg)) + new_model = new_model.transform(to_hw.InferConvInpGen()) + if not use_rtl_swg: + for node in new_model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") if depthwise is True: - new_model = new_model.transform(to_hls.InferVectorVectorActivation()) + new_model = new_model.transform(to_hw.InferVectorVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) else: - new_model = new_model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - fc_node = new_model.get_nodes_by_op_type("MatrixVectorActivation")[0] + new_model = new_model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + new_model = new_model.transform(SpecializeLayers()) + # set folding parameters for MVAU + if new_model.get_nodes_by_op_type("MVAU_hls"): + fc_node = new_model.get_nodes_by_op_type("MVAU_hls")[0] + else: + fc_node = new_model.get_nodes_by_op_type("MVAU_rtl")[0] fc_inst = getCustomOp(fc_node) mw = fc_inst.get_nodeattr("MW") mh = fc_inst.get_nodeattr("MH") @@ -164,10 +169,10 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod inp_dict = {model.graph.input[0].name: x} assert oxe.compare_execution(model, new_model, inp_dict) - if kernel_size == 1 and stride > 1 and pad == 0: - assert new_model.graph.node[1].op_type == "DownSampler" + if not use_rtl_swg and kernel_size == 1 and stride > 1 and pad == 0: + assert new_model.graph.node[1].op_type == "DownSampler_hls" if exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("DownSampler")[0] + node = new_model.get_nodes_by_op_type("DownSampler_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) @@ -179,12 +184,12 @@ def test_convert_to_hls_conv_layer(conv_config, depthwise, use_rtl_swg, exec_mod if use_rtl_swg: padding_node = new_model.get_nodes_by_op_type("FMPadding_rtl")[0] else: - padding_node = new_model.get_nodes_by_op_type("FMPadding_Batch")[0] + padding_node = new_model.get_nodes_by_op_type("FMPadding_hls")[0] padding_inst = getCustomOp(padding_node) assert padding_inst.get_nodeattr("SIMD") == in_chn if depthwise is True and exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = new_model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py similarity index 71% rename from tests/fpgadataflow/test_convert_to_hls_layers_cnv.py rename to tests/fpgadataflow/test_convert_to_hw_layers_cnv.py index 9997f28438..71f383ca23 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_cnv.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_cnv.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -26,46 +27,57 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest -import brevitas.onnx as bo +import importlib_resources as importlib import numpy as np import os +import torch +from brevitas.export import export_qonnx from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount from qonnx.transformation.fold_constants import FoldConstants -from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from qonnx.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, +) from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MakeMaxPoolNHWC +from finn.util.fpgadataflow import is_fpgadataflow_node from finn.util.test import get_test_model_trained -export_onnx_path_cnv = "test_convert_to_hls_layers_cnv.onnx" +export_onnx_path_cnv = "test_convert_to_hw_layers_cnv.onnx" @pytest.mark.fpgadataflow @pytest.mark.vivado # Standalone or fused thresholding-based activation @pytest.mark.parametrize("fused_activation", [True, False]) -def test_convert_to_hls_layers_cnv_w1a1(fused_activation): +def test_convert_to_hw_layers_cnv_w1a1(fused_activation): cnv = get_test_model_trained("CNV", 1, 1) - bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv) + export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv) + qonnx_cleanup(export_onnx_path_cnv, out_file=export_onnx_path_cnv) model = ModelWrapper(export_onnx_path_cnv) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(Streamline()) model = model.transform(LowerConvsToMatMul()) @@ -75,10 +87,10 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) model = model.transform(InferDataLayouts()) - # model.save("golden.onnx") # load one of the test vectors - fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz") - input_tensor = np.load(fn)["arr_0"].astype(np.float32) + ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz" + with importlib.as_file(ref) as fn: + input_tensor = np.load(fn)["arr_0"].astype(np.float32) input_tensor = input_tensor / 255 assert input_tensor.shape == (1, 3, 32, 32) # generate expected value from streamlined net @@ -86,16 +98,24 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): expected_ctx = oxe.execute_onnx(model, input_dict, True) expected = expected_ctx[model.graph.output[0].name] - # if we infer thresholding first, all MultiThresholds get converted to HLS + # if we infer thresholding first, all MultiThresholds get converted to HW # subsequently, the FC inference will generate passthrough MVAUs if not fused_activation: - model = model.transform(to_hls.InferThresholdingLayer()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferThresholdingLayer()) + + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) + for node in model.graph.node: + if is_fpgadataflow_node(node): + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers()) for node in model.graph.node: - if node.op_type == "MatrixVectorActivation": + if node.op_type == "MVAU_hls": inst = getCustomOp(node) - inst.set_nodeattr("mem_mode", "decoupled") + inst.set_nodeattr("mem_mode", "internal_decoupled") mw = inst.get_nodeattr("MW") mh = inst.get_nodeattr("MH") if mh % 4 == 0: @@ -108,31 +128,27 @@ def test_convert_to_hls_layers_cnv_w1a1(fused_activation): else: simd = mw inst.set_nodeattr("SIMD", simd) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) # check topology status finn_nodes = model.get_finn_nodes() if fused_activation: assert len(finn_nodes) == 18 else: assert len(finn_nodes) == 26 - thr_nodes = model.get_nodes_by_op_type("Thresholding_Batch") + thr_nodes = model.get_nodes_by_op_type("Thresholding_hls") assert len(thr_nodes) == 8 non_finn_nodes = model.get_non_finn_nodes() assert len(non_finn_nodes) == 5 exp_non_finn_nodes = ["Transpose", "Transpose", "Reshape", "Mul", "Add"] assert [x.op_type for x in non_finn_nodes] == exp_non_finn_nodes - fc_nodes = model.get_nodes_by_op_type("MatrixVectorActivation") + fc_nodes = model.get_nodes_by_op_type("MVAU_hls") assert len(fc_nodes) == 9 - swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator") + swg_nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") assert len(swg_nodes) == 6 - mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_Batch") + mp_nodes = model.get_nodes_by_op_type("StreamingMaxPool_hls") assert len(mp_nodes) == 2 - # model.save("cnv-pre-compile.onnx") model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) - # model.save("cnv-post-compile.onnx") produced_ctx = oxe.execute_onnx(model, input_dict, True) produced = produced_ctx[model.graph.output[0].name] assert np.isclose(expected, produced, atol=1e-3).all() diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py similarity index 81% rename from tests/fpgadataflow/test_convert_to_hls_layers_fc.py rename to tests/fpgadataflow/test_convert_to_hw_layers_fc.py index fd4e3679d7..746ded9074 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_fc.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_fc.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,66 +29,77 @@ import pytest -import brevitas.onnx as bo import numpy as np import onnx import onnx.numpy_helper as nph import os import torch +from brevitas.export import export_qonnx from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount from qonnx.transformation.fold_constants import FoldConstants -from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from qonnx.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, +) from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.round_thresholds import RoundAndClipThresholds from finn.util.test import get_test_model_trained -export_onnx_path = "test_convert_to_hls_layers_fc.onnx" +export_onnx_path = "test_convert_to_hw_layers_fc.onnx" @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_layers_tfc_w1a1(): +def test_convert_to_hw_layers_tfc_w1a1(): tfc = get_test_model_trained("TFC", 1, 1) - bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path) + export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(Streamline()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(absorb.AbsorbAddIntoMultiThreshold()) model = model.transform(absorb.AbsorbMulIntoMultiThreshold()) model = model.transform(RoundAndClipThresholds()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation" + assert fc0.op_type.startswith("MVAU") assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 1] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation" + assert fc1.op_type.startswith("MVAU") assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 1] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation" + assert fc2.op_type.startswith("MVAU") assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 1] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation" + assert fc3.op_type.startswith("MVAU") assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] @@ -128,38 +140,39 @@ def test_convert_to_hls_layers_tfc_w1a1(): @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_convert_to_hls_layers_tfc_w1a2(): +def test_convert_to_hw_layers_tfc_w1a2(): tfc = get_test_model_trained("TFC", 1, 2) - bo.export_finn_onnx(tfc, (1, 1, 28, 28), export_onnx_path) + export_qonnx(tfc, torch.randn(1, 1, 28, 28), export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) + model.save(export_onnx_path) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(Streamline()) - from finn.transformation.fpgadataflow.convert_to_hls_layers import ( - InferQuantizedMatrixVectorActivation, - ) - - model = model.transform(InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(SpecializeLayers()) fc0 = model.graph.node[2] - assert fc0.op_type == "MatrixVectorActivation" + assert fc0.op_type.startswith("MVAU") assert model.get_tensor_shape(fc0.input[0]) == [1, 784] assert model.get_tensor_shape(fc0.input[1]) == [784, 64] assert model.get_tensor_shape(fc0.input[2]) == [64, 2] fc1 = model.graph.node[3] - assert fc1.op_type == "MatrixVectorActivation" + assert fc1.op_type.startswith("MVAU") assert model.get_tensor_shape(fc1.input[0]) == [1, 64] assert model.get_tensor_shape(fc1.input[1]) == [64, 64] assert model.get_tensor_shape(fc1.input[2]) == [64, 2] fc2 = model.graph.node[4] - assert fc2.op_type == "MatrixVectorActivation" + assert fc2.op_type.startswith("MVAU") assert model.get_tensor_shape(fc2.input[0]) == [1, 64] assert model.get_tensor_shape(fc2.input[1]) == [64, 64] assert model.get_tensor_shape(fc2.input[2]) == [64, 2] fc3 = model.graph.node[5] - assert fc3.op_type == "MatrixVectorActivation" + assert fc3.op_type.startswith("MVAU") assert model.get_tensor_shape(fc3.input[0]) == [1, 64] assert model.get_tensor_shape(fc3.input[1]) == [64, 10] fc0w = getCustomOp(fc0) diff --git a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py similarity index 78% rename from tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py rename to tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py index c837a46a7c..6c83f10617 100644 --- a/tests/fpgadataflow/test_convert_to_hls_layers_synthetic.py +++ b/tests/fpgadataflow/test_convert_to_hw_layers_synthetic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,10 +46,11 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.transformation.streamline.absorb import ( AbsorbConsecutiveTransposes, AbsorbScalarMulAddIntoTopK, @@ -67,7 +68,7 @@ export_onnx_path = "test_output_synthetic.onnx" # construct a synthetic graph to test: -# topk insertion, topk conversion to hls, add conversion to hls +# topk insertion, topk conversion to hw, add conversion to hw # graph should just be a sum @@ -91,21 +92,11 @@ def make_model(ch, ifmdim): add0_node = helper.make_node("Add", [inp.name, inp1_add0_ct.name], ["out_add0"]) add1_node = helper.make_node("Add", ["out_add0", inp1_add_ct.name], [inp1_add.name]) add2_node = helper.make_node("Add", ["out_add0", inp2_add_ct.name], [inp2_add.name]) - mul1_node = helper.make_node( - "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name] - ) - mul2_node = helper.make_node( - "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name] - ) - eltwise_add_node = helper.make_node( - "Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name] - ) - globalavgpool_node = helper.make_node( - "GlobalAveragePool", [eltwise_add.name], [pool.name] - ) - reshape_node = helper.make_node( - "Reshape", [pool.name, reshape_ct.name], [outp.name] - ) + mul1_node = helper.make_node("Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]) + mul2_node = helper.make_node("Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]) + eltwise_add_node = helper.make_node("Add", [inp1_mul.name, inp2_mul.name], [eltwise_add.name]) + globalavgpool_node = helper.make_node("GlobalAveragePool", [eltwise_add.name], [pool.name]) + reshape_node = helper.make_node("Reshape", [pool.name, reshape_ct.name], [outp.name]) graph = helper.make_graph( nodes=[ @@ -146,7 +137,7 @@ def make_model(ch, ifmdim): @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): +def test_convert_to_hw_layers_synthetic(ch, ifmdim, idt): model = make_model(ch, ifmdim) model.save(export_onnx_path) model = ModelWrapper(export_onnx_path, fix_float64=True) @@ -155,7 +146,6 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataLayouts()) - # model.save("golden.onnx") # generate test vectors of correct shape if ifmdim == -1: input_tensor_shape = (1, ch) @@ -176,7 +166,7 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(InferDataLayouts()) - # convert to hls + # convert to hw model.set_tensor_datatype(model.graph.input[0].name, idt) # extra streamlining model = model.transform(MoveScalarLinearPastInvariants()) @@ -189,35 +179,52 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(InferDataLayouts()) model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferChannelwiseLinearLayer()) - model = model.transform(to_hls.InferAddStreamsLayer()) - model = model.transform(to_hls.InferGlobalAccPoolLayer()) + model = model.transform(to_hw.InferChannelwiseLinearLayer()) + model = model.transform(to_hw.InferAddStreamsLayer()) + model = model.transform(to_hw.InferGlobalAccPoolLayer()) model = model.transform(MoveScalarLinearPastInvariants()) model = model.transform(InsertTopK()) model = model.transform(AbsorbScalarMulAddIntoTopK()) model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferLabelSelectLayer()) + model = model.transform(to_hw.InferLabelSelectLayer()) model = model.transform(AbsorbConsecutiveTransposes()) model = model.transform(InferDataTypes()) - model = model.transform(to_hls.InferLabelSelectLayer()) - model = model.transform(to_hls.InferDuplicateStreamsLayer()) + model = model.transform(to_hw.InferDuplicateStreamsLayer()) model = model.transform(SortGraph()) - # model.save("golden_hls.onnx") # check topology status finn_nodes = model.get_finn_nodes() assert len(finn_nodes) == 9 - add_nodes = model.get_nodes_by_op_type("AddStreams_Batch") + add_nodes = model.get_nodes_by_op_type("AddStreams") + assert len(add_nodes) == 1 + pool_nodes = model.get_nodes_by_op_type("GlobalAccPool") + assert len(pool_nodes) == 1 + label_nodes = model.get_nodes_by_op_type("LabelSelect") + assert len(label_nodes) == 1 + channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp") + assert len(channelwise_nodes) == 5 + dup_nodes = model.get_nodes_by_op_type("DuplicateStreams") + assert len(dup_nodes) == 1 + + output_hw = oxe.execute_onnx(model, input_dict, True) + + model = model.transform(SpecializeLayers()) + + # check topology status + + finn_nodes = model.get_finn_nodes() + assert len(finn_nodes) == 9 + add_nodes = model.get_nodes_by_op_type("AddStreams_hls") assert len(add_nodes) == 1 - pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_Batch") + pool_nodes = model.get_nodes_by_op_type("GlobalAccPool_hls") assert len(pool_nodes) == 1 - label_nodes = model.get_nodes_by_op_type("LabelSelect_Batch") + label_nodes = model.get_nodes_by_op_type("LabelSelect_hls") assert len(label_nodes) == 1 - channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_Batch") + channelwise_nodes = model.get_nodes_by_op_type("ChannelwiseOp_hls") assert len(channelwise_nodes) == 5 - dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_Batch") + dup_nodes = model.get_nodes_by_op_type("DuplicateStreams_hls") assert len(dup_nodes) == 1 model = model.transform(PrepareCppSim()) @@ -225,7 +232,13 @@ def test_convert_to_hls_layers_synthetic(ch, ifmdim, idt): model = model.transform(SetExecMode("cppsim")) output_dict = oxe.execute_onnx(model, input_dict, True) - produced_topk_hls = output_dict[model.graph.output[0].name] + + # verify execution + outp_name = model.graph.output[0].name + # comparison before and after layer specialization + assert (output_dict[outp_name] == output_hw[outp_name]).all() + # comparison with golden output + produced_topk_hls = output_dict[outp_name] topk_input = output_dict[model.graph.node[-1].input[0]] assert soft_verify_topk(topk_input, produced_topk_hls, 5) diff --git a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py similarity index 80% rename from tests/fpgadataflow/test_convert_to_hls_pool_batch.py rename to tests/fpgadataflow/test_convert_to_hw_pool_batch.py index 6d628c9e53..d532cf345e 100644 --- a/tests/fpgadataflow/test_convert_to_hls_pool_batch.py +++ b/tests/fpgadataflow/test_convert_to_hw_pool_batch.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,7 +38,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -46,11 +46,10 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_single_maxpool_modelwrapper( - k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False -): +def make_single_maxpool_modelwrapper(k, stride, pad, ifm_ch, ifm_dim, ofm_dim, idt, use_1d=False): odt = idt if use_1d: ishape = [1, ifm_ch, 1, ifm_dim] @@ -74,9 +73,7 @@ def make_single_maxpool_modelwrapper( pads=pads, strides=strides, ) - graph = helper.make_graph( - nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="mp-model") model = ModelWrapper(model) @@ -89,12 +86,8 @@ def make_single_maxpool_modelwrapper( def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt): - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim, ofm_dim]) mp_node = helper.make_node( "QuantAvgPool2d", @@ -108,9 +101,7 @@ def make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, id signed=1 if idt.signed() else 0, data_layout="NCHW", ) - graph = helper.make_graph( - nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="mp-model") model = ModelWrapper(model) @@ -143,9 +134,7 @@ def prepare_inputs(input_tensor): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_convert_to_hls_pool_batch( - idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode -): +def test_convert_to_hw_pool(idt, odt, pool_config, ifm_ch, pe, op_type, exec_mode): k, stride, pad, ifm_dim = pool_config if ifm_ch % pe != 0: @@ -168,10 +157,6 @@ def test_convert_to_hls_pool_batch( # prepare input data input_dict = prepare_inputs(x) if op_type == "MaxPool": - # if idt.signed(): - # pytest.skip("""No support for signed input (see accu initialization - # in Pool_batch HLSLIB function). Skipping""") - if idt != odt: pytest.skip("Skipping Maxpool with idt != odt") @@ -184,24 +169,31 @@ def test_convert_to_hls_pool_batch( if idt.signed() != odt.signed(): pytest.skip("Skipping QuantAvgPool2d with idt.signed() != odt.signed()") - model = make_single_quantavpool_modelwrapper( - k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt - ) + model = make_single_quantavpool_modelwrapper(k, stride, ifm_ch, ifm_dim, ofm_dim, idt, odt) else: assert False, "{} is not a supported op_type".format(op_type) y_expected = oxe.execute_onnx(model, input_dict)["outp"] - new_model = model.transform(to_hls.InferPool_Batch()) + new_model = model.transform(to_hw.InferPool()) new_model = new_model.transform(GiveUniqueNodeNames()) + new_model = new_model.transform(to_hw.InferConvInpGen()) + # to test cppsim, set preferred_impl_style for swg to hls + inst = getCustomOp(new_model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) + inst.set_nodeattr("preferred_impl_style", "hls") + if pad != 0: + inst = getCustomOp(new_model.get_nodes_by_op_type("FMPadding")[0]) + inst.set_nodeattr("preferred_impl_style", "hls") + y_produced = oxe.execute_onnx(new_model, input_dict)["outp"] + assert (y_produced == y_expected).all() + new_model = new_model.transform(SpecializeLayers()) - new_model = new_model.transform(to_hls.InferConvInpGen()) # Folding for n in new_model.graph.node: if n.op_type.startswith("ConvolutionInputGenerator"): inst = getCustomOp(n) inst.set_nodeattr("SIMD", pe) - elif n.op_type == "Pool_Batch": + elif n.op_type.startswith("Pool"): inst = getCustomOp(n) inst.set_nodeattr("PE", pe) @@ -209,19 +201,15 @@ def test_convert_to_hls_pool_batch( if pad == 0: assert len(new_model.graph.node) == 4 assert new_model.graph.node[0].op_type == "Transpose" - assert new_model.graph.node[1].op_type.startswith( - "ConvolutionInputGenerator" - ) - assert new_model.graph.node[2].op_type == "Pool_Batch" + assert new_model.graph.node[1].op_type.startswith("ConvolutionInputGenerator") + assert new_model.graph.node[2].op_type.startswith("Pool") assert new_model.graph.node[3].op_type == "Transpose" else: assert len(new_model.graph.node) == 5 assert new_model.graph.node[0].op_type == "Transpose" - assert new_model.graph.node[1].op_type == "FMPadding_Batch" - assert new_model.graph.node[2].op_type.startswith( - "ConvolutionInputGenerator" - ) - assert new_model.graph.node[3].op_type == "Pool_Batch" + assert new_model.graph.node[1].op_type.startswith("FMPadding") + assert new_model.graph.node[2].op_type.startswith("ConvolutionInputGenerator") + assert new_model.graph.node[3].op_type.startswith("Pool") assert new_model.graph.node[4].op_type == "Transpose" else: # not currently converted to HLS, node stays as-is @@ -248,7 +236,7 @@ def test_convert_to_hls_pool_batch( assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = new_model.get_nodes_by_op_type("Pool_Batch")[0] + node = new_model.get_nodes_by_op_type("Pool_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = new_model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_convert_to_hw_thresholding.py b/tests/fpgadataflow/test_convert_to_hw_thresholding.py new file mode 100755 index 0000000000..63cb5986e1 --- /dev/null +++ b/tests/fpgadataflow/test_convert_to_hw_thresholding.py @@ -0,0 +1,205 @@ +# Copyright (C) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.multithreshold import multithreshold +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor + +import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferThresholdingLayer +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers + +test_fpga_part = "xczu3eg-sbva484-1-e" +target_clk_ns = 5 + + +# Helper functions +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +def prepare_inputs(input_tensor): + return {"inp": input_tensor} + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC(FINN) to NCHW(Standard) +def layout_FINN2NCHW(data): + return np.transpose(data, (0, 3, 1, 2)) + + +# Convert from NCHW(Standard) to NHWC(FINN) +def layout_NCHW2FINN(data): + return np.transpose(data, (0, 2, 3, 1)) + + +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def generate_pe_value(fold, num_input_channels): + if fold == -1: + fold = num_input_channels + pe = num_input_channels // fold + assert num_input_channels % pe == 0 + return pe + + +def make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, +): + NumChannels = thresholds.shape[0] + + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, num_input_vecs + [NumChannels]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, num_input_vecs + [NumChannels]) + + node_inp_list = ["inp", "thresh"] + + Multithresholding_node = helper.make_node( + "MultiThreshold", + node_inp_list, + ["outp"], + domain="qonnx.custom_op.general", + out_dtype=output_data_type.name, + out_bias=float(activation_bias), + out_scale=1.0, + ) + + graph = helper.make_graph( + nodes=[Multithresholding_node], + name="multithresholding_graph", + inputs=[inp], + outputs=[outp], + ) + + model = helper.make_model(graph, producer_name="multithresholding-model") + model = ModelWrapper(model) + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + model = model.transform(GiveUniqueNodeNames()) + + model.set_tensor_datatype("inp", input_data_type) + model.set_tensor_datatype("outp", output_data_type) + + model.set_tensor_datatype("thresh", input_data_type) + model.set_initializer("thresh", thresholds) + return model + + +# N.B. Fold values where C % PE != 0 fail +@pytest.mark.parametrize("activation", [DataType["INT4"], DataType["BIPOLAR"]]) +@pytest.mark.parametrize("input_data_type", [DataType["INT16"], DataType["UINT16"]]) +@pytest.mark.parametrize("fold", [-1, 1, 2, 4, 6]) +@pytest.mark.parametrize("num_input_channels", [16]) +@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_convert_multithreshold_to_hardware( + impl_style, + activation, + input_data_type, + fold, + num_input_channels, +): + # Handle inputs to the test + pe = generate_pe_value(fold, num_input_channels) + num_steps = activation.get_num_possible_values() - 1 + + # Other non-input parameters + num_input_vecs = [1, 2, 2] + output_data_type = activation + if output_data_type == DataType["BIPOLAR"]: + activation_bias = 0 + else: + activation_bias = output_data_type.min() + + # Generate random thresholds and sort in ascending order + thresholds = generate_random_threshold_values(input_data_type, num_input_channels, num_steps) + + # provide non-decreasing/ascending thresholds + thresholds = sort_thresholds_increasing(thresholds) + + # Make a Multithreshold graph and convert to thresholding binary search node + model = make_single_multithresholding_modelwrapper( + thresholds, + pe, + input_data_type, + output_data_type, + activation_bias, + num_input_vecs, + ) + + model = model.transform(InferThresholdingLayer()) + + # Perform functional validation of the InferThresholdingLayer transform + x = gen_finn_dt_tensor(input_data_type, tuple(num_input_vecs + [num_input_channels])) + + x_nchw = layout_FINN2NCHW(x) + y_expected = multithreshold(x_nchw, thresholds) + + # convert back to NHWC for comparison to hw outputs + y_expected = layout_NCHW2FINN(y_expected) + if activation == DataType["BIPOLAR"]: + # binary to bipolar + y_expected = 2 * y_expected - 1 + else: + # signed offset + y_expected += activation.min() + + input_dict = prepare_inputs(x) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + assert (y_produced == y_expected).all() + + # Transform to the specified implementation style, either the + # RTL or HLS according to test parameters + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] + inst = getCustomOp(node) + inst.set_nodeattr("preferred_impl_style", impl_style) + model = model.transform(SpecializeLayers()) + model = model.transform(InferShapes()) + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) diff --git a/tests/fpgadataflow/test_depthwise_convolution.py b/tests/fpgadataflow/test_depthwise_convolution.py index 8ab22bcfdc..b8242df933 100644 --- a/tests/fpgadataflow/test_depthwise_convolution.py +++ b/tests/fpgadataflow/test_depthwise_convolution.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,7 +46,7 @@ import finn.core.onnx_exec as oxe from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import ( +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( InferConvInpGen, InferVectorVectorActivation, ) @@ -54,10 +55,10 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): - # set up reference model consisting of Im2Col + MatMul (+ MultiThreshold) ofm_ch = ifm_ch total_pad = 2 * padding @@ -84,16 +85,10 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): ) # set up onnx model - inp = oh.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch] - ) - outp = oh.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch] - ) + inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch]) + outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, ofm_ch]) - W_sparse = oh.make_tensor_value_info( - "W_sparse", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch] - ) + W_sparse = oh.make_tensor_value_info("W_sparse", TensorProto.FLOAT, [ifm_ch * k * k, ofm_ch]) im2col_node = oh.make_node( "Im2Col", @@ -107,9 +102,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): depthwise=1, ) - matmul_node = oh.make_node( - "MatMul", inputs=["im2col_out", "W_sparse"], outputs=["outp"] - ) + matmul_node = oh.make_node("MatMul", inputs=["im2col_out", "W_sparse"], outputs=["outp"]) if act is None: node_list = [im2col_node, matmul_node] @@ -175,7 +168,7 @@ def set_up_reference_model(act, idt, wdt, k, ifm_dim, ifm_ch, stride, padding): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): +def test_depthwise_conv_hw_cppsim(act, pe, k, stride, padding): idt = wdt = DataType["INT4"] ifm_dim = 6 ifm_ch = 4 @@ -189,13 +182,14 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - # set SIMD in ConvInputGen node and PE in VVAU node + new_model = new_model.transform(SpecializeLayers()) + # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator": + if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation": + elif n.op_type.startswith("VVAU"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) new_model = new_model.transform(SetExecMode("cppsim")) @@ -218,7 +212,7 @@ def test_depthwise_conv_hls_cppsim(act, pe, k, stride, padding): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): +def test_depthwise_conv_hw_rtlsim(act, pe, k, stride, padding): idt = wdt = DataType["INT4"] ifm_dim = 6 ifm_ch = 4 @@ -232,13 +226,14 @@ def test_depthwise_conv_hls_rtlsim(act, pe, k, stride, padding): new_model = model.transform(InferConvInpGen()) new_model = new_model.transform(InferVectorVectorActivation()) - # set SIMD in ConvInputGen node and PE in VVAU node + new_model = new_model.transform(SpecializeLayers()) + # set SIMD in ConvInputGen node and PE in VVAU node for n in new_model.graph.node: - if n.op_type == "ConvolutionInputGenerator": + if n.op_type.startswith("ConvolutionInputGenerator"): convinputgen_node = getCustomOp(n) convinputgen_node.set_nodeattr("SIMD", pe) - elif n.op_type == "VectorVectorActivation": + elif n.op_type.startswith("VVAU"): vvau_node = getCustomOp(n) vvau_node.set_nodeattr("PE", pe) diff --git a/tests/fpgadataflow/test_fifosizing.py b/tests/fpgadataflow/test_fifosizing.py index f4f2b8dbff..338204c0c7 100644 --- a/tests/fpgadataflow/test_fifosizing.py +++ b/tests/fpgadataflow/test_fifosizing.py @@ -31,7 +31,8 @@ import json import shutil -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +import torch +from brevitas.export import export_qonnx from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp @@ -45,7 +46,7 @@ def fetch_test_model(topology, wbits=2, abits=2): tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology) (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) chkpt_name = tmp_output_dir + "/model.onnx" - BrevitasONNXManager.export(model, ishape, chkpt_name) + export_qonnx(model, torch.randn(ishape), chkpt_name) return tmp_output_dir @@ -55,7 +56,7 @@ def fetch_test_model(topology, wbits=2, abits=2): @pytest.mark.parametrize( "method", ["largefifo_rtlsim_python", "largefifo_rtlsim_cpp", "characterize"] ) -@pytest.mark.parametrize("topology", ["tfc"]) +@pytest.mark.parametrize("topology", ["tfc", "cnv"]) def test_fifosizing_linear(method, topology): force_python_rtlsim = "python" in method method_key = "largefifo_rtlsim" if "largefifo_rtlsim" in method else "characterize" @@ -68,14 +69,13 @@ def test_fifosizing_linear(method, topology): force_python_rtlsim=force_python_rtlsim, synth_clk_period_ns=10.0, board="Pynq-Z1", - rtlsim_batch_size=100, + rtlsim_batch_size=100 if topology == "tfc" else 2, shell_flow_type=build_cfg.ShellFlowType.VIVADO_ZYNQ, generate_outputs=[ build_cfg.DataflowOutputType.ESTIMATE_REPORTS, build_cfg.DataflowOutputType.STITCHED_IP, build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, ], - default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, ) build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: @@ -83,8 +83,7 @@ def test_fifosizing_linear(method, topology): with open(tmp_output_dir + "/report/rtlsim_performance.json") as f: sim_data = json.load(f) assert ( - float(sim_data["throughput[images/s]"]) - / float(est_data["estimated_throughput_fps"]) + float(sim_data["stable_throughput[images/s]"]) / float(est_data["estimated_throughput_fps"]) > 0.9 ) # now run the same build using the generated folding and FIFO config @@ -97,12 +96,8 @@ def test_fifosizing_linear(method, topology): cfg_cmp.folding_config_file = tmp_output_dir + "/final_hw_config.json" build.build_dataflow_cfg(tmp_output_dir_cmp + "/model.onnx", cfg_cmp) - model0 = ModelWrapper( - tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx" - ) - model1 = ModelWrapper( - tmp_output_dir_cmp + "/intermediate_models/step_create_stitched_ip.onnx" - ) + model0 = ModelWrapper(tmp_output_dir + "/intermediate_models/step_create_stitched_ip.onnx") + model1 = ModelWrapper(tmp_output_dir_cmp + "/intermediate_models/step_create_stitched_ip.onnx") assert len(model0.graph.node) == len(model1.graph.node) for i in range(len(model0.graph.node)): diff --git a/tests/fpgadataflow/test_fpgadataflow_addstreams.py b/tests/fpgadataflow/test_fpgadataflow_addstreams.py index 1ad2c26610..530d94e13b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_addstreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_addstreams.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,6 +44,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_addstreams_modelwrapper(ch, pe, idt): @@ -52,7 +53,7 @@ def make_addstreams_modelwrapper(ch, pe, idt): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch]) addstreams_node = helper.make_node( - "AddStreams_Batch", + "AddStreams", ["inp1", "inp2"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -60,6 +61,7 @@ def make_addstreams_modelwrapper(ch, pe, idt): NumChannels=ch, PE=pe, inputDataType=idt.name, + preferred_impl_style="hls", ) graph = helper.make_graph( nodes=[addstreams_node], @@ -104,6 +106,18 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): model = make_addstreams_modelwrapper(ch, pe, idt) + # prepare input data + input_dict = prepare_inputs(x1, x2) + oshape = model.get_tensor_shape("outp") + y = x1 + x2 + y_expected = y.reshape(oshape) + + # test verification flow before specializing layer + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all(), "Execution of hw layer failed" + + model = model.transform(SpecializeLayers()) + if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -117,12 +131,6 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data - input_dict = prepare_inputs(x1, x2) - - oshape = model.get_tensor_shape("outp") - y = x1 + x2 - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) @@ -130,7 +138,7 @@ def test_fpgadataflow_addstreams(idt, ch, fold, exec_mode): assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("AddStreams_Batch")[0] + node = model.get_nodes_by_op_type("AddStreams_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py index 13fab9a47f..d5fa7c779f 100644 --- a/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py +++ b/tests/fpgadataflow/test_fpgadataflow_channelwise_ops.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -45,20 +46,19 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): NumChannels = C.shape[0] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, vecs + [NumChannels]) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, vecs + [NumChannels] - ) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, vecs + [NumChannels]) node_inp_list = ["inp", "const"] node = helper.make_node( - "ChannelwiseOp_Batch", + "ChannelwiseOp", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -70,6 +70,7 @@ def make_modelwrapper(C, pe, idt, odt, pdt, func, vecs): outputDataType=odt.name, paramDataType=pdt.name, numInputVectors=vecs, + preferred_impl_style="hls", ) graph = helper.make_graph(nodes=[node], name="graph", inputs=[inp], outputs=[outp]) @@ -111,13 +112,35 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m # generate input and param data x = gen_finn_dt_tensor(idt, tuple(vecs + [ich])) - # C = np.random.randint(idt.min(), idt.max() + 1, ich).astype(np.float32) C = gen_finn_dt_tensor(pdt, (ich)) odt = act + # create model model = make_modelwrapper(C, pe, idt, odt, pdt, func, vecs) + # package input data as dictionary + input_dict = {"inp": x} + + oshape = model.get_tensor_shape("outp") + + C_reshaped = np.broadcast_to(C.flatten(), x.shape) + if func == "add": + y = x + C_reshaped + elif func == "mul": + y = x * C_reshaped + + y_expected = y.reshape(oshape) + + # verify hw abstraction layer + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "HW layer execution failed" + + model = model.transform(SpecializeLayers()) + if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -131,30 +154,18 @@ def test_fpgadataflow_channelwise_ops(idt, act, pdt, nf, ich, func, vecs, exec_m else: raise Exception("Unknown exec_mode") - # package input data as dictionary - input_dict = {"inp": x} - - oshape = model.get_tensor_shape("outp") - - C_reshaped = np.broadcast_to(C.flatten(), x.shape) - if func == "add": - y = x + C_reshaped - elif func == "mul": - y = x * C_reshaped - - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "ChannelwiseOp_Batch_0" in hls_synt_res_est + assert "ChannelwiseOp_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("ChannelwiseOp_Batch")[0] + node = model.get_nodes_by_op_type("ChannelwiseOp_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_checksum.py b/tests/fpgadataflow/test_fpgadataflow_checksum.py index cd404f5a63..34a48996c9 100644 --- a/tests/fpgadataflow/test_fpgadataflow_checksum.py +++ b/tests/fpgadataflow/test_fpgadataflow_checksum.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 @@ -70,10 +72,10 @@ def create_two_fc_model(): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["inp", "w0"], ["mid"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -85,14 +87,14 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode="internal_decoupled", ) fc1 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["mid", "w1"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -104,7 +106,7 @@ def create_two_fc_model(): ActVal=actval, binaryXnorMode=binary_xnor_mode, noActivation=no_act, - mem_mode="decoupled", + mem_mode="internal_decoupled", ) graph = helper.make_graph( @@ -151,7 +153,7 @@ def test_fpgadataflow_checksum(): model = model.transform(InferShapes()) assert ( - len(model.get_nodes_by_op_type("CheckSum")) == 2 + len(model.get_nodes_by_op_type("CheckSum_hls")) == 2 ), """Insertion of checksum layers was unsuccessful""" @@ -166,14 +168,15 @@ def test_fpgadataflow_checksum(): model = model.transform(CompileCppSim()) inp = {"global_in": x} y_cppsim = oxe.execute_onnx(model, inp, return_full_exec_context=True) - checksum0_cppsim = y_cppsim["CheckSum_0_out1"] - checksum1_cppsim = y_cppsim["CheckSum_1_out1"] + checksum0_cppsim = y_cppsim["CheckSum_hls_0_out1"] + checksum1_cppsim = y_cppsim["CheckSum_hls_1_out1"] # in this test case scenario the checksums are equal assert checksum0_cppsim == checksum1_cppsim, "CheckSums are not equal" # rtlsim model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) @@ -187,7 +190,7 @@ def test_fpgadataflow_checksum(): def read_checksum_and_drain(sim): chk_addr = 16 drain_addr = 32 - for i in range(len(model.get_nodes_by_op_type("CheckSum"))): + for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) checksums.append(axilite_read(sim, chk_addr, basename=axi_name)) drain.append(axilite_read(sim, drain_addr, basename=axi_name)) @@ -196,7 +199,7 @@ def read_checksum_and_drain(sim): def write_drain(sim): addr = 32 - for i in range(len(model.get_nodes_by_op_type("CheckSum"))): + for i in range(len(model.get_nodes_by_op_type("CheckSum_hls"))): axi_name = "s_axi_checksum_{}_".format(i) axilite_write(sim, addr, drain_value, basename=axi_name) @@ -215,11 +218,7 @@ def write_drain(sim): ), """The second checksums do not match in cppsim vs. rtlsim""" - assert ( - checksum0_drain == 0 - ), "Drain read doesn't match drain write for first checksum" - assert ( - checksum1_drain == 0 - ), "Drain read doesn't match drain write for second checksum" + assert checksum0_drain == 0, "Drain read doesn't match drain write for first checksum" + assert checksum1_drain == 0, "Drain read doesn't match drain write for second checksum" # TODO: test for drain set to true diff --git a/tests/fpgadataflow/test_fpgadataflow_concat.py b/tests/fpgadataflow/test_fpgadataflow_concat.py index 5fff286e54..b52b14fca3 100644 --- a/tests/fpgadataflow/test_fpgadataflow_concat.py +++ b/tests/fpgadataflow/test_fpgadataflow_concat.py @@ -1,4 +1,5 @@ # Copyright (c) 2021, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,7 +41,7 @@ from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferConcatLayer +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferConcatLayer from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO @@ -48,6 +49,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_concat_model(i_shapes, idt): @@ -90,11 +92,17 @@ def test_fpgadataflow_concat(exec_mode, idt): inp_dict[model.graph.input[i].name] = i_data[i] ret = execute_onnx(model, inp_dict) assert (ret[oname] == exp_out).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW and verify conversion model = model.transform(InferConcatLayer()) assert model.graph.node[0].op_type == "StreamingConcat" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" + ret = execute_onnx(model, inp_dict) + assert (ret[oname] == exp_out).all() + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "StreamingConcat_hls" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" if exec_mode == "cppsim": + model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) @@ -129,11 +137,15 @@ def test_fpgadataflow_concat_stitchedip(): inp_dict[model.graph.input[i].name] = i_data[i] ret = execute_onnx(model, inp_dict) assert (ret[oname] == exp_out).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW and verify conversion model = model.transform(InferConcatLayer()) assert model.graph.node[0].op_type == "StreamingConcat" assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow" + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "StreamingConcat_hls" + assert model.graph.node[0].domain == "finn.custom_op.fpgadataflow.hls" model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(fpga_part, clk_ns)) model = model.transform(HLSSynthIP()) diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py index 3cfff9ac34..45ca74fbea 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,11 +33,13 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.registry import getCustomOp from qonnx.transformation.general import GiveUniqueNodeNames from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP @@ -44,30 +47,34 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt -): +def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw): + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + ofm_dim_h, ofm_dim_w = ofm_dim + odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch] + "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] ) im2col_node = helper.make_node( "Im2Col", ["inp"], ["outp"], - domain="qonnx.custom_op.general", - stride=[stride, stride], - kernel_size=[k, k], - input_shape=str((1, ifm_dim, ifm_dim, ifm_ch)), + domain="finn.custom_op.general", + stride=[stride_h, stride_w], + kernel_size=[k_h, k_w], + input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), + dilations=[dilation_h, dilation_w], pad_amount=[0, 0, 0, 0], pad_value=0, - dilations=[dilation, dilation], + depthwise=dw, ) graph = helper.make_graph( nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] @@ -82,85 +89,117 @@ def make_single_im2col_modelwrapper( return model -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw=0 -): - odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim, ifm_dim, ifm_ch] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim, ofm_dim, k * k * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k, k], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim, ifm_dim], - OFMDim=[ofm_dim, ofm_dim], - SIMD=simd, - Stride=[stride, stride], - Dilation=[dilation, dilation], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - def prepare_inputs(input_tensor): return {"inp": input_tensor} # input datatype -@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT2"]]) +@pytest.mark.parametrize("idt", [DataType["INT2"], DataType["UINT4"]]) # kernel size -@pytest.mark.parametrize("k", [2, 3]) +@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 5]]) # input dimension -@pytest.mark.parametrize("ifm_dim", [6, 8]) +@pytest.mark.parametrize("ifm_dim", [[8, 8], [1, 21]]) # input channels @pytest.mark.parametrize("ifm_ch", [2, 4]) # Stride -@pytest.mark.parametrize("stride", [1, 2]) +@pytest.mark.parametrize("stride", [[1, 1], [2, 2], [2, 1]]) # Dilation -# Currently only dilation value of 1 is supported -@pytest.mark.parametrize("dilation", [1]) +@pytest.mark.parametrize("dilation", [[1, 1], [2, 2], [2, 1]]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 2]) +@pytest.mark.parametrize("simd", [1, 2, 4]) # depthwise @pytest.mark.parametrize("dw", [0, 1]) +# parallel_window enable (MMV_out = M*K) +@pytest.mark.parametrize("parallel_window", [0, 1]) +# in/out MMV ("M") +@pytest.mark.parametrize("m", [1]) +# Flip dimensions +@pytest.mark.parametrize("flip", [False]) +# implementation style +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_slidingwindow( - idt, k, ifm_dim, ifm_ch, stride, dilation, exec_mode, simd, dw + idt, + k, + ifm_dim, + ifm_ch, + stride, + dilation, + exec_mode, + simd, + dw, + parallel_window, + m, + flip, + impl_style, ): - ofm_dim = int(((ifm_dim - k) / stride) + 1) + if flip: + if ( + ifm_dim[0] == ifm_dim[1] + and k[0] == k[1] + and stride[0] == stride[1] + and dilation[0] == dilation[1] + ): + pytest.skip("Dimension flip would have no effect") + k = k[::-1] + ifm_dim = ifm_dim[::-1] + stride = stride[::-1] + dilation = dilation[::-1] + + k_h, k_w = k + ifm_dim_h, ifm_dim_w = ifm_dim + stride_h, stride_w = stride + dilation_h, dilation_w = dilation + + kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation + kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation + + if simd > ifm_ch: + pytest.skip("SIMD cannot be larger than number of input channels") + if ifm_ch % simd != 0: + pytest.skip("SIMD must divide number of input channels") + if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") + if (k_h == 1 and dilation_h != 1) or (k_w == 1 and dilation_w != 1): + pytest.skip("Illegal convolution configuration: dilation for unitary kernel dim") + if ((stride_h > k_h) or (stride_w > k_w)) and not (parallel_window or (k_h == 1 and k_w == 1)): + pytest.skip("Not all combinations for stride > k edge case supported in default mode") + if parallel_window and simd != ifm_ch and not (dw or (k_h == 1 and k_w == 1)): + pytest.skip("Parallel window requires SIMD=C for non-depthwise case") + + ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) + ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) + ofm_dim = [ofm_dim_h, ofm_dim_w] + + x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) + # prepare input data + input_dict = prepare_inputs(x) + model = make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt, dw) + y_expected = oxe.execute_onnx(model, input_dict)["outp"] - x = gen_finn_dt_tensor(idt, (1, ifm_dim, ifm_dim, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, dw - ) + model = model.transform(to_hw.InferConvInpGen()) + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + # set impl_style + inst = getCustomOp(model.get_nodes_by_op_type("ConvolutionInputGenerator")[0]) + inst.set_nodeattr("preferred_impl_style", impl_style) + model = model.transform(SpecializeLayers()) + # set simd + inst = getCustomOp(model.graph.node[0]) + inst.set_nodeattr("SIMD", simd) + optype = model.graph.node[0].op_type + if optype == "ConvolutionInputGenerator_rtl": + inst.set_nodeattr("parallel_window", parallel_window) + inst.set_nodeattr("M", m) + if optype == "ConvolutionInputGenerator_hls": + if inst.get_nodeattr("is1D"): + inst.set_nodeattr("parallel_window", parallel_window) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) @@ -175,30 +214,26 @@ def test_fpgadataflow_slidingwindow( else: raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") - # prepare input data - input_dict = prepare_inputs(x) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] if dw == 0: assert (y_produced == y_expected).all() else: - y_expected = y_expected.reshape( - 1, ofm_dim, ofm_dim, k * k, ifm_ch // simd, simd - ) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd) y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim, ofm_dim, ifm_ch * k * k) + y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) assert (y_produced == y_expected).all() - if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("ConvolutionInputGenerator")[0] - inst = getCustomOp(node) - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) - assert exp_cycles != 0 + if exec_mode == "rtlsim" and impl_style == "hls": + nodes = model.get_nodes_by_op_type("ConvolutionInputGenerator_hls") + if nodes: + node = nodes[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 + else: + assert model.graph.node[0].op_type == "ConvolutionInputGenerator_rtl" diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py deleted file mode 100644 index f467f37618..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator1d.py +++ /dev/null @@ -1,276 +0,0 @@ -# Copyright (c) 2020, Xilinx -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -import numpy as np -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer -from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP -from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - -fpga_part = "xczu3eg-sbva484-1-e" - - -def make_single_im2col_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - im2col_node = helper.make_node( - "Im2Col", - ["inp"], - ["outp"], - domain="qonnx.custom_op.general", - stride=[stride_h, stride_w], - kernel_size=[k_h, k_w], - input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), - dilations=[dilation_h, dilation_w], - pad_amount=[0, 0, 0, 0], - pad_value=0, - ) - graph = helper.make_graph( - nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] - ) - - model = qonnx_make_model(graph, producer_name="im2col-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, stride, dilation, idt, parallel_window, dw=0 -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator1D", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim_h, ifm_dim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=simd, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - parallel_window=parallel_window, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -# input datatype -# @pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT8"]]) -@pytest.mark.parametrize("idt", [DataType["INT8"]]) -# kernel size -@pytest.mark.parametrize("k", [[4, 1]]) -# input dimension -@pytest.mark.parametrize("ifm_dim", [[10, 1]]) -# input channels -@pytest.mark.parametrize("ifm_ch", [1, 4]) -# Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 1]]) -# Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 1]]) -# execution mode -@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) -# input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 4]) -# depthwise -@pytest.mark.parametrize("dw", [0, 1]) -# Flip dimensions -@pytest.mark.parametrize("flip", [False, True]) -# Use parallel window output variant -@pytest.mark.parametrize("parallel_window", [False, True]) -@pytest.mark.fpgadataflow -@pytest.mark.slow -@pytest.mark.vivado -def test_fpgadataflow_slidingwindow_1d( - idt, - k, - ifm_dim, - ifm_ch, - stride, - dilation, - exec_mode, - simd, - dw, - flip, - parallel_window, -): - if flip: - k = k[::-1] - ifm_dim = ifm_dim[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - if (dilation_h > 1 or dilation_w > 1) and (stride_h > 1 or stride_w > 1): - pytest.skip( - """Dilation value greater than 1 and stride greater than 1 - currently not supported for 1D convolutions""" - ) - if (dilation_h > 1 or dilation_w > 1) and dw == 0: - pytest.skip( - """Dilation value greater than 1 currently not supported - for non-dws 1D convolutions""" - ) - if simd > ifm_ch: - pytest.skip("SIMD cannot be larger than number of input channels") - - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) - ofm_dim = [ofm_dim_h, ofm_dim_w] - - x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - stride=stride, - dilation=dilation, - idt=idt, - parallel_window=parallel_window, - dw=dw, - ) - - if exec_mode == "cppsim": - model = model.transform(SetExecMode("cppsim")) - model = model.transform(PrepareCppSim()) - model = model.transform(CompileCppSim()) - elif exec_mode == "rtlsim": - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(fpga_part, 5)) - model = model.transform(HLSSynthIP()) - model = model.transform(PrepareRTLSim()) - else: - raise Exception("Unknown exec_mode in test_fpgadataflow_slidingwindow") - - # prepare input data - input_dict = prepare_inputs(x) - # execute model - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - stride=stride, - dilation=dilation, - idt=idt, - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - - if dw == 0: - assert (y_produced == y_expected).all() - else: - y_expected = y_expected.reshape( - 1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd - ) - y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) - assert (y_produced == y_expected).all() - - if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("ConvolutionInputGenerator1D")[0] - inst = getCustomOp(node) - cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") - exp_cycles_dict = model.analysis(exp_cycles_per_layer) - exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) - assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py deleted file mode 100755 index 58fc5ec04c..0000000000 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl.py +++ /dev/null @@ -1,260 +0,0 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of FINN nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -import pytest - -from onnx import TensorProto, helper -from qonnx.core.datatype import DataType -from qonnx.core.modelwrapper import ModelWrapper -from qonnx.custom_op.general.im2col import compute_conv_output_dim -from qonnx.transformation.general import GiveUniqueNodeNames -from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model - -import finn.core.onnx_exec as oxe -from finn.transformation.fpgadataflow.prepare_ip import PrepareIP -from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim -from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode - - -def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilation, idt): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - im2col_node = helper.make_node( - "Im2Col", - ["inp"], - ["outp"], - domain="finn.custom_op.general", - stride=[stride_h, stride_w], - kernel_size=[k_h, k_w], - input_shape=str((1, ifm_dim_h, ifm_dim_w, ifm_ch)), - dilations=[dilation_h, dilation_w], - pad_amount=[0, 0, 0, 0], - pad_value=0, - ) - graph = helper.make_graph( - nodes=[im2col_node], name="im2col_graph", inputs=[inp], outputs=[outp] - ) - - model = qonnx_make_model(graph, producer_name="im2col-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def make_single_slidingwindow_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, simd, m, parallel_window, stride, dilation, idt, dw=0 -): - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - ofm_dim_h, ofm_dim_w = ofm_dim - - odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] - ) - - SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator_rtl", - ["inp"], - ["outp"], - domain="finn.custom_op.fpgadataflow", - backend="fpgadataflow", - ConvKernelDim=[k_h, k_w], - IFMChannels=ifm_ch, - IFMDim=[ifm_dim_h, ifm_dim_w], - OFMDim=[ofm_dim_h, ofm_dim_w], - SIMD=simd, - M=m, - parallel_window=parallel_window, - Stride=[stride_h, stride_w], - Dilation=[dilation_h, dilation_w], - inputDataType=idt.name, - outputDataType=odt.name, - depthwise=dw, - ) - graph = helper.make_graph( - nodes=[SlidingWindow_node], - name="slidingwindow_graph", - inputs=[inp], - outputs=[outp], - ) - - model = qonnx_make_model(graph, producer_name="slidingwindow-model") - model = ModelWrapper(model) - - model.set_tensor_datatype("inp", idt) - model.set_tensor_datatype("outp", odt) - - return model - - -def prepare_inputs(input_tensor): - return {"inp": input_tensor} - - -# input datatype -@pytest.mark.parametrize("idt", [DataType["UINT4"]]) -# kernel size -@pytest.mark.parametrize("k", [[2, 2], [3, 3], [1, 3]]) -# input dimension -@pytest.mark.parametrize("ifm_dim", [[24, 24], [15, 6], [13, 13], [1, 14]]) -# input channels -@pytest.mark.parametrize("ifm_ch", [6]) -# Stride -@pytest.mark.parametrize("stride", [[1, 1], [2, 2]]) -# Dilation -@pytest.mark.parametrize("dilation", [[1, 1], [2, 2]]) -# depthwise -@pytest.mark.parametrize("dw", [0, 1]) -# input channel parallelism ("SIMD") -@pytest.mark.parametrize("simd", [1, 2, 3, 6]) -# parallel_window enable (MMV_out = M*K) -@pytest.mark.parametrize("parallel_window", [0]) -# in/out MMV ("M") -@pytest.mark.parametrize("m", [1]) -# Flip dimensions -@pytest.mark.parametrize("flip", [False]) -@pytest.mark.slow -@pytest.mark.vivado -@pytest.mark.fpgadataflow -def test_fpgadataflow_slidingwindow_rtl( - idt, k, ifm_dim, ifm_ch, stride, dilation, dw, simd, m, parallel_window, flip -): - if flip: - if ( - ifm_dim[0] == ifm_dim[1] - and k[0] == k[1] - and stride[0] == stride[1] - and dilation[0] == dilation[1] - ): - pytest.skip("Dimension flip would have no effect") - k = k[::-1] - ifm_dim = ifm_dim[::-1] - stride = stride[::-1] - dilation = dilation[::-1] - - k_h, k_w = k - ifm_dim_h, ifm_dim_w = ifm_dim - stride_h, stride_w = stride - dilation_h, dilation_w = dilation - - kernel_width = (k_w - 1) * dilation_w + 1 # incl. dilation - kernel_height = (k_h - 1) * dilation_h + 1 # incl. dilation - - if simd > ifm_ch: - pytest.skip("SIMD cannot be larger than number of input channels") - if ifm_ch % simd != 0: - pytest.skip("SIMD must divide number of input channels") - if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: - pytest.skip( - "Illegal convolution configuration: kernel or stride > FM dimension" - ) - if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: - pytest.skip( - "Illegal convolution configuration: kernel or stride > FM dimension" - ) - if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( - k_w == 1 and (stride_w != 1 or dilation_w != 1) - ): - pytest.skip( - """Illegal convolution configuration: - stride or dilation defined for unitary kernel dim""" - ) - if k_h == 1 and k_w == 1 and simd != ifm_ch: - pytest.skip("1x1 Kernel only supported in parallel mode (SIMD=C)") - if parallel_window and simd != ifm_ch: - pytest.skip("Parallel window requires SIMD=C") - - ofm_dim_h = compute_conv_output_dim(ifm_dim_h, k_h, stride_h, 0, dilation_h) - ofm_dim_w = compute_conv_output_dim(ifm_dim_w, k_w, stride_w, 0, dilation_w) - ofm_dim = [ofm_dim_h, ofm_dim_w] - - x = gen_finn_dt_tensor(idt, (1, ifm_dim_h, ifm_dim_w, ifm_ch)) - model = make_single_slidingwindow_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - simd=simd, - m=m, - parallel_window=parallel_window, - stride=stride, - dilation=dilation, - idt=idt, - dw=dw, - ) - - model = model.transform(SetExecMode("rtlsim")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP("xc7z020clg400-1", 5)) - model = model.transform(PrepareRTLSim()) - - # prepare input data - input_dict = prepare_inputs(x) - # execute model - y_produced = oxe.execute_onnx(model, input_dict)["outp"] - golden = make_single_im2col_modelwrapper( - k=k, - ifm_ch=ifm_ch, - ifm_dim=ifm_dim, - ofm_dim=ofm_dim, - stride=stride, - dilation=dilation, - idt=idt, - ) - y_expected = oxe.execute_onnx(golden, input_dict)["outp"] - - if dw == 0: - assert (y_produced == y_expected).all() - else: - y_expected = y_expected.reshape( - 1, ofm_dim_h, ofm_dim_w, k_h * k_w, ifm_ch // simd, simd - ) - y_expected = y_expected.transpose(0, 1, 2, 4, 3, 5) - y_expected = y_expected.reshape(1, ofm_dim_h, ofm_dim_w, ifm_ch * k_h * k_w) - assert (y_produced == y_expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py index 7f7bf649a9..6c0712b7b0 100644 --- a/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py +++ b/tests/fpgadataflow/test_fpgadataflow_convinputgenerator_rtl_dynamic.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022, Advanced Micro Devices, Inc. +# Copyright (c) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -48,7 +48,7 @@ from qonnx.util.basic import gen_finn_dt_tensor, get_by_name, qonnx_make_model import finn.core.onnx_exec as oxe -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb from finn.core.onnx_exec import execute_onnx from finn.core.rtlsim_exec import rtlsim_exec @@ -60,35 +60,22 @@ from finn.transformation.fpgadataflow.insert_dwc import InsertDWC from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pyverilate_get_liveness_threshold_cycles -def create_conv_model( - idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise -): +def create_conv_model(idim_h, idim_w, ifm, k, stride, ofm, idt, wdt, pad_mode, depthwise): np.random.seed(0) group = ifm if depthwise else 1 group_str = str(group) ishp = (1, ifm, idim_h, idim_w) - pad_0 = _auto_pad_to_explicit_padding( - pad_mode, idim_h, idim_w, k, k, stride, stride, 2 - ) - int_dim_h = compute_conv_output_dim( - idim_h, k, stride, total_pad=pad_0[0] + pad_0[2] - ) - int_dim_w = compute_conv_output_dim( - idim_w, k, stride, total_pad=pad_0[1] + pad_0[3] - ) + pad_0 = _auto_pad_to_explicit_padding(pad_mode, idim_h, idim_w, k, k, stride, stride, 2) + int_dim_h = compute_conv_output_dim(idim_h, k, stride, total_pad=pad_0[0] + pad_0[2]) + int_dim_w = compute_conv_output_dim(idim_w, k, stride, total_pad=pad_0[1] + pad_0[3]) - pad_1 = _auto_pad_to_explicit_padding( - pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2 - ) - odim_h = compute_conv_output_dim( - int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2] - ) - odim_w = compute_conv_output_dim( - int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3] - ) + pad_1 = _auto_pad_to_explicit_padding(pad_mode, int_dim_h, int_dim_w, k, k, stride, stride, 2) + odim_h = compute_conv_output_dim(int_dim_h, k, stride, total_pad=pad_1[0] + pad_1[2]) + odim_w = compute_conv_output_dim(int_dim_w, k, stride, total_pad=pad_1[1] + pad_1[3]) oshp = (1, ifm, odim_h, odim_w) if depthwise else (1, ofm, odim_h, odim_w) wshp = (ifm, 1, k, k) if depthwise else (ofm, ifm, k, k) wshp_1 = (ifm, 1, k, k) if depthwise else (ofm, ofm, k, k) @@ -189,6 +176,10 @@ def write_swg_config(sim): "ofm": 64, "depthwise": True, "pad_mode": "SAME_UPPER", + # run synthesis for one configuration + # this helped expose a bug in enum decls previously + # (which config the synth runs on does not matter) + "do_synth": True, } cfg1 = { "idims": [(32, 16), (16, 8)], @@ -198,6 +189,7 @@ def write_swg_config(sim): "ofm": 8, "depthwise": False, "pad_mode": "SAME_UPPER", + "do_synth": False, } cfg2 = { "idims": [(64, 128), (2, 4)], @@ -207,6 +199,7 @@ def write_swg_config(sim): "ofm": 64, "depthwise": True, "pad_mode": "SAME_UPPER", + "do_synth": False, } @@ -215,6 +208,7 @@ def write_swg_config(sim): @pytest.mark.vivado @pytest.mark.fpgadataflow def test_fpgadataflow_conv_dynamic(cfg): + do_synth = cfg["do_synth"] pad_mode = cfg["pad_mode"] depthwise = cfg["depthwise"] idims = cfg["idims"] @@ -255,16 +249,13 @@ def test_fpgadataflow_conv_dynamic(cfg): # convert to hardware and prepare simulation model = largest_model.transform(LowerConvsToMatMul()) - model = model.transform(to_hls.InferConvInpGen(use_rtl_variant=True)) - model = model.transform( - to_hls.InferQuantizedMatrixVectorActivation(mem_mode="decoupled") - ) - model = model.transform(to_hls.InferVectorVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferVectorVectorActivation()) model = model.transform(absorb.AbsorbConsecutiveTransposes()) + model = model.transform(SpecializeLayers()) parent_model = model.transform(CreateDataflowPartition()) - sdp_inst = getCustomOp( - parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0] - ) + sdp_inst = getCustomOp(parent_model.get_nodes_by_op_type("StreamingDataflowPartition")[0]) model = ModelWrapper(sdp_inst.get_nodeattr("model")) assert len(model.get_nodes_by_op_type("ConvolutionInputGenerator_rtl")) == 2 if pad_mode == "VALID": @@ -278,8 +269,10 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(swg_node).set_nodeattr("dynamic_mode", 1) getCustomOp(swg_node).set_nodeattr("inFIFODepths", [16]) getCustomOp(swg_node).set_nodeattr("outFIFODepths", [16]) - comp_nodes = model.get_nodes_by_op_type("MatrixVectorActivation") - comp_nodes += model.get_nodes_by_op_type("VectorVectorActivation") + comp_nodes = model.get_nodes_by_op_type("MVAU_hls") + comp_nodes += model.get_nodes_by_op_type("MVAU_rtl") + comp_nodes += model.get_nodes_by_op_type("VVAU_hls") + comp_nodes += model.get_nodes_by_op_type("VVAU_rtl") for comp_node in comp_nodes: if depthwise: getCustomOp(comp_node).set_nodeattr("PE", 4) @@ -288,11 +281,12 @@ def test_fpgadataflow_conv_dynamic(cfg): getCustomOp(comp_node).set_nodeattr("PE", 4) model = model.transform(InsertDWC()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) - model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5)) + model = model.transform(CreateStitchedIP("xc7z020clg400-1", 5, vitis=do_synth)) model.set_metadata_prop("exec_mode", "rtlsim") # loop through experiment configurations @@ -324,15 +318,11 @@ def test_fpgadataflow_conv_dynamic(cfg): pad_nodes = model.get_nodes_by_op_type("FMPadding_rtl") padder0 = getCustomOp(pad_nodes[0]) update_tensor_dim(model, padder0.onnx_node.input[0], (idim_h, idim_w)) - update_tensor_dim( - model, padder0.onnx_node.output[0], (conv0_idim_h, conv0_idim_w) - ) + update_tensor_dim(model, padder0.onnx_node.output[0], (conv0_idim_h, conv0_idim_w)) pad_config0 = padder0.get_dynamic_config((idim_h, idim_w), pad0) padder1 = getCustomOp(pad_nodes[1]) update_tensor_dim(model, padder1.onnx_node.input[0], (int_dim_h, int_dim_w)) - update_tensor_dim( - model, padder1.onnx_node.output[0], (conv1_idim_h, conv1_idim_w) - ) + update_tensor_dim(model, padder1.onnx_node.output[0], (conv1_idim_h, conv1_idim_w)) pad_config1 = padder1.get_dynamic_config((int_dim_h, int_dim_w), pad1) configs = [ ("s_axilite_0_", pad_config0), @@ -373,9 +363,7 @@ def make_single_im2col_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, stride, dilatio ofm_dim_h, ofm_dim_w = ofm_dim odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) outp = helper.make_tensor_value_info( "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] ) @@ -415,15 +403,13 @@ def make_single_slidingwindow_modelwrapper( ofm_dim_h, ofm_dim_w = ofm_dim odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) outp = helper.make_tensor_value_info( "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, k_h * k_w * ifm_ch] ) SlidingWindow_node = helper.make_node( - "ConvolutionInputGenerator_rtl", + "ConvolutionInputGenerator", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -508,13 +494,9 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( if ifm_ch % simd != 0: pytest.skip("SIMD must divide number of input channels") if kernel_height > ifm_dim_h or stride_h > ifm_dim_h: - pytest.skip( - "Illegal convolution configuration: kernel or stride > FM dimension" - ) + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") if kernel_width > ifm_dim_w or stride_w > ifm_dim_w: - pytest.skip( - "Illegal convolution configuration: kernel or stride > FM dimension" - ) + pytest.skip("Illegal convolution configuration: kernel or stride > FM dimension") if (k_h == 1 and (stride_h != 1 or dilation_h != 1)) or ( k_w == 1 and (stride_w != 1 or dilation_w != 1) ): @@ -541,9 +523,11 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( dw=dw, ) + model = model.transform(SpecializeLayers()) # Simulate using stitched-ip-rtlsim so we can use existing infrastructure # that supports hook functions to re-program configuration before rtlsim model = model.transform(InsertFIFO(True)) # required for proper simulation + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) model = model.transform(HLSSynthIP()) @@ -570,7 +554,7 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( configs = [("s_axilite_0_", config)] # Also update FIFO nodes and corresponding tensors - fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[0] + fifo_node = model.get_nodes_by_op_type("StreamingFIFO_rtl")[0] fifo_inst = getCustomOp(fifo_node) shape = fifo_inst.get_nodeattr("folded_shape") shape[1] = ifm_dim_h @@ -578,7 +562,7 @@ def test_fpgadataflow_slidingwindow_rtl_dynamic( fifo_inst.set_nodeattr("folded_shape", shape) update_tensor_dim(model, fifo_node.input[0], ifm_dim) - fifo_node = model.get_nodes_by_op_type("StreamingFIFO")[1] + fifo_node = model.get_nodes_by_op_type("StreamingFIFO_rtl")[1] fifo_inst = getCustomOp(fifo_node) shape = fifo_inst.get_nodeattr("folded_shape") shape[1] = ofm_dim_h diff --git a/tests/fpgadataflow/test_fpgadataflow_deconv.py b/tests/fpgadataflow/test_fpgadataflow_deconv.py new file mode 100644 index 0000000000..f1fc989066 --- /dev/null +++ b/tests/fpgadataflow/test_fpgadataflow_deconv.py @@ -0,0 +1,211 @@ +# Copyright (c) 2024, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of Xilinx nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +import os +from onnx import TensorProto, helper +from qonnx.core.datatype import DataType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model + +import finn.core.onnx_exec as oxe +from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.convert_to_hw_layers import ( + InferConvInpGen, + InferQuantizedMatrixVectorActivation, +) +from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.infer_pixel_padding_deconv import ( + InferPixelPaddingDeconv, +) +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim +from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.util.basic import pynq_part_map + +test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") +test_fpga_part = pynq_part_map[test_pynq_board] +target_clk_ns = 10 + + +def set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding): + idim_h, idim_w = idim + stride_h, stride_w = stride + odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 + odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1 + odt = DataType["INT32"] + + inp = helper.make_tensor_value_info( + "inp", + TensorProto.FLOAT, + [ + 1, + ifm_ch, + idim_h, + idim_w, + ], + ) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_ch, odim_h, odim_w]) + + W = helper.make_tensor_value_info("W", TensorProto.FLOAT, [ifm_ch, ofm_ch, k, k]) + + ConvTranspose = helper.make_node( + "ConvTranspose", + ["inp", "W"], + ["outp"], + dilations=(1, 1), + group=1, + kernel_shape=(k, k), + pads=(padding, padding, padding, padding), + strides=(stride_h, stride_w), + ) + + node_list = [ConvTranspose] + value_info = [W] + + graph = helper.make_graph( + nodes=node_list, + name="convtranspose_graph", + inputs=[inp], + outputs=[outp], + value_info=value_info, + ) + + model = qonnx_make_model(graph, producer_name="convtranspose-model") + model = ModelWrapper(model) + + # initialize model + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype(model.graph.output[0].name, odt) + model.set_tensor_datatype("W", wdt) + + w_tensor = gen_finn_dt_tensor(wdt, [ifm_ch, ofm_ch, k, k]) + model.set_initializer("W", w_tensor) + + model = model.transform(InferShapes()) + + return model + + +# input image dimension +@pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) +# number of rows and number of cols to add +@pytest.mark.parametrize("stride", [[2, 2], [2, 3]]) +# number of channels +@pytest.mark.parametrize("ifm_ch", [2]) +# number of channels +@pytest.mark.parametrize("ofm_ch", [4]) +# Input parallelism +@pytest.mark.parametrize("simd", [1, 2]) +# PE +@pytest.mark.parametrize("pe", [1, 2]) +# kernel size +@pytest.mark.parametrize("k", [2]) +# padding +@pytest.mark.parametrize("padding", [0, 1]) +# exec mode +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_deconv(idim, stride, ifm_ch, ofm_ch, simd, pe, k, padding, exec_mode): + idt = wdt = DataType["INT4"] + wdt = idt + idim_h, idim_w = idim + stride_h, stride_w = stride + + ref_model = set_up_reference_model(idt, wdt, k, idim, ifm_ch, ofm_ch, stride, padding) + + odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 + odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1 + + input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, idim_h, idim_w]) + input_dict = {"inp": input_tensor} + + y_expected = oxe.execute_onnx(ref_model, input_dict)["outp"] + + model = ref_model.transform(InferPixelPaddingDeconv()) + model = model.transform(InferConvInpGen()) + model = model.transform(InferQuantizedMatrixVectorActivation()) + model = model.transform(InferShapes()) + model = model.transform(GiveUniqueNodeNames()) + + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) + model = model.transform(MinimizeAccumulatorWidth()) + + for n in model.graph.node: + if n.op_type.startswith("ConvolutionInputGenerator"): + convinputgen_node = getCustomOp(n) + convinputgen_node.set_nodeattr("SIMD", simd) + elif n.op_type.startswith("MVAU"): + mvau_node = getCustomOp(n) + mvau_node.set_nodeattr("PE", pe) + mvau_node.set_nodeattr("SIMD", simd) + + expected_oshape = (1, ofm_ch, odim_h, odim_w) + + # cppsim + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + + # rtlsim + else: + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + model = model.transform(SetExecMode("rtlsim")) + + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert y_produced.shape == expected_oshape + assert (y_produced == y_expected).all() + + if exec_mode == "rtlsim": + node = model.get_nodes_by_op_type("FMPadding_Pixel_hls")[0] + inst = getCustomOp(node) + cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") + exp_cycles_dict = model.analysis(exp_cycles_per_layer) + exp_cycles = exp_cycles_dict[node.name] + assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert exp_cycles != 0 diff --git a/tests/fpgadataflow/test_fpgadataflow_downsampler.py b/tests/fpgadataflow/test_fpgadataflow_downsampler.py index 64da0a2368..25717a4152 100644 --- a/tests/fpgadataflow/test_fpgadataflow_downsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_downsampler.py @@ -39,7 +39,7 @@ from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.basic import gen_finn_dt_tensor -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -48,6 +48,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=False): @@ -122,14 +123,15 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): stride = 2 dt_in = DataType["UINT8"] dt_w = DataType["INT2"] - model = build_model( - is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d - ) + model = build_model(is_1d, in_dim, k, stride, dt_in, dt_w, pad_half=0, flip_1d=flip_1d) inp = gen_finn_dt_tensor(dt_in, model.get_tensor_shape("in0")) idict = {"in0": inp} y_expected = execute_onnx(model, idict)["out0"] - model = model.transform(to_hls.InferConvInpGen()) + model = model.transform(to_hw.InferConvInpGen()) assert len(model.get_nodes_by_op_type("DownSampler")) == 1 + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all() + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) @@ -145,7 +147,7 @@ def test_fpgadataflow_downsampler(is_1d, flip_1d, exec_mode): y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("DownSampler")[0] + node = model.get_nodes_by_op_type("DownSampler_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py index 441bbce50a..62b9265466 100644 --- a/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py +++ b/tests/fpgadataflow/test_fpgadataflow_duplicatestreams.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,9 +47,10 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): +def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl, impl_style): shape = [1, idim, idim, ch] inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) out_names = [] @@ -56,12 +58,10 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): for i in range(n_dupl): outp_name = "outp%d" % i out_names.append(outp_name) - out_vi.append( - helper.make_tensor_value_info(outp_name, TensorProto.FLOAT, shape) - ) + out_vi.append(helper.make_tensor_value_info(outp_name, TensorProto.FLOAT, shape)) dupstrm_node = helper.make_node( - "DuplicateStreams_Batch", + "DuplicateStreams", ["inp"], out_names, domain="finn.custom_op.fpgadataflow", @@ -71,10 +71,9 @@ def make_dupstreams_modelwrapper(ch, pe, idim, idt, n_dupl): PE=pe, inputDataType=idt.name, numInputVectors=[1, idim, idim], + preferred_impl_style=impl_style, ) - graph = helper.make_graph( - nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi - ) + graph = helper.make_graph(nodes=[dupstrm_node], name="graph", inputs=[inp], outputs=out_vi) model = qonnx_make_model(graph, producer_name="addstreams-model") model = ModelWrapper(model) @@ -103,9 +102,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("n_dupl", [2, 3]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl_style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): +def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode, impl_style): if fold == -1: pe = 1 else: @@ -115,7 +116,19 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) - model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl) + model = make_dupstreams_modelwrapper(ch, pe, imdim, idt, n_dupl, impl_style) + + # prepare input data and execute + input_dict = prepare_inputs(x, idt) + + # check behavior of hw abstraction layer + output_dict = oxe.execute_onnx(model, input_dict) + expected_y = x + for i in range(n_dupl): + y = output_dict["outp%d" % i] + assert (y == expected_y).all(), "HW layer execution failed" + + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -130,17 +143,14 @@ def test_fpgadataflow_duplicatestreams(idt, ch, fold, imdim, n_dupl, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) output_dict = oxe.execute_onnx(model, input_dict) - expected_y = x for i in range(n_dupl): y = output_dict["outp%d" % i] assert (y == expected_y).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("DuplicateStreams_Batch")[0] + node = model.get_nodes_by_op_type("DuplicateStreams_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_dwc.py b/tests/fpgadataflow/test_fpgadataflow_dwc.py index 2bde148a14..7152d32a7b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_dwc.py +++ b/tests/fpgadataflow/test_fpgadataflow_dwc.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -35,19 +36,25 @@ from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO +from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim +from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_style): - +def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, shape) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, shape) + optype = "StreamingDataWidthConverter" + DWC_node = helper.make_node( - "StreamingDataWidthConverter_Batch", + optype, ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -56,12 +63,9 @@ def make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype, impl_styl inWidth=inWidth, outWidth=outWidth, dataType=str(finn_dtype.name), - impl_style=impl_style, ) - graph = helper.make_graph( - nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[DWC_node], name="dwc_graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="dwc-model") model = ModelWrapper(model) @@ -79,32 +83,86 @@ def prepare_inputs(input_tensor, dt): @pytest.mark.parametrize( "config", [ - ([1, 24], 6, 4, DataType["INT2"], "hls"), - ([1, 24], 4, 6, DataType["INT2"], "hls"), - ([1, 4], 2, 4, DataType["BIPOLAR"], "hls"), - ([1, 2, 8], 2, 4, DataType["BIPOLAR"], "hls"), - ([1, 4], 4, 2, DataType["INT2"], "hls"), - ([1, 2, 8], 4, 4, DataType["INT2"], "hls"), - ([1, 2, 8], 8, 16, DataType["INT2"], "vivado"), + ([1, 24], 6, 4, DataType["INT2"]), + ([1, 24], 4, 6, DataType["INT2"]), + ([1, 4], 2, 4, DataType["BIPOLAR"]), + ([1, 2, 8], 2, 4, DataType["BIPOLAR"]), + ([1, 4], 4, 2, DataType["INT2"]), + ([1, 2, 8], 4, 4, DataType["INT2"]), + ([1, 2, 8], 8, 16, DataType["INT2"]), + ], +) +@pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_dwc(config, exec_mode): + shape, inWidth, outWidth, finn_dtype = config + + test_fpga_part = "xc7z020clg400-1" + # generate input data + x = gen_finn_dt_tensor(finn_dtype, shape) + input_dict = prepare_inputs(x, finn_dtype) + + model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) + # verify abstraction level execution + y = oxe.execute_onnx(model, input_dict)["outp"] + assert ( + y == x + ).all(), """The output values are not the same as the + input values anymore.""" + assert y.shape == tuple(shape), """The output shape is incorrect.""" + + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + if exec_mode == "cppsim": + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + model = model.transform(SetExecMode("cppsim")) + elif exec_mode == "rtlsim": + model = model.transform(PrepareIP(test_fpga_part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareRTLSim()) + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert ( + y == x + ).all(), """The output values are not the same as the + input values anymore.""" + assert y.shape == tuple(shape), """The output shape is incorrect.""" + + +@pytest.mark.parametrize( + "config", + [ + ([1, 24], 6, 4, DataType["INT2"]), + ([1, 24], 4, 6, DataType["INT2"]), + ([1, 4], 2, 4, DataType["BIPOLAR"]), + ([1, 2, 8], 2, 4, DataType["BIPOLAR"]), + ([1, 4], 4, 2, DataType["INT2"]), + ([1, 2, 8], 4, 4, DataType["INT2"]), + ([1, 2, 8], 8, 16, DataType["INT2"]), ], ) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_dwc_rtlsim(config): - shape, inWidth, outWidth, finn_dtype, impl_style = config +def test_fpgadataflow_dwc_stitched_rtlsim(config): + shape, inWidth, outWidth, finn_dtype = config + test_fpga_part = "xc7z020clg400-1" target_clk_ns = 10.0 # generate input data x = gen_finn_dt_tensor(finn_dtype, shape) input_dict = prepare_inputs(x, finn_dtype) - model = make_single_dwc_modelwrapper( - shape, inWidth, outWidth, finn_dtype, impl_style - ) + model = make_single_dwc_modelwrapper(shape, inWidth, outWidth, finn_dtype) + model = model.transform(SpecializeLayers()) model = model.transform(InsertFIFO(create_shallow_fifos=True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) - model = model.transform(PrepareIP(test_fpga_part, 5)) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) model.set_metadata_prop("exec_mode", "rtlsim") diff --git a/tests/fpgadataflow/test_fpgadataflow_eltwise.py b/tests/fpgadataflow/test_fpgadataflow_eltwise.py index 6028a9b9f0..fbfcc8e28b 100644 --- a/tests/fpgadataflow/test_fpgadataflow_eltwise.py +++ b/tests/fpgadataflow/test_fpgadataflow_eltwise.py @@ -1,4 +1,5 @@ # Copyright (c) 2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -38,7 +39,7 @@ from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim @@ -47,6 +48,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def build_model(shp, dt0, dt1, do_abs): @@ -105,9 +107,17 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): in1 = gen_finn_dt_tensor(dt1, shp) idict = {"in0": in0, "in1": in1} y_expected = execute_onnx(model, idict)["out0"] - model = model.transform(to_hls.InferStreamingEltwise()) + model = model.transform(to_hw.InferStreamingEltwise()) assert len(model.graph.node) == 1 assert model.graph.node[0].op_type == "StreamingEltwise" + + y_produced = execute_onnx(model, idict)["out0"] + assert (y_produced == y_expected).all(), exec_mode + " failed" + + model = model.transform(SpecializeLayers()) + + assert len(model.graph.node) == 1 + assert model.graph.node[0].op_type == "StreamingEltwise_hls" getCustomOp(model.graph.node[0]).set_nodeattr("PE", pe) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -124,7 +134,7 @@ def test_fpgadataflow_eltwise(dt0, ch, fold, do_abs, exec_mode): y_produced = execute_onnx(model, idict)["out0"] assert (y_produced == y_expected).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("StreamingEltwise")[0] + node = model.get_nodes_by_op_type("StreamingEltwise_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_fifo.py b/tests/fpgadataflow/test_fpgadataflow_fifo.py index efdb3bf6aa..1719da1454 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fifo.py +++ b/tests/fpgadataflow/test_fpgadataflow_fifo.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,6 +41,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers build_dir = os.environ["FINN_BUILD_DIR"] test_fpga_part = "xc7z020clg400-1" @@ -47,7 +49,6 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype): - inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, Shape) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, Shape) @@ -59,12 +60,11 @@ def make_single_fifo_modelwrapper(Shape, Depth, fld_shape, finn_dtype): backend="fpgadataflow", depth=Depth, folded_shape=fld_shape, + normal_shape=Shape, dataType=str(finn_dtype.name), ) - graph = helper.make_graph( - nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[FIFO_node], name="fifo_graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="fifo-model") model = ModelWrapper(model) @@ -86,17 +86,17 @@ def prepare_inputs(input_tensor, dt): # outWidth @pytest.mark.parametrize("depth", [16]) # finn_dtype -@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"]]) # , DataType["INT2"]]) +@pytest.mark.parametrize("finn_dtype", [DataType["BIPOLAR"], DataType["INT2"]]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_fifo_rtlsim(Shape, folded_shape, depth, finn_dtype): - # generate input data x = gen_finn_dt_tensor(finn_dtype, Shape) input_dict = prepare_inputs(x, finn_dtype) model = make_single_fifo_modelwrapper(Shape, depth, folded_shape, finn_dtype) + model = model.transform(SpecializeLayers()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py index b95409fda8..45cc265ac7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_fmpadding.py +++ b/tests/fpgadataflow/test_fpgadataflow_fmpadding.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -46,6 +47,7 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.basic import pynq_part_map test_pynq_board = os.getenv("PYNQ_BOARD", default="Pynq-Z1") @@ -53,7 +55,7 @@ target_clk_ns = 10 -def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt): +def make_single_fmpadding_modelwrapper(impl_style, idim, padding, num_ch, simd, idt): pad_h = padding[0] + padding[2] pad_w = padding[1] + padding[3] idim_h, idim_w = idim @@ -62,15 +64,11 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) odim_h = idim_h + pad_h odim_w = idim_w + pad_w - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, idim_h, idim_w, num_ch] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, idim_h, idim_w, num_ch]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, odim_h, odim_w, num_ch]) FMPadding = helper.make_node( - optype, + "FMPadding", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -81,6 +79,7 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) inputDataType=str(idt.name), numInputVectors=1, SIMD=simd, + preferred_impl_style=impl_style, ) graph = helper.make_graph( @@ -99,9 +98,7 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) # input image dimension @pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) # number of rows and number of cols to add -@pytest.mark.parametrize( - "pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]] -) +@pytest.mark.parametrize("pad", [[1, 1, 1, 1], [1, 1, 2, 2], [1, 3, 2, 3], [7, 0, 8, 0]]) # number of channels @pytest.mark.parametrize("num_ch", [2, 4]) # Input parallelism @@ -116,8 +113,6 @@ def make_single_fmpadding_modelwrapper(optype, idim, padding, num_ch, simd, idt) @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): - if impl_style == "rtl" and mode == "cppsim": - pytest.skip("rtl implstyle has no cppsim, skipping") if num_ch % simd != 0: pytest.skip(" num_ch % simd != 0, skipping") @@ -131,9 +126,17 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): odim_h = idim_h + pad_h odim_w = idim_w + pad_w - optype = {"hls": "FMPadding_Batch", "rtl": "FMPadding_rtl"}[impl_style] + y_expected = np.pad(x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant") + expected_oshape = (1, odim_h, odim_w, num_ch) + + model = make_single_fmpadding_modelwrapper(impl_style, idim, pad, num_ch, simd, idt) + + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert y_produced.shape == expected_oshape + assert (y_produced == y_expected).all(), "HW layer execution failed" + + model = model.transform(SpecializeLayers()) - model = make_single_fmpadding_modelwrapper(optype, idim, pad, num_ch, simd, idt) model = model.transform(InferShapes()) model = model.transform(SetExecMode(mode)) model = model.transform(GiveUniqueNodeNames()) @@ -146,17 +149,13 @@ def test_fpgadataflow_fmpadding(idim, pad, num_ch, simd, idt, mode, impl_style): model = model.transform(PrepareRTLSim()) y_produced = oxe.execute_onnx(model, input_dict)["outp"] - expected_oshape = (1, odim_h, odim_w, num_ch) - assert y_produced.shape == expected_oshape - - y_expected = np.pad( - x, ((0, 0), (pad[0], pad[2]), (pad[1], pad[3]), (0, 0)), "constant" - ) + assert y_produced.shape == expected_oshape assert (y_produced == y_expected).all() if mode == "rtlsim": - node = model.get_nodes_by_op_type(optype)[0] + op_type = "FMPadding_" + impl_style + node = model.get_nodes_by_op_type(op_type)[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py index a2c3d09a55..9c2802aade 100644 --- a/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_globalaccpool.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -44,14 +45,15 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers -def make_accpool_modelwrapper(ch, pe, idim, idt): +def make_accpool_modelwrapper(ch, pe, idim, idt, impl_style): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, idim, idim, ch]) outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 1, 1, ch]) accpool_node = helper.make_node( - "GlobalAccPool_Batch", + "GlobalAccPool", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -60,10 +62,9 @@ def make_accpool_modelwrapper(ch, pe, idim, idt): PE=pe, inputDataType=idt.name, numInputVectors=[1, idim, idim], + preferred_impl_style=impl_style, ) - graph = helper.make_graph( - nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[accpool_node], name="graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="thresholding-model") model = ModelWrapper(model) @@ -87,9 +88,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("imdim", [7]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl_style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): +def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode, impl_style): if fold == -1: pe = 1 else: @@ -99,7 +102,17 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, imdim, imdim, ch)) - model = make_accpool_modelwrapper(ch, pe, imdim, idt) + # prepare input data and execute + input_dict = prepare_inputs(x, idt) + expected_y = np.sum(x, axis=(1, 2)).flatten() + + model = make_accpool_modelwrapper(ch, pe, imdim, idt, impl_style) + + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert (y == expected_y).all(), "HW layer verification failed" + + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -114,15 +127,12 @@ def test_fpgadataflow_globalaccpool(idt, ch, fold, imdim, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) y = oxe.execute_onnx(model, input_dict)["outp"] - expected_y = np.sum(x, axis=(1, 2)).flatten() assert (y == expected_y).all(), exec_mode + " failed" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("GlobalAccPool_Batch")[0] + node = model.get_nodes_by_op_type("GlobalAccPool_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py index b220338e69..2061601b4a 100644 --- a/tests/fpgadataflow/test_fpgadataflow_ipstitch.py +++ b/tests/fpgadataflow/test_fpgadataflow_ipstitch.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -61,7 +62,7 @@ ip_stitch_model_dir = os.environ["FINN_BUILD_DIR"] -def create_one_fc_model(mem_mode="const"): +def create_one_fc_model(mem_mode="internal_embedded"): # create a model with a MatrixVectorActivation instance with no activation # the wider range of the full accumulator makes debugging a bit easier wdt = DataType["INT2"] @@ -78,10 +79,10 @@ def create_one_fc_model(mem_mode="const"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["inp", "w0"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -96,9 +97,7 @@ def create_one_fc_model(mem_mode="const"): mem_mode=mem_mode, ) - graph = helper.make_graph( - nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[fc0], name="fclayer_graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="fclayer-model") model = ModelWrapper(model) @@ -115,7 +114,7 @@ def create_one_fc_model(mem_mode="const"): return model -def create_two_fc_model(mem_mode="decoupled"): +def create_two_fc_model(mem_mode="internal_decoupled"): # create a model with two MatrixVectorActivation instances wdt = DataType["INT2"] idt = DataType["INT32"] @@ -132,10 +131,10 @@ def create_two_fc_model(mem_mode="decoupled"): outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, m]) fc0 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["inp", "w0"], ["mid"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -151,10 +150,10 @@ def create_two_fc_model(mem_mode="decoupled"): ) fc1 = helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", ["mid", "w1"], ["outp"], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=m, MH=m, @@ -196,7 +195,7 @@ def create_two_fc_model(mem_mode="decoupled"): return model -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_gen_model(mem_mode): @@ -206,19 +205,16 @@ def test_fpgadataflow_ipstitch_gen_model(mem_mode): assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" assert os.path.isfile(sdp_node.get_nodeattr("model")) model = load_test_checkpoint_or_skip(sdp_node.get_nodeattr("model")) - model.set_metadata_prop("exec_mode", "remote_pynq") model = model.transform(InsertTLastMarker()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, 5)) model = model.transform(HLSSynthIP()) - assert model.graph.node[0].op_type == "MatrixVectorActivation" - assert model.graph.node[-1].op_type == "TLastMarker" - model.save( - ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode - ) + assert model.graph.node[0].op_type == "MVAU_hls" + assert model.graph.node[-1].op_type == "TLastMarker_hls" + model.save(ip_stitch_model_dir + "/test_fpgadataflow_ipstitch_gen_model_%s.onnx" % mem_mode) -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_do_stitch(mem_mode): @@ -236,7 +232,7 @@ def test_fpgadataflow_ipstitch_do_stitch(mem_mode): model.save(ip_stitch_model_dir + "/test_fpgadataflow_ip_stitch_%s.onnx" % mem_mode) -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado def test_fpgadataflow_ipstitch_rtlsim(mem_mode): @@ -285,7 +281,7 @@ def test_fpgadataflow_ipstitch_rtlsim(mem_mode): assert (rtlsim_res == x).all() -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow @@ -340,7 +336,7 @@ def test_fpgadataflow_ipstitch_vitis_end2end(board, period_ns, extw): pytest.skip("VITIS_PATH not set") platform = alveo_default_platform[board] fpga_part = alveo_part_map[board] - model = create_two_fc_model("external" if extw else "decoupled") + model = create_two_fc_model("external" if extw else "internal_decoupled") if model.graph.node[0].op_type == "StreamingDataflowPartition": sdp_node = getCustomOp(model.graph.node[0]) assert sdp_node.__class__.__name__ == "StreamingDataflowPartition" diff --git a/tests/fpgadataflow/test_fpgadataflow_labelselect.py b/tests/fpgadataflow/test_fpgadataflow_labelselect.py index 553f263ba2..98ded66ca7 100644 --- a/tests/fpgadataflow/test_fpgadataflow_labelselect.py +++ b/tests/fpgadataflow/test_fpgadataflow_labelselect.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -42,15 +43,16 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.test import soft_verify_topk -def make_labelselect_modelwrapper(labels, pe, k, idt): +def make_labelselect_modelwrapper(labels, pe, k, idt, impl_style): inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, labels]) - outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, k]) + outp = helper.make_tensor_value_info("outp", TensorProto.INT64, [1, k]) labelselect_node = helper.make_node( - "LabelSelect_Batch", + "LabelSelect", ["inp"], ["outp"], domain="finn.custom_op.fpgadataflow", @@ -59,6 +61,7 @@ def make_labelselect_modelwrapper(labels, pe, k, idt): PE=pe, K=k, inputDataType=idt.name, + preferred_impl_style=impl_style, ) graph = helper.make_graph( nodes=[labelselect_node], @@ -81,9 +84,7 @@ def prepare_inputs(input_tensor, idt): return {"inp": input_tensor} -@pytest.mark.parametrize( - "idt", [DataType["UINT8"], DataType["UINT16"], DataType["INT16"]] -) +@pytest.mark.parametrize("idt", [DataType["UINT8"], DataType["UINT16"], DataType["INT16"]]) # labels @pytest.mark.parametrize("labels", [10, 100]) # folding @@ -92,9 +93,11 @@ def prepare_inputs(input_tensor, idt): @pytest.mark.parametrize("k", [1, 5]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) +# impl style +@pytest.mark.parametrize("impl_style", ["hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): +def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode, impl_style): np.random.seed(0) if fold == -1: pe = 1 @@ -107,8 +110,15 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): # generate input data x = gen_finn_dt_tensor(idt, (1, labels)) + input_dict = prepare_inputs(x, idt) + + model = make_labelselect_modelwrapper(labels, pe, k, idt, impl_style) + + y = oxe.execute_onnx(model, input_dict)["outp"] + + assert soft_verify_topk(x, y, k), "HW layer execution failed" - model = make_labelselect_modelwrapper(labels, pe, k, idt) + model = model.transform(SpecializeLayers()) if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) @@ -123,8 +133,6 @@ def test_fpgadataflow_labelselect(idt, labels, fold, k, exec_mode): else: raise Exception("Unknown exec_mode") - # prepare input data and execute - input_dict = prepare_inputs(x, idt) y = oxe.execute_onnx(model, input_dict)["outp"] assert soft_verify_topk(x, y, k), exec_mode + " failed" diff --git a/tests/fpgadataflow/test_fpgadataflow_lookup.py b/tests/fpgadataflow/test_fpgadataflow_lookup.py index da4204c81a..cb15fa3ae5 100644 --- a/tests/fpgadataflow/test_fpgadataflow_lookup.py +++ b/tests/fpgadataflow/test_fpgadataflow_lookup.py @@ -1,4 +1,5 @@ -# Copyright (c) 2021, Xilinx +# Copyright (C) 2021-2022, Xilinx, Inc. +# Copyright (C) 2023-2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -30,7 +31,7 @@ import numpy as np import torch -from brevitas.export import FINNManager +from brevitas.export import export_qonnx from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp @@ -38,17 +39,22 @@ from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.basic import gen_finn_dt_tensor +from qonnx.util.cleanup import cleanup as qonnx_cleanup from torch import nn from finn.core.onnx_exec import execute_onnx from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferLookupLayer +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferLookupLayer from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN + +export_onnx_path = "test_lookup.onnx" def make_lookup_model(embeddings, ishape, idt, edt): @@ -57,9 +63,7 @@ def make_lookup_model(embeddings, ishape, idt, edt): class LookupModel(nn.Module): def __init__(self, num_embeddings, embedding_dim): super().__init__() - self.lookup = nn.Embedding( - num_embeddings=num_embeddings, embedding_dim=embedding_dim - ) + self.lookup = nn.Embedding(num_embeddings=num_embeddings, embedding_dim=embedding_dim) def forward(self, x): x = self.lookup(x) @@ -67,8 +71,11 @@ def forward(self, x): torch_model = LookupModel(num_embeddings, embedding_dim) input_t = torch.zeros(ishape, dtype=torch.int64) - ret = FINNManager.export(torch_model, input_t=input_t, opset_version=11) - model = ModelWrapper(ret) + export_qonnx(torch_model, input_t, export_onnx_path, opset_version=11) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) + model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) + model = model.transform(InferShapes()) iname = model.graph.input[0].name ename = model.graph.node[0].input[0] model.set_tensor_datatype(iname, idt) @@ -115,13 +122,19 @@ def test_fpgadataflow_lookup(edt, embedding_cfg, exec_mode): ret = execute_onnx(model, {iname: itensor}) exp_out = np.take(embeddings, itensor, axis=0) assert (exp_out == ret[oname]).all() - # call transformation to convert to HLS and verify conversion + # call transformation to convert to HW layer and verify conversion model = model.transform(InferLookupLayer()) assert model.graph.node[0].op_type == "Lookup" assert model.graph.node[0].input[0] == iname assert model.graph.node[0].input[1] == ename assert model.graph.node[0].output[0] == oname + ret_hw = execute_onnx(model, {iname: itensor}) + assert (exp_out == ret_hw[oname]).all() + # call transformation to convert abstraction layer into HLS layer + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "Lookup_hls" if exec_mode == "cppsim": + model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) model = model.transform(SetExecMode("cppsim")) @@ -159,14 +172,10 @@ def test_fpgadataflow_lookup_external(): assert tuple(model.get_tensor_shape(ename)) == eshape assert tuple(model.get_tensor_shape(oname)) == exp_oshape assert (model.get_initializer(ename) == embeddings).all() - # itensor = gen_finn_dt_tensor(idt, ishape).astype(np.int64) - # itensor = np.clip(itensor, 0, num_embeddings - 1) - # ret = execute_onnx(model, {iname: itensor}) - # exp_out = np.take(embeddings, itensor, axis=0) - # assert (exp_out == ret[oname]).all() - # call transformation to convert to HLS and verify conversion model = model.transform(InferLookupLayer()) assert model.graph.node[0].op_type == "Lookup" + model = model.transform(SpecializeLayers()) + assert model.graph.node[0].op_type == "Lookup_hls" assert model.graph.node[0].input[0] == iname assert model.graph.node[0].input[1] == ename assert model.graph.node[0].output[0] == oname diff --git a/tests/fpgadataflow/test_fpgadataflow_mvau.py b/tests/fpgadataflow/test_fpgadataflow_mvau.py index b80ef76a19..2a22f3fc41 100644 --- a/tests/fpgadataflow/test_fpgadataflow_mvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_mvau.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -35,7 +35,12 @@ from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.util.basic import ( calculate_signed_dot_prod_range, gen_finn_dt_tensor, @@ -43,15 +48,25 @@ ) import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.analysis.fpgadataflow.hls_synth_res_estimation import hls_synth_res_estimation from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.derive_characteristic import DeriveCharacteristic from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=None): @@ -90,7 +105,7 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non actval = 0 no_act = 1 FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -127,16 +142,32 @@ def make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T=None, tdt=Non return model -def prepare_inputs(input_tensor, idt, wdt): +def make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W): + matmul_node = helper.make_node("MatMul", ["ifm", "weights"], ["ofm"]) + graph = helper.make_graph(nodes=[matmul_node], name="matmul_graph", inputs=[ifm], outputs=[ofm]) + + model = qonnx_make_model(graph, producer_name="fclayer-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_tensor_datatype( + "ofm", DataType["INT32"] + ) # At this step, the MatMul layer does not optimize the bit-width of the output datatype + model.set_initializer("weights", W) + # model.set_tensor_layout("ifm", DataLayout.NHWC) + + return model + + +def prepare_inputs(input_tensor, idt, wdt, inp_name="inp"): if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: # convert bipolar to binary - return {"inp": (input_tensor + 1) / 2} + return {inp_name: (input_tensor + 1) / 2} else: - return {"inp": input_tensor} + return {inp_name: input_tensor} -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) # weight datatype @@ -154,7 +185,7 @@ def prepare_inputs(input_tensor, idt, wdt): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_mvau_hwop(idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -191,10 +222,98 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): else: tdt = DataType["INT32"] model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + # prepare input data + input_dict = prepare_inputs(x, idt, wdt) + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # convert inputs to binary and use xnorpopcountmatmul + y = xp.xnorpopcountmatmul((x + 1) / 2, (W + 1) / 2) + else: + y = np.matmul(x, W) + if T is not None: + # y = multithreshold(y, T) + if act == DataType["BIPOLAR"]: + # binary to bipolar + # y = 2 * y - 1 + y = multithreshold(y, T, 2, -1) + else: + # signed offset + # y += act.min() + y = multithreshold(y, T, 1, act.min()) + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all(), "cppsim hw-op failed" + + +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled", "external"]) +# activation: None or DataType +@pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["INT4"]]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["INT4"]]) +# neuron folding, -1 is maximum possible +@pytest.mark.parametrize("nf", [-1, 2, 1]) +# synapse folding, -1 is maximum possible +@pytest.mark.parametrize("sf", [-1, 2, 1]) +# HLS matrix width (input features) +@pytest.mark.parametrize("mw", [16]) +# HLS matrix height (output features) +@pytest.mark.parametrize("mh", [16]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_mvau_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): + if nf == -1: + nf = mh + if sf == -1: + sf = mw + pe = mh // nf + simd = mw // sf + assert mh % pe == 0 + assert mw % sf == 0 + # generate weights + W = gen_finn_dt_tensor(wdt, (mw, mh)) + # generate input data + x = gen_finn_dt_tensor(idt, (1, mw)) + if act is None: + # no activation, produce accumulators + T = None + tdt = None + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] + else: + odt = act + (min, max) = calculate_signed_dot_prod_range(idt, wdt, mw) + n_steps = act.get_num_possible_values() - 1 + T = np.random.randint(min, max - 1, (mh, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T = np.sort(T, axis=1) + # generate thresholds for activation + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + mw) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] + model = make_single_fclayer_modelwrapper(W, pe, simd, wdt, idt, odt, T, tdt) + model = model.transform(GiveUniqueNodeNames()) for node in model.graph.node: # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + # Note: only HLS-based MVAU layers execute CPPsim + inst.set_nodeattr("preferred_impl_style", "hls") + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(GiveUniqueNodeNames()) model = model.transform(SetExecMode("cppsim")) model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -220,11 +339,11 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all(), "cppsim hls-op failed" -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["const", "decoupled", "external"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled", "external"]) # activation: None or DataType @pytest.mark.parametrize("act", [None, DataType["BIPOLAR"], DataType["INT4"]]) # weight datatype @@ -242,7 +361,7 @@ def test_fpgadataflow_fclayer_cppsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_fpgadataflow_mvau_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): if nf == -1: nf = mh if sf == -1: @@ -283,6 +402,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("preferred_impl_style", "hls") # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -303,6 +423,7 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers("xc7z020clg400-1")) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -312,9 +433,9 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + node = model.get_nodes_by_op_type("MVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -323,10 +444,10 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): assert exp_cycles != 0 -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["decoupled"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_decoupled"]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["INT4"]]) +@pytest.mark.parametrize("act", [None, DataType["INT4"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) # input datatype @@ -339,11 +460,15 @@ def test_fpgadataflow_fclayer_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): @pytest.mark.parametrize("mw", [128]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [128]) +# Backend +@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( - mem_mode, idt, wdt, act, nf, sf, mw, mh +def test_fpgadataflow_mvau_large_depth_decoupled_mode_rtlsim( + mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style ): + if preferred_impl_style == "rtl" and act is not None: + pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh if sf == -1: @@ -384,6 +509,8 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("resType", "auto") + inst.set_nodeattr("preferred_impl_style", preferred_impl_style) # prepare input data input_dict = prepare_inputs(x, idt, wdt) @@ -404,6 +531,9 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( y_expected = y.reshape(oshape) # TODO split up into several dependent tests -- need to check how this # works for parametrized tests... + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -413,9 +543,13 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( assert (y_produced.reshape(y_expected.shape) == y_expected).all(), "rtlsim failed" hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "MatrixVectorActivation_0" in hls_synt_res_est + if preferred_impl_style == "hls": + assert "MVAU_hls_0" in hls_synt_res_est - node = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + if preferred_impl_style == "hls": + node = model.get_nodes_by_op_type("MVAU_hls")[0] + else: + node = model.get_nodes_by_op_type("MVAU_rtl")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) @@ -424,10 +558,10 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( assert exp_cycles != 0 -# mem_mode: const or decoupled -@pytest.mark.parametrize("mem_mode", ["decoupled", "const"]) +# mem_mode: internal_embedded or internal_decoupled +@pytest.mark.parametrize("mem_mode", ["internal_decoupled", "internal_embedded"]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["INT4"]]) +@pytest.mark.parametrize("act", [None, DataType["INT4"]]) # weight datatype @pytest.mark.parametrize("wdt", [DataType["INT4"]]) # input datatype @@ -440,9 +574,15 @@ def test_fpgadataflow_fclayer_large_depth_decoupled_mode_rtlsim( @pytest.mark.parametrize("mw", [32]) # HLS matrix height (output features) @pytest.mark.parametrize("mh", [32]) +# Backend +@pytest.mark.parametrize("preferred_impl_style", ["hls", "rtl"]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh): +def test_mvau_fifocharacterize_rtlsim( + mem_mode, idt, wdt, act, nf, sf, mw, mh, preferred_impl_style +): + if preferred_impl_style == "rtl" and (mem_mode == "internal_embedded" or act is not None): + pytest.skip("RTL-MVAU doesn't support const mem mode or embedded activations") if nf == -1: nf = mh if sf == -1: @@ -467,8 +607,13 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh # lookup op_type in registry of CustomOps inst = getCustomOp(node) inst.set_nodeattr("mem_mode", mem_mode) + inst.set_nodeattr("resType", "auto") + inst.set_nodeattr("preferred_impl_style", preferred_impl_style) total_fold = nf * sf exp_total_cycles = total_fold + 10 + model = model.transform(SpecializeLayers("xc7z020clg400-1")) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) model = model.transform(SetExecMode("rtlsim")) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP("xc7z020clg400-1", 5)) @@ -482,7 +627,101 @@ def test_fclayer_fifocharacterize_rtlsim(mem_mode, idt, wdt, act, nf, sf, mw, mh chrc_out = node_inst.get_nodeattr("io_chrc_out") assert chrc_in.shape == (1, 2 * exp_total_cycles) assert chrc_out.shape == (1, 2 * exp_total_cycles) - # first sf cycles should read input continuously - assert (chrc_in[0, :sf] == range(1, sf + 1)).all() + # total number of transactions == 2*SF + assert chrc_in[0, -1] == 2 * sf # all outputs should be produced within the exp n of cycles assert chrc_out[0, exp_total_cycles] == nf + + +@pytest.mark.parametrize("mh", [18]) +@pytest.mark.parametrize("mw", [128]) +@pytest.mark.parametrize("pe", [1, 6, 9, 18]) +@pytest.mark.parametrize("simd", [1, 4, 16, 64, 128]) +@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("wdt", [DataType["INT4"], DataType["INT8"]]) +@pytest.mark.parametrize("part", ["xcvc1902-vsva2197-2MP-e-S", "xcku3p-ffva676-1-e"]) +@pytest.mark.parametrize("clk_ns", [1.66, 4]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_rtl_mvau(mh, mw, pe, simd, idt, wdt, part, clk_ns): + if part == "xcku3p-ffva676-1-e" and clk_ns != 1.66: + pytest.skip( + """Skip test for varying clk for devices other than Versal, + since this variable only affects DSP58s""" + ) + + # Create test input vector (produced by SWG) + ofm_shape = (3, 3) + ofm_h, ofm_w = ofm_shape + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, [1, ofm_h, ofm_w, mw]) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, (1, ofm_h, ofm_w, mh)) + W = gen_finn_dt_tensor(wdt, (mw, mh)) + model = make_single_matmul_modelwrapper(ifm, ofm, idt, wdt, W) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # Create MatMul & obtain golden reference output + A = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) + input_dict = prepare_inputs(A, idt, wdt, inp_name="global_in") + + # Execute ONNX model + output_matmul = oxe.execute_onnx(model, input_dict)["global_out"] + + # Create MVAU (HLS) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + + # Apply convert-to-rtl step + model = model.transform(SpecializeLayers(part)) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "MVAU_rtl_0": { + "PE": pe, + "SIMD": simd, + "resType": "dsp", + }, + } + model = model.transform(ApplyConfig(folding_config)) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + # make sure the changed datatypes are propagated through the network + model = model.transform(InferDataTypes()) + + # Run CPPsim + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + output_mvau_hls = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + output_matmul == output_mvau_hls + ).all(), "Output of ONNX model not matching output of node-by-node CPPsim!" + + # Run node-by-node RTLsim + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_mvau_rtl = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + output_matmul == output_mvau_rtl + ).all(), "Output of ONNX model not matching output of node-by-node RTLsim!" + + # Run stitched-ip RTLsim + model = model.transform(InsertAndSetFIFODepths(part, clk_ns)) + model = model.transform(PrepareIP(part, clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(part, clk_ns)) + + model.set_metadata_prop("rtlsim_so", "") + model.set_metadata_prop("exec_mode", "rtlsim") + output_mvau_rtl_stitch = oxe.execute_onnx(model, input_dict)["global_out"] + + assert ( + output_matmul == output_mvau_rtl_stitch + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" diff --git a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py index b3cf7b4229..1bc2d9d59e 100644 --- a/tests/fpgadataflow/test_fpgadataflow_res_estimate.py +++ b/tests/fpgadataflow/test_fpgadataflow_res_estimate.py @@ -38,6 +38,7 @@ res_estimation, res_estimation_complete, ) +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def check_two_dict_for_equality(dict1, dict2): @@ -68,7 +69,7 @@ def test_res_estimate(): node_inp_list = ["inp", "weights", "thresh"] FCLayer_node = helper.make_node( - "MatrixVectorActivation", + "MVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", @@ -95,13 +96,14 @@ def test_res_estimate(): model.set_tensor_datatype("outp", odt) model.set_tensor_datatype("weights", wdt) + model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) prod_resource_estimation = model.analysis(res_estimation) expect_resource_estimation = { - "MatrixVectorActivation_0": { + "MVAU_hls_0": { "BRAM_18K": 0, "BRAM_efficiency": 1, - "LUT": 357, + "LUT": 317, "DSP": 0, "URAM_efficiency": 1, "URAM": 0, @@ -115,11 +117,11 @@ def test_res_estimate(): prod_resource_estimation = model.analysis(res_estimation_complete) expect_resource_estimation = { - "MatrixVectorActivation_0": [ + "MVAU_hls_0": [ { "BRAM_18K": 0, "BRAM_efficiency": 1, - "LUT": 352, + "LUT": 313, "DSP": 1, "URAM": 0, "URAM_efficiency": 1, @@ -127,7 +129,7 @@ def test_res_estimate(): { "BRAM_18K": 0, "BRAM_efficiency": 1, - "LUT": 357, + "LUT": 317, "DSP": 0, "URAM": 0, "URAM_efficiency": 1, diff --git a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py index 628721b429..0df7181a60 100644 --- a/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py +++ b/tests/fpgadataflow/test_fpgadataflow_streamingmaxpool.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020-2022, Xilinx +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -40,12 +41,13 @@ import finn.core.onnx_exec as oxe from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferStreamingMaxPool +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferStreamingMaxPool from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode): @@ -53,9 +55,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_ ifm_dim_h, ifm_dim_w = ifm_dim ofm_dim_h, ofm_dim_w = ofm_dim odt = idt - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim_h, ifm_dim_w, ifm_ch]) outp = helper.make_tensor_value_info( "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] ) @@ -70,9 +70,7 @@ def make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_ ceil_mode=ceil_mode, pads=[0, 0, 0, 0], ) - graph = helper.make_graph( - nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[mp_node], name="mp_graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="mp-model") model = ModelWrapper(model) @@ -96,7 +94,7 @@ def prepare_inputs(input_tensor): # input dimension @pytest.mark.parametrize("ifm_dim", [4, 10]) # input channels -@pytest.mark.parametrize("ifm_ch", [1, 3]) # 1,3 +@pytest.mark.parametrize("ifm_ch", [1, 3]) # pe @pytest.mark.parametrize("pe", [1, 3]) # ceil mode @@ -106,9 +104,7 @@ def prepare_inputs(input_tensor): @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado -def test_fpgadataflow_streamingmaxpool( - idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode -): +def test_fpgadataflow_streamingmaxpool(idt, dim_1d, k, ifm_dim, ifm_ch, pe, ceil_mode, exec_mode): ifm_dim_h = ifm_dim k_h = k if dim_1d: @@ -138,18 +134,22 @@ def test_fpgadataflow_streamingmaxpool( # prepare input data input_dict = prepare_inputs(x) - golden = make_single_maxpoolnhwc_modelwrapper( - k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode - ) + golden = make_single_maxpoolnhwc_modelwrapper(k, ifm_ch, ifm_dim, ofm_dim, idt, ceil_mode) y_expected = oxe.execute_onnx(golden, input_dict)["outp"] model = golden.transform(InferStreamingMaxPool()) model = model.transform(InferShapes()) - assert model.graph.node[0].op_type == "StreamingMaxPool_Batch" + assert model.graph.node[0].op_type == "StreamingMaxPool" + + # execute model + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) # Ensure PE value is set - streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0] + streamingmaxpool_node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0] getCustomOp(streamingmaxpool_node).set_nodeattr("PE", pe) if exec_mode == "cppsim": @@ -170,7 +170,7 @@ def test_fpgadataflow_streamingmaxpool( assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("StreamingMaxPool_Batch")[0] + node = model.get_nodes_by_op_type("StreamingMaxPool_hls")[0] # inst = getCustomOp(node) # cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) diff --git a/tests/fpgadataflow/test_fpgadataflow_thresholding.py b/tests/fpgadataflow/test_fpgadataflow_thresholding.py index 96cd69c345..a6e7e41596 100644 --- a/tests/fpgadataflow/test_fpgadataflow_thresholding.py +++ b/tests/fpgadataflow/test_fpgadataflow_thresholding.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -51,40 +51,58 @@ from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers test_fpga_part = "xczu3eg-sbva484-1-e" target_clk_ns = 5 -def make_single_thresholding_modelwrapper( - T, pe, idt, odt, actval, mem_mode, n_inp_vecs -): +def generate_random_threshold_values(input_data_type, num_input_channels, num_steps): + return np.random.randint( + input_data_type.min(), + input_data_type.max() + 1, + (num_input_channels, num_steps), + ).astype(np.float32) + + +def sort_thresholds_increasing(thresholds): + return np.sort(thresholds, axis=1) + + +# n = batch, c = channel, h = height, w = width of feature map +# Standard = NCHW; FINN = NHWC +# Convert from NHWC(FINN) to NCHW(Standard) +def layout_FINN2NCHW(data): + return np.transpose(data, (0, 3, 1, 2)) + + +# Convert from NCHW(Standard) to NHWC(FINN) +def layout_NCHW2FINN(data): + return np.transpose(data, (0, 2, 3, 1)) + + +def make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs): NumChannels = T.shape[0] - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels] - ) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, n_inp_vecs + [NumChannels] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, n_inp_vecs + [NumChannels]) node_inp_list = ["inp", "thresh"] Thresholding_node = helper.make_node( - "Thresholding_Batch", + "Thresholding", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", NumChannels=NumChannels, - PE=pe, numSteps=T.shape[1], inputDataType=idt.name, weightDataType=idt.name, # will be set by MinimizeAccumulatorWidth outputDataType=odt.name, ActVal=actval, - mem_mode=mem_mode, numInputVectors=n_inp_vecs, + preferred_impl_style=impl_style, ) graph = helper.make_graph( nodes=[Thresholding_node], @@ -115,39 +133,83 @@ def make_single_thresholding_modelwrapper( # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) # memory mode -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) @pytest.mark.fpgadataflow @pytest.mark.vivado @pytest.mark.slow -def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): +def test_fpgadataflow_thresholding(impl_style, idt, act, nf, ich, exec_mode, mem_mode): + # the mem_mode parameter can only be used for the hls thresholding + # so the test will only be executed once for impl_style=rtl and once skipped + # when the mem_mode is varied. Otherwise, the same test configuration would always + # run twice. + if impl_style == "rtl" and mem_mode == "internal_decoupled": + pytest.skip( + "Skip, because test is identical to impl_style=rtl and mem_mode=internal_embedded" + ) if nf == -1: nf = ich pe = ich // nf n_inp_vecs = [1, 2, 2] assert ich % pe == 0 - # generate input data + # generate input data, data layout is NHWC for FINN x = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich])) odt = act n_steps = act.get_num_possible_values() - 1 - T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) - # make the vivado_hls threshold bug appear (incorrect rtlsim result when first - # threshold of first channel is zero, while using BIPOLAR output) - if act == DataType["BIPOLAR"]: - T[0][0] = 0 - # provide non-decreasing thresholds - T = np.sort(T, axis=1) + + # Generate random, non-decreasing thresholds + thresholds = generate_random_threshold_values(idt, ich, n_steps) + + thresholds = sort_thresholds_increasing(thresholds) if odt == DataType["BIPOLAR"]: actval = 0 else: actval = odt.min() + # Build DUT model = make_single_thresholding_modelwrapper( - T, pe, idt, odt, actval, mem_mode, n_inp_vecs + impl_style, thresholds, idt, odt, actval, n_inp_vecs ) + # Expected Reference output + # multithreshold util fxn wants NCHW input, not NHWC + x_nchw = layout_FINN2NCHW(x) + y = multithreshold(x_nchw, thresholds) + + # convert back to NHWC for comparison to hw outputs + y = layout_NCHW2FINN(y) + if act == DataType["BIPOLAR"]: + # binary to bipolar + y = 2 * y - 1 + else: + # signed offset + y += act.min() + + oshape = model.get_tensor_shape("outp") + y_expected = y.reshape(oshape) + + # package input data as dictionary + input_dict = {"inp": x} + + # execute DUT + y_produced = oxe.execute_onnx(model, input_dict)["outp"] + + y_produced = y_produced.reshape(y_expected.shape) + + assert (y_produced == y_expected).all() + + model = model.transform(SpecializeLayers()) + # Make sure that SpecializeLayers did not default to HLS implementation unexpectedly + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + node = model.graph.node[0] + inst = getCustomOp(node) + inst.set_nodeattr("PE", pe) + if impl_style == "hls": + inst.set_nodeattr("mem_mode", mem_mode) + if exec_mode == "cppsim": model = model.transform(PrepareCppSim()) model = model.transform(CompileCppSim()) @@ -161,60 +223,49 @@ def test_fpgadataflow_thresholding(idt, act, nf, ich, exec_mode, mem_mode): else: raise Exception("Unknown exec_mode") - # package input data as dictionary - input_dict = {"inp": x} - - # multithreshold util fxn wants NCHW input, not NHWC - y = multithreshold(np.transpose(x, (0, 3, 1, 2)), T) - # convert back to NHWC for comparison to hw outputs - y = np.transpose(y, (0, 2, 3, 1)) - if act == DataType["BIPOLAR"]: - # binary to bipolar - y = 2 * y - 1 - else: - # signed offset - y += act.min() - - oshape = model.get_tensor_shape("outp") - y_expected = y.reshape(oshape) # execute model y_produced = oxe.execute_onnx(model, input_dict)["outp"] y_produced = y_produced.reshape(y_expected.shape) - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_produced == y_expected).all() if exec_mode == "rtlsim": - hls_synt_res_est = model.analysis(hls_synth_res_estimation) - assert "Thresholding_Batch_0" in hls_synt_res_est - - node = model.get_nodes_by_op_type("Thresholding_Batch")[0] + if impl_style == "hls": + hls_synt_res_est = model.analysis(hls_synth_res_estimation) + assert model.graph.node[0].name in hls_synt_res_est + node = model.get_nodes_by_op_type(model.graph.node[0].op_type)[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] - assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) + assert np.isclose(exp_cycles, cycles_rtlsim, atol=15) assert exp_cycles != 0 +@pytest.mark.parametrize("impl_style", ["rtl", "hls"]) +# configuration (ch, pe) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) @pytest.mark.fpgadataflow @pytest.mark.vivado -def test_runtime_thresholds_single_layer(): +def test_runtime_thresholds_read(impl_style, cfg): + """Read back threshold weights during runtime + + 1. Create random initial weights T + 2. Execute model + 3. Read back weights via AXI + 4. Compare with initial weights T + """ + ch = cfg[0] + pe = cfg[1] n_inp_vecs = [1, 2, 2] - mem_mode = "decoupled" + hls_mem_mode = "internal_decoupled" act = DataType["INT4"] idt = DataType["INT16"] - nf = 8 - ich = 16 - pe = ich // nf - assert ich % pe == 0 - - # generate input data - in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ich])) - odt = act n_steps = act.get_num_possible_values() - 1 - T = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype(np.float32) + np.random.seed(2) + T = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) # provide non-decreasing thresholds T = np.sort(T, axis=1) @@ -223,19 +274,29 @@ def test_runtime_thresholds_single_layer(): else: actval = odt.min() - model = make_single_thresholding_modelwrapper( - T, pe, idt, odt, actval, mem_mode, n_inp_vecs - ) - op_inst = getCustomOp(model.graph.node[0]) + model = make_single_thresholding_modelwrapper(impl_style, T, idt, odt, actval, n_inp_vecs) + model = model.transform(SpecializeLayers()) + + # Make sure that specialize layer did not default to HLS implementation + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + + node = model.get_nodes_by_op_type(f"Thresholding_{impl_style}")[0] + op_inst = getCustomOp(node) + op_inst.set_nodeattr("PE", pe) + if impl_style == "hls": + op_inst.set_nodeattr("mem_mode", hls_mem_mode) op_inst.set_nodeattr("runtime_writeable_weights", 1) - op_inst.make_weight_file(T, "decoupled_runtime", "old_weights.dat") - with open("old_weights.dat", "r") as f: + + dat_fname = f"old_weights_{cfg}.dat" + op_inst.make_weight_file(T, "decoupled_runtime", dat_fname) + with open(dat_fname, "r") as f: old_weight_stream = f.read().strip() - os.remove("old_weights.dat") + os.remove(dat_fname) old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) # need to create stitched IP for runtime weight testing model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) @@ -245,66 +306,154 @@ def test_runtime_thresholds_single_layer(): # add two copies of the input tensor as the first one is just used to # "flush out" the pipeline (as mvau already starts receiving old weights while # we read/write new ones and reads seem to cause a disturbance too) + # generate input data + in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch])) in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) + exec_ctx = {"inp": in_tensor} extracted_weight_stream = [] def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): - extracted_weight_stream.append( - axilite_read(sim, addr, basename="s_axilite_0_") - ) + extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) + + # Validate the AXI Read weights assert extracted_weight_stream == old_weight_stream - # only use second batch element in output; first will be invalid due to - # old weights (see above) - y = exec_ctx["outp"][1] + + y = exec_ctx["outp"][0] # multithreshold util fxn wants NCHW input, not NHWC expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T) # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] - # expected = multithreshold(in_tensor, T)[1] if act == DataType["BIPOLAR"]: - # binary to bipolar + # binary to bipolarW expected = 2 * expected - 1 else: # signed offset expected += act.min() + + # Validate the output is as expected assert (y == expected).all() - new_weights = np.random.randint(idt.min(), idt.max() + 1, (ich, n_steps)).astype( - np.float32 - ) + +@pytest.mark.parametrize("impl_style", ["hls", "rtl"]) +# configuration (ch, pe) +@pytest.mark.parametrize("cfg", [(1, 1), (6, 2), (6, 3), (8, 4)]) +@pytest.mark.fpgadataflow +@pytest.mark.vivado +def test_runtime_thresholds_write(impl_style, cfg): + """Write threshold weights during runtime + + 1. Create random initial weights T_init + 2. Create model with initial weights + 3. Create new set of weights T_write + 4. Write T_write using AXI bus + 5. Read back using AXI bus to T_read + 6. Compare T_write and T_read + 7. Validate outputs with expected vectors + """ + ch = cfg[0] + pe = cfg[1] + + n_inp_vecs = [1, 2, 2] + hls_mem_mode = "internal_decoupled" + act = DataType["INT4"] + idt = DataType["INT16"] + + odt = act + n_steps = act.get_num_possible_values() - 1 + np.random.seed(2) + T_init = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) + # provide non-decreasing thresholds + T_init = np.sort(T_init, axis=1) + + if odt == DataType["BIPOLAR"]: + actval = 0 + else: + actval = odt.min() + + model = make_single_thresholding_modelwrapper(impl_style, T_init, idt, odt, actval, n_inp_vecs) + model = model.transform(SpecializeLayers()) + + # Validate that specialize layer did not default to HLS implementation + assert model.graph.node[0].op_type == "Thresholding_" + str(impl_style) + + op_inst = getCustomOp(model.graph.node[0]) + op_inst.set_nodeattr("PE", pe) + if impl_style == "hls": + op_inst.set_nodeattr("mem_mode", hls_mem_mode) + op_inst.set_nodeattr("runtime_writeable_weights", 1) + + # Make new weights for runtime write + np.random.seed(4) + T_write = np.random.randint(idt.min(), idt.max() + 1, (ch, n_steps)).astype(np.float32) # provide non-decreasing thresholds - new_weights = np.sort(T, axis=1) - op_inst.make_weight_file(new_weights, "decoupled_runtime", "new_weights.dat") - with open("new_weights.dat", "r") as f: - new_weight_stream = f.read().strip() - os.remove("new_weights.dat") - new_weight_stream = map(lambda x: int(x, 16), new_weight_stream.split("\n")) - new_weight_stream = list(new_weight_stream) + T_write = np.sort(T_write, axis=1) + + dat_fname = f"T_write_{cfg}.dat" # distinguish fname per paramter for distributed testing + op_inst.make_weight_file(T_write, "decoupled_runtime", dat_fname) + with open(dat_fname, "r") as f: + T_write_stream = f.read().strip() + os.remove(dat_fname) + + T_write_stream = map(lambda x: int(x, 16), T_write_stream.split("\n")) + T_write_stream = list(T_write_stream) + + # need to create stitched IP for runtime weight testing + model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) + model = model.transform(HLSSynthIP()) + model = model.transform(CreateStitchedIP(test_fpga_part, target_clk_ns)) + model = model.transform(PrepareRTLSim()) + model.set_metadata_prop("exec_mode", "rtlsim") + # add two copies of the input tensor as the first one is just used to + # "flush out" the pipeline (as mvau already starts receiving old weights while + # we read/write new ones and reads seem to cause a disturbance too) + # generate input data + in_tensor = gen_finn_dt_tensor(idt, tuple(n_inp_vecs + [ch])) + in_tensor = np.tile(in_tensor, (2, 1, 1, 1)) + + exec_ctx_write = {"inp": in_tensor} def write_weights(sim): addr = 0 - for nw in new_weight_stream: + for nw in T_write_stream: axilite_write(sim, addr, nw, basename="s_axilite_0_") addr += 4 - rtlsim_exec(model, exec_ctx, pre_hook=write_weights) - y = exec_ctx["outp"][1] + T_read_stream = [] + + def read_weights(sim): + addr = 0 + for i in range(len(T_write_stream)): + T_read_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) + addr += 4 + + rtlsim_exec(model, exec_ctx_write, pre_hook=write_weights, post_hook=read_weights) + + y = exec_ctx_write["outp"][1] + + assert T_read_stream == T_write_stream + # multithreshold util fxn wants NCHW input, not NHWC - expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), new_weights) + expected = multithreshold(np.transpose(in_tensor, (0, 3, 1, 2)), T_write) # convert back to NHWC for comparison to hw outputs expected = np.transpose(expected, (0, 2, 3, 1))[1] + if act == DataType["BIPOLAR"]: - # binary to bipolar + # binary to bipolarW expected = 2 * expected - 1 else: # signed offset expected += act.min() + + # Validate the output is as expected assert (y == expected).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_upsampler.py b/tests/fpgadataflow/test_fpgadataflow_upsampler.py index a08d31f7b0..b0da767eaa 100644 --- a/tests/fpgadataflow/test_fpgadataflow_upsampler.py +++ b/tests/fpgadataflow/test_fpgadataflow_upsampler.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020-2022, Xilinx, Inc. +# Copyright (C) 2023, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,7 +33,7 @@ import os import shutil import torch -from brevitas.export import FINNManager +from brevitas.export import export_qonnx from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.base import Transformation @@ -41,17 +42,20 @@ from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.make_input_chanlast import MakeInputChannelsLast +from qonnx.util.cleanup import cleanup as qonnx_cleanup from torch import nn import finn.core.onnx_exec as oxe import finn.transformation.streamline.absorb as absorb from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim -from finn.transformation.fpgadataflow.convert_to_hls_layers import InferUpsample +from finn.transformation.fpgadataflow.convert_to_hw_layers import InferUpsample from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.basic import make_build_dir tmpdir = os.environ["FINN_BUILD_DIR"] @@ -81,29 +85,6 @@ def apply(self, model): _to_chan_first_args = (0, 3, 1, 2) -class TransposeUpsampleIO(Transformation): - """ - Converts the inputs outputs for all Upsample and Resize nodes - from NCHW to NHWC. - """ - - def apply(self, model): - graph = model.graph - for n in graph.node: - if n.op_type == "Upsample" or n.op_type == "Resize": - # Set input shape - inp = n.input[0] - NCHW_shape = model.get_tensor_shape(inp) - NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] - model.set_tensor_shape(inp, NHWC_shape) - # Set output shape - out = n.output[0] - NCHW_shape = model.get_tensor_shape(out) - NHWC_shape = [NCHW_shape[idx] for idx in _to_chan_last_args] - model.set_tensor_shape(out, NHWC_shape) - return model, False - - class PyTorchTestModel(nn.Module): def __init__(self, upscale_factor=2): super(PyTorchTestModel, self).__init__() @@ -154,10 +135,11 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d # Get golden PyTorch and ONNX inputs golden_torch_float = torch_model(test_in) export_path = f"{tmpdir}/Upsample_exported.onnx" - FINNManager.export( - torch_model, input_shape=input_shape, export_path=export_path, opset_version=11 - ) + export_qonnx(torch_model, torch.randn(input_shape), export_path, opset_version=11) + qonnx_cleanup(export_path, out_file=export_path) model = ModelWrapper(export_path) + model = model.transform(ConvertQONNXtoFINN()) + model = model.transform(InferShapes()) input_dict = {model.graph.input[0].name: test_in.numpy().astype(np.int32)} input_dict = {model.graph.input[0].name: test_in.numpy()} golden_output_dict = oxe.execute_onnx(model, input_dict, True) @@ -169,7 +151,6 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d # Prep model for execution model = ModelWrapper(export_path) - # model = model.transform(TransposeUpsampleIO()) model = model.transform(MakeInputChannelsLast()) model = model.transform(InferDataLayouts()) model = model.transform(absorb.AbsorbTransposeIntoResize()) @@ -182,8 +163,18 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d # Check that all nodes are UpsampleNearestNeighbour_Batch nodes for n in model.get_finn_nodes(): - node_check = n.op_type == "UpsampleNearestNeighbour_Batch" - assert node_check, "All nodes should be UpsampleNearestNeighbour_Batch nodes." + node_check = n.op_type == "UpsampleNearestNeighbour" + assert node_check, "All nodes should be UpsampleNearestNeighbour nodes." + + test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) + input_dict = {model.graph.input[0].name: test_in_transposed} + + # Run sim + output_dict = oxe.execute_onnx(model, input_dict, True) + test_result = output_dict[model.graph.output[0].name] + output_matches = np.isclose(golden_result, test_result, atol=atol).all() + + model = model.transform(SpecializeLayers()) # Prep sim if exec_mode == "cppsim": @@ -200,8 +191,6 @@ def test_fpgadataflow_upsampler(dt, IFMDim, scale, NumChannels, exec_mode, is_1d raise Exception("Unknown exec_mode") # Run sim - test_in_transposed = test_in.numpy().transpose(_to_chan_last_args) - input_dict = {model.graph.input[0].name: test_in_transposed} output_dict = oxe.execute_onnx(model, input_dict, True) test_result = output_dict[model.graph.output[0].name] output_matches = np.isclose(golden_result, test_result, atol=atol).all() diff --git a/tests/fpgadataflow/test_fpgadataflow_vvau.py b/tests/fpgadataflow/test_fpgadataflow_vvau.py index abf8ba0b9e..236176faa6 100644 --- a/tests/fpgadataflow/test_fpgadataflow_vvau.py +++ b/tests/fpgadataflow/test_fpgadataflow_vvau.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -32,19 +32,40 @@ from onnx import TensorProto, helper from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.general.im2col import compute_conv_output_dim from qonnx.custom_op.general.multithreshold import multithreshold from qonnx.custom_op.registry import getCustomOp -from qonnx.transformation.general import GiveUniqueNodeNames +from qonnx.transformation.general import ( + ApplyConfig, + GiveReadableTensorNames, + GiveUniqueNodeNames, +) +from qonnx.transformation.infer_datatypes import InferDataTypes +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul from qonnx.util.basic import gen_finn_dt_tensor, qonnx_make_model import finn.core.onnx_exec as oxe +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw from finn.analysis.fpgadataflow.exp_cycles_per_layer import exp_cycles_per_layer from finn.transformation.fpgadataflow.compile_cppsim import CompileCppSim +from finn.transformation.fpgadataflow.create_dataflow_partition import ( + CreateDataflowPartition, +) +from finn.transformation.fpgadataflow.create_stitched_ip import CreateStitchedIP from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) from finn.transformation.fpgadataflow.prepare_cppsim import PrepareCppSim from finn.transformation.fpgadataflow.prepare_ip import PrepareIP from finn.transformation.fpgadataflow.prepare_rtlsim import PrepareRTLSim from finn.transformation.fpgadataflow.set_exec_mode import SetExecMode +from finn.transformation.fpgadataflow.set_fifo_depths import InsertAndSetFIFODepths +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers def _infer_sparse_weight_tensor(W_conv, k_h, k_w, channels): @@ -77,6 +98,7 @@ def _calculate_dot_prod_range(dt_a, dt_b, len): def _make_single_vvau_modelwrapper( W, pe, + simd, k_h, k_w, channels, @@ -87,7 +109,7 @@ def _make_single_vvau_modelwrapper( odt, T=None, tdt=None, - mem_mode="const", + mem_mode="internal_embedded", ): in_shape = [1, dim_h, dim_w, k_h * k_w * channels] # [N, H, W, K*K*CH] out_shape = [ @@ -103,19 +125,23 @@ def _make_single_vvau_modelwrapper( if T is not None: no_act = 0 node_inp_list = ["inp", "weights", "thresh"] - actval = odt.min() + if odt == DataType["BIPOLAR"]: + actval = 0 + else: + actval = odt.min() else: no_act = 1 node_inp_list = ["inp", "weights"] actval = 0 VVAU_node = helper.make_node( - "VectorVectorActivation", + "VVAU", node_inp_list, ["outp"], domain="finn.custom_op.fpgadataflow", backend="fpgadataflow", PE=pe, + SIMD=simd, Dim=[dim_h, dim_w], Channels=channels, Kernel=[k_h, k_w], @@ -128,9 +154,7 @@ def _make_single_vvau_modelwrapper( mem_mode=mem_mode, ) - graph = helper.make_graph( - nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp] - ) + graph = helper.make_graph(nodes=[VVAU_node], name="vvau_graph", inputs=[inp], outputs=[outp]) model = qonnx_make_model(graph, producer_name="vvau-model") model = ModelWrapper(model) @@ -146,21 +170,22 @@ def _make_single_vvau_modelwrapper( model.set_tensor_datatype("thresh", tdt) model.set_initializer("thresh", T) - return model - + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) -def prepare_inputs(input_tensor): - return {"inp": input_tensor} + return model # input datatype -@pytest.mark.parametrize("idt", [DataType["UINT4"], DataType["UINT8"]]) +@pytest.mark.parametrize("idt", [DataType["BIPOLAR"], DataType["UINT4"]]) # weight datatype -@pytest.mark.parametrize("wdt", [DataType["INT4"]]) +@pytest.mark.parametrize("wdt", [DataType["BIPOLAR"], DataType["UINT4"]]) # activation: None or DataType -@pytest.mark.parametrize("act", [DataType["UINT4"], None]) +@pytest.mark.parametrize("act", [DataType["BIPOLAR"], DataType["UINT4"], None]) # PE -@pytest.mark.parametrize("pe", [1, "channels"]) +@pytest.mark.parametrize("pe", [1, 3, 6]) +# SIMD +@pytest.mark.parametrize("simd", [1, 9]) # Input image shape @pytest.mark.parametrize("dim_h", [10]) @pytest.mark.parametrize("dim_w", [10, 1]) @@ -168,31 +193,29 @@ def prepare_inputs(input_tensor): @pytest.mark.parametrize("k_h", [3]) @pytest.mark.parametrize("k_w", [3, 1]) # Number of input and output channels -@pytest.mark.parametrize("channels", [3, 4]) +@pytest.mark.parametrize("channels", [3, 6]) # memory mode -@pytest.mark.parametrize("mem_mode", ["const", "decoupled"]) +@pytest.mark.parametrize("mem_mode", ["internal_embedded", "internal_decoupled"]) # execution mode @pytest.mark.parametrize("exec_mode", ["cppsim", "rtlsim"]) @pytest.mark.fpgadataflow @pytest.mark.slow @pytest.mark.vivado def test_fpgadataflow_vvau( - idt, wdt, act, pe, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode + idt, wdt, act, pe, simd, dim_h, dim_w, k_h, k_w, channels, mem_mode, exec_mode ): - if pe == "channels": - pe = channels - if dim_w == 1 and k_w != 1: pytest.skip("1D image requires 1D kernel, skipping.") if channels % pe != 0: pytest.skip("Requirement Channels divisable by PE is violated.") + if (k_h * k_w) % simd != 0: + pytest.skip("Requirement kernel (k_h * k_w) divisable by SIMD is violated.") + # Generate weights in expected shape for ONNX and HLS node W = gen_finn_dt_tensor(wdt, (channels, 1, k_h, k_w)) # shape: [channels, 1, k, k] - W_onnx = _infer_sparse_weight_tensor( - W, k_h, k_w, channels - ) # shape: [k*k*channels, channels] + W_onnx = _infer_sparse_weight_tensor(W, k_h, k_w, channels) # shape: [k*k*channels, channels] # Generate inputs in expected format for ONNX and HLS node x = gen_finn_dt_tensor(idt, (1, dim_h, dim_w, k_h * k_w * channels)) @@ -203,18 +226,33 @@ def test_fpgadataflow_vvau( if act is None: T = None tdt = None - odt = DataType["INT32"] + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + odt = DataType["UINT32"] + else: + odt = DataType["INT32"] else: odt = act - (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w * channels) + (min_v, max_v) = _calculate_dot_prod_range(idt, wdt, k_h * k_w) n_steps = act.get_num_possible_values() - 1 T = np.random.randint(min_v, max_v - 1, (channels, n_steps)).astype(np.float32) T = np.sort(T, axis=1) - tdt = DataType["INT32"] + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + tdt = DataType["UINT32"] + # bias thresholds to be positive + T = np.ceil((T + (k_h * k_w)) / 2) + assert (T >= 0).all() + else: + tdt = DataType["INT32"] model = _make_single_vvau_modelwrapper( - W, pe, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode + W, pe, simd, k_h, k_w, channels, dim_h, dim_w, wdt, idt, odt, T, tdt, mem_mode ) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + input_dict = prepare_inputs(x_vvau) + y_hwop = oxe.execute_onnx(model, input_dict)["global_out"] + model = model.transform(SpecializeLayers("xc7z020clg400-1")) if exec_mode == "cppsim": model = model.transform(SetExecMode("cppsim")) @@ -229,29 +267,204 @@ def test_fpgadataflow_vvau( else: raise Exception("Unknown exec_mode in test_fpgadataflow_vvau") - input_dict = prepare_inputs(x_vvau) - # Calculate output - y_expected = np.matmul(x, W_onnx) # Y is in [N, H, W, C] format + if wdt == DataType["BIPOLAR"] and idt == DataType["BIPOLAR"]: + # Simulate XNOR-popcount matrix multiplication, see + # qonnx.custom_op.general.xnorpopcount (not usable due to sparse W) + y_expected = np.matmul(x, W_onnx) + y_expected = (y_expected + (k_h * k_w)) / 2 + else: + y_expected = np.matmul(x, W_onnx) # Y is in [N, H, W, C] format + if T is not None: # Reshape Y, as multithreshold expects Y to be in [N, C, H, W] format y_expected = np.transpose(y_expected, (0, 3, 1, 2)) y_expected = multithreshold(y_expected, T) y_expected = np.transpose(y_expected, (0, 2, 3, 1)) - # signed offset - y_expected += act.min() + if act == DataType["BIPOLAR"]: + # binary to bipolar + y_expected = 2 * y_expected - 1 + else: + # signed offset + y_expected += act.min() - y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)[ - "outp" - ] + y_produced = oxe.execute_onnx(model, input_dict, return_full_exec_context=False)["global_out"] - assert (y_produced == y_expected).all(), "cppsim failed" + assert (y_hwop == y_expected).all(), "VVAU HW-op mismatches with golden output!" + assert (y_produced == y_expected).all(), "VVAU specialized-op mismatches with golden output!" if exec_mode == "rtlsim": - node = model.get_nodes_by_op_type("VectorVectorActivation")[0] + node = model.get_nodes_by_op_type("VVAU_hls")[0] inst = getCustomOp(node) cycles_rtlsim = inst.get_nodeattr("cycles_rtlsim") exp_cycles_dict = model.analysis(exp_cycles_per_layer) exp_cycles = exp_cycles_dict[node.name] assert np.isclose(exp_cycles, cycles_rtlsim, atol=10) assert exp_cycles != 0 + + +def make_single_dw_conv_modelwrapper(conv_config, idt, wdt): + kernel_size, in_feature_dim, in_chn = conv_config + stride = 1 + pad = 0 + + out_feature_dim = compute_conv_output_dim(in_feature_dim, kernel_size, stride, pad) + group = out_chn = in_chn + + conv_param_shape = [out_chn, 1, kernel_size, kernel_size] + input_shape = [1, in_chn, in_feature_dim, in_feature_dim] + output_shape = [1, out_chn, out_feature_dim, out_feature_dim] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = group + conv_config["kernel_shape"] = [kernel_size, kernel_size] + conv_config["pads"] = [pad, pad, pad, pad] + conv_config["strides"] = [stride, stride] + + ifm = helper.make_tensor_value_info("ifm", TensorProto.FLOAT, input_shape) + ofm = helper.make_tensor_value_info("ofm", TensorProto.FLOAT, output_shape) + weights = [helper.make_tensor_value_info("weights", TensorProto.FLOAT, conv_param_shape)] + + modelproto = qonnx_make_model( + helper.make_graph( + name="conv_test", + inputs=[ifm], + outputs=[ofm], + value_info=weights, + nodes=[helper.make_node("Conv", ["ifm", "weights"], ["ofm"], **conv_config)], + ) + ) + + model = ModelWrapper(modelproto) + model.set_tensor_datatype("ifm", idt) + model.set_tensor_datatype("weights", wdt) + model.set_initializer("weights", gen_finn_dt_tensor(wdt, conv_param_shape)) + + model = model.transform(InferShapes()) + model = model.transform(InferDataTypes()) + + return model + + +def prepare_inputs(input_tensor): + return {"global_in": input_tensor} + + +# kernel size (square) +@pytest.mark.parametrize("kernel_size", [3]) +# IFM size (square) +@pytest.mark.parametrize("in_feature_dim", [5]) +# input channels +@pytest.mark.parametrize("in_chn", [4]) +# input datatype +@pytest.mark.parametrize("idt", [DataType["INT8"]]) +# weight datatype +@pytest.mark.parametrize("wdt", [DataType["INT6"]]) +# targeted board +@pytest.mark.parametrize("part", ["xcvm1802-vsvd1760-2MP-e-S"]) +# pe +@pytest.mark.parametrize("pe", [1, 2, 4]) +# simd +@pytest.mark.parametrize("simd", [1, 3, 9]) +@pytest.mark.fpgadataflow +@pytest.mark.slow +@pytest.mark.vivado +def test_fpgadataflow_vvau_rtl(kernel_size, in_feature_dim, in_chn, idt, wdt, part, pe, simd): + # Create depthwise-separable convolution + conv_config = (kernel_size, in_feature_dim, in_chn) + model = make_single_dw_conv_modelwrapper(conv_config, idt, wdt) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + # Obtain golden reference output + golden_in = gen_finn_dt_tensor( + model.get_tensor_datatype("global_in"), model.get_tensor_shape("global_in") + ) + input_dict = prepare_inputs(golden_in) + golden_out = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)["global_out"] + + # Convert to HLS custom-op first + model = model.transform(LowerConvsToMatMul()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferVectorVectorActivation()) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveReadableTensorNames()) + + output_vvau_hw = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)[ + "global_out" + ] + assert ( + golden_out == output_vvau_hw + ).all(), "Output of ONNX model not matching output of HW-ops!" + + # Obtain second reference from HLS-based VVAU layer + model = model.transform(SpecializeLayers(part)) + model = model.transform(GiveUniqueNodeNames()) + + # Apply folding (i.e. specify to use DSPs) + folding_config = { + "Defaults": {}, + "ConvolutionInputGenerator_rtl_0": { + "SIMD": pe, + "parallel_window": 1, + }, + "VVAU_rtl_0": { + "PE": pe, + "SIMD": simd, + "resType": "dsp", + }, + } + model = model.transform(ApplyConfig(folding_config)) + model = model.transform(GiveUniqueNodeNames()) + model = model.transform(MinimizeWeightBitWidth()) + model = model.transform(MinimizeAccumulatorWidth()) + # make sure the changed datatypes are propagated through the network + model = model.transform(InferDataTypes()) + + # Run CPPsim + model = model.transform(SetExecMode("cppsim")) + model = model.transform(PrepareCppSim()) + model = model.transform(CompileCppSim()) + output_vvau_cppsim = oxe.execute_onnx(model, input_dict)["global_out"] + assert ( + golden_out == output_vvau_cppsim + ).all(), "Output of ONNX model not matching output of node-by-node CPPsim!" + + # Run node-by-node RTLsim + model = model.transform(SetExecMode("rtlsim")) + model = model.transform(PrepareIP(part, 5)) + model = model.transform(HLSSynthIP()) + model = model.transform(PrepareRTLSim()) + output_vvau_rtlsim = oxe.execute_onnx(model, input_dict, return_full_exec_context=True)[ + "global_out" + ] + + assert ( + golden_out == output_vvau_rtlsim + ).all(), "Output of ONNX model not matching output of specialized HW-ops!" + + # Stitched-IP RTLsim + model = model.transform(CreateDataflowPartition()) + partition_model_path = getCustomOp( + model.get_nodes_by_op_type("StreamingDataflowPartition")[0] + ).get_nodeattr("model") + partitioned_model = ModelWrapper(partition_model_path) + # FIFOs needed for stitched-ip RTLsim, DWC needed for VVU operating on SIMD parallelism + partitioned_model = partitioned_model.transform(InsertAndSetFIFODepths(part, 5)) + partitioned_model = partitioned_model.transform(PrepareIP(part, 5)) + partitioned_model = partitioned_model.transform(HLSSynthIP()) + partitioned_model = partitioned_model.transform(CreateStitchedIP(part, 5)) + # set top-level prop for stitched-ip rtlsim and launch + partitioned_model.set_metadata_prop("exec_mode", "rtlsim") + # transpose input since we're now simulating HW layers (NCHW --> NHWC) + input_dict["global_in"] = np.transpose(input_dict["global_in"], (0, 2, 3, 1)) + output_vvau_stitched = oxe.execute_onnx( + partitioned_model, input_dict, return_full_exec_context=True + )["global_out"] + # tranpose hardware-generated outputs NHWC -> NCHW to be comparable + output_vvau_stitched = output_vvau_stitched.transpose(0, 3, 1, 2) + + assert ( + golden_out == output_vvau_stitched + ).all(), "Output of ONNX model not matching output of stitched-IP RTL model!" diff --git a/tests/fpgadataflow/test_minimize_bit_width.py b/tests/fpgadataflow/test_minimize_bit_width.py new file mode 100644 index 0000000000..4b26e7ac00 --- /dev/null +++ b/tests/fpgadataflow/test_minimize_bit_width.py @@ -0,0 +1,308 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import pytest + +import numpy as np +from onnx import TensorProto, helper +from qonnx.core.datatype import BipolarType, DataType, IntType +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.custom_op.registry import getCustomOp +from qonnx.util.basic import gen_finn_dt_tensor, roundup_to_integer_multiple +from typing import Optional, Union + +from finn.custom_op.fpgadataflow.matrixvectoractivation import MVAU +from finn.custom_op.fpgadataflow.vectorvectoractivation import VVAU +from finn.transformation.fpgadataflow.minimize_accumulator_width import ( + MinimizeAccumulatorWidth, +) +from finn.transformation.fpgadataflow.minimize_weight_bit_width import ( + MinimizeWeightBitWidth, +) + + +def make_unit_test_model(wdt: DataType, idt: DataType, tdt: Optional[DataType] = None): + """Creates a toy finn-onnx model for unit testing. The VVAU-MVAU pair is based + on the first pair of MobileNetV1""" + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, 32, 32, 288]) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, 32, 32, 64]) + layer1 = helper.make_node( + "VVAU", + ["inp", "params0", "thresh0"] if tdt is not None else ["inp", "params0"], + ["hid"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + PE=1, + Channels=32, + Dim=(32, 32), + Kernel=(3, 3), + inputDataType=idt.name, + outputDataType=idt.name, + weightDataType=wdt.name, + ActVal=tdt.min() if tdt is not None else 0, + noActivation=0 if tdt is not None else 1, + ) + layer2 = helper.make_node( + "MVAU", + ["hid", "params1", "thresh1"] if tdt is not None else ["hid", "params1"], + ["outp"], + domain="finn.custom_op.fpgadataflow", + backend="fpgadataflow", + MW=32, # matrix_width (num_inputs) + MH=64, # matrix_height (num_outputs) + SIMD=1, + PE=1, + inputDataType=idt.name, + outputDataType=idt.name, + weightDataType=wdt.name, + ActVal=tdt.min() if tdt is not None else 0, + noActivation=0 if tdt is not None else 1, + binaryXnorMode=0, + ) + graph = helper.make_graph( + nodes=[layer1, layer2], name="fclayer_graph", inputs=[inp], outputs=[outp] + ) + + model = helper.make_model(graph, producer_name="fclayer-model") + model = ModelWrapper(model) + + model.set_tensor_datatype("inp", idt) + model.set_tensor_datatype("outp", idt) + model.set_tensor_datatype("hid", idt) + model.set_tensor_datatype("params0", wdt) + model.set_tensor_datatype("params1", wdt) + model.set_initializer("params0", gen_finn_dt_tensor(wdt, (32, 1, 3, 3))) + model.set_initializer("params1", gen_finn_dt_tensor(wdt, (32, 64))) + # if the threshold data type is specified, then we need to generate + # some dummy threshold values + if tdt is not None: + model.set_tensor_datatype("thresh0", tdt) + model.set_tensor_datatype("thresh1", tdt) + # Create threshold tensors + n_steps: int = idt.get_num_possible_values() - 1 + thresholds: Optional[np.ndarray] = np.random.randint( + tdt.min(), tdt.max() - 1, (32, n_steps) + ).astype( + np.float32 + ) # generate thresholds for the activations + thresholds = np.sort(thresholds, axis=1) # provide non-decreasing thresholds + model.set_initializer("thresh0", thresholds) + thresholds: Optional[np.ndarray] = np.random.randint( + tdt.min(), tdt.max() - 1, (64, n_steps) + ).astype( + np.float32 + ) # generate thresholds for the activations + thresholds = np.sort(thresholds, axis=1) # provide non-decreasing thresholds + model.set_initializer("thresh1", thresholds) + return model + + +weight_data_types = [ + DataType["INT8"], + DataType["UINT8"], + DataType["INT7"], + DataType["UINT7"], + DataType["INT3"], + DataType["UINT3"], + # DataType["BIPOLAR"], # TODO - add support for bipolar weights + DataType["TERNARY"], +] + + +input_data_types = [ + DataType["INT8"], + DataType["UINT8"], + DataType["INT3"], + DataType["UINT3"], + DataType["BIPOLAR"], + DataType["TERNARY"], +] + + +@pytest.mark.parametrize("wdt", weight_data_types) +@pytest.mark.parametrize("rww", [True, False]) +@pytest.mark.fpgadataflow +def test_minimize_weight_bit_width(wdt: DataType, rww: bool): + """Testing MinimizeWeightBitWidth for VVAU and MVAU. + + :param wdt: (DataType) The data type that we are testing for the weights + :param rww: (bool) Whether or not to use runtime-writeable weights""" + if isinstance(wdt, BipolarType): + # current MinimizeWeightBitWidth sets {-1,1} to INT2, need to check + # for 0 in weights to minimize weight bit width to bipolar + pytest.skip("Not well-supported for this optimization") + + # Create a w8a8 model + def_wdt = DataType["UINT8"] + model = make_unit_test_model(def_wdt, DataType["INT8"]) + + # Create new weights for the model based on wdt + params0 = gen_finn_dt_tensor(wdt, (32, 1, 3, 3)) + params1 = gen_finn_dt_tensor(wdt, (32, 64)) + model.set_initializer("params0", params0) + model.set_initializer("params1", params1) + + # If runtime-writeable weights, specify as a node attribute + for node in model.graph.node: + inst = getCustomOp(node) + if isinstance(inst, (MVAU, VVAU)): + inst.set_nodeattr("runtime_writeable_weights", int(rww)) + + # Apply the optimization + model = model.transform(MinimizeWeightBitWidth()) + + # Iterate through each node to make sure it functioned properly + for node in model.graph.node: + inst = getCustomOp(node) + if isinstance(inst, (MVAU, VVAU)): + cur_wdt = DataType[inst.get_nodeattr("weightDataType")] + exp_wdt = def_wdt if rww else wdt + assert cur_wdt.bitwidth() == exp_wdt.bitwidth(), "Mismatched data types" + + +def calculate_accumulator_bit_width( + inst: Union[MVAU, VVAU], model: ModelWrapper +) -> Union[DataType, IntType]: + """Calculate the accumulator bit width using the closed-form expressions + derived in `Quantized Neural Networks for Low-Precision Accumulation + with Guaranteed Overflow Avoidance` (2023) by I.Colbert, A. Pappalardo, + J. Petri-Koenig + + :param inst: (HLSCustomOp) The instance of the MVAU or VVAU + :param model: (ModelWrapper) The instance of the whole model + """ + + def phi(x: float) -> float: + return np.log2(1 + pow(2, -x)) + + weights = model.get_initializer(inst.onnx_node.input[1]) + # since in the calculation the values of the weight matrix are used, + # for the bipolar case they need to be converted to bipolar + if inst.get_nodeattr("binaryXnorMode"): + weights = 2 * weights - 1 + # modify the weights based on if the node is a VVAU or MVAU + if isinstance(inst, MVAU): + K = inst.get_nodeattr("MW") # matrix_width = num_inputs + elif isinstance(inst, VVAU): + k_h, k_w = inst.get_nodeattr("Kernel") + K = k_h * k_w # size of kernels = num_inputs + fm = inst.get_nodeattr("Channels") + # put weights into the shape expected by calculate_matvec_accumulator_range + weights = weights.reshape(fm, k_h * k_w).transpose() + else: + raise Exception("Considering only MVAU and VVAU currently") + # collect attributes used to determine the accumulator bit width bound + wdt = inst.get_weight_datatype() + idt = inst.get_input_datatype() + rww = inst.get_nodeattr("runtime_writeable_weights") + # if runtime-writeable weights, then use the lower bound on the accumulator bit + # width as determined by the input and weight data types and size of dot product + if rww: + alpha = np.log2(K) + idt.bitwidth() + wdt.bitwidth() - 1.0 - float(idt.signed()) + P = np.ceil(alpha + phi(alpha) + 1.0) + # if not runtime-writable weights, then use the tighter bound on the accumulator + # bit width as determined by the weight values themselves + else: + beta = np.log2(abs(weights).sum(axis=0).max()) + idt.bitwidth() - float(idt.signed()) + P = np.ceil(beta + phi(beta) + 1.0) + # if the node is the last in the graph, then round up to the nearest 8 bits + if model.find_direct_successors(inst.onnx_node) is None: + P = roundup_to_integer_multiple(P, 8) + return DataType[f"INT{int(P)}"] + + +thresh_data_types = [ + None, + DataType["INT32"], + DataType["INT24"], + DataType["INT16"], +] + +# Removing unsigned data types fro weights +weight_data_types = [ + DataType["INT8"], + DataType["INT7"], + DataType["INT3"], + # DataType["BIPOLAR"], # TODO - add support for bipolar weights + DataType["TERNARY"], +] + + +@pytest.mark.parametrize("wdt", weight_data_types) +@pytest.mark.parametrize("idt", input_data_types) +@pytest.mark.parametrize("tdt", thresh_data_types) +@pytest.mark.parametrize("rww", [True, False]) +@pytest.mark.fpgadataflow +def test_minimize_accumulator_width(wdt: DataType, idt: DataType, tdt: DataType, rww: bool): + """Testing MinimizeAccumulatorWidth for VVAU and MVAU. + + :param wdt: (DataType) The data type that we are testing for the weights + :param idt: (DataType) The data type that we are testing for the activations + :param tdt: (DataType) The data type that we are testing for the thresholds + :param rww: (bool) Whether or not to use runtime-writeable weights""" + if (not wdt.signed()) or isinstance(wdt, BipolarType): + pytest.skip("Closed-form accumulator calculation is designed to consider signed weights") + + # Create uniform-precision model + model = make_unit_test_model(wdt, idt, tdt) + def_adt = DataType["INT32"] + + # If runtime-writeable weights, specify as a node attribute + for node in model.graph.node: + inst = getCustomOp(node) + if isinstance(inst, (MVAU, VVAU)): + inst.set_nodeattr("runtime_writeable_weights", int(rww)) + cur_adt = DataType[inst.get_nodeattr("accDataType")] + assert cur_adt.bitwidth() == def_adt.bitwidth(), "Default data type is incorrect" + + # Apply the optimization + model = model.transform(MinimizeAccumulatorWidth()) + + # Iterate through each node to make sure it functioned properly + for node in model.graph.node: + inst = getCustomOp(node) + if isinstance(inst, (MVAU, VVAU)): + cur_adt = DataType[inst.get_nodeattr("accDataType")] + cur_odt = DataType[inst.get_nodeattr("outputDataType")] + # Calculating expected accumulator bit width using a closed-form expression + # that is a slight over-approximation of the lower bound. The accumulator + # bit width minimization logic in the MVAU and VVAU is exact and should be + # less than or equal to this calculation + exp_adt = calculate_accumulator_bit_width(inst, model) + assert cur_adt.bitwidth() <= exp_adt.bitwidth(), "Mismatched accumulation data types" + + # if there is no activation, outputDataType = accDataType and if it is the last node + # it needs to be divisible by 8 + if inst.get_nodeattr("noActivation"): + assert ( + cur_adt.bitwidth() == cur_odt.bitwidth() + ), "outputDataType and accDataType should be equal" + if model.find_direct_successors(inst.onnx_node) is None: + assert ( + cur_adt.bitwidth() % 8 + ) == 0, "bit width of last node needs to be divisible by 8" diff --git a/tests/fpgadataflow/test_runtime_weights.py b/tests/fpgadataflow/test_runtime_weights.py index 16fed5c3cb..3e7822a077 100644 --- a/tests/fpgadataflow/test_runtime_weights.py +++ b/tests/fpgadataflow/test_runtime_weights.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (C) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -41,6 +42,7 @@ from finn.transformation.fpgadataflow.hlssynth_ip import HLSSynthIP from finn.transformation.fpgadataflow.insert_fifo import InsertFIFO from finn.transformation.fpgadataflow.prepare_ip import PrepareIP +from finn.transformation.fpgadataflow.specialize_layers import SpecializeLayers from finn.util.create import hls_random_mlp_maker test_fpga_part = "xczu3eg-sbva484-1-e" @@ -68,9 +70,10 @@ def test_runtime_weights_single_layer(): } layer_spec_list = [layer_spec] model = hls_random_mlp_maker(layer_spec_list) - fcl = model.get_nodes_by_op_type("MatrixVectorActivation")[0] + model = model.transform(SpecializeLayers()) + fcl = model.get_nodes_by_op_type("MVAU_hls")[0] op_inst = getCustomOp(fcl) - op_inst.set_nodeattr("mem_mode", "decoupled") + op_inst.set_nodeattr("mem_mode", "internal_decoupled") op_inst.set_nodeattr("runtime_writeable_weights", 1) old_weights = model.get_initializer(fcl.input[1]) op_inst.make_weight_file(old_weights, "decoupled_runtime", "old_weights.dat") @@ -80,6 +83,7 @@ def test_runtime_weights_single_layer(): old_weight_stream = map(lambda x: int(x, 16), old_weight_stream.split("\n")) old_weight_stream = list(old_weight_stream) model = model.transform(InsertFIFO(True)) + model = model.transform(SpecializeLayers()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(PrepareIP(test_fpga_part, target_clk_ns)) model = model.transform(HLSSynthIP()) @@ -96,9 +100,7 @@ def test_runtime_weights_single_layer(): def read_weights(sim): addr = 0 for i in range(len(old_weight_stream)): - extracted_weight_stream.append( - axilite_read(sim, addr, basename="s_axilite_0_") - ) + extracted_weight_stream.append(axilite_read(sim, addr, basename="s_axilite_0_")) addr += 4 rtlsim_exec(model, exec_ctx, pre_hook=read_weights) diff --git a/tests/fpgadataflow/test_set_folding.py b/tests/fpgadataflow/test_set_folding.py index 5355dd7044..19e459c222 100644 --- a/tests/fpgadataflow/test_set_folding.py +++ b/tests/fpgadataflow/test_set_folding.py @@ -45,7 +45,6 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): - W = np.random.randint(wdt.min(), wdt.max() + 1, size=(ch, ch)) W = W.astype(np.float32) @@ -55,9 +54,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): tensors = [] tensors.append(helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ch])) for i in range(1, nnodes): - inter = helper.make_tensor_value_info( - "inter_" + str(i), TensorProto.FLOAT, [1, ch] - ) + inter = helper.make_tensor_value_info("inter_" + str(i), TensorProto.FLOAT, [1, ch]) tensors.append(inter) tensors.append(helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ch])) @@ -67,10 +64,10 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): simd = 1 FCLayer_nodes += [ helper.make_node( - "MatrixVectorActivation", + "MVAU_hls", [tensors[i].name, "weights_" + str(i), "thresh_" + str(i)], [tensors[i + 1].name], - domain="finn.custom_op.fpgadataflow", + domain="finn.custom_op.fpgadataflow.hls", backend="fpgadataflow", MW=ch, MH=ch, @@ -115,10 +112,7 @@ def make_multi_fclayer_model(ch, wdt, adt, tdt, nnodes): @pytest.mark.parametrize("platform", ["Pynq-Z1", "Ultra96", "U200"]) @pytest.mark.fpgadataflow def test_set_folding(target_fps, platform): - - model = make_multi_fclayer_model( - 128, DataType["INT4"], DataType["INT2"], DataType["INT16"], 5 - ) + model = make_multi_fclayer_model(128, DataType["INT4"], DataType["INT2"], DataType["INT16"], 5) model = model.transform(GiveUniqueNodeNames()) parent_model = model.transform(CreateDataflowPartition()) diff --git a/tests/fpgadataflow/test_split_large_fifos.py b/tests/fpgadataflow/test_split_large_fifos.py index 85b4a2bfa8..d192755d06 100644 --- a/tests/fpgadataflow/test_split_large_fifos.py +++ b/tests/fpgadataflow/test_split_large_fifos.py @@ -1,4 +1,4 @@ -# Copyright (C) 2022, Advanced Micro Devices, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -31,7 +31,8 @@ import json import shutil -from brevitas.export.onnx.generic.manager import BrevitasONNXManager +import torch +from brevitas.export import export_qonnx from qonnx.core.modelwrapper import ModelWrapper from qonnx.custom_op.registry import getCustomOp @@ -46,15 +47,15 @@ def fetch_test_model(topology, wbits=2, abits=2): tmp_output_dir = make_build_dir("build_fifosizing_%s_" % topology) (model, ishape) = get_trained_network_and_ishape(topology, wbits, abits) chkpt_name = tmp_output_dir + "/model.onnx" - BrevitasONNXManager.export(model, ishape, chkpt_name) + export_qonnx(model, torch.randn(ishape), chkpt_name) return tmp_output_dir def get_folding_cfg(depth=65536): cfg = dict() cfg["Defaults"] = dict() - for i in range(3): - key = "StreamingFIFO_" + str(i) + for i in range(4): + key = "StreamingFIFO_rtl_" + str(i) cfg[key] = {"depth": depth, "ram_style": "auto", "impl_style": "vivado"} return cfg @@ -85,7 +86,6 @@ def test_split_large_fifos(depth, force_python_rtlsim): build_cfg.DataflowOutputType.STITCHED_IP, build_cfg.DataflowOutputType.RTLSIM_PERFORMANCE, ], - default_mem_mode=build_cfg.ComputeEngineMemMode.DECOUPLED, ) build.build_dataflow_cfg(tmp_output_dir + "/model.onnx", cfg) with open(tmp_output_dir + "/report/estimate_network_performance.json") as f: @@ -93,15 +93,11 @@ def test_split_large_fifos(depth, force_python_rtlsim): with open(tmp_output_dir + "/report/rtlsim_performance.json") as f: sim_data = json.load(f) assert ( - float(sim_data["throughput[images/s]"]) - / float(est_data["estimated_throughput_fps"]) - > 0.9 - ) - model = ModelWrapper( - tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx" + float(sim_data["throughput[images/s]"]) / float(est_data["estimated_throughput_fps"]) > 0.9 ) + model = ModelWrapper(tmp_output_dir + "/intermediate_models/step_set_fifo_depths.onnx") # exclude final FIFO node (output FIFO, not part of test) - fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO")[:-1] + fifo_nodes = model.get_nodes_by_op_type("StreamingFIFO_rtl")[:-1] golden_cfg = get_fifo_split_configs(depth, 256, 32768) for i, fifo_node in enumerate(fifo_nodes): inst = getCustomOp(fifo_node) diff --git a/tests/notebooks/test_jupyter_notebooks.py b/tests/notebooks/test_jupyter_notebooks.py new file mode 100644 index 0000000000..e1415b9066 --- /dev/null +++ b/tests/notebooks/test_jupyter_notebooks.py @@ -0,0 +1,51 @@ +import pytest + +import nbformat +from nbconvert.preprocessors import ExecutePreprocessor + +from finn.util.basic import get_finn_root + +notebook_timeout_seconds = 3600 +notebook_basic_dir = get_finn_root() + "/notebooks/basics/" +notebook_advanced_dir = get_finn_root() + "/notebooks/advanced/" +notebook_cyber_dir = get_finn_root() + "/notebooks/end2end_example/cybersecurity/" +notebook_bnn_dir = get_finn_root() + "/notebooks/end2end_example/bnn-pynq/" + +basics_notebooks = [ + pytest.param(notebook_basic_dir + "0_how_to_work_with_onnx.ipynb"), + pytest.param(notebook_basic_dir + "1_brevitas_network_import_via_QONNX.ipynb"), +] + +advanced_notebooks = [ + pytest.param(notebook_advanced_dir + "0_custom_analysis_pass.ipynb"), + pytest.param(notebook_advanced_dir + "1_custom_transformation_pass.ipynb"), + pytest.param(notebook_advanced_dir + "2_custom_op.ipynb"), + pytest.param(notebook_advanced_dir + "3_folding.ipynb"), + pytest.param(notebook_advanced_dir + "4_advanced_builder_settings.ipynb"), +] + +cyber_notebooks = [ + pytest.param(notebook_cyber_dir + "1-train-mlp-with-brevitas.ipynb"), + pytest.param(notebook_cyber_dir + "2-import-into-finn-and-verify.ipynb"), + pytest.param(notebook_cyber_dir + "3-build-accelerator-with-finn.ipynb"), +] + +bnn_notebooks = [ + pytest.param(notebook_bnn_dir + "cnv_end2end_example.ipynb"), + pytest.param(notebook_bnn_dir + "tfc_end2end_example.ipynb"), + pytest.param(notebook_bnn_dir + "tfc_end2end_verification.ipynb"), +] + + +@pytest.mark.notebooks +@pytest.mark.parametrize( + "notebook", basics_notebooks + advanced_notebooks + cyber_notebooks + bnn_notebooks +) +def test_notebook_exec(notebook): + with open(notebook) as f: + nb = nbformat.read(f, as_version=4) + ep = ExecutePreprocessor(timeout=notebook_timeout_seconds, kernel_name="python3") + try: + assert ep.preprocess(nb) is not None, f"Got empty notebook for {notebook}" + except Exception: + assert False, f"Failed executing {notebook}" diff --git a/tests/transformation/streamline/test_absorb_mul_into_topk.py b/tests/transformation/streamline/test_absorb_mul_into_topk.py index 89ef74e0b3..1ca8fb06e9 100644 --- a/tests/transformation/streamline/test_absorb_mul_into_topk.py +++ b/tests/transformation/streamline/test_absorb_mul_into_topk.py @@ -71,18 +71,12 @@ def test_absorb_mul_into_topk(mul_positive, scalar): # initialize values # for mul if mul_positive is True: - a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype( - np.float32 - ) + a0_values = np.random.uniform(low=0.1, high=1, size=tuple(shape)).astype(np.float32) else: - a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype( - np.float32 - ) + a0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(np.float32) model.set_initializer("a0", a0_values) # for add - c0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype( - np.float32 - ) + c0_values = np.random.uniform(low=-1, high=-0.1, size=tuple(shape)).astype(np.float32) model.set_initializer("c0", c0_values) model = model.transform(InsertTopK()) model = model.transform(InferShapes()) @@ -92,9 +86,7 @@ def test_absorb_mul_into_topk(mul_positive, scalar): model_transformed = model.transform(AbsorbScalarMulAddIntoTopK()) # compare execution results - inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype( - np.float32 - ) + inp_values = np.random.uniform(low=-10, high=10, size=(1, 1, 1, 1000)).astype(np.float32) idict = {"global_in": inp_values} odict = oxe.execute_onnx(model, idict, True) y_indices = odict["global_out"] diff --git a/tests/transformation/streamline/test_absorb_transp_into_flatten.py b/tests/transformation/streamline/test_absorb_transp_into_flatten.py index 44b0c1d7e0..5b278bd552 100644 --- a/tests/transformation/streamline/test_absorb_transp_into_flatten.py +++ b/tests/transformation/streamline/test_absorb_transp_into_flatten.py @@ -65,9 +65,7 @@ def test_absorb_transp_into_flatten(perm, shape, ishape, data_layout): # model_transformed.save("test2.onnx") # verify transformation - inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype( - np.float32 - ) + inp_values = np.random.uniform(low=-1, high=1, size=tuple(ishape)).astype(np.float32) idict = {model.graph.input[0].name: inp_values} assert oxe.compare_execution(model, model_transformed, idict) diff --git a/tests/transformation/streamline/test_linear_past_eltwise.py b/tests/transformation/streamline/test_linear_past_eltwise.py index 4e5dcd6386..70fc395652 100644 --- a/tests/transformation/streamline/test_linear_past_eltwise.py +++ b/tests/transformation/streamline/test_linear_past_eltwise.py @@ -63,15 +63,9 @@ def make_model(shape): add1_node = helper.make_node("Add", [inp1.name, inp1_add_ct.name], [inp1_add.name]) add2_node = helper.make_node("Add", [inp2.name, inp2_add_ct.name], [inp2_add.name]) - mul1_node = helper.make_node( - "Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name] - ) - mul2_node = helper.make_node( - "Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name] - ) - eltwise_add_node = helper.make_node( - "Add", [inp1_mul.name, inp2_mul.name], [outp.name] - ) + mul1_node = helper.make_node("Mul", [inp1_add.name, inp1_mul_ct.name], [inp1_mul.name]) + mul2_node = helper.make_node("Mul", [inp2_add.name, inp2_mul_ct.name], [inp2_mul.name]) + eltwise_add_node = helper.make_node("Add", [inp1_mul.name, inp2_mul.name], [outp.name]) graph = helper.make_graph( nodes=[add1_node, add2_node, mul1_node, mul2_node, eltwise_add_node], name="graph", @@ -153,9 +147,7 @@ def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim): num_of_params = 6 value_info = [] for i in range(num_of_params): - value_info += [ - helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape) - ] + value_info += [helper.make_tensor_value_info("p" + str(i), TensorProto.FLOAT, input_shape)] modelproto = qonnx_make_model( helper.make_graph( @@ -180,9 +172,7 @@ def test_linear_past_eltwise_add_multiple_forks(ch, ifmdim): np.random.seed(0) for i in range(num_of_params): - model.set_initializer( - "p" + str(i), np.random.rand(*input_shape).astype(np.float32) - ) + model.set_initializer("p" + str(i), np.random.rand(*input_shape).astype(np.float32)) # need equal mults: model.set_initializer("p2", model.get_initializer("p1")) diff --git a/tests/transformation/streamline/test_maxpool_nhwc.py b/tests/transformation/streamline/test_maxpool_nhwc.py index d61eedaaf5..77dbf3a971 100644 --- a/tests/transformation/streamline/test_maxpool_nhwc.py +++ b/tests/transformation/streamline/test_maxpool_nhwc.py @@ -14,21 +14,13 @@ def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt): - ofm_dim_h = compute_pool_output_dim( - ifm_dim[0], kernel_shape[0], strides[0], pads[0], ceil_mode - ) - ofm_dim_w = compute_pool_output_dim( - ifm_dim[1], kernel_shape[1], strides[1], pads[1], ceil_mode - ) - inp = oh.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] - ) + ofm_dim_h = compute_pool_output_dim(ifm_dim[0], kernel_shape[0], strides[0], pads[0], ceil_mode) + ofm_dim_w = compute_pool_output_dim(ifm_dim[1], kernel_shape[1], strides[1], pads[1], ceil_mode) + inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]) outp_mp = oh.make_tensor_value_info( "outp_mp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] ) - outp = oh.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] - ) + outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]) maxpool_node = oh.make_node( "MaxPool", @@ -83,9 +75,7 @@ def create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt) @pytest.mark.parametrize("idt", [DataType["INT4"]]) def test_maxpool_nhwc(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt): # create MaxPool node - maxpool_model = create_maxpool( - ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt - ) + maxpool_model = create_maxpool(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, idt) # generate input tensor for testing input_tensor = gen_finn_dt_tensor(idt, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]) @@ -100,9 +90,7 @@ def test_maxpool_nhwc(ifm_dim, ifm_ch, kernel_shape, pads, strides, ceil_mode, i # execute transformed model output_node_name = maxpool_model.graph.output[0].name - output_dict = oxe.execute_onnx( - maxpool_model, input_dict, return_full_exec_context=False - ) + output_dict = oxe.execute_onnx(maxpool_model, input_dict, return_full_exec_context=False) output = output_dict[output_node_name] # compare outputs diff --git a/tests/transformation/streamline/test_move_chw_add_past_conv.py b/tests/transformation/streamline/test_move_chw_add_past_conv.py index e1b324a798..8b2f10b658 100644 --- a/tests/transformation/streamline/test_move_chw_add_past_conv.py +++ b/tests/transformation/streamline/test_move_chw_add_past_conv.py @@ -85,13 +85,9 @@ def test_move_chw_add_past_conv(idim, k, s, ich, och): model = ModelWrapper(model) # initialize model - a0_values = np.random.uniform(low=0, high=1, size=tuple(add_param_shape)).astype( - np.float32 - ) + a0_values = np.random.uniform(low=0, high=1, size=tuple(add_param_shape)).astype(np.float32) model.set_initializer("a0", a0_values) - a1_values = np.random.uniform(low=0, high=1, size=tuple(conv_param_shape)).astype( - np.float32 - ) + a1_values = np.random.uniform(low=0, high=1, size=tuple(conv_param_shape)).astype(np.float32) model.set_initializer("a1", a1_values) model = model.transform(InferShapes()) diff --git a/tests/transformation/streamline/test_move_identical_op_past_join_op.py b/tests/transformation/streamline/test_move_identical_op_past_join_op.py index 7be9763162..dd83681fc2 100644 --- a/tests/transformation/streamline/test_move_identical_op_past_join_op.py +++ b/tests/transformation/streamline/test_move_identical_op_past_join_op.py @@ -56,18 +56,10 @@ def create_model(perm): "Add", inputs=["out_transpose1", "out_transpose2"], outputs=["out_join1"] ) - in_transpose1 = oh.make_tensor_value_info( - "in_transpose1", TensorProto.FLOAT, in_shape - ) - in_transpose2 = oh.make_tensor_value_info( - "in_transpose2", TensorProto.FLOAT, in_shape - ) - out_transpose1 = oh.make_tensor_value_info( - "out_transpose1", TensorProto.FLOAT, out_shape - ) - out_transpose2 = oh.make_tensor_value_info( - "out_transpose2", TensorProto.FLOAT, out_shape - ) + in_transpose1 = oh.make_tensor_value_info("in_transpose1", TensorProto.FLOAT, in_shape) + in_transpose2 = oh.make_tensor_value_info("in_transpose2", TensorProto.FLOAT, in_shape) + out_transpose1 = oh.make_tensor_value_info("out_transpose1", TensorProto.FLOAT, out_shape) + out_transpose2 = oh.make_tensor_value_info("out_transpose2", TensorProto.FLOAT, out_shape) out_join1 = oh.make_tensor_value_info("out_join1", TensorProto.FLOAT, out_shape) graph = oh.make_graph( diff --git a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py index 6126acd9e3..2dee153545 100644 --- a/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py +++ b/tests/transformation/streamline/test_move_maxpool_past_multithreshold.py @@ -67,14 +67,10 @@ def test_move_maxpool_past_multithreshold(): value_info = [] thres1_shape = [1, 1] - value_info += [ - helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape) - ] + value_info += [helper.make_tensor_value_info("thres1", TensorProto.FLOAT, thres1_shape)] thres2_shape = [ch, 14] - value_info += [ - helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape) - ] + value_info += [helper.make_tensor_value_info("thres2", TensorProto.FLOAT, thres2_shape)] nodes = [] nodes += [helper.make_node("MaxPool", ["top_in"], ["t1"], **maxpool_config)] @@ -114,9 +110,7 @@ def test_move_maxpool_past_multithreshold(): model = model.transform(InferDataTypes()) model.set_initializer("thres1", np.array([[0]], dtype=np.float32)) - model.set_initializer( - "thres2", get_multithreshold_rand_params(*thres2_shape, seed=0) - ) + model.set_initializer("thres2", get_multithreshold_rand_params(*thres2_shape, seed=0)) # Transform new_model = model.transform(MoveMaxPoolPastMultiThreshold()) diff --git a/tests/transformation/streamline/test_move_mul_past_dw_conv.py b/tests/transformation/streamline/test_move_mul_past_dw_conv.py index 72a6650ec4..303b97c69f 100644 --- a/tests/transformation/streamline/test_move_mul_past_dw_conv.py +++ b/tests/transformation/streamline/test_move_mul_past_dw_conv.py @@ -65,14 +65,10 @@ def test_move_mul_past_dw_conv(ifm_dim, ifm_ch, k, stride, pad_amt, dw): ofm_dim = compute_conv_output_dim(ifm_dim, k, stride, total_pad) # set up onnx model - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]) mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, [1, ifm_ch, 1, 1]) W = helper.make_tensor_value_info("W", TensorProto.FLOAT, W_shape) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim] - ) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]) Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) diff --git a/tests/transformation/streamline/test_move_mul_past_maxpool.py b/tests/transformation/streamline/test_move_mul_past_maxpool.py index 3bae2905a0..61dddd56e9 100755 --- a/tests/transformation/streamline/test_move_mul_past_maxpool.py +++ b/tests/transformation/streamline/test_move_mul_past_maxpool.py @@ -65,13 +65,9 @@ def test_move_mul_past_maxpool(ifm_dim, ifm_ch, k, stride, pad, cw, negative): ofm_dim = compute_pool_output_dim(ifm_dim, k, stride, pad) # set up onnx model - inp = helper.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim] - ) + inp = helper.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim, ifm_dim]) mul = helper.make_tensor_value_info("mul", TensorProto.FLOAT, mul_shape) - outp = helper.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim] - ) + outp = helper.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_ch, ofm_dim, ofm_dim]) Mul_node = helper.make_node("Mul", ["inp", "mul"], ["mul_out"]) diff --git a/tests/transformation/streamline/test_move_past_fork.py b/tests/transformation/streamline/test_move_past_fork.py index 7e77d7f9b3..e9433178c8 100644 --- a/tests/transformation/streamline/test_move_past_fork.py +++ b/tests/transformation/streamline/test_move_past_fork.py @@ -64,9 +64,7 @@ def test_move_past_fork_transpose(): new_model = model.transform(MoveTransposePastFork()) new_model = new_model.transform(GiveUniqueNodeNames()) nodes = new_model.graph.node - assert oxe.compare_execution( - model, new_model, {"in0": np.random.rand(*shp).astype(np.float32)} - ) + assert oxe.compare_execution(model, new_model, {"in0": np.random.rand(*shp).astype(np.float32)}) assert len(nodes) == 5 assert not new_model.is_fork_node(get_by_name(nodes, "Transpose_0")) @@ -120,9 +118,7 @@ def test_move_past_fork_linear(ch, ifmdim): for tensor_name in model.get_all_tensor_names(): if tensor_name.endswith("_param"): pshape = model.get_tensor_shape(tensor_name) - model.set_initializer( - tensor_name, np.random.rand(*pshape).astype(np.float32) - ) + model.set_initializer(tensor_name, np.random.rand(*pshape).astype(np.float32)) model = model.transform(GiveUniqueNodeNames()) # Transform new_model = model.transform(MoveLinearPastFork()) diff --git a/tests/transformation/streamline/test_move_scalar_past_convtranspose.py b/tests/transformation/streamline/test_move_scalar_past_convtranspose.py new file mode 100644 index 0000000000..7da22abd87 --- /dev/null +++ b/tests/transformation/streamline/test_move_scalar_past_convtranspose.py @@ -0,0 +1,106 @@ +# Copyright (C) 2023, Advanced Micro Devices, Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# * Neither the name of FINN nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +import pytest + +import numpy as np +import onnx.helper as oh +from onnx import TensorProto +from qonnx.core.modelwrapper import ModelWrapper +from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.basic import qonnx_make_model + +import finn.core.onnx_exec as ox +from finn.transformation.streamline.reorder import MoveScalarMulPastConvTranspose + + +@pytest.mark.streamline +# input image dimension +@pytest.mark.parametrize("idim", [[8, 8], [10, 8]]) +# number of rows and number of cols to add +@pytest.mark.parametrize("stride", [[2, 2], [2, 3]]) +# number of channels +@pytest.mark.parametrize("ifm_ch", [2, 4]) +# number of channels +@pytest.mark.parametrize("ofm_ch", [2, 4]) +# kernel size +@pytest.mark.parametrize("k", [2, 4]) +# padding +@pytest.mark.parametrize("padding", [False, True]) +def test_move_scalar_past_conv(idim, stride, ifm_ch, ofm_ch, k, padding): + idim_h, idim_w = idim + stride_h, stride_w = stride + + odim_h = (idim_h - 1) * stride_h - 2 * padding + (k - 1) + 1 + odim_w = (idim_w - 1) * stride_w - 2 * padding + (k - 1) + 1 + + input_shape = [1, ifm_ch, idim_h, idim_w] + output_shape = [1, ofm_ch, odim_h, odim_w] + + conv_param_shape = [ifm_ch, ofm_ch, k, k] + + conv_config = {} + conv_config["dilations"] = [1, 1] + conv_config["group"] = 1 + conv_config["kernel_shape"] = [k, k] + if padding: + conv_config["pads"] = [1, 1, 1, 1] + else: + conv_config["pads"] = [0, 0, 0, 0] + conv_config["strides"] = [stride_h, stride_w] + + top_in = oh.make_tensor_value_info("top_in", TensorProto.FLOAT, input_shape) + top_out = oh.make_tensor_value_info("top_out", TensorProto.FLOAT, output_shape) + + value_info = [oh.make_tensor_value_info("p1", TensorProto.FLOAT, [1])] + value_info += [oh.make_tensor_value_info("p2", TensorProto.FLOAT, conv_param_shape)] + + modelproto = qonnx_make_model( + oh.make_graph( + name="test", + inputs=[top_in], + outputs=[top_out], + value_info=value_info, + nodes=[ + oh.make_node("Mul", ["top_in", "p1"], ["t1"]), + oh.make_node("ConvTranspose", ["t1", "p2"], ["top_out"], **conv_config), + ], + ) + ) + model = ModelWrapper(modelproto) + model = model.transform(InferShapes()) + + np.random.seed(0) + model.set_initializer("p1", *np.random.rand(1).astype(np.float32)) + model.set_initializer("p2", np.random.rand(*conv_param_shape).astype(np.float32)) + + new_model = model.transform(MoveScalarMulPastConvTranspose()) + inp_dict = {"top_in": np.random.rand(*input_shape).astype(np.float32)} + + assert ox.compare_execution(model, new_model, inp_dict) + assert new_model.graph.node[0].op_type == "ConvTranspose" + assert new_model.graph.node[1].op_type == "Mul" diff --git a/tests/transformation/streamline/test_move_scalar_past_matmul.py b/tests/transformation/streamline/test_move_scalar_past_matmul.py index 6c788294bc..e4f4357fff 100644 --- a/tests/transformation/streamline/test_move_scalar_past_matmul.py +++ b/tests/transformation/streamline/test_move_scalar_past_matmul.py @@ -63,9 +63,7 @@ def test_move_scalar_mul_past_matmul(): model = ModelWrapper(modelproto) model = model.transform(InferShapes()) model.set_initializer("mul_param", np.asarray([[3]], dtype=np.float32)) - model.set_initializer( - "matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32) - ) + model.set_initializer("matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32)) new_model = model.transform(MoveScalarMulPastMatMul()) inp_dict = {"top_in": np.asarray([[-1.0, 1.0]], dtype=np.float32)} assert ox.compare_execution(model, new_model, inp_dict) @@ -95,9 +93,7 @@ def test_move_scalar_add_past_matmul(): model = ModelWrapper(modelproto) model = model.transform(InferShapes()) model.set_initializer("add_param", np.asarray([[3]], dtype=np.float32)) - model.set_initializer( - "matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32) - ) + model.set_initializer("matmul_param", np.asarray([[2, 4], [-1, 1]], dtype=np.float32)) new_model = model.transform(MoveScalarAddPastMatMul()) inp_dict = {"top_in": np.asarray([[-1.0, 1.0]], dtype=np.float32)} assert ox.compare_execution(model, new_model, inp_dict) diff --git a/tests/transformation/streamline/test_scale_resize_nhwc.py b/tests/transformation/streamline/test_scale_resize_nhwc.py index 5e107448f8..350f5b3133 100644 --- a/tests/transformation/streamline/test_scale_resize_nhwc.py +++ b/tests/transformation/streamline/test_scale_resize_nhwc.py @@ -18,9 +18,7 @@ def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): ofm_dim_h = ifm_dim[0] * scales[2] ofm_dim_w = ifm_dim[1] * scales[3] - inp = oh.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] - ) + inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]]) param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) @@ -30,9 +28,7 @@ def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): outp_up = oh.make_tensor_value_info( "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] ) - outp = oh.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] - ) + outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]) resize_node = oh.make_node( "Resize", @@ -73,18 +69,14 @@ def create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt): ofm_dim_h = ifm_dim[0] * scales[2] ofm_dim_w = ifm_dim[1] * scales[3] - inp = oh.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] - ) + inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch]) param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, [4]) # Not actually used, only needed for compliance with the Resize node interface roi = oh.make_tensor_value_info("roi", TensorProto.FLOAT, [4]) - outp = oh.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] - ) + outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w]) outp_tr = oh.make_tensor_value_info( "outp_tr", TensorProto.FLOAT, [1, ifm_ch, ifm_dim[0], ifm_dim[1]] ) @@ -128,9 +120,7 @@ def create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt): def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): ofm_dim_h = ifm_dim[0] * scales[2] ofm_dim_w = ifm_dim[1] * scales[3] - inp = oh.make_tensor_value_info( - "inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch] - ) + inp = oh.make_tensor_value_info("inp", TensorProto.FLOAT, [1, ifm_dim[0], ifm_dim[1], ifm_ch]) param = oh.make_tensor_value_info("scales", TensorProto.FLOAT, scales) @@ -144,9 +134,7 @@ def create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt): outp_up = oh.make_tensor_value_info( "outp_up", TensorProto.FLOAT, [1, ifm_ch, ofm_dim_h, ofm_dim_w] ) - outp = oh.make_tensor_value_info( - "outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch] - ) + outp = oh.make_tensor_value_info("outp", TensorProto.FLOAT, [1, ofm_dim_h, ofm_dim_w, ifm_ch]) transpose_node1 = onnx.helper.make_node( "Transpose", @@ -209,9 +197,7 @@ def check_transform(model): # input channels @pytest.mark.parametrize("ifm_ch", [3]) # scales -@pytest.mark.parametrize( - "scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)] -) +@pytest.mark.parametrize("scales", [[1, 1, i, j] for i in range(2, 5) for j in range(2, 5)]) # mode @pytest.mark.parametrize("mode", ["nearest"]) # input datatype @@ -220,9 +206,7 @@ def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt): # create models resize_model1 = create_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt) resize_model2 = create_transpose_resize(ifm_dim, ifm_ch, scales, mode, idt) - resize_model3 = create_transpose_resize_transpose( - ifm_dim, ifm_ch, scales, mode, idt - ) + resize_model3 = create_transpose_resize_transpose(ifm_dim, ifm_ch, scales, mode, idt) # set initializers resize_model1.set_initializer("scales", np.array(scales, dtype=np.float32)) @@ -245,9 +229,7 @@ def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt): # execute transformed model output_node_name1 = resize_model1.graph.output[0].name - output_dict1 = oxe.execute_onnx( - resize_model1, input_dict_nchw, return_full_exec_context=False - ) + output_dict1 = oxe.execute_onnx(resize_model1, input_dict_nchw, return_full_exec_context=False) output1 = output_dict1[output_node_name1] # compare outputs @@ -264,9 +246,7 @@ def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt): # execute transformed model output_node_name2 = resize_model2.graph.output[0].name - output_dict2 = oxe.execute_onnx( - resize_model2, input_dict_nhwc, return_full_exec_context=False - ) + output_dict2 = oxe.execute_onnx(resize_model2, input_dict_nhwc, return_full_exec_context=False) output2 = output_dict2[output_node_name2] # compare outputs @@ -283,9 +263,7 @@ def test_scale_resize_nhwc(ifm_dim, ifm_ch, scales, mode, idt): # execute transformed model output_node_name3 = resize_model3.graph.output[0].name - output_dict3 = oxe.execute_onnx( - resize_model3, input_dict_nhwc, return_full_exec_context=False - ) + output_dict3 = oxe.execute_onnx(resize_model3, input_dict_nhwc, return_full_exec_context=False) output3 = output_dict3[output_node_name3] # compare outputs diff --git a/tests/transformation/streamline/test_sign_to_thres.py b/tests/transformation/streamline/test_sign_to_thres.py index 839680bd7a..1386592563 100644 --- a/tests/transformation/streamline/test_sign_to_thres.py +++ b/tests/transformation/streamline/test_sign_to_thres.py @@ -28,16 +28,19 @@ import pytest -import brevitas.onnx as bo import onnx import onnx.numpy_helper as nph import os +import torch +from brevitas.export import export_qonnx from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import ConvertSignToThres from finn.util.test import get_test_model_trained @@ -47,8 +50,10 @@ @pytest.mark.streamline def test_sign_to_thres(): lfc = get_test_model_trained("LFC", 1, 1) - bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path) + export_qonnx(lfc, torch.randn(1, 1, 28, 28), export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) new_model = model.transform(ConvertSignToThres()) diff --git a/tests/transformation/streamline/test_streamline_cnv.py b/tests/transformation/streamline/test_streamline_cnv.py index 6a82925012..8a91a49278 100644 --- a/tests/transformation/streamline/test_streamline_cnv.py +++ b/tests/transformation/streamline/test_streamline_cnv.py @@ -26,23 +26,26 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest -import brevitas.onnx as bo +import importlib_resources as importlib import numpy as np +import torch +from brevitas.export import export_qonnx from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.general import ( GiveReadableTensorNames, GiveUniqueNodeNames, + GiveUniqueParameterTensors, RemoveStaticGraphInputs, RemoveUnusedTensors, ) from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.util.basic import make_build_dir from finn.util.test import get_test_model_trained @@ -63,16 +66,20 @@ def test_streamline_cnv(size, wbits, abits): nname = "%s_%dW%dA" % (size, wbits, abits) finn_onnx = export_onnx_path + "/%s.onnx" % nname fc = get_test_model_trained(size, wbits, abits) - bo.export_finn_onnx(fc, (1, 3, 32, 32), finn_onnx) + export_qonnx(fc, torch.randn(1, 3, 32, 32), finn_onnx) + qonnx_cleanup(finn_onnx, out_file=finn_onnx) model = ModelWrapper(finn_onnx) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(RemoveStaticGraphInputs()) # load one of the test vectors - fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz") - input_tensor = np.load(fn)["arr_0"].astype(np.float32) + ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz" + with importlib.as_file(ref) as fn: + input_tensor = np.load(fn)["arr_0"].astype(np.float32) input_tensor = input_tensor / 255 assert input_tensor.shape == (1, 3, 32, 32) # run using FINN-based execution diff --git a/tests/transformation/streamline/test_streamline_fc.py b/tests/transformation/streamline/test_streamline_fc.py index 9000821435..edc4a96fe2 100644 --- a/tests/transformation/streamline/test_streamline_fc.py +++ b/tests/transformation/streamline/test_streamline_fc.py @@ -28,22 +28,26 @@ import pytest -import brevitas.onnx as bo import numpy as np import onnx import onnx.numpy_helper as nph +import torch +from brevitas.export import export_qonnx from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.general import ( GiveReadableTensorNames, GiveUniqueNodeNames, + GiveUniqueParameterTensors, RemoveStaticGraphInputs, RemoveUnusedTensors, ) from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.util.basic import make_build_dir from finn.util.test import get_test_model_trained @@ -66,11 +70,14 @@ def test_streamline_fc(size, wbits, abits): nname = "%s_%dW%dA" % (size, wbits, abits) finn_onnx = export_onnx_path + "/%s.onnx" % nname fc = get_test_model_trained(size, wbits, abits) - bo.export_finn_onnx(fc, (1, 1, 28, 28), finn_onnx) + export_qonnx(fc, torch.randn(1, 1, 28, 28), finn_onnx) + qonnx_cleanup(finn_onnx, out_file=finn_onnx) model = ModelWrapper(finn_onnx) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(RemoveStaticGraphInputs()) # load one of the test vectors diff --git a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py index fd4e37807c..fd5033674b 100644 --- a/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py +++ b/tests/transformation/test_batchnorm_to_affine_bnn_pynq.py @@ -26,22 +26,24 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest -import brevitas.onnx as bo +import importlib_resources as importlib import numpy as np import onnx import onnx.numpy_helper as nph import os +import torch +from brevitas.export import export_qonnx from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.batchnorm_to_affine import BatchNormToAffine from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.cleanup import cleanup as qonnx_cleanup import finn.core.onnx_exec as oxe +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.test import get_test_model_trained export_onnx_path = "test_output_bn2affine.onnx" @@ -50,12 +52,15 @@ @pytest.mark.transform def test_batchnorm_to_affine_cnv_w1a1(): lfc = get_test_model_trained("CNV", 1, 1) - bo.export_finn_onnx(lfc, (1, 3, 32, 32), export_onnx_path) + export_qonnx(lfc, torch.randn(1, 3, 32, 32), export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) - fn = pk.resource_filename("finn.qnn-data", "cifar10/cifar10-test-data-class3.npz") - input_tensor = np.load(fn)["arr_0"].astype(np.float32) + ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz" + with importlib.as_file(ref) as fn: + input_tensor = np.load(fn)["arr_0"].astype(np.float32) input_tensor = input_tensor / 255 assert input_tensor.shape == (1, 3, 32, 32) input_dict = {"0": input_tensor} @@ -75,8 +80,10 @@ def test_batchnorm_to_affine_cnv_w1a1(): @pytest.mark.transform def test_batchnorm_to_affine_lfc_w1a1(): lfc = get_test_model_trained("LFC", 1, 1) - bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path) + export_qonnx(lfc, torch.randn(1, 1, 28, 28), export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) new_model = model.transform(BatchNormToAffine()) diff --git a/tests/transformation/test_infer_data_layouts_cnv.py b/tests/transformation/test_infer_data_layouts_cnv.py index 952ce306a4..fc9d98d24f 100644 --- a/tests/transformation/test_infer_data_layouts_cnv.py +++ b/tests/transformation/test_infer_data_layouts_cnv.py @@ -1,4 +1,5 @@ -# Copyright (c) 2020, Xilinx +# Copyright (c) 2020, Xilinx, Inc. +# Copyright (C) 2024, Advanced Micro Devices, Inc. # All rights reserved. # # Redistribution and use in source and binary forms, with or without @@ -28,19 +29,26 @@ import pytest -import brevitas.onnx as bo import os import qonnx.core.data_layout as DataLayout +import torch +from brevitas.export import export_qonnx from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.bipolar_to_xnor import ConvertBipolarMatMulToXnorPopcount from qonnx.transformation.fold_constants import FoldConstants -from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames +from qonnx.transformation.general import ( + GiveReadableTensorNames, + GiveUniqueNodeNames, + GiveUniqueParameterTensors, +) from qonnx.transformation.infer_data_layouts import InferDataLayouts from qonnx.transformation.infer_shapes import InferShapes from qonnx.transformation.lower_convs_to_matmul import LowerConvsToMatMul +from qonnx.util.cleanup import cleanup as qonnx_cleanup -import finn.transformation.fpgadataflow.convert_to_hls_layers as to_hls +import finn.transformation.fpgadataflow.convert_to_hw_layers as to_hw import finn.transformation.streamline.absorb as absorb +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.transformation.streamline import Streamline from finn.transformation.streamline.reorder import MakeMaxPoolNHWC from finn.util.test import get_test_model_trained @@ -49,13 +57,17 @@ @pytest.mark.transform +@pytest.mark.xfail def test_infer_data_layouts_cnv(): cnv = get_test_model_trained("CNV", 1, 1) - bo.export_finn_onnx(cnv, (1, 3, 32, 32), export_onnx_path_cnv) + export_qonnx(cnv, torch.randn(1, 3, 32, 32), export_onnx_path_cnv) + qonnx_cleanup(export_onnx_path_cnv, out_file=export_onnx_path_cnv) model = ModelWrapper(export_onnx_path_cnv) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) + model = model.transform(GiveUniqueParameterTensors()) model = model.transform(GiveReadableTensorNames()) model = model.transform(Streamline()) model = model.transform(InferDataLayouts()) @@ -90,10 +102,10 @@ def test_infer_data_layouts_cnv(): model = model.transform(absorb.AbsorbTransposeIntoMultiThreshold()) model = model.transform(ConvertBipolarMatMulToXnorPopcount()) model = model.transform(Streamline()) - model = model.transform(to_hls.InferBinaryMatrixVectorActivation()) - model = model.transform(to_hls.InferQuantizedMatrixVectorActivation()) - model = model.transform(to_hls.InferConvInpGen()) - model = model.transform(to_hls.InferStreamingMaxPool()) + model = model.transform(to_hw.InferBinaryMatrixVectorActivation()) + model = model.transform(to_hw.InferQuantizedMatrixVectorActivation()) + model = model.transform(to_hw.InferConvInpGen()) + model = model.transform(to_hw.InferStreamingMaxPool()) model = model.transform(GiveUniqueNodeNames()) model = model.transform(GiveReadableTensorNames()) model = model.transform(InferDataLayouts()) @@ -103,12 +115,10 @@ def test_infer_data_layouts_cnv(): # note: im2col output isn't really NHWC or any other common layout # since the concept of channels changes with lowering... but it is # conceptually close to NHWC since the innermost dim gets multiplied - assert ( - model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC - ) - assert model.get_tensor_layout("MatrixVectorActivation_3_out0") == DataLayout.NHWC + assert model.get_tensor_layout("ConvolutionInputGenerator_0_out0") == DataLayout.NHWC + assert model.get_tensor_layout("MVAU_3_out0") == DataLayout.NHWC assert model.get_tensor_layout("Reshape_0_out0") == DataLayout.NC - assert model.get_tensor_layout("MatrixVectorActivation_6_out0") == DataLayout.NC + assert model.get_tensor_layout("MVAU_6_out0") == DataLayout.NC assert model.get_tensor_layout("global_out") == DataLayout.NC os.remove(export_onnx_path_cnv) diff --git a/tests/transformation/test_infer_datatypes_lfc.py b/tests/transformation/test_infer_datatypes_lfc.py index 9798005349..b9d9dc558f 100644 --- a/tests/transformation/test_infer_datatypes_lfc.py +++ b/tests/transformation/test_infer_datatypes_lfc.py @@ -28,15 +28,18 @@ import pytest -import brevitas.onnx as bo import os +import torch +from brevitas.export import export_qonnx from qonnx.core.datatype import DataType from qonnx.core.modelwrapper import ModelWrapper from qonnx.transformation.fold_constants import FoldConstants from qonnx.transformation.general import GiveReadableTensorNames, GiveUniqueNodeNames from qonnx.transformation.infer_datatypes import InferDataTypes from qonnx.transformation.infer_shapes import InferShapes +from qonnx.util.cleanup import cleanup as qonnx_cleanup +from finn.transformation.qonnx.convert_qonnx_to_finn import ConvertQONNXtoFINN from finn.util.test import get_test_model_trained export_onnx_path = "test_infer_datatypes.onnx" @@ -45,8 +48,10 @@ @pytest.mark.transform def test_infer_datatypes_lfc(): lfc = get_test_model_trained("LFC", 1, 1) - bo.export_finn_onnx(lfc, (1, 1, 28, 28), export_onnx_path) + export_qonnx(lfc, torch.randn(1, 1, 28, 28), export_onnx_path) + qonnx_cleanup(export_onnx_path, out_file=export_onnx_path) model = ModelWrapper(export_onnx_path) + model = model.transform(ConvertQONNXtoFINN()) model = model.transform(InferShapes()) model = model.transform(FoldConstants()) model = model.transform(GiveUniqueNodeNames()) diff --git a/tests/transformation/test_qonnx_to_finn.py b/tests/transformation/test_qonnx_to_finn.py index 7e438b4b8b..939082b87b 100644 --- a/tests/transformation/test_qonnx_to_finn.py +++ b/tests/transformation/test_qonnx_to_finn.py @@ -27,21 +27,16 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest -import brevitas.export.onnx.generic as b_onnx -import brevitas.onnx as bo +import importlib_resources as importlib import numpy as np import onnx import onnx.numpy_helper as nph import torch +from brevitas.export import export_qonnx from pkgutil import get_data from qonnx.core.modelwrapper import ModelWrapper -from qonnx.transformation.fold_constants import FoldConstants -from qonnx.transformation.general import GiveUniqueNodeNames, RemoveStaticGraphInputs -from qonnx.transformation.infer_shapes import InferShapes from qonnx.util.cleanup import cleanup from tempfile import TemporaryDirectory @@ -59,10 +54,9 @@ def get_brev_model_and_sample_inputs(model_name, wbits, abits): brev_model = get_test_model_trained(model_name, wbits, abits) elif model_name == "CNV": in_shape = (1, 3, 32, 32) - fn = pk.resource_filename( - "finn.qnn-data", "cifar10/cifar10-test-data-class3.npz" - ) - input_tensor = np.load(fn)["arr_0"].astype(np.float32) + ref = importlib.files("finn.qnn-data") / "cifar10/cifar10-test-data-class3.npz" + with importlib.as_file(ref) as fn: + input_tensor = np.load(fn)["arr_0"].astype(np.float32) input_tensor = input_tensor / 255 brev_model = get_test_model_trained(model_name, wbits, abits) elif model_name == "mobilenet": @@ -96,7 +90,6 @@ def analysis_testing_for_no_quant_nodes(model): def test_QONNX_to_FINN(model_name, wbits, abits): if model_name == "mobilenet": pytest.xfail("MobileNet test is temporarily excluded from QONNX testing.") - if wbits > abits: pytest.skip("No wbits > abits cases at the moment") if model_name == "LFC" and wbits == 2 and abits == 2: @@ -105,42 +98,17 @@ def test_QONNX_to_FINN(model_name, wbits, abits): pytest.skip("Mobilenet only runs at W2A2, though it's technically W4A4.") # Get test config and model - ATOL = 1e-7 - brev_model, in_shape, input_tensor = get_brev_model_and_sample_inputs( - model_name, wbits, abits - ) + ATOL = 1e-6 + brev_model, in_shape, input_tensor = get_brev_model_and_sample_inputs(model_name, wbits, abits) temp_dir = TemporaryDirectory() qonnx_base_path = temp_dir.name + "/qonnx_{}.onnx" - finn_base_path = temp_dir.name + "/finn_{}.onnx" # Get Brevitas output torch_input_tensor = torch.from_numpy(input_tensor).float() brev_output = brev_model.forward(torch_input_tensor).detach().numpy() - # Get "clean" FINN model and it's output - _ = bo.export_finn_onnx(brev_model, in_shape, finn_base_path.format("raw")) - model = ModelWrapper(finn_base_path.format("raw")) - model = model.transform(GiveUniqueNodeNames()) - model = model.transform(InferShapes()) - model = model.transform(FoldConstants()) - model = model.transform(RemoveStaticGraphInputs()) - model.save(finn_base_path.format("clean")) - - model = ModelWrapper(finn_base_path.format("clean")) - input_dict = {model.graph.input[0].name: input_tensor} - output_dict = oxe.execute_onnx(model, input_dict, False) - finn_export_output = output_dict[model.graph.output[0].name] - # This test always fails on MobileNet for some reason - if model_name != "mobilenet": - assert np.isclose( - brev_output, finn_export_output, atol=ATOL - ).all(), "The output of the Brevitas model and the FINN model should match." - - # Get the equivalent QONNX model - b_onnx.function.DOMAIN_STRING = "qonnx.custom_op.general" - _ = b_onnx.manager.BrevitasONNXManager.export( - brev_model, in_shape, qonnx_base_path.format("raw") - ) + # Get QONNX model + _ = export_qonnx(brev_model, torch.randn(in_shape), qonnx_base_path.format("raw")) cleanup(qonnx_base_path.format("raw"), out_file=qonnx_base_path.format("clean")) # Compare output @@ -151,11 +119,6 @@ def test_QONNX_to_FINN(model_name, wbits, abits): assert np.isclose( brev_output, qonnx_export_output, atol=ATOL ).all(), "The output of the Brevitas model and the QONNX model should match." - # This test always fails on MobileNet for some reason - if model_name != "mobilenet": - assert np.isclose( - qonnx_export_output, finn_export_output, atol=ATOL - ).all(), "The output of the FINN model and the QONNX model should match." # Run QONNX to FINN conversion model = ModelWrapper(qonnx_base_path.format("clean")) @@ -167,9 +130,8 @@ def test_QONNX_to_FINN(model_name, wbits, abits): input_dict = {model.graph.input[0].name: input_tensor} output_dict = oxe.execute_onnx(model, input_dict, False) test_output = output_dict[model.graph.output[0].name] - assert np.isclose(test_output, finn_export_output, atol=ATOL).all(), ( - "The output of the FINN model " - "and the QONNX -> FINN converted model should match." + assert np.isclose(test_output, qonnx_export_output, atol=ATOL).all(), ( + "The output of the FINN model " "and the QONNX -> FINN converted model should match." ) # Run analysis passes on the converted model diff --git a/tests/util/test_build_dataflow.py b/tests/util/test_build_dataflow.py index 39f0b0dc89..c8f80a8e1b 100644 --- a/tests/util/test_build_dataflow.py +++ b/tests/util/test_build_dataflow.py @@ -26,8 +26,6 @@ # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -import pkg_resources as pk - import pytest import numpy as np @@ -44,7 +42,7 @@ def test_end2end_build_dataflow_directory(): test_dir = make_build_dir("test_build_dataflow_directory_") target_dir = test_dir + "/build_dataflow" - example_data_dir = pk.resource_filename("finn.qnn-data", "build_dataflow/") + example_data_dir = os.environ["FINN_ROOT"] + "/src/finn/qnn-data/build_dataflow" copytree(example_data_dir, target_dir) build_dataflow_directory(target_dir) # check the generated files @@ -52,14 +50,13 @@ def test_end2end_build_dataflow_directory(): assert os.path.isfile(output_dir + "/time_per_step.json") assert os.path.isfile(output_dir + "/auto_folding_config.json") assert os.path.isfile(output_dir + "/final_hw_config.json") + assert os.path.isfile(output_dir + "/template_specialize_layers_config.json") assert os.path.isfile(output_dir + "/stitched_ip/ip/component.xml") assert os.path.isfile(output_dir + "/driver/driver.py") assert os.path.isfile(output_dir + "/report/estimate_layer_cycles.json") assert os.path.isfile(output_dir + "/report/estimate_layer_resources.json") assert os.path.isfile(output_dir + "/report/rtlsim_perf_batch_1.vcd") - assert os.path.isfile( - output_dir + "/report/estimate_layer_config_alternatives.json" - ) + assert os.path.isfile(output_dir + "/report/estimate_layer_config_alternatives.json") assert os.path.isfile(output_dir + "/report/estimate_network_performance.json") assert os.path.isfile(output_dir + "/report/ooc_synth_and_timing.json") assert os.path.isfile(output_dir + "/report/rtlsim_performance.json") @@ -71,16 +68,8 @@ def test_end2end_build_dataflow_directory(): verif_batchsize = np.load(target_dir + "/input.npy").shape[0] for i in range(verif_batchsize): verify_out_dir = output_dir + "/verification_output" - assert os.path.isfile( - verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy" - ) - assert os.path.isfile( - verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy" - ) - assert os.path.isfile( - verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy" - ) - assert os.path.isfile( - verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy" - ) + assert os.path.isfile(verify_out_dir + f"/verify_initial_python_{i}_SUCCESS.npy") + assert os.path.isfile(verify_out_dir + f"/verify_streamlined_python_{i}_SUCCESS.npy") + assert os.path.isfile(verify_out_dir + f"/verify_folded_hls_cppsim_{i}_SUCCESS.npy") + assert os.path.isfile(verify_out_dir + f"/verify_stitched_ip_rtlsim_{i}_SUCCESS.npy") assert os.path.isfile(output_dir + f"/report/verify_rtlsim_{i}.vcd") diff --git a/tests/util/test_create.py b/tests/util/test_create.py index dc44e4bd45..b8b439cf18 100644 --- a/tests/util/test_create.py +++ b/tests/util/test_create.py @@ -34,9 +34,7 @@ @pytest.mark.util -@pytest.mark.parametrize( - "bitwidth", [DataType["BIPOLAR"], DataType["INT2"], DataType["INT4"]] -) +@pytest.mark.parametrize("bitwidth", [DataType["BIPOLAR"], DataType["INT2"], DataType["INT4"]]) def test_hls_random_mlp_maker(bitwidth): w = bitwidth a = bitwidth diff --git a/tests/util/test_data_packing_hls.py b/tests/util/test_data_packing_hls.py index 859b926543..b95bcd5d42 100644 --- a/tests/util/test_data_packing_hls.py +++ b/tests/util/test_data_packing_hls.py @@ -105,16 +105,12 @@ def test_npy2apintstream(test_shape, dtype): ) with open(test_dir + "/compile.sh", "w") as f: f.write(cmd_compile) - compile = subprocess.Popen( - ["sh", "compile.sh"], stdout=subprocess.PIPE, cwd=test_dir - ) + compile = subprocess.Popen(["sh", "compile.sh"], stdout=subprocess.PIPE, cwd=test_dir) (stdout, stderr) = compile.communicate() # make copy before saving the array ndarray = ndarray.copy() np.save(npy_in, ndarray) - execute = subprocess.Popen( - "./test_npy2apintstream", stdout=subprocess.PIPE, cwd=test_dir - ) + execute = subprocess.Popen("./test_npy2apintstream", stdout=subprocess.PIPE, cwd=test_dir) (stdout, stderr) = execute.communicate() produced = np.load(npy_out) success = (produced == ndarray).all() diff --git a/tutorials/fpga_flow/gen_tb_data.py b/tutorials/fpga_flow/gen_tb_data.py index a525d92bfc..e73fd65094 100755 --- a/tutorials/fpga_flow/gen_tb_data.py +++ b/tutorials/fpga_flow/gen_tb_data.py @@ -48,9 +48,7 @@ tb_data.write("{:02X}".format(test_x[i][j][k])) tb_data.write("\n") tb_data.write( - "ffffffffffffffffffffffffffffffffffffffffffffffffffffff{:02X}\n".format( - test_y[i] - ) + "ffffffffffffffffffffffffffffffffffffffffffffffffffffff{:02X}\n".format(test_y[i]) ) print("Testbench data generated at " + file_name)